本文共 4764 字,大约阅读时间需要 15 分钟。
Python爬虫实现百度学术资源获取
为了抓取百度学术(Baidu Scholar)上的论文或研究资料,可以通过Python编写爬虫脚本。以下是一个实现的示例代码及解释
代码示例:
import requestsimport jsonheaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36', 'Cookie': 'BIDUPSID=A12EDE53534DDC451A589A7FC5FD294A; PSTM=1610955715; BAIDUID=A12EDE53534DDC450D4E90875D7EEF20:FG=1; BDUSS=ZDd0JtU35PbjZZSn5UcThLbGYwWHI3OWRoNU8zdW56SXU1emxwfmJPMVNVWGhnSVFBQUFBJCQAAAAAAAAAAAEAAADNJfUrtPOwrse5uOcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFLEUGBSxFBga3; BDUSS_BFESS=ZDd0JtU35PbjZZSn5UcThLbGYwWHI3OWRoNU8zdW56SXU1emxwfmJPMVNVWGhnSVFBQUFBJCQAAAAAAAAAAAEAAADNJfUrtPOwrse5uOcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFLEUGBSxFBga3; BD_HOME=0; Hm_lvt_43115ae30293b511088d3cbe41ec099c=1614681795,1617122895; Hm_lpvt_43115ae30293b511088d3cbe41ec099c=1617122895; Hm_lvt_f28578486a5410f35e6fbd0da5361e5f=1614681795,1617122895; Hm_lpvt_f28578486a5410f35e6fbd0da5361e5f=1617122895; BD_CK_SAM=1; __yjs_duid=1_06596d8685c1b78b10b07ab3599d36081617818111406; SC_TRANS=1; BDSFRCVID=dJ_OJeC62697xoQeqhoPJIGyFmjqV0bTH68mfcNHmXLKNvLNh8BeEG0PVf8g0Kub9iQpogKK3gOTH4DF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tJ-D_DLytDK3HJOmMJjE5bcHbxo0b-vtHD7XVh4MXh7keq8CDxQKLx4IhxoJtRv4We5x5DQD0KQYstQ2y5jHhnKX04tDql-DLb7AKqrEtfopsIJM0UFWbT8U5f5AXP3haKviaKOjBMb1MhbDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6Q0jHLDtjKsb5vfstOEK4oMHRIk-PnVq4tHeUvHaMRZ5mAqotn9tl7rSb5mjJ7PyxF1LGj7--oiyeOnaIQqa-nHsfJP5q74QJknMq8LaJv43bRTWhCy5KJvfj6weKFhhP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMopCafD_KMI8wj5tWen-W5gTJ54jQ5C6tsJOOaCvdfqQOy4oTj6Dt-4nOLPR-bnTQ2pcY3h_-_p7Dbx6-3MvB-fnhLtja52nBXnc23xckbbcGQft20MkIeMtjBbQaLmcl_b7jWhk5ep72y5OUQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8IjHCfJ5LetRufVb3aKRr8jJDkKJL_-P4DeN3KBMRZ5mAqoDQeaR-B8n5IbRbzyxF1Llbt3jOhKHrnaIQqaMbijp7ayfbZQf0mLpb23-r43bRTW-Py5KJvfj6dQ5AWhP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMopCafD82bDtGejKWen-W5gItJjOjaIAX3b7Efbbk_p7_bf--D65-jp7yQ4QD2IOTaqTTMlQVj4jKD-5xy55bhfj2WfvLygJi2tQ1BJPWER5HQT3m345bbN3i-4D82G6jWb3cWKOJ8UbSjMOPBTD02-nBat-OQ6npaJ5nJq5nhMJmb67JD-50exbH55uDfRCf_fK; BDSFRCVID_BFESS=dJ_OJeC62697xoQeqhoPJIGyFmjqV0bTH68mfcNHmXLKNvLNh8BeEG0PVf8g0Kub9iQpogKK3gOTH4DF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tJ-D_DLytDK3HJOmMJjE5bcHbxo0b-vtHD7XVh4MXh7keq8CDxQKLx4IhxoJtRv4We5x5DQD0KQYstQ2y5jHhnKX04tDql-DLb7AKqrEtfopsIJM0UFWbT8U5f5AXP3haKviaKOjBMb1MhbDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6Q0jHLDtjKsb5vfstOEK4oMHRIk-PnVq4tHeUvHaMRZ5mAqotn9tl7rSb5mjJ7PyxF1LGj7--oiyeOnaIQqa-nHsfJP5q74QJknMq8LaJv43bRTWhCy5KJvfj6weKFhhP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMopCafD_KMI8wj5tWen-W5gTJ54jQ5C6tsJOOaCvdfqQOy4oTj6Dt-4nOLPR-bnTQ2pcY3h_-_p7Dbx6-3MvB-fnhLtja52nBXnc23xckbbcGQft20MkIeMtjBbQaLmcl_b7jWhk5ep72y5OUQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8IjHCfJ5LetRufVb3aKRr8jJDkKJL_-P4DeN3KBMRZ5mAqoDQeaR-B8n5IbRbzyxF1Llbt3jOhKHrnaIQqaMbijp7ayfbZQf0mLpb23-r43bRTW-Py5KJvfj6dQ5AWhP-UyNbMWh37JgnlMKoaMp78jR093JO4y4Ldj4oxJpOJ5JbMopCafD82bDtGejKWen-W5gItJjOjaIAX3b7Efbbk_p7_bf--D65-jp7yQ4QD2IOTaqTTMlQVj4jKD-5xy55bhfj2WfvLygJi2tQ1BJPWER5HQT3m345bbN3i-4D82G6jWb3cWKOJ8UbSjMOPBTD02-nBat-OQ6npaJ5nJq5nhMJmb67JD-50exbH55uDfRCf_fK; BDSFRCVID_BFESS=dJ_OJeC62697xoQeqhoPJIGyFmjqV0bTH68mfcNHmXLKNvLNh8BeEG0PVf8g0Kub9iQpogKK3gOTH4DF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; BAIDUID_BFESS=BAE07051B3EE5CB97112D33DA0C60E31:FG=1; BDRCVFR[n9IS1zhFc9f]=mk3SLVN4HKm; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSVRTM=583; BDRCVFR[w2jhEs_Zudc]=mk3SLVN4HKm; PSINO=7; H_PS_PSSID=33838_33818_31660_33689_33849_33675_26350_33810; H_WISE_SIDS=107315_110085_127969_131423_151532_154214_164075_165136_165269_166147_168389_169065_169773_170035_170142_170817_170935_171214_171565_171710_171772_172018_172323_172827_172923_172995_173017_173032_173125_173127_173129_173244_173387_173412_173571_173594_173601_173609_173791_173892_173920_173922_174196_174198_174263_174332_174357_174449; rsv_i=67435W1%2BPm7TTVCA5z6nPIIFRn9s4dtb9ZBSlhPsRrNuhjP4LqFNXsKtQs13wjR%2BQfbsAZkufl9E%2B%2B01f42nXwnFP3yEKgA; FEED_SIDS=3000156_3; SE_LAUNCH=5%3A26983792; BA_HECTOR=a4812k2g01802k2hhk1g80qej0r'}def bai_du_scholar爬虫(content, begin, end): """ 使用Python抓取百度学术资源 @param content 搜索关键词 @param begin 起始页码 @param end 结束页码 @return None """ for pn in range(begin, end+1): url = f'https://xueshu.baidu.com/s?wd={content}&pn={pn}&tn=SE_baiduxueshu_c1gjeupa' response = requests.get(url, headers=headers) response.encoding = 'utf-8' print(f'正在抓取第{pn}页') print(response.text) 功能说明:
注意事项:
该脚本可以根据实际需求进行调整,适用于个人学习或小范围的网页抓取。
转载地址:http://aeun.baihongyu.com/