Sep-11-2022, 06:17 PM
I have a python crawler that downloads Baidu index data for the search keywords I provide. It works for most Chinese characters. When it comes to some characters, python shows the "invalid literal for int() with base 10: ''" error. I have checked several times and make sure there is no empty space, but the error remains.
The code:
It would be really appreciated if someone could help! Thanks!
The code:
import requests
import json
from datetime import date, timedelta
import pandas as pd
class DownloadBaiDuIndex(object):
def __init__(self, cookie):
self.cookie = cookie
self.headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/plain, */*",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty",
"Referer": "https://index.baidu.com/v2/main/index.html",
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6,ja;q=0.5,ru;q=0.4",
'Cookie': self.cookie,
"Host": "index.baidu.com",
"X-Requested-With": "XMLHttpRequest",
"Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",
}
def decrypt(self, ptbk, index_data):
n = len(ptbk) // 2
a = dict(zip(ptbk[:n], ptbk[n:]))
return "".join([a[s] for s in index_data])
def get_index_data_json(self, keys, start=None, end=None):
words = [[{"name":key, "wordType":1}] for key in keys]
words = str(words).replace(" ", "").replace("'", "\"")
url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}'
print(words, start, end)
res = requests.get(url, headers=self.headers)
data = res.json()['data']
uniqid = data['uniqid']
url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
res = requests.get(url, headers=self.headers)
ptbk = res.json()['data']
result = {}
result["startDate"] = start
result["endDate"] = end
for userIndexe in data['userIndexes']:
name = userIndexe['word'][0]['name']
tmp = {}
index_all = userIndexe['all']['data']
index_all_data = [int(e) for e in self.decrypt(ptbk, index_all).split(",")]
tmp["all"] = index_all_data
index_pc = userIndexe['pc']['data']
index_pc_data = [int(e) for e in self.decrypt(ptbk, index_pc).split(",")]
tmp["pc"] = index_pc_data
index_wise = userIndexe['wise']['data']
index_wise_data = [int(e)
for e in self.decrypt(ptbk, index_wise).split(",")]
tmp["wise"] = index_wise_data
result[name] = tmp
return result
def GetIndex(self, keys, start=None, end=None):
today = date.today()
if start is None:
start = str(today - timedelta(days=8))
if end is None:
end = str(today - timedelta(days=2))
try:
raw_data = self.get_index_data_json(keys=keys, start=start, end=end)
raw_data = pd.DataFrame(raw_data[keys[0]])
raw_data.index = pd.date_range(start=start, end=end)
except Exception as e:
print(e)
raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})
finally:
return raw_data
# Baidu Index Crawler
appended_data = []
names = ["万科", "宝钢股份", "宁波华翔", "邯郸钢铁"]
cookie = 'PSTM=1493135429; BIDUPSID=4E6249E17CE020DD96051F17E859065E; MCITY=-:; __yjs_duid=1_f2bdcf71928819b290246dfb5ec1f88b1627719752767; BAIDUID=EDE52BAB587B1289E4CFE6876999D70E:FG=1; BDUSS=BFa0FJYXZuVVNUQUlDRC10Nmc4bVBGamIwU3NzZnRHfmp0ZGNKeHBUcWY4TlJpSVFBQUFBJCQAAAAAAAAAAAEAAABCpSWjeWxpdTMxNTA5MTIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJ9jrWKfY61iZV; ai_user=TU6P0S+hXPuepmudkzF3ab|2022-09-07T06:20:41.482Z; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1662531642; Hm_up_d101ea4d2a5c67dab98251f0b5de24dc={"uid_":{"value":"2737153346","scope":1}}; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=ah840l8lal818k052kakma061hhppa518; ZFY=haOdTA3raQNehkj7kdTtwgp2mbl7MGaNDvXqpzATSJ0:C; BAIDUID_BFESS=EDE52BAB587B1289E4CFE6876999D70E:FG=1; delPer=0; PSINO=7; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_PS_PSSID=36544_37117_37361_36885_37274_36806_36786_37244_37260_26350_37232; bdindexid=rn8uincmt59ojbfomaj3ijas86; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04129204022rfXD6ExmZPpUJhdFgUNKL4Qf58vyWFT2S9WLV9yYMYQuaAJmU3S1E39ned8eaf1lkOvzmrGB1ag+Jlrwf6nqg7TMyOdUIrHPpXAZtXo0rHsdOrEqtHM/bZHpdWofRX9/yLcabgvvrvlMoQqIlWgvw+ONVPggUTw1e2w+bt/EwSzYDQ1Yg467d2OHikWxK4pp63uhKDengOWFOCsY+vk7ptdZXsAvh2eijgfMNHfhdQq/zVIH9NFTRhT+pgZIC93eFJsp67DzV7YvSjinm0rZhA==60986356220794518584054979528195; __cas__rn__=412920402; __cas__st__212=10292876e9f34179535097291f4887e79777d53e349fc22b264d7913546801e104d3983195722c1cbba597de; __cas__id__212=42237265; CPID_212=42237265; CPTK_212=428319642; ab_sr=1.0.1_YTBlMzg0ZGQ2N2Y2NDUwZGM0OTUyOGZiYTRiYzg2NTE5ZTY5OWVkZjExY2Q2YTExZmJkMjFjMTZiMjVhN2E2ODkwYzE4MjJiZDU4ZGZiOGU3YWYwYmM3NmVkOWVkOWIyNGVmZWQ5YTFkNjM2NzU5OWJmMGJiOTZkZjAwNTVkMzUyNmMyNWEyMDIxZmYwZThiMmEwZGMwODgzODE3OTRiYQ==; RT="z=1&dm=baidu.com&si=c706899d-6871-43a3-8e6d-2c363b5d090b&ss=l7wbr8mm&sl=0&tt=0&bcn=https://fclog.baidu.com/log/weirwood?type=perf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1662907967; BDUSS_BFESS=BFa0FJYXZuVVNUQUlDRC10Nmc4bVBGamIwU3NzZnRHfmp0ZGNKeHBUcWY4TlJpSVFBQUFBJCQAAAAAAAAAAAEAAABCpSWjeWxpdTMxNTA5MTIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJ9jrWKfY61iZV'
downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
for name in names:
data = downloadbaiduindex.GetIndex(keys=[name], start='2020-01-01', end='2020-12-31')
data['Ticker'] = name
appended_data.append(data)
appended_data = pd.concat(appended_data)
appended_data.to_csv('Baidu_Index.csv')For example, in the name list, the first two "万科", "宝钢股份" work but the last two "宁波华翔", "邯郸钢铁" don't. It would be really appreciated if someone could help! Thanks!
