基本思路
- 首先查看robots.txt,文档说明是允许爬虫的
- 查找接口:第一次对整个页面进行了爬取,发现返回的内容并不包含我们所需要的信息。说明这部分的信息是通过AJAX渲染的,这就需要我们去查找接口。按F12打开开发者模式,寻找数据相对应的API接口。如下图:
- 查看请求头:
发现是GET请求,使用python之中的requests库对该接口模拟发送GET请求。
返回的内容之中包含一个json格式的文件:
jQuery1124009615462983953937_1680183926555({"rc":0,"rt":6,"svr":182994499,"lt":1,"full":1,"dlmkts":"","data":{"total":123,"diff":[{"f1":2,"f2":2.95,"f3":12.6,"f4":0.33,"f5":163315,"f6":46468306.0,"f7":18.32,"f8":12.28,"f9":-1.9,"f10":1.88,"f11":0.0,"f12":"688086","f13":1,"f14":"*ST紫晶","f15":3.0,"f16":2.52,"f17":2.64,"f18":2.62,"f20":561625434,"f21":392266391,"f22":0.0,"f23":0.44,"f24":18.95,"f25":21.9,"f45":-295363225.26,"f62":4176541.0,"f115":-1.27,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":18.54,"f3":10.03,"f4":1.69,"f5":46483,"f6":85118291.46,"f7":5.7,"f8":7.72,"f9":18.55,"f10":4.15,"f11":0.0,"f12":"003013","f13":0,"f14":"地铁设计","f15":18.54,"f16":17.58,"f17":17.64,"f18":16.85,"f20":7416185400,"f21":1116235852,"f22":0.0,"f23":3.43,"f24":20.78,"f25":20.0,"f45":399837098.45,"f62":29303477.0,"f115":18.55,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":8.36,"f3":6.91,"f4":0.54,"f5":452302,"f6":373778816.0,"f7":11.13,"f8":3.58,"f9":18.09,"f10":3.53,"f11":0.72,"f12":"603300","f13":1,"f14":"华铁应急","f15":8.6,"f16":7.73,"f17":7.82,"f18":7.82,"f20":11600104487,"f21":10555104487,"f22":0.12,"f23":2.53,"f24":32.07,"f25":31.86,"f45":641292635.3,"f62":56773066.0,"f115":18.09,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":14.53,"f3":6.29,"f4":0.86,"f5":2399600,"f6":3391859221.4,"f7":14.41,"f8":26.1,"f9":116.24,"f10":1.22,"f11":0.48,"f12":"300166","f13":0,"f14":"东方国信","f15":15.13,"f16":13.16,"f17":13.52,"f18":13.67,"f20":16744102280,"f21":13359031528,"f22":0.55,"f23":2.4,"f24":85.81,"f25":81.85,"f45":108039435.05,"f62":87945664.0,"f115":84.19,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":39.08,"f3":3.8,"f4":1.43,"f5":765067,"f6":2945965192.79,"f7":9.0,"f8":9.09,"f9":108.5,"f10":1.26,"f11":0.0,"f12":"002268","f13":0,"f14":"电科网安","f15":39.99,"f16":36.6,"f17":37.41,"f18":37.65,"f20":33056857645,"f21":32877363987,"f22":0.0,"f23":6.32,"f24":30.22,"f25":28.01,"f45":304681465.04,"f62":162384768.0,"f115":122.63,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":13.63,"f3":2.25,"f4":0.3,"f5":241620,"f6":323482342.0,"f7":4.8,"f8":9.51,"f9":-26.06,"f10":0.93,"f11":0.22,"f12":"600706","f13":1,"f14":"曲江文旅","f15":13.65,"f16":13.01,"f17":13.21,"f18":13.33,"f20":3476464870,"f21":3462431422,"f22":0.07,"f23":3.13,"f24":3.26,"f25":2.17,"f45":-100057560.45,"f62":-1907264.0,"f115":-50.67,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":16.78,"f3":1.76,"f4":0.29,"f5":469306,"f6":781530101.0,"f7":6.97,"f8":7.94,"f9":-216.25,"f10":1.1,"f11":-0.24,"f12":"603636","f13":1,"f14":"南威软件","f15":17.15,"f16":16.0,"f17":16.47,"f18":16.49,"f20":9913516239,"f21":9913516239,"f22":-0.18,"f23":3.94,"f24":26.07,"f25":22.13,"f45":-34381460.78,"f62":67686358.0,"f115":91.47,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":50.34,"f3":1.59,"f4":0.79,"f5":22478,"f6":112178689.0,"f7":4.04,"f8":1.33,"f9":88.04,"f10":0.52,"f11":0.06,"f12":"603383","f13":1,"f14":"顶点软件","f15":50.6,"f16":48.6,"f17":49.11,"f18":49.55,"f20":8622210835,"f21":8500841095,"f22":0.0,"f23":6.88,"f24":28.06,"f25":15.35,"f45":73449280.72,"f62":2351546.0,"f115":63.86,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":17.88,"f3":0.96,"f4":0.17,"f5":195218,"f6":347295198.74,"f7":7.4,"f8":17.33,"f9":42.27,"f10":1.5,"f11":0.06,"f12":"300605","f13":0,"f14":"恒锋信息","f15":18.37,"f16":17.06,"f17":17.42,"f18":17.71,"f20":2940598154,"f21":2014413993,"f22":0.0,"f23":5.24,"f24":25.74,"f25":30.04,"f45":52170587.42,"f62":-10049982.0,"f115":51.71,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":19.5,"f3":0.57,"f4":0.11,"f5":832599,"f6":1663215656.91,"f7":9.44,"f8":11.5,"f9":117.11,"f10":2.06,"f11":-0.66,"f12":"300188","f13":0,"f14":"美亚柏科","f15":20.95,"f16":19.12,"f17":20.36,"f18":19.39,"f20":16759807545,"f21":14122788323,"f22":0.05,"f23":3.99,"f24":51.63,"f25":51.63,"f45":143116900.0,"f62":-65400698.0,"f115":239.96,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":32.81,"f3":0.46,"f4":0.15,"f5":76034,"f6":249306989.73,"f7":5.45,"f8":5.84,"f9":41.14,"f10":0.85,"f11":0.03,"f12":"002987","f13":0,"f14":"京北方","f15":33.7,"f16":31.92,"f17":32.64,"f18":32.66,"f20":10331812140,"f21":4274869135,"f22":0.06,"f23":4.84,"f24":26.53,"f25":23.39,"f45":188347683.98,"f62":6321601.0,"f115":44.43,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":12.34,"f3":0.33,"f4":0.04,"f5":209484,"f6":267202094.35,"f7":7.4,"f8":15.36,"f9":-36.99,"f10":1.03,"f11":-0.64,"f12":"300588","f13":0,"f14":"熙菱信息","f15":13.24,"f16":12.33,"f17":12.37,"f18":12.3,"f20":2373858510,"f21":1683028290,"f22":-0.08,"f23":5.47,"f24":37.88,"f25":33.98,"f45":-48126003.15,"f62":-14190708.0,"f115":-20.88,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":41.38,"f3":0.15,"f4":0.06,"f5":224359,"f6":931478847.2,"f7":4.02,"f8":3.82,"f9":156.35,"f10":0.66,"f11":-0.07,"f12":"002368","f13":0,"f14":"太极股份","f15":42.34,"f16":40.68,"f17":41.3,"f18":41.32,"f20":24484461957,"f21":24320090501,"f22":-0.02,"f23":6.95,"f24":51.58,"f25":47.1,"f45":117453861.33,"f62":-2519416.0,"f115":60.54,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":25.7,"f3":-0.08,"f4":-0.02,"f5":13701,"f6":35026723.0,"f7":2.99,"f8":1.53,"f9":17.47,"f10":0.43,"f11":0.08,"f12":"002889","f13":0,"f14":"东方嘉盛","f15":25.95,"f16":25.18,"f17":25.44,"f18":25.72,"f20":3549206725,"f21":2308611828,"f22":0.0,"f23":1.72,"f24":12.08,"f25":13.62,"f45":152412263.98,"f62":217898.0,"f115":16.58,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":13.23,"f3":-0.15,"f4":-0.02,"f5":69280,"f6":91845900.28,"f7":2.19,"f8":1.73,"f9":247.43,"f10":0.53,"f11":0.0,"f12":"002090","f13":0,"f14":"金智科技","f15":13.35,"f16":13.06,"f17":13.2,"f18":13.25,"f20":5348425103,"f21":5285042145,"f22":0.0,"f23":4.34,"f24":12.4,"f25":8.89,"f45":16211813.05,"f62":4414167.0,"f115":44.69,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":22.5,"f3":-0.22,"f4":-0.05,"f5":241914,"f6":549635943.21,"f7":5.76,"f8":9.26,"f9":626.65,"f10":0.67,"f11":0.0,"f12":"300052","f13":0,"f14":"中青宝","f15":23.38,"f16":22.08,"f17":22.46,"f18":22.55,"f20":5891820975,"f21":5877321255,"f22":-0.18,"f23":9.09,"f24":19.3,"f25":12.22,"f45":7051614.23,"f62":20485472.0,"f115":-111.72,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":4.1,"f3":-0.24,"f4":-0.01,"f5":201454,"f6":82251709.05,"f7":2.43,"f8":1.7,"f9":-39.7,"f10":0.83,"f11":-0.24,"f12":"000863","f13":0,"f14":"三湘印象","f15":4.13,"f16":4.03,"f17":4.1,"f18":4.11,"f20":4937918886,"f21":4862646404,"f22":0.0,"f23":1.14,"f24":-6.39,"f25":-3.76,"f45":-93297369.65,"f62":414252.0,"f115":-16.36,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":9.25,"f3":-0.43,"f4":-0.04,"f5":57158,"f6":53343380.0,"f7":3.77,"f8":1.12,"f9":-25.16,"f10":0.81,"f11":-0.22,"f12":"603956","f13":1,"f14":"威派格","f15":9.5,"f16":9.15,"f17":9.23,"f18":9.29,"f20":4703027108,"f21":4703027108,"f22":0.0,"f23":2.39,"f24":26.02,"f25":23.99,"f45":-140193810.65,"f62":6863548.0,"f115":-54.9,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":73.0,"f3":-0.48,"f4":-0.35,"f5":104793,"f6":760460108.07,"f7":5.03,"f8":9.31,"f9":291.74,"f10":0.58,"f11":0.41,"f12":"300624","f13":0,"f14":"万兴科技","f15":74.44,"f16":70.75,"f17":72.15,"f18":73.35,"f20":9482703577,"f21":8216086417,"f22":0.63,"f23":11.26,"f24":154.09,"f25":148.64,"f45":24377989.07,"f62":-35126668.0,"f115":312.57,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":19.93,"f3":-0.5,"f4":-0.1,"f5":79585,"f6":158145745.03,"f7":3.05,"f8":2.24,"f9":47.03,"f10":0.64,"f11":0.1,"f12":"002150","f13":0,"f14":"通润装备","f15":20.14,"f16":19.53,"f17":20.0,"f18":20.03,"f20":7105384866,"f21":7078134079,"f22":0.15,"f23":4.51,"f24":-6.65,"f25":-2.92,"f45":151084400.3,"f62":-13451405.0,"f115":41.7,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2}]}});
使用正则表达式稍做处理,得到一个json文件,
其中参数对应如下表:
参数 | 对应的名称 |
---|---|
f23 | 市净率 |
f14 | 股票名称 |
f12 | 股票代码 |
f9 | 市盈率 |
f2 | 最新价 |
f3 | 涨跌幅(百分比) |
f4 | 涨跌额 |
f5 | 成交量 |
f6 | 成交额 |
f7 | 振幅 |
f10 | 量比 |
f15 | 最高 |
f16 | 最低 |
f17 | 今开 |
f18 | 昨收 |
将json文件转换为dataframe,之后转换为excel文件导出。
代码
# %%
import json
import pandas as pd
import re
import requestsdic = {'f23': '市净率','f14': '股票名称','f12': '股票代码','f9': '市盈率','f2': '最新价','f3': '涨跌幅(百分比)','f4': '涨跌额','f5': '成交量','f6': '成交额','f7': '振幅','f10': '量比','f15': '最高','f16': '最低','f17': '今开','f18': '昨收'
}headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0','Cookie': 'bid=03QzRATVvZI; douban-fav-remind=1; __gads=ID=235109b723938ba4-2224407c19d8000c:T=1668039578:RT=1668039578:S=ALNI_MYFFJ5HU2_bVR7eqoxZtDCAQBeaiA; __gpi=UID=00000b78f6a2bf6c:T=1668039578:RT=1668325134:S=ALNI_MZJXHay5I4KEyvwl_-VLHskRpbIyA; __utma=30149280.1863681919.1668039577.1668325999.1668341717.4; __utmz=30149280.1668325999.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; push_noty_num=0; push_doumail_num=0; __utmv=30149280.17848; gr_user_id=dccf8534-bc90-42d8-b1be-c133a8b1daf7; ap_v=0,6.0; __utmb=30149280.9.8.1668341833502; __utmc=30149280; __utmt_douban=1; __utmt_t1=1; RT=s=1668341920729&r=https%3A%2F%2Fbook.douban.com%2Fsubject%2F6025373%2Fcomments%2F%3Fstart%3D220%26amp%3Blimit%3D20%26amp%3Bstatus%3DP%26amp%3Bsort%3Dnew_score; dbcl2="178484147:wK+jx5QTARI"; ck=PDDZ'
}def func(request_url):res = requests.get(request_url, headers=headers)res = res.textregex = re.compile(r"(?=\()(.*)(?<=\))")jsonString = regex.findall(res)[-1]jsonString = json.loads(jsonString.strip('()'))['data']['diff']df = pd.DataFrame(jsonString)df = df.rename(columns=dic)df = df.drop(['f1'], axis=1)return dfrequest_url1 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926561&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926562'
request_url2 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926559&pn=2&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926631'
request_url3 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926555&pn=3&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926649'
request_url4 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926559&pn=4&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926661'
request_url5 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926559&pn=5&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926670'
request_url6 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926559&pn=6&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926675'
request_url7 = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009615462983953937_1680183926559&pn=7&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=%7C0%7C0%7C0%7Cweb&fid=f3&fs=b%3ABK1061+f%3A%2150&fields=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6%2Cf7%2Cf8%2Cf9%2Cf10%2Cf12%2Cf13%2Cf14%2Cf15%2Cf16%2Cf17%2Cf18%2Cf20%2Cf21%2Cf23%2Cf24%2Cf25%2Cf22%2Cf11%2Cf62%2Cf128%2Cf136%2Cf115%2Cf152%2Cf45&_=1680183926684'df = pd.concat([func(request_url1),func(request_url2),func(request_url3),func(request_url4),func(request_url5),func(request_url6),func(request_url7)],axis=0)
df.reset_index(inplace=True,drop=True)
df.to_excel('finaloutput.xlsx')