基于python多线程多进程爬虫的maa作业站技能使用分析
技能使用分析
多线程(8核)
import json
import multiprocessing
import requests
from multiprocessing.dummy import Pooldef maa(st):url = "https://prts.maa.plus/copilot/get/"m = 1out = {}for i in range(st[0], st[0] + st[1]):print(i, "运行进度:" + str(m) + "/" + str(st[1]))m += 1skills = requests.get(url=url + str(i)).json()if skills["status_code"] == 200:try:content = skills["data"]["content"]opers = json.loads(content)["opers"]for j in opers:if j["name"] not in out:out[j["name"]] = [0, 0, 0]if "skill" in j:out[j["name"]][j["skill"] - 1] += 1except:passelse:continueprint(out)return outstart = 20000
end = 46625
pn = multiprocessing.cpu_count() # 线程数
selist = []
print(int((end - start) / pn))
for i in range(0, pn):selist.append([start + int((end - start) / pn) * i, int((end - start) / pn)])
print(selist)
# 创建线程池,调用下载函数
print(multiprocessing.cpu_count())
pool = Pool(pn)
re = pool.map(maa, selist)
# 关闭线程池
pool.close()
pool.join()
print(re)
addout = {}
for i in re:for j in i:if j not in addout:addout[j] = [0, 0, 0]addout[j][0] += i[j][0]addout[j][1] += i[j][1]addout[j][2] += i[j][2]
print(addout)
with open('data.json', 'w', encoding='utf-8') as f:json.dump(addout, f, ensure_ascii=False, indent=4)
json处理
import json
with open('data.json', 'r', encoding='utf-8') as file:data = json.load(file)
print(data)
out = {}
for i in data:if i + "1技能" not in out:out[i + "1技能"] = 0if i + "2技能" not in out:out[i + "2技能"] = 0if i + "3技能" not in out:out[i + "3技能"] = 0out[i+"1技能"] += data[i][0]out[i + "2技能"] += data[i][1]out[i + "3技能"] += data[i][2]
out = sorted(out.items(),key=lambda x:x[1],reverse=True)
for i in out:print(i)
多线程模式(容易死机)
import json
import multiprocessing
import requests
from multiprocessing.dummy import Pool
import threadingdef fetch_title(url, results, index):try:out = {}global progressglobal total_workskills = requests.get(url).json()if skills["status_code"] == 200:try:print(skills["data"]["id"])with lock:progress += 1print(f"Progress: {progress}/{total_work}")content = skills["data"]["content"]opers = json.loads(content)["opers"]for j in opers:if j["name"] not in out:out[j["name"]] = [0, 0, 0]if "skill" in j:out[j["name"]][j["skill"] - 1] += 1except:passresults[index] = outexcept:passdef maa(st):url = "https://prts.maa.plus/copilot/get/"threads = []results = [None] * st[1]urls = []for i in range(st[0],st[0]+st[1]):urls.append(url+str(i))# 为每个URL创建一个线程for i, url in enumerate(urls):t = threading.Thread(target=fetch_title, args=(url, results, i))t.start()threads.append(t)# 等待所有线程完成for t in threads:t.join()# print(results)return resultsprogress = 0
lock = threading.Lock()
start = 20000
end = 46625
total_work = end - start + 1
pn = multiprocessing.cpu_count() # 线程数
selist = []
print(int((end - start) / pn))
for i in range(0, pn):selist.append([start + int((end - start) / pn) * i, int((end - start) / pn)])
print(selist)
# 创建线程池,调用下载函数
print(pn)
pool = Pool(pn)
re = pool.map(maa, selist)
# 关闭线程池
pool.close()
pool.join()
print(re)
addout = {}
for i in re[0]:for j in i:if j not in addout:addout[j] = [0, 0, 0]addout[j][0] += i[j][0]addout[j][1] += i[j][1]addout[j][2] += i[j][2]
print(addout)
with open('data.json', 'w', encoding='utf-8') as f:json.dump(addout, f, ensure_ascii=False, indent=4)
出场率分析(单作业)
出场率分析(结合访问量)
出场率分析代码
import requestsfirst_num = 20000
limit = 50
page = 1
url = "https://prts.maa.plus/copilot/query?desc=true&limit=%d&page=%d&order_by=id"
last_url = "https://prts.maa.plus/copilot/query?desc=true&limit=1&page=1&order_by=id"
result = {}
model = 0 # 0 计数,1 热度
if model == 0:txt_name = "result_num.txt"
else:txt_name = "result_view.txt"last_get = requests.get(last_url).json()
print(last_get)
last_num = last_get['data']['data'][0]['id']
print("maa最新ID:" + str(last_num))for i in range(1, (last_num - first_num) // limit):print("第%d次请求" % i)maa_json = requests.get(url % (limit, i)).json()print(maa_json)if maa_json["status_code"] == 200 and (not maa_json['data']['data'] == []):for j in range(0, limit - 1):if j < len(maa_json['data']['data']):maa_id = str(maa_json['data']['data'][j]['id'])# print("maa://" + maa_id + " 请求成功")content = eval(maa_json['data']['data'][j]['content'], {"true": True, "false": False, "null": None})maa_opers = content['opers']for m in maa_opers:if m['name'] in result and model == 1:result[m['name']] += maa_json['data']['data'][j]['views']elif m['name'] in result and model == 0:result[m['name']] += 1elif (not m['name'] in result) and model == 1:result[m['name']] = maa_json['data']['data'][j]['views']else:result[m['name']] = 1# print(result)with open(txt_name, "w", encoding="utf-8") as f:for n in result:f.write(str(n) + str(result[n]) + "\n")maa_groups = content # ['groups']可能没有groups# print(maa_groups)else:breakelse:print(str(i) + "请求失败")breakresult = sorted(result.items(), key=lambda x: x[1], reverse=True)
print(result)
with open(txt_name, "w", encoding="utf-8") as f:for n in result:f.write(str(n) + "\n")