LLM自动化评测

使用的数据集：ceval-exam

import requests
from datasets import load_dataset, concatenate_datasets
import re
from tqdm import tqdm
import re, time, tiktoken, ollama
from ollama import ChatResponse
from ollama import Optionsdef llm(model, query, temperature=0.6, stream=False, encoding=tiktoken.encoding_for_model("gpt-4"), max_tokens=None):# return "A"options = Options(temperature=temperature,num_gpu=0, # num_gpu=0即使用CPU计算# num_thread=32,# num_ctx=4096, # 上下文窗口大小)# 流式输出response = ollama.chat(model=model,messages=[{"role": "system","content": "你是一个做题专家。请完成下列单项选择题。\n\n## output format\n只能输出一个选项编号字母，不要有解析等其他任何内容。",},{"role": "user","content": query,},],options=options,stream=stream,keep_alive=0)if stream:chunks = ""# 逐块打印响应内容for chunk in response:chunks += chunk["message"]["content"]# print(chunk["message"]["content"], end="", flush=True)if max_tokens != None and len(encoding.encode(chunks)) > max_tokens:breakresponse = chunkselse:# print(response["message"]["content"])response = response["message"]["content"]# stream=True时无效# with open("tmp.txt", "a", encoding="utf-8") as f:#     f.write(response + "\n"+ 100*'*' + '\n')if '<think>' in response and '</think>' in response:response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)return response.strip()task_list = ["computer_network","operating_system","computer_architecture","college_programming","college_physics","college_chemistry","advanced_mathematics","probability_and_statistics","discrete_mathematics","electrical_engineer","metrology_engineer","high_school_mathematics","high_school_physics","high_school_chemistry","high_school_biology","middle_school_mathematics","middle_school_biology","middle_school_physics","middle_school_chemistry","veterinary_medicine","college_economics","business_administration","marxism","mao_zedong_thought","education_science","teacher_qualification","high_school_politics","high_school_geography","middle_school_politics","middle_school_geography","modern_chinese_history","ideological_and_moral_cultivation","logic","law","chinese_language_and_literature","art_studies","professional_tour_guide","legal_professional","high_school_chinese","high_school_history","middle_school_history","civil_servant","sports_science","plant_protection","basic_medicine","clinical_medicine","urban_and_rural_planner","accountant","fire_engineer","environmental_impact_assessment_engineer","tax_accountant","physician",
]
task_chinese_name_list = ["计算机网络","操作系统","计算机架构","大学编程","大学物理","大学化学","高等数学","概率与统计","离散数学","电气工程师","计量工程师","高中数学","高中物理","高中化学","高中生物学","中学数学","中学生物学","中学物理","中学化学","兽医学","大学经济学","工商管理","马克思主义","毛泽东思想","教育科学","教师资格","高中政治","高中地理","中学政治","中学地理","现代中国史","思想道德修养","逻辑","法律","汉语与文学","艺术研究","专业旅游指南","法律专业","高中汉语","高中历史","中学历史","公务员","体育科学","植物保护","基础医学","临床医学","城市与农村规划","会计","消防工程师","环境影响评估工程师","税务会计","医生",
]def test_split(model_name):encoding = tiktoken.encoding_for_model("gpt-4")model_name_write = model_name.replace(":", "_").replace("/", "_")# with open(f"{model_name_write}.txt", "w", encoding="utf-8") as f:#     f.write(f"")# 加载数据集sum_total = 0sum_correct = 0for i in range(26, len(task_list)):try:dataset_tmp = load_dataset(r"ceval/data", name=task_list[i])dataset = concatenate_datasets([dataset_tmp["dev"], dataset_tmp["val"]])print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集加载完成, len(dataset)={len(dataset)}")except:print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集加载失败")continue# 初始化统计变量correct = 0total = len(dataset)for item in tqdm(dataset, desc=f"No.{i}: Processing"):# for item in dataset:try:# 构造完整问题user_prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\n答案："# 调用Ollama APImodel_answer = llm(model_name, user_prompt, stream=True, encoding=encoding, max_tokens=4096)# 提取并验证答案"""从模型输出中提取答案选项（A/B/C/D）"""match = re.search(r"[A-D]", model_answer.upper())extracted = match.group(0) if match else Noneif extracted and extracted == item["answer"]:correct += 1except:print("\nerror.")# 输出结果sum_total += totalsum_correct += correctprint(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集准确率: {correct}/{total} = {correct/total:.2%}")with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:f.write(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})数据集准确率: {correct}/{total} = {correct/total:.2%}\n\n")with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:f.write(f"总准确率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}\n\n")print(f"总准确率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}")# huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M    
# qwen2.5:3b-instruct-q8_0                           
# qwen2.5:7b-instruct-q5_K_M                         
# deepseek-r1-7b:latest 
# test_split(model_name="qwen2.5:3b-instruct-q8_0")
# test_split(model_name="qwen2.5:7b-instruct-q5_K_M")
# test_split(model_name="huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M")
# test_split(model_name="qwen2.5:1.5b")
# test_split(model_name="qwen2.5:1.5b-instruct-fp16")
# test_split(model_name="qwen2.5:3b")
# test_split(model_name="gemma3:4b")
# test_split(model_name="qwen2.5:7b")
# test_split(model_name="gemma3:4b-it-q8_0")
# test_split(model_name="qwen2.5:0.5b-instruct-fp16")
# test_split(model_name="qwen2.5:0.5b")test_split(model_name="deepseek-r1:1.5b")
# test_split(model_name="deepseek-r1:1.5b-qwen-distill-fp16")
# test_split(model_name="deepseek-r1:7b")