模型评分

大语言模型微调及其应用的探索 跟踪前沿的技术

模型评分

使用开源大模型给自己训练模型评分

接口

import json
import urllib.request

def query_model(prompt, model="qwen3:latest", url="http://192.168.9.179:11434/api/chat"):
    data = {                                                               
        "model": model,
        "option":{
            "seed": 123, # for deterministic responses
            "temperature": 0, # for deterministic responses
        },
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }

    payload = json.dumps(data).encode("utf-8")                             
    request = urllib.request.Request(url, data=payload, method="POST")     
    request.add_header("Content-Type", "application/json")                 

    response_data = ""
    with urllib.request.urlopen(request) as response:                      
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]
    return response_data


result = query_model(prompt = "你好啊!!!");
print(result);


二、评分代码实现

with open("instruction-data-with-response.json", "r", encoding="utf-8") as f:
    jsondata = json.load(f)
#print(data[0])  
for entry in jsondata[:3]:
    prompt = (
        f" 给定一个输入: `{format_input(entry)}` "
        f" 正确的输出为: `{entry['output']}`, "
        f" 模型给的输出为: `{entry['model_response']}`"
        f" 请为模型的输出打分,0表示最差,100表示最好,只给出分数。"
    )
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model_response"])
    print("\nScore:")
    print(">>", chat_model(prompt))
    print("\n-------------------------")




#测试评分

from tqdm import tqdm
def generate_model_scores(json_data):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"给定一个输出: `{format_input(entry)}` "
            f"正确的输出为: `{entry['output']}`, "
            f"模型给的输出为: `{entry['model_response']}`"
            f" 请为模型的输出打分,0表示最差,100表示最好,只给出分数。"                       
        )
        score = chat_model(prompt)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue

    return scores

scores = generate_model_scores(jsondata)
print(f"Number of scores: {len(scores)} of {len(jsondata)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")