跳转到主要内容

自定义脚本

使用自定义预处理和后处理脚本扩展 AITraining。

要求

自定义脚本可以使用 AITraining 附带的库:
  • Python >= 3.8
  • pandas - 数据处理
  • torch - PyTorch 用于模型操作
  • transformers - Hugging Face 模型
  • peft - LoRA/适配器操作
这些会随 AITraining 自动安装。

数据预处理

自定义数据流水线

# prepare_data.py
import json
import pandas as pd
from pathlib import Path

def load_and_clean(input_path):
    """加载并清理原始数据。"""
    df = pd.read_csv(input_path)

    # 删除空行
    df = df.dropna(subset=['text'])

    # 清理文本
    df['text'] = df['text'].str.strip()
    df = df[df['text'].str.len() > 10]

    return df

def convert_to_chat_format(df, text_col='text'):
    """转换为 SFT 的聊天格式。"""
    conversations = []

    for _, row in df.iterrows():
        conversations.append({
            "messages": [
                {"role": "user", "content": "Continue this text:"},
                {"role": "assistant", "content": row[text_col]}
            ]
        })

    return conversations

def main():
    # 加载数据
    df = load_and_clean("raw_data.csv")

    # 转换为聊天格式
    conversations = convert_to_chat_format(df)

    # 保存
    output_path = Path("processed_data.jsonl")
    with open(output_path, 'w') as f:
        for conv in conversations:
            f.write(json.dumps(conv) + '\n')

    print(f"Saved {len(conversations)} conversations")

if __name__ == "__main__":
    main()
用法:
python prepare_data.py
aitraining llm --train --data-path ./processed_data.jsonl ...

DPO 数据准备

# prepare_dpo_data.py
import json
from pathlib import Path

def create_dpo_pairs(preference_data):
    """将偏好数据转换为 DPO 格式。"""
    dpo_data = []

    for item in preference_data:
        dpo_data.append({
            "prompt": item["question"],
            "chosen": item["preferred_answer"],
            "rejected": item["rejected_answer"]
        })

    return dpo_data

def main():
    # 加载偏好注释
    with open("preferences.json") as f:
        data = json.load(f)

    # 转换为 DPO 格式
    dpo_data = create_dpo_pairs(data)

    # 保存
    with open("dpo_data.jsonl", 'w') as f:
        for item in dpo_data:
            f.write(json.dumps(item) + '\n')

    print(f"Created {len(dpo_data)} DPO pairs")

if __name__ == "__main__":
    main()

评估脚本

自定义评估

# evaluate_model.py
import argparse
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_path):
    """加载训练好的模型。"""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return model, tokenizer

def evaluate_prompts(model, tokenizer, prompts):
    """为评估提示生成响应。"""
    results = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=100)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append({
            "prompt": prompt,
            "response": response
        })

    return results

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", required=True)
    parser.add_argument("--prompts-file", default="eval_prompts.txt")
    args = parser.parse_args()

    # 加载模型
    model, tokenizer = load_model(args.model_path)

    # 加载提示
    with open(args.prompts_file) as f:
        prompts = [line.strip() for line in f if line.strip()]

    # 评估
    results = evaluate_prompts(model, tokenizer, prompts)

    # 保存结果
    with open("eval_results.json", 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Evaluated {len(results)} prompts")

if __name__ == "__main__":
    main()

基准测试脚本

# benchmark.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def benchmark_model(model_path, prompts, num_runs=5):
    """基准测试模型推理速度。"""
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    if torch.cuda.is_available():
        model = model.cuda()

    times = []

    for _ in range(num_runs):
        start = time.time()

        for prompt in prompts:
            inputs = tokenizer(prompt, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=50)

        times.append(time.time() - start)

    avg_time = sum(times) / len(times)
    tokens_per_sec = len(prompts) * 50 / avg_time

    return {
        "avg_time": avg_time,
        "tokens_per_second": tokens_per_sec
    }

后处理脚本

模型合并

# merge_lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import argparse

def merge_and_save(base_model, adapter_path, output_path):
    """将 LoRA 适配器合并到基础模型中。"""
    # 加载基础模型
    model = AutoModelForCausalLM.from_pretrained(base_model)
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    # 加载并合并适配器
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()

    # 保存合并后的模型
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

    print(f"Merged model saved to {output_path}")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-model", required=True)
    parser.add_argument("--adapter-path", required=True)
    parser.add_argument("--output-path", required=True)
    args = parser.parse_args()

    merge_and_save(args.base_model, args.adapter_path, args.output_path)

if __name__ == "__main__":
    main()
或使用内置工具:
aitraining tools merge-llm-adapter \
  --base-model-path google/gemma-3-270m \
  --adapter-path ./my-lora-model \
  --output-folder ./merged-model
内置工具还支持 --token--pad-to-multiple-of--push-to-hub 标志。您必须指定 --output-folder--push-to-hub

集成示例

完整训练脚本

#!/bin/bash
# train_with_scripts.sh

set -e

# 1. 准备数据
echo "Preparing data..."
python scripts/prepare_data.py

# 2. 训练模型
echo "Training..."
aitraining llm --train \
  --model google/gemma-3-270m \
  --data-path processed/data.jsonl \
  --project-name my-model \
  --peft

# 3. 合并适配器
echo "Merging adapter..."
python scripts/merge_lora.py \
  --base-model google/gemma-3-270m \
  --adapter-path my-model \
  --output-path my-model-merged

# 4. 评估
echo "Evaluating..."
python scripts/evaluate_model.py \
  --model-path my-model-merged

echo "Done!"

下一步