自定义脚本
使用自定义预处理和后处理脚本扩展 AITraining。要求
自定义脚本可以使用 AITraining 附带的库:- Python >= 3.8
pandas- 数据处理torch- PyTorch 用于模型操作transformers- Hugging Face 模型peft- LoRA/适配器操作
数据预处理
自定义数据流水线
复制
# prepare_data.py
import json
import pandas as pd
from pathlib import Path
def load_and_clean(input_path):
"""加载并清理原始数据。"""
df = pd.read_csv(input_path)
# 删除空行
df = df.dropna(subset=['text'])
# 清理文本
df['text'] = df['text'].str.strip()
df = df[df['text'].str.len() > 10]
return df
def convert_to_chat_format(df, text_col='text'):
"""转换为 SFT 的聊天格式。"""
conversations = []
for _, row in df.iterrows():
conversations.append({
"messages": [
{"role": "user", "content": "Continue this text:"},
{"role": "assistant", "content": row[text_col]}
]
})
return conversations
def main():
# 加载数据
df = load_and_clean("raw_data.csv")
# 转换为聊天格式
conversations = convert_to_chat_format(df)
# 保存
output_path = Path("processed_data.jsonl")
with open(output_path, 'w') as f:
for conv in conversations:
f.write(json.dumps(conv) + '\n')
print(f"Saved {len(conversations)} conversations")
if __name__ == "__main__":
main()
复制
python prepare_data.py
aitraining llm --train --data-path ./processed_data.jsonl ...
DPO 数据准备
复制
# prepare_dpo_data.py
import json
from pathlib import Path
def create_dpo_pairs(preference_data):
"""将偏好数据转换为 DPO 格式。"""
dpo_data = []
for item in preference_data:
dpo_data.append({
"prompt": item["question"],
"chosen": item["preferred_answer"],
"rejected": item["rejected_answer"]
})
return dpo_data
def main():
# 加载偏好注释
with open("preferences.json") as f:
data = json.load(f)
# 转换为 DPO 格式
dpo_data = create_dpo_pairs(data)
# 保存
with open("dpo_data.jsonl", 'w') as f:
for item in dpo_data:
f.write(json.dumps(item) + '\n')
print(f"Created {len(dpo_data)} DPO pairs")
if __name__ == "__main__":
main()
评估脚本
自定义评估
复制
# evaluate_model.py
import argparse
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
def load_model(model_path):
"""加载训练好的模型。"""
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
return model, tokenizer
def evaluate_prompts(model, tokenizer, prompts):
"""为评估提示生成响应。"""
results = []
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
results.append({
"prompt": prompt,
"response": response
})
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", required=True)
parser.add_argument("--prompts-file", default="eval_prompts.txt")
args = parser.parse_args()
# 加载模型
model, tokenizer = load_model(args.model_path)
# 加载提示
with open(args.prompts_file) as f:
prompts = [line.strip() for line in f if line.strip()]
# 评估
results = evaluate_prompts(model, tokenizer, prompts)
# 保存结果
with open("eval_results.json", 'w') as f:
json.dump(results, f, indent=2)
print(f"Evaluated {len(results)} prompts")
if __name__ == "__main__":
main()
基准测试脚本
复制
# benchmark.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def benchmark_model(model_path, prompts, num_runs=5):
"""基准测试模型推理速度。"""
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.cuda()
times = []
for _ in range(num_runs):
start = time.time()
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
times.append(time.time() - start)
avg_time = sum(times) / len(times)
tokens_per_sec = len(prompts) * 50 / avg_time
return {
"avg_time": avg_time,
"tokens_per_second": tokens_per_sec
}
后处理脚本
模型合并
复制
# merge_lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import argparse
def merge_and_save(base_model, adapter_path, output_path):
"""将 LoRA 适配器合并到基础模型中。"""
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)
# 加载并合并适配器
model = PeftModel.from_pretrained(model, adapter_path)
model = model.merge_and_unload()
# 保存合并后的模型
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"Merged model saved to {output_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--base-model", required=True)
parser.add_argument("--adapter-path", required=True)
parser.add_argument("--output-path", required=True)
args = parser.parse_args()
merge_and_save(args.base_model, args.adapter_path, args.output_path)
if __name__ == "__main__":
main()
复制
aitraining tools merge-llm-adapter \
--base-model-path google/gemma-3-270m \
--adapter-path ./my-lora-model \
--output-folder ./merged-model
内置工具还支持
--token、--pad-to-multiple-of 和 --push-to-hub 标志。您必须指定 --output-folder 或 --push-to-hub。集成示例
完整训练脚本
复制
#!/bin/bash
# train_with_scripts.sh
set -e
# 1. 准备数据
echo "Preparing data..."
python scripts/prepare_data.py
# 2. 训练模型
echo "Training..."
aitraining llm --train \
--model google/gemma-3-270m \
--data-path processed/data.jsonl \
--project-name my-model \
--peft
# 3. 合并适配器
echo "Merging adapter..."
python scripts/merge_lora.py \
--base-model google/gemma-3-270m \
--adapter-path my-model \
--output-path my-model-merged
# 4. 评估
echo "Evaluating..."
python scripts/evaluate_model.py \
--model-path my-model-merged
echo "Done!"