流水线自动化
为生产工作流构建自动化训练流水线。简单流水线
顺序训练
复制
#!/bin/bash
# pipeline.sh
set -e # 出错时退出
# 步骤 1:准备数据
echo "Step 1: Data preparation..."
python prepare_data.py
# 步骤 2:训练模型
echo "Step 2: Training..."
aitraining llm --train \
--model google/gemma-3-270m \
--data-path ./processed_data \
--project-name production-model
# 步骤 3:评估
echo "Step 3: Evaluation..."
python evaluate_model.py --model-path ./production-model
# 步骤 4:推送到 Hub(使用 huggingface-cli)
echo "Step 4: Deploying..."
huggingface-cli upload $HF_USERNAME/production-model ./production-model
echo "Pipeline complete!"
Python 流水线
使用 Python
复制
import subprocess
import json
from pathlib import Path
def run_training(config):
"""使用配置运行训练。"""
cmd = ["aitraining", "--config", config]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Training failed: {result.stderr}")
return result.stdout
def get_training_metrics(model_path):
"""从 trainer_state.json 获取指标。"""
state_file = Path(model_path) / "trainer_state.json"
if state_file.exists():
with open(state_file) as f:
state = json.load(f)
return {
"best_metric": state.get("best_metric"),
"global_step": state.get("global_step"),
"epoch": state.get("epoch"),
}
return None
def main():
# 训练
print("Training model...")
run_training("config.yaml")
# 从训练器状态获取指标
print("Checking results...")
metrics = get_training_metrics("./output")
print(f"Best metric: {metrics.get('best_metric')}")
# 检查质量门(best_metric 通常是 eval_loss)
if metrics.get("best_metric", 999) > 0.5:
print("Quality gate failed!")
return False
print("Pipeline passed!")
return True
if __name__ == "__main__":
main()
多阶段流水线
训练 → 蒸馏 → 评估
复制
#!/bin/bash
# full_pipeline.sh
set -e
# 阶段 1:训练教师模型
echo "=== Stage 1: Training teacher model ==="
aitraining llm --train \
--model google/gemma-2-2b \
--data-path ./data \
--project-name teacher-model \
--epochs 3
# 阶段 2:蒸馏到学生模型
echo "=== Stage 2: Knowledge distillation ==="
aitraining llm --train \
--model google/gemma-3-270m \
--teacher-model ./teacher-model \
--data-path ./data \
--project-name student-model \
--use-distillation \
--distill-temperature 2.0 \
--epochs 5
# 阶段 3:评估两者
echo "=== Stage 3: Evaluation ==="
python compare_models.py \
--teacher ./teacher-model \
--student ./student-model
echo "Pipeline complete!"
条件流水线
带质量门
复制
#!/bin/bash
# quality_pipeline.sh
train_and_check() {
local config=$1
local threshold=$2
# 训练
aitraining --config "$config"
# 从 trainer_state.json 获取最佳指标
eval_loss=$(python -c "
import json
with open('output/trainer_state.json') as f:
state = json.load(f)
print(state.get('best_metric', 999))
")
# 检查阈值
if (( $(echo "$eval_loss > $threshold" | bc -l) )); then
echo "Quality gate failed: $eval_loss > $threshold"
return 1
fi
return 0
}
# 使用质量门运行
if train_and_check "config.yaml" 0.5; then
echo "Proceeding to deployment..."
# 部署步骤在这里
else
echo "Pipeline stopped due to quality gate failure"
exit 1
fi
计划训练
Cron 作业
复制
# 添加到 crontab(crontab -e)
# 每天凌晨 2 点运行
0 2 * * * /path/to/training_pipeline.sh >> /var/log/training.log 2>&1
带数据更新
复制
#!/bin/bash
# scheduled_training.sh
# 检查新数据
NEW_DATA=$(find ./incoming -newer ./last_training_marker -type f | wc -l)
if [ "$NEW_DATA" -gt 0 ]; then
echo "Found $NEW_DATA new files, starting training..."
# 合并新数据
python merge_data.py
# 训练
aitraining --config production.yaml
# 更新标记
touch ./last_training_marker
else
echo "No new data, skipping training"
fi
CI/CD 集成
GitHub Actions
复制
# .github/workflows/train.yml
name: Train Model
on:
push:
branches: [main]
paths:
- 'data/**'
- 'configs/**'
jobs:
train:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install dependencies
run: pip install aitraining
- name: Train model
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
run: aitraining --config configs/production.yaml