Custom Scripts
Extend AITraining with custom preprocessing and postprocessing scripts.Requirements
Custom scripts can use libraries bundled with AITraining:- Python >= 3.8
pandas- Data manipulationtorch- PyTorch for model operationstransformers- Hugging Face modelspeft- LoRA/adapter operations
Data Preprocessing
Custom Data Pipeline
Copy
# prepare_data.py
import json
import pandas as pd
from pathlib import Path
def load_and_clean(input_path):
"""Load and clean raw data."""
df = pd.read_csv(input_path)
# Remove empty rows
df = df.dropna(subset=['text'])
# Clean text
df['text'] = df['text'].str.strip()
df = df[df['text'].str.len() > 10]
return df
def convert_to_chat_format(df, text_col='text'):
"""Convert to chat format for SFT."""
conversations = []
for _, row in df.iterrows():
conversations.append({
"messages": [
{"role": "user", "content": "Continue this text:"},
{"role": "assistant", "content": row[text_col]}
]
})
return conversations
def main():
# Load data
df = load_and_clean("raw_data.csv")
# Convert to chat format
conversations = convert_to_chat_format(df)
# Save
output_path = Path("processed_data.jsonl")
with open(output_path, 'w') as f:
for conv in conversations:
f.write(json.dumps(conv) + '\n')
print(f"Saved {len(conversations)} conversations")
if __name__ == "__main__":
main()
Copy
python prepare_data.py
aitraining llm --train --data-path ./processed_data.jsonl ...
DPO Data Preparation
Copy
# prepare_dpo_data.py
import json
from pathlib import Path
def create_dpo_pairs(preference_data):
"""Convert preference data to DPO format."""
dpo_data = []
for item in preference_data:
dpo_data.append({
"prompt": item["question"],
"chosen": item["preferred_answer"],
"rejected": item["rejected_answer"]
})
return dpo_data
def main():
# Load preference annotations
with open("preferences.json") as f:
data = json.load(f)
# Convert to DPO format
dpo_data = create_dpo_pairs(data)
# Save
with open("dpo_data.jsonl", 'w') as f:
for item in dpo_data:
f.write(json.dumps(item) + '\n')
print(f"Created {len(dpo_data)} DPO pairs")
if __name__ == "__main__":
main()
Evaluation Scripts
Custom Evaluation
Copy
# evaluate_model.py
import argparse
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
def load_model(model_path):
"""Load trained model."""
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
return model, tokenizer
def evaluate_prompts(model, tokenizer, prompts):
"""Generate responses for evaluation prompts."""
results = []
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
results.append({
"prompt": prompt,
"response": response
})
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", required=True)
parser.add_argument("--prompts-file", default="eval_prompts.txt")
args = parser.parse_args()
# Load model
model, tokenizer = load_model(args.model_path)
# Load prompts
with open(args.prompts_file) as f:
prompts = [line.strip() for line in f if line.strip()]
# Evaluate
results = evaluate_prompts(model, tokenizer, prompts)
# Save results
with open("eval_results.json", 'w') as f:
json.dump(results, f, indent=2)
print(f"Evaluated {len(results)} prompts")
if __name__ == "__main__":
main()
Benchmark Script
Copy
# benchmark.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def benchmark_model(model_path, prompts, num_runs=5):
"""Benchmark model inference speed."""
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.cuda()
times = []
for _ in range(num_runs):
start = time.time()
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
times.append(time.time() - start)
avg_time = sum(times) / len(times)
tokens_per_sec = len(prompts) * 50 / avg_time
return {
"avg_time": avg_time,
"tokens_per_second": tokens_per_sec
}
Post-Processing Scripts
Model Merging
Copy
# merge_lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import argparse
def merge_and_save(base_model, adapter_path, output_path):
"""Merge LoRA adapter into base model."""
# Load base model
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)
# Load and merge adapter
model = PeftModel.from_pretrained(model, adapter_path)
model = model.merge_and_unload()
# Save merged model
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"Merged model saved to {output_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--base-model", required=True)
parser.add_argument("--adapter-path", required=True)
parser.add_argument("--output-path", required=True)
args = parser.parse_args()
merge_and_save(args.base_model, args.adapter_path, args.output_path)
if __name__ == "__main__":
main()
Copy
aitraining tools merge-llm-adapter \
--base-model-path google/gemma-3-270m \
--adapter-path ./my-lora-model \
--output-folder ./merged-model
The built-in tool also supports
--token, --pad-to-multiple-of, and --push-to-hub flags. You must specify either --output-folder or --push-to-hub.Integration Example
Full Training Script
Copy
#!/bin/bash
# train_with_scripts.sh
set -e
# 1. Prepare data
echo "Preparing data..."
python scripts/prepare_data.py
# 2. Train model
echo "Training..."
aitraining llm --train \
--model google/gemma-3-270m \
--data-path processed/data.jsonl \
--project-name my-model \
--peft
# 3. Merge adapter
echo "Merging adapter..."
python scripts/merge_lora.py \
--base-model google/gemma-3-270m \
--adapter-path my-model \
--output-path my-model-merged
# 4. Evaluate
echo "Evaluating..."
python scripts/evaluate_model.py \
--model-path my-model-merged
echo "Done!"