Skip to main content

Custom Scripts

Extend AITraining with custom preprocessing and postprocessing scripts.

Requirements

Custom scripts can use libraries bundled with AITraining:
  • Python >= 3.8
  • pandas - Data manipulation
  • torch - PyTorch for model operations
  • transformers - Hugging Face models
  • peft - LoRA/adapter operations
These are installed automatically with AITraining.

Data Preprocessing

Custom Data Pipeline

# prepare_data.py
import json
import pandas as pd
from pathlib import Path

def load_and_clean(input_path):
    """Load and clean raw data."""
    df = pd.read_csv(input_path)

    # Remove empty rows
    df = df.dropna(subset=['text'])

    # Clean text
    df['text'] = df['text'].str.strip()
    df = df[df['text'].str.len() > 10]

    return df

def convert_to_chat_format(df, text_col='text'):
    """Convert to chat format for SFT."""
    conversations = []

    for _, row in df.iterrows():
        conversations.append({
            "messages": [
                {"role": "user", "content": "Continue this text:"},
                {"role": "assistant", "content": row[text_col]}
            ]
        })

    return conversations

def main():
    # Load data
    df = load_and_clean("raw_data.csv")

    # Convert to chat format
    conversations = convert_to_chat_format(df)

    # Save
    output_path = Path("processed_data.jsonl")
    with open(output_path, 'w') as f:
        for conv in conversations:
            f.write(json.dumps(conv) + '\n')

    print(f"Saved {len(conversations)} conversations")

if __name__ == "__main__":
    main()
Usage:
python prepare_data.py
aitraining llm --train --data-path ./processed_data.jsonl ...

DPO Data Preparation

# prepare_dpo_data.py
import json
from pathlib import Path

def create_dpo_pairs(preference_data):
    """Convert preference data to DPO format."""
    dpo_data = []

    for item in preference_data:
        dpo_data.append({
            "prompt": item["question"],
            "chosen": item["preferred_answer"],
            "rejected": item["rejected_answer"]
        })

    return dpo_data

def main():
    # Load preference annotations
    with open("preferences.json") as f:
        data = json.load(f)

    # Convert to DPO format
    dpo_data = create_dpo_pairs(data)

    # Save
    with open("dpo_data.jsonl", 'w') as f:
        for item in dpo_data:
            f.write(json.dumps(item) + '\n')

    print(f"Created {len(dpo_data)} DPO pairs")

if __name__ == "__main__":
    main()

Evaluation Scripts

Custom Evaluation

# evaluate_model.py
import argparse
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_path):
    """Load trained model."""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return model, tokenizer

def evaluate_prompts(model, tokenizer, prompts):
    """Generate responses for evaluation prompts."""
    results = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=100)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append({
            "prompt": prompt,
            "response": response
        })

    return results

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", required=True)
    parser.add_argument("--prompts-file", default="eval_prompts.txt")
    args = parser.parse_args()

    # Load model
    model, tokenizer = load_model(args.model_path)

    # Load prompts
    with open(args.prompts_file) as f:
        prompts = [line.strip() for line in f if line.strip()]

    # Evaluate
    results = evaluate_prompts(model, tokenizer, prompts)

    # Save results
    with open("eval_results.json", 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Evaluated {len(results)} prompts")

if __name__ == "__main__":
    main()

Benchmark Script

# benchmark.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def benchmark_model(model_path, prompts, num_runs=5):
    """Benchmark model inference speed."""
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    if torch.cuda.is_available():
        model = model.cuda()

    times = []

    for _ in range(num_runs):
        start = time.time()

        for prompt in prompts:
            inputs = tokenizer(prompt, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=50)

        times.append(time.time() - start)

    avg_time = sum(times) / len(times)
    tokens_per_sec = len(prompts) * 50 / avg_time

    return {
        "avg_time": avg_time,
        "tokens_per_second": tokens_per_sec
    }

Post-Processing Scripts

Model Merging

# merge_lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import argparse

def merge_and_save(base_model, adapter_path, output_path):
    """Merge LoRA adapter into base model."""
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(base_model)
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    # Load and merge adapter
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()

    # Save merged model
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

    print(f"Merged model saved to {output_path}")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-model", required=True)
    parser.add_argument("--adapter-path", required=True)
    parser.add_argument("--output-path", required=True)
    args = parser.parse_args()

    merge_and_save(args.base_model, args.adapter_path, args.output_path)

if __name__ == "__main__":
    main()
Or use the built-in tool:
aitraining tools merge-llm-adapter \
  --base-model-path google/gemma-3-270m \
  --adapter-path ./my-lora-model \
  --output-folder ./merged-model
The built-in tool also supports --token, --pad-to-multiple-of, and --push-to-hub flags. You must specify either --output-folder or --push-to-hub.

Integration Example

Full Training Script

#!/bin/bash
# train_with_scripts.sh

set -e

# 1. Prepare data
echo "Preparing data..."
python scripts/prepare_data.py

# 2. Train model
echo "Training..."
aitraining llm --train \
  --model google/gemma-3-270m \
  --data-path processed/data.jsonl \
  --project-name my-model \
  --peft

# 3. Merge adapter
echo "Merging adapter..."
python scripts/merge_lora.py \
  --base-model google/gemma-3-270m \
  --adapter-path my-model \
  --output-path my-model-merged

# 4. Evaluate
echo "Evaluating..."
python scripts/evaluate_model.py \
  --model-path my-model-merged

echo "Done!"

Next Steps