Skip to main content

Benchmarking

Evaluate and compare model performance.

Quick Evaluation

Using Enhanced Eval

aitraining llm --train \
  --model google/gemma-3-270m \
  --data-path ./data \
  --project-name my-model \
  --use-enhanced-eval \
  --eval-metrics "perplexity,accuracy"

Eval Only (No Training)

python -c "
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained('./my-model')
tokenizer = AutoTokenizer.from_pretrained('./my-model')

# Compute perplexity on test data
# ...
"

Metrics

Available Metrics

MetricDescriptionUse Case
perplexityLanguage modeling qualityLLMs
accuracyClassification accuracyClassification
f1F1 scoreClassification
bleuTranslation qualitySeq2Seq
rougeSummarization qualitySeq2Seq

Custom Evaluation

Enhanced evaluation runs on your validation split during training:
aitraining llm --train \
  --model google/gemma-3-270m \
  --data-path ./data \
  --valid-split validation \
  --project-name my-model \
  --use-enhanced-eval \
  --eval-metrics "perplexity,accuracy"
Enhanced evaluation uses the validation data specified by --valid-split. To evaluate on a separate test set after training, use the LM Evaluation Harness or custom scripts shown below.

Inference Speed

Throughput Test

# benchmark_speed.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def benchmark(model_path, num_samples=100, max_tokens=50):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    if torch.cuda.is_available():
        model = model.cuda()

    prompt = "The quick brown fox"
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}

    # Warmup
    for _ in range(5):
        model.generate(**inputs, max_new_tokens=max_tokens)

    # Benchmark
    start = time.time()
    for _ in range(num_samples):
        model.generate(**inputs, max_new_tokens=max_tokens)
    elapsed = time.time() - start

    tokens_per_second = (num_samples * max_tokens) / elapsed
    print(f"Throughput: {tokens_per_second:.2f} tokens/sec")
    print(f"Latency: {elapsed/num_samples*1000:.2f} ms/sample")

benchmark("./my-model")

Memory Usage

# benchmark_memory.py
import torch
from transformers import AutoModelForCausalLM

def measure_memory(model_path):
    torch.cuda.reset_peak_memory_stats()

    model = AutoModelForCausalLM.from_pretrained(model_path)
    model = model.cuda()

    peak_memory = torch.cuda.max_memory_allocated() / 1024**3
    print(f"Peak memory: {peak_memory:.2f} GB")

measure_memory("./my-model")

Model Comparison

Compare Multiple Models

# compare_models.py
import json
from pathlib import Path

def compare_models(model_paths):
    results = []

    for path in model_paths:
        state_file = Path(path) / "trainer_state.json"
        if state_file.exists():
            with open(state_file) as f:
                state = json.load(f)
            results.append({
                "model": path,
                "best_metric": state.get("best_metric"),
                "epoch": state.get("epoch"),
            })

    # Sort by best_metric (typically eval_loss)
    results.sort(key=lambda x: x.get("best_metric") or float("inf"))

    print("Model Comparison:")
    print("-" * 50)
    for r in results:
        metric = r.get('best_metric')
        metric_str = f"{metric:.4f}" if metric else "N/A"
        print(f"{r['model']}: best_metric={metric_str}")

compare_models([
    "./model-v1",
    "./model-v2",
    "./model-v3"
])

W&B Comparison

When logging to W&B, compare runs in the dashboard:
# Train multiple variants
aitraining llm --train --model modelA --project-name exp-a --log wandb
aitraining llm --train --model modelB --project-name exp-b --log wandb

# Compare in W&B dashboard

Standard Benchmarks

LM Evaluation Harness

For standard benchmarks like HellaSwag, ARC, and MMLU, use the LM Evaluation Harness after training:
pip install lm-eval

lm_eval --model hf \
  --model_args pretrained=./my-model \
  --tasks hellaswag,arc_easy,arc_challenge \
  --batch_size 8

Common Benchmark Tasks

TaskDescription
hellaswagCommonsense reasoning
arc_easyScience questions (easy)
arc_challengeScience questions (hard)
mmluMultitask language understanding
winograndeCommonsense reasoning
truthfulqaTruthfulness evaluation

Reporting

Generate Report

# generate_report.py
import json
from datetime import datetime

def generate_report(model_path, metrics, benchmark_results):
    report = {
        "model": model_path,
        "date": datetime.now().isoformat(),
        "metrics": metrics,
        "benchmarks": benchmark_results,
    }

    with open("benchmark_report.json", 'w') as f:
        json.dump(report, f, indent=2)

    # Print summary
    print(f"\nBenchmark Report - {model_path}")
    print("=" * 50)
    print(f"Eval Loss: {metrics.get('eval_loss', 'N/A')}")
    print(f"Perplexity: {metrics.get('perplexity', 'N/A')}")
    for name, score in benchmark_results.items():
        print(f"{name}: {score}")

Next Steps