Benchmarking
Evaluate and compare model performance.
Quick Evaluation
Using Enhanced Eval
aitraining llm --train \
--model google/gemma-3-270m \
--data-path ./data \
--project-name my-model \
--use-enhanced-eval \
--eval-metrics "perplexity,accuracy"
Eval Only (No Training)
python -c "
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained('./my-model')
tokenizer = AutoTokenizer.from_pretrained('./my-model')
# Compute perplexity on test data
# ...
"
Metrics
Available Metrics
| Metric | Description | Use Case |
|---|
perplexity | Language modeling quality | LLMs |
accuracy | Classification accuracy | Classification |
f1 | F1 score | Classification |
bleu | Translation quality | Seq2Seq |
rouge | Summarization quality | Seq2Seq |
Custom Evaluation
Enhanced evaluation runs on your validation split during training:
aitraining llm --train \
--model google/gemma-3-270m \
--data-path ./data \
--valid-split validation \
--project-name my-model \
--use-enhanced-eval \
--eval-metrics "perplexity,accuracy"
Enhanced evaluation uses the validation data specified by --valid-split. To evaluate on a separate test set after training, use the LM Evaluation Harness or custom scripts shown below.
Inference Speed
Throughput Test
# benchmark_speed.py
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def benchmark(model_path, num_samples=100, max_tokens=50):
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.cuda()
prompt = "The quick brown fox"
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Warmup
for _ in range(5):
model.generate(**inputs, max_new_tokens=max_tokens)
# Benchmark
start = time.time()
for _ in range(num_samples):
model.generate(**inputs, max_new_tokens=max_tokens)
elapsed = time.time() - start
tokens_per_second = (num_samples * max_tokens) / elapsed
print(f"Throughput: {tokens_per_second:.2f} tokens/sec")
print(f"Latency: {elapsed/num_samples*1000:.2f} ms/sample")
benchmark("./my-model")
Memory Usage
# benchmark_memory.py
import torch
from transformers import AutoModelForCausalLM
def measure_memory(model_path):
torch.cuda.reset_peak_memory_stats()
model = AutoModelForCausalLM.from_pretrained(model_path)
model = model.cuda()
peak_memory = torch.cuda.max_memory_allocated() / 1024**3
print(f"Peak memory: {peak_memory:.2f} GB")
measure_memory("./my-model")
Model Comparison
Compare Multiple Models
# compare_models.py
import json
from pathlib import Path
def compare_models(model_paths):
results = []
for path in model_paths:
state_file = Path(path) / "trainer_state.json"
if state_file.exists():
with open(state_file) as f:
state = json.load(f)
results.append({
"model": path,
"best_metric": state.get("best_metric"),
"epoch": state.get("epoch"),
})
# Sort by best_metric (typically eval_loss)
results.sort(key=lambda x: x.get("best_metric") or float("inf"))
print("Model Comparison:")
print("-" * 50)
for r in results:
metric = r.get('best_metric')
metric_str = f"{metric:.4f}" if metric else "N/A"
print(f"{r['model']}: best_metric={metric_str}")
compare_models([
"./model-v1",
"./model-v2",
"./model-v3"
])
W&B Comparison
When logging to W&B, compare runs in the dashboard:
# Train multiple variants
aitraining llm --train --model modelA --project-name exp-a --log wandb
aitraining llm --train --model modelB --project-name exp-b --log wandb
# Compare in W&B dashboard
Standard Benchmarks
LM Evaluation Harness
For standard benchmarks like HellaSwag, ARC, and MMLU, use the LM Evaluation Harness after training:
pip install lm-eval
lm_eval --model hf \
--model_args pretrained=./my-model \
--tasks hellaswag,arc_easy,arc_challenge \
--batch_size 8
Common Benchmark Tasks
| Task | Description |
|---|
hellaswag | Commonsense reasoning |
arc_easy | Science questions (easy) |
arc_challenge | Science questions (hard) |
mmlu | Multitask language understanding |
winogrande | Commonsense reasoning |
truthfulqa | Truthfulness evaluation |
Reporting
Generate Report
# generate_report.py
import json
from datetime import datetime
def generate_report(model_path, metrics, benchmark_results):
report = {
"model": model_path,
"date": datetime.now().isoformat(),
"metrics": metrics,
"benchmarks": benchmark_results,
}
with open("benchmark_report.json", 'w') as f:
json.dump(report, f, indent=2)
# Print summary
print(f"\nBenchmark Report - {model_path}")
print("=" * 50)
print(f"Eval Loss: {metrics.get('eval_loss', 'N/A')}")
print(f"Perplexity: {metrics.get('perplexity', 'N/A')}")
for name, score in benchmark_results.items():
print(f"{name}: {score}")
Next Steps