Error Handling
Handle errors gracefully in your training scripts.Common Exceptions
ValueError
Configuration errors:Copy
from autotrain.trainers.clm.params import LLMTrainingParams
try:
params = LLMTrainingParams(
model="google/gemma-3-270m",
data_path="./data.jsonl",
project_name="my-model",
trainer="invalid_trainer", # Invalid value
)
except ValueError as e:
print(f"Configuration error: {e}")
RuntimeError
Training failures:Copy
from autotrain.project import AutoTrainProject
try:
project = AutoTrainProject(
params=params,
backend="local",
process=True
)
job_id = project.create()
except RuntimeError as e:
print(f"Training failed: {e}")
FileNotFoundError
Missing files:Copy
try:
params = LLMTrainingParams(
model="google/gemma-3-270m",
data_path="./nonexistent.jsonl", # Doesn't exist
project_name="my-model",
)
except FileNotFoundError as e:
print(f"File not found: {e}")
Handling GPU Errors
Out of Memory
Copy
import torch
try:
# Your training code
project.create()
except torch.cuda.OutOfMemoryError:
print("GPU out of memory. Try:")
print("- Reducing batch_size")
print("- Enabling gradient_accumulation")
print("- Using quantization")
Prevention
Copy
params = LLMTrainingParams(
model="google/gemma-3-270m",
data_path="./data.jsonl",
project_name="my-model",
# Memory-saving options
batch_size=1,
gradient_accumulation=16,
auto_find_batch_size=True,
quantization="int4",
)
Validation
Check Data Before Training
Copy
import json
from pathlib import Path
def validate_data(data_path):
"""Validate training data format."""
path = Path(data_path)
if not path.exists():
raise FileNotFoundError(f"Data file not found: {data_path}")
errors = []
with open(path) as f:
for i, line in enumerate(f):
try:
data = json.loads(line)
if "text" not in data and "messages" not in data:
errors.append(f"Line {i}: Missing 'text' or 'messages'")
except json.JSONDecodeError:
errors.append(f"Line {i}: Invalid JSON")
if errors:
raise ValueError(f"Data validation failed:\\n" + "\\n".join(errors))
return True
# Use before training
validate_data("./data.jsonl")
Check GPU Availability
Copy
import torch
def check_gpu():
"""Check GPU availability."""
if not torch.cuda.is_available():
print("Warning: No GPU available, training will be slow")
return False
gpu_count = torch.cuda.device_count()
for i in range(gpu_count):
name = torch.cuda.get_device_name(i)
memory = torch.cuda.get_device_properties(i).total_memory / 1e9
print(f"GPU {i}: {name} ({memory:.1f} GB)")
return True
check_gpu()
Retry Logic
Copy
import time
def train_with_retry(params, max_retries=3):
"""Train with automatic retry on failure."""
for attempt in range(max_retries):
try:
project = AutoTrainProject(
params=params,
backend="local",
process=True
)
return project.create()
except RuntimeError as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt * 60 # Exponential backoff
print(f"Attempt {attempt + 1} failed: {e}")
print(f"Retrying in {wait_time} seconds...")
time.sleep(wait_time)
else:
raise
# Use with retry
job_id = train_with_retry(params)
Logging
Enable Debug Logging
Copy
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("autotrain")
logger.setLevel(logging.DEBUG)
# Now run training
Capture Errors to File
Copy
import sys
import traceback
try:
project.create()
except Exception as e:
with open("error.log", "w") as f:
f.write(f"Error: {e}\\n")
traceback.print_exc(file=f)
raise