Skip to main content

Error Handling

Handle errors gracefully in your training scripts.

Common Exceptions

ValueError

Configuration errors:
from autotrain.trainers.clm.params import LLMTrainingParams

try:
    params = LLMTrainingParams(
        model="google/gemma-3-270m",
        data_path="./data.jsonl",
        project_name="my-model",
        trainer="invalid_trainer",  # Invalid value
    )
except ValueError as e:
    print(f"Configuration error: {e}")

RuntimeError

Training failures:
from autotrain.project import AutoTrainProject

try:
    project = AutoTrainProject(
        params=params,
        backend="local",
        process=True
    )
    job_id = project.create()
except RuntimeError as e:
    print(f"Training failed: {e}")

FileNotFoundError

Missing files:
try:
    params = LLMTrainingParams(
        model="google/gemma-3-270m",
        data_path="./nonexistent.jsonl",  # Doesn't exist
        project_name="my-model",
    )
except FileNotFoundError as e:
    print(f"File not found: {e}")

Handling GPU Errors

Out of Memory

import torch

try:
    # Your training code
    project.create()
except torch.cuda.OutOfMemoryError:
    print("GPU out of memory. Try:")
    print("- Reducing batch_size")
    print("- Enabling gradient_accumulation")
    print("- Using quantization")

Prevention

params = LLMTrainingParams(
    model="google/gemma-3-270m",
    data_path="./data.jsonl",
    project_name="my-model",

    # Memory-saving options
    batch_size=1,
    gradient_accumulation=16,
    auto_find_batch_size=True,
    quantization="int4",
)

Validation

Check Data Before Training

import json
from pathlib import Path

def validate_data(data_path):
    """Validate training data format."""
    path = Path(data_path)

    if not path.exists():
        raise FileNotFoundError(f"Data file not found: {data_path}")

    errors = []
    with open(path) as f:
        for i, line in enumerate(f):
            try:
                data = json.loads(line)
                if "text" not in data and "messages" not in data:
                    errors.append(f"Line {i}: Missing 'text' or 'messages'")
            except json.JSONDecodeError:
                errors.append(f"Line {i}: Invalid JSON")

    if errors:
        raise ValueError(f"Data validation failed:\\n" + "\\n".join(errors))

    return True

# Use before training
validate_data("./data.jsonl")

Check GPU Availability

import torch

def check_gpu():
    """Check GPU availability."""
    if not torch.cuda.is_available():
        print("Warning: No GPU available, training will be slow")
        return False

    gpu_count = torch.cuda.device_count()
    for i in range(gpu_count):
        name = torch.cuda.get_device_name(i)
        memory = torch.cuda.get_device_properties(i).total_memory / 1e9
        print(f"GPU {i}: {name} ({memory:.1f} GB)")

    return True

check_gpu()

Retry Logic

import time

def train_with_retry(params, max_retries=3):
    """Train with automatic retry on failure."""
    for attempt in range(max_retries):
        try:
            project = AutoTrainProject(
                params=params,
                backend="local",
                process=True
            )
            return project.create()

        except RuntimeError as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt * 60  # Exponential backoff
                print(f"Attempt {attempt + 1} failed: {e}")
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                raise

# Use with retry
job_id = train_with_retry(params)

Logging

Enable Debug Logging

import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("autotrain")
logger.setLevel(logging.DEBUG)

# Now run training

Capture Errors to File

import sys
import traceback

try:
    project.create()
except Exception as e:
    with open("error.log", "w") as f:
        f.write(f"Error: {e}\\n")
        traceback.print_exc(file=f)
    raise

Next Steps