Error Handling

Handle errors gracefully in your training scripts.

Common Exceptions

ValueError

Configuration errors:

from autotrain.trainers.clm.params import LLMTrainingParams

try:
    params = LLMTrainingParams(
        model="google/gemma-3-270m",
        data_path="./data.jsonl",
        project_name="my-model",
        trainer="invalid_trainer",  # Invalid value
    )
except ValueError as e:
    print(f"Configuration error: {e}")

RuntimeError

Training failures:

from autotrain.project import AutoTrainProject

try:
    project = AutoTrainProject(
        params=params,
        backend="local",
        process=True
    )
    job_id = project.create()
except RuntimeError as e:
    print(f"Training failed: {e}")

FileNotFoundError

Missing files:

try:
    params = LLMTrainingParams(
        model="google/gemma-3-270m",
        data_path="./nonexistent.jsonl",  # Doesn't exist
        project_name="my-model",
    )
except FileNotFoundError as e:
    print(f"File not found: {e}")

Handling GPU Errors

Out of Memory

import torch

try:
    # Your training code
    project.create()
except torch.cuda.OutOfMemoryError:
    print("GPU out of memory. Try:")
    print("- Reducing batch_size")
    print("- Enabling gradient_accumulation")
    print("- Using quantization")

Prevention

params = LLMTrainingParams(
    model="google/gemma-3-270m",
    data_path="./data.jsonl",
    project_name="my-model",

    # Memory-saving options
    batch_size=1,
    gradient_accumulation=16,
    auto_find_batch_size=True,
    quantization="int4",
)

Validation

Check Data Before Training

import json
from pathlib import Path

def validate_data(data_path):
    """Validate training data format."""
    path = Path(data_path)

    if not path.exists():
        raise FileNotFoundError(f"Data file not found: {data_path}")

    errors = []
    with open(path) as f:
        for i, line in enumerate(f):
            try:
                data = json.loads(line)
                if "text" not in data and "messages" not in data:
                    errors.append(f"Line {i}: Missing 'text' or 'messages'")
            except json.JSONDecodeError:
                errors.append(f"Line {i}: Invalid JSON")

    if errors:
        raise ValueError(f"Data validation failed:\\n" + "\\n".join(errors))

    return True

# Use before training
validate_data("./data.jsonl")

Check GPU Availability

import torch

def check_gpu():
    """Check GPU availability."""
    if not torch.cuda.is_available():
        print("Warning: No GPU available, training will be slow")
        return False

    gpu_count = torch.cuda.device_count()
    for i in range(gpu_count):
        name = torch.cuda.get_device_name(i)
        memory = torch.cuda.get_device_properties(i).total_memory / 1e9
        print(f"GPU {i}: {name} ({memory:.1f} GB)")

    return True

check_gpu()

Retry Logic

import time

def train_with_retry(params, max_retries=3):
    """Train with automatic retry on failure."""
    for attempt in range(max_retries):
        try:
            project = AutoTrainProject(
                params=params,
                backend="local",
                process=True
            )
            return project.create()

        except RuntimeError as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt * 60  # Exponential backoff
                print(f"Attempt {attempt + 1} failed: {e}")
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                raise

# Use with retry
job_id = train_with_retry(params)

Logging

Enable Debug Logging

import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("autotrain")
logger.setLevel(logging.DEBUG)

# Now run training

Capture Errors to File

import sys
import traceback

try:
    project.create()
except Exception as e:
    with open("error.log", "w") as f:
        f.write(f"Error: {e}\\n")
        traceback.print_exc(file=f)
    raise

Getting Started

Core Endpoints

Task APIs

SDKs & Integration

Real-time

Error Handling

Error Handling

Common Exceptions

ValueError

RuntimeError

FileNotFoundError

Handling GPU Errors

Out of Memory

Prevention

Validation

Check Data Before Training

Check GPU Availability

Retry Logic

Logging

Enable Debug Logging

Capture Errors to File

Next Steps

Python SDK

Logging & Debugging

Getting Started

Core Endpoints

Task APIs

SDKs & Integration

Real-time

​Error Handling

​Common Exceptions

​ValueError

​RuntimeError

​FileNotFoundError

​Handling GPU Errors

​Out of Memory

​Prevention

​Validation

​Check Data Before Training

​Check GPU Availability

​Retry Logic

​Logging

​Enable Debug Logging

​Capture Errors to File

​Next Steps

Python SDK

Logging & Debugging

Error Handling

Common Exceptions

ValueError

RuntimeError

FileNotFoundError

Handling GPU Errors

Out of Memory

Prevention

Validation

Check Data Before Training

Check GPU Availability

Retry Logic

Logging

Enable Debug Logging

Capture Errors to File

Next Steps