Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.cirron.com/llms.txt

Use this file to discover all available pages before exploring further.

scikit-learn Template

The scikit-learn template creates a production-ready ML project with scikit-learn, pandas, and traditional machine learning algorithms. This template is perfect for classification, regression, and other traditional ML tasks.

Quick Start

Create a new scikit-learn project:
cirron init my-sklearn-model --template sklearn
The model type (classification, regression, etc.) is selected interactively during cirron init.

Project Structure

The scikit-learn template generates the following project structure:
my-sklearn-model/
├── src/
│   ├── model.py           # Model definition and preprocessing
│   ├── data_loader.py     # Data loading utilities
│   ├── train.py           # Training script
│   └── inference.py       # Inference/prediction script
├── data/                  # Data directory
├── models/                # Saved models (created during training)
├── requirements.txt       # Python dependencies
├── Dockerfile            # Container configuration
└── cirron.yaml           # Project configuration

Generated Files

requirements.txt

scikit-learn>=1.3.0
numpy>=1.21.0
pandas>=1.5.0
matplotlib>=3.5.0
seaborn>=0.11.0
joblib>=1.2.0

src/model.py

The model and preprocessing pipeline is automatically generated based on your model type:

Classification Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd

def create_model():
    """Create a classification model"""
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    return model

def preprocess_data(X, preprocessor=None, fit=True):
    """Preprocess the data"""
    if preprocessor is None:
        preprocessor = StandardScaler()
    
    if fit:
        X_processed = preprocessor.fit_transform(X)
    else:
        X_processed = preprocessor.transform(X)
    
    return X_processed, preprocessor

Regression Model

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd

def create_model():
    """Create a regression model"""
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    return model

def preprocess_data(X, preprocessor=None, fit=True):
    """Preprocess the data"""
    if preprocessor is None:
        preprocessor = StandardScaler()
    
    if fit:
        X_processed = preprocessor.fit_transform(X)
    else:
        X_processed = preprocessor.transform(X)
    
    return X_processed, preprocessor

src/data_loader.py

Data loading utilities for scikit-learn:
import pandas as pd
import numpy as np
import os

def load_data(data_path):
    """Load data from CSV file"""
    # TODO: Implement based on your data structure
    # This is a placeholder - customize for your data
    
    if not os.path.exists(data_path):
        # Create sample data if file doesn't exist
        print(f"Data file {data_path} not found. Creating sample data...")
        return create_sample_data()
    
    # Load your actual data
    data = pd.read_csv(data_path)
    
    # Separate features and target
    # TODO: Update these column names based on your data
    feature_columns = [col for col in data.columns if col != 'target']
    target_column = 'target'
    
    X = data[feature_columns]
    y = data[target_column]
    
    return X, y

def create_sample_data(n_samples=1000):
    """Create sample data for testing"""
    np.random.seed(42)
    
    # Generate sample features
    X = pd.DataFrame({
        'feature1': np.random.randn(n_samples),
        'feature2': np.random.randn(n_samples),
        'feature3': np.random.randn(n_samples),
        'feature4': np.random.randn(n_samples),
    })
    
    # Generate sample target (classification or regression)
    if np.random.choice([True, False]):  # Random choice for demo
        # Classification target
        y = np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])
    else:
        # Regression target
        y = 2 * X['feature1'] + 1.5 * X['feature2'] + np.random.randn(n_samples) * 0.1
    
    return X, y

src/train.py

Complete training script with scikit-learn best practices:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from model import create_model, preprocess_data
from data_loader import load_data

class Trainer:
    def __init__(self, config):
        self.config = config
        self.model = create_model()
        self.preprocessor = None
        
        # Load data
        self.X, self.y = load_data(config['data_path'])
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42
        )
    
    def preprocess_data(self):
        """Preprocess the data"""
        self.X_train_processed, self.preprocessor = preprocess_data(self.X_train, fit=True)
        self.X_test_processed, _ = preprocess_data(self.X_test, self.preprocessor, fit=False)
    
    def train(self):
        """Train the model"""
        print("Preprocessing data...")
        self.preprocess_data()
        
        print("Training model...")
        self.model.fit(self.X_train_processed, self.y_train)
        
        # Cross-validation
        cv_scores = cross_val_score(self.model, self.X_train_processed, self.y_train, cv=5)
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        # Test set evaluation
        y_pred = self.model.predict(self.X_test_processed)
        
        if self.config['model_type'] == 'classification':
            print("\nClassification Report:")
            print(classification_report(self.y_test, y_pred))
        else:
            mse = mean_squared_error(self.y_test, y_pred)
            r2 = r2_score(self.y_test, y_pred)
            print(f"\nTest MSE: {mse:.4f}")
            print(f"Test R²: {r2:.4f}")
    
    def save_model(self, filepath='models/model.joblib'):
        """Save the trained model and preprocessor"""
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        model_data = {
            'model': self.model,
            'preprocessor': self.preprocessor,
            'config': self.config
        }
        
        joblib.dump(model_data, filepath)
        print(f"Model saved to {filepath}")

if __name__ == "__main__":
    config = {
        'model_type': 'classification',  # or 'regression'
        'data_path': 'data/sample_data.csv',
    }
    
    trainer = Trainer(config)
    trainer.train()
    trainer.save_model()
    print("Training completed!")

src/inference.py

Production-ready inference script:
import joblib
import numpy as np
import pandas as pd
from model import create_model, preprocess_data

class ModelInference:
    def __init__(self, model_path: str = None):
        self.model = create_model()
        self.preprocessor = None
        
        if model_path:
            self.load_model(model_path)
    
    def load_model(self, model_path: str):
        """Load trained model"""
        saved_objects = joblib.load(model_path)
        self.model = saved_objects['model']
        self.preprocessor = saved_objects.get('preprocessor', None)
        print(f"Model loaded from {model_path}")
    
    def preprocess(self, input_data):
        """Preprocess input data"""
        if isinstance(input_data, dict):
            input_data = pd.DataFrame([input_data])
        elif isinstance(input_data, list):
            input_data = pd.DataFrame(input_data)
        
        if self.preprocessor:
            input_data = self.preprocessor.transform(input_data)
        
        return input_data
    
    def predict(self, input_data):
        """Make prediction"""
        processed_data = self.preprocess(input_data)
        predictions = self.model.predict(processed_data)
        return predictions
    
    def predict_proba(self, input_data):
        """Get prediction probabilities (for classification)"""
        if hasattr(self.model, 'predict_proba'):
            processed_data = self.preprocess(input_data)
            probabilities = self.model.predict_proba(processed_data)
            return probabilities
        else:
            raise ValueError("Model does not support probability predictions")

if __name__ == "__main__":
    inference = ModelInference()
    
    # Example usage
    sample_input = {'feature1': 1.0, 'feature2': 2.0}  # Replace with actual features
    result = inference.predict(sample_input)
    print(f"Prediction: {result}")

Model Types

The scikit-learn template supports different model types:

Classification

  • Use case: Binary classification, multi-class classification
  • Output: Class predictions and probabilities
  • Default model: RandomForestClassifier
  • Metrics: Accuracy, precision, recall, F1-score

Regression

  • Use case: Continuous value prediction, time series forecasting
  • Output: Continuous values
  • Default model: RandomForestRegressor
  • Metrics: MSE, R² score

Training Configuration

Default training configuration:
config = {
    'model_type': 'classification',  # or 'regression'
    'data_path': 'data/sample_data.csv',
}

Usage Examples

Basic Training

# Navigate to your project
cd my-sklearn-model

# Train the model
python src/train.py

Custom Training

# Modify config in src/train.py
config = {
    'model_type': 'regression',
    'data_path': 'data/my_data.csv',
}

Inference

from src.inference import ModelInference

# Load trained model
inference = ModelInference('models/model.joblib')

# Make prediction
sample_input = {'feature1': 1.5, 'feature2': 2.3, 'feature3': 0.8, 'feature4': -1.2}
result = inference.predict(sample_input)
print(f"Prediction: {result}")

# Get probabilities (for classification)
if hasattr(inference.model, 'predict_proba'):
    proba = inference.predict_proba(sample_input)
    print(f"Probabilities: {proba}")

scikit-learn Features

Built-in Preprocessing

The template includes essential preprocessing:
  • StandardScaler: Standardizes features by removing the mean and scaling to unit variance
  • Pipeline Support: Easy to extend with additional preprocessing steps
  • Fit/Transform Pattern: Proper handling of training vs inference preprocessing

Model Evaluation

  • Cross-validation: 5-fold cross-validation for robust evaluation
  • Train/Test Split: 80/20 split with fixed random state
  • Comprehensive Metrics: Classification report or MSE/R² for regression

Model Persistence

  • joblib: Efficient serialization of scikit-learn models
  • Preprocessor Storage: Saves both model and preprocessor together
  • Configuration Storage: Includes training configuration in saved model

Customization

Adding Custom Models

  1. Modify src/model.py to add your custom model:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

def create_model():
    """Create a custom classification model"""
    # Choose your model
    model = SVC(probability=True, random_state=42)
    # or
    # model = LogisticRegression(random_state=42)
    return model

Custom Preprocessing

from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif

def preprocess_data(X, preprocessor=None, fit=True):
    """Custom preprocessing pipeline"""
    if preprocessor is None:
        # Create custom preprocessing pipeline
        preprocessor = Pipeline([
            ('scaler', RobustScaler()),
            ('feature_selection', SelectKBest(f_classif, k=10))
        ])
    
    if fit:
        X_processed = preprocessor.fit_transform(X)
    else:
        X_processed = preprocessor.transform(X)
    
    return X_processed, preprocessor

Custom Data Loading

def load_data(data_path):
    """Custom data loading for your specific format"""
    # Load your data
    data = pd.read_csv(data_path)
    
    # Custom preprocessing
    data = data.dropna()  # Remove missing values
    data = data.drop_duplicates()  # Remove duplicates
    
    # Feature engineering
    data['new_feature'] = data['feature1'] * data['feature2']
    
    # Separate features and target
    feature_columns = ['feature1', 'feature2', 'feature3', 'new_feature']
    target_column = 'target'
    
    X = data[feature_columns]
    y = data[target_column]
    
    return X, y

Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV

def create_model():
    """Create model with hyperparameter tuning"""
    base_model = RandomForestClassifier(random_state=42)
    
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10]
    }
    
    model = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy')
    return model

Best Practices

Data Organization

data/
├── train_data.csv        # Training data
├── test_data.csv         # Test data (optional)
└── sample_data.csv       # Sample data for testing

Model Saving

  • Models are saved in models/ directory
  • Uses joblib for efficient serialization
  • Includes both model and preprocessor
  • Saves training configuration

Feature Engineering

  • Implement feature engineering in data_loader.py
  • Use scikit-learn’s preprocessing pipeline
  • Handle missing values and outliers
  • Scale features appropriately

Model Selection

  • Start with RandomForest for baseline
  • Try different algorithms based on your data
  • Use cross-validation for model comparison
  • Consider ensemble methods for better performance

Performance Optimization

Memory Efficiency

# Use chunked processing for large datasets
def load_large_data(data_path, chunk_size=10000):
    chunks = []
    for chunk in pd.read_csv(data_path, chunksize=chunk_size):
        chunks.append(chunk)
    return pd.concat(chunks)

Parallel Processing

# Enable parallel processing for RandomForest
model = RandomForestClassifier(n_jobs=-1, random_state=42)

Model Optimization

# Use more efficient algorithms for large datasets
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log', random_state=42)

Next Steps