Documentation Index
Fetch the complete documentation index at: https://docs.cirron.com/llms.txt
Use this file to discover all available pages before exploring further.
scikit-learn Template
The scikit-learn template creates a production-ready ML project with scikit-learn, pandas, and traditional machine learning algorithms. This template is perfect for classification, regression, and other traditional ML tasks.
Quick Start
Create a new scikit-learn project:
cirron init my-sklearn-model --template sklearn
The model type (classification, regression, etc.) is selected interactively during cirron init.
Project Structure
The scikit-learn template generates the following project structure:
my-sklearn-model/
├── src/
│ ├── model.py # Model definition and preprocessing
│ ├── data_loader.py # Data loading utilities
│ ├── train.py # Training script
│ └── inference.py # Inference/prediction script
├── data/ # Data directory
├── models/ # Saved models (created during training)
├── requirements.txt # Python dependencies
├── Dockerfile # Container configuration
└── cirron.yaml # Project configuration
Generated Files
requirements.txt
scikit-learn>=1.3.0
numpy>=1.21.0
pandas>=1.5.0
matplotlib>=3.5.0
seaborn>=0.11.0
joblib>=1.2.0
src/model.py
The model and preprocessing pipeline is automatically generated based on your model type:
Classification Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
def create_model():
"""Create a classification model"""
model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
return model
def preprocess_data(X, preprocessor=None, fit=True):
"""Preprocess the data"""
if preprocessor is None:
preprocessor = StandardScaler()
if fit:
X_processed = preprocessor.fit_transform(X)
else:
X_processed = preprocessor.transform(X)
return X_processed, preprocessor
Regression Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
def create_model():
"""Create a regression model"""
model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
random_state=42
)
return model
def preprocess_data(X, preprocessor=None, fit=True):
"""Preprocess the data"""
if preprocessor is None:
preprocessor = StandardScaler()
if fit:
X_processed = preprocessor.fit_transform(X)
else:
X_processed = preprocessor.transform(X)
return X_processed, preprocessor
src/data_loader.py
Data loading utilities for scikit-learn:
import pandas as pd
import numpy as np
import os
def load_data(data_path):
"""Load data from CSV file"""
# TODO: Implement based on your data structure
# This is a placeholder - customize for your data
if not os.path.exists(data_path):
# Create sample data if file doesn't exist
print(f"Data file {data_path} not found. Creating sample data...")
return create_sample_data()
# Load your actual data
data = pd.read_csv(data_path)
# Separate features and target
# TODO: Update these column names based on your data
feature_columns = [col for col in data.columns if col != 'target']
target_column = 'target'
X = data[feature_columns]
y = data[target_column]
return X, y
def create_sample_data(n_samples=1000):
"""Create sample data for testing"""
np.random.seed(42)
# Generate sample features
X = pd.DataFrame({
'feature1': np.random.randn(n_samples),
'feature2': np.random.randn(n_samples),
'feature3': np.random.randn(n_samples),
'feature4': np.random.randn(n_samples),
})
# Generate sample target (classification or regression)
if np.random.choice([True, False]): # Random choice for demo
# Classification target
y = np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])
else:
# Regression target
y = 2 * X['feature1'] + 1.5 * X['feature2'] + np.random.randn(n_samples) * 0.1
return X, y
src/train.py
Complete training script with scikit-learn best practices:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from model import create_model, preprocess_data
from data_loader import load_data
class Trainer:
def __init__(self, config):
self.config = config
self.model = create_model()
self.preprocessor = None
# Load data
self.X, self.y = load_data(config['data_path'])
# Split data
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.X, self.y, test_size=0.2, random_state=42
)
def preprocess_data(self):
"""Preprocess the data"""
self.X_train_processed, self.preprocessor = preprocess_data(self.X_train, fit=True)
self.X_test_processed, _ = preprocess_data(self.X_test, self.preprocessor, fit=False)
def train(self):
"""Train the model"""
print("Preprocessing data...")
self.preprocess_data()
print("Training model...")
self.model.fit(self.X_train_processed, self.y_train)
# Cross-validation
cv_scores = cross_val_score(self.model, self.X_train_processed, self.y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# Test set evaluation
y_pred = self.model.predict(self.X_test_processed)
if self.config['model_type'] == 'classification':
print("\nClassification Report:")
print(classification_report(self.y_test, y_pred))
else:
mse = mean_squared_error(self.y_test, y_pred)
r2 = r2_score(self.y_test, y_pred)
print(f"\nTest MSE: {mse:.4f}")
print(f"Test R²: {r2:.4f}")
def save_model(self, filepath='models/model.joblib'):
"""Save the trained model and preprocessor"""
os.makedirs(os.path.dirname(filepath), exist_ok=True)
model_data = {
'model': self.model,
'preprocessor': self.preprocessor,
'config': self.config
}
joblib.dump(model_data, filepath)
print(f"Model saved to {filepath}")
if __name__ == "__main__":
config = {
'model_type': 'classification', # or 'regression'
'data_path': 'data/sample_data.csv',
}
trainer = Trainer(config)
trainer.train()
trainer.save_model()
print("Training completed!")
src/inference.py
Production-ready inference script:
import joblib
import numpy as np
import pandas as pd
from model import create_model, preprocess_data
class ModelInference:
def __init__(self, model_path: str = None):
self.model = create_model()
self.preprocessor = None
if model_path:
self.load_model(model_path)
def load_model(self, model_path: str):
"""Load trained model"""
saved_objects = joblib.load(model_path)
self.model = saved_objects['model']
self.preprocessor = saved_objects.get('preprocessor', None)
print(f"Model loaded from {model_path}")
def preprocess(self, input_data):
"""Preprocess input data"""
if isinstance(input_data, dict):
input_data = pd.DataFrame([input_data])
elif isinstance(input_data, list):
input_data = pd.DataFrame(input_data)
if self.preprocessor:
input_data = self.preprocessor.transform(input_data)
return input_data
def predict(self, input_data):
"""Make prediction"""
processed_data = self.preprocess(input_data)
predictions = self.model.predict(processed_data)
return predictions
def predict_proba(self, input_data):
"""Get prediction probabilities (for classification)"""
if hasattr(self.model, 'predict_proba'):
processed_data = self.preprocess(input_data)
probabilities = self.model.predict_proba(processed_data)
return probabilities
else:
raise ValueError("Model does not support probability predictions")
if __name__ == "__main__":
inference = ModelInference()
# Example usage
sample_input = {'feature1': 1.0, 'feature2': 2.0} # Replace with actual features
result = inference.predict(sample_input)
print(f"Prediction: {result}")
Model Types
The scikit-learn template supports different model types:
Classification
- Use case: Binary classification, multi-class classification
- Output: Class predictions and probabilities
- Default model: RandomForestClassifier
- Metrics: Accuracy, precision, recall, F1-score
Regression
- Use case: Continuous value prediction, time series forecasting
- Output: Continuous values
- Default model: RandomForestRegressor
- Metrics: MSE, R² score
Training Configuration
Default training configuration:
config = {
'model_type': 'classification', # or 'regression'
'data_path': 'data/sample_data.csv',
}
Usage Examples
Basic Training
# Navigate to your project
cd my-sklearn-model
# Train the model
python src/train.py
Custom Training
# Modify config in src/train.py
config = {
'model_type': 'regression',
'data_path': 'data/my_data.csv',
}
Inference
from src.inference import ModelInference
# Load trained model
inference = ModelInference('models/model.joblib')
# Make prediction
sample_input = {'feature1': 1.5, 'feature2': 2.3, 'feature3': 0.8, 'feature4': -1.2}
result = inference.predict(sample_input)
print(f"Prediction: {result}")
# Get probabilities (for classification)
if hasattr(inference.model, 'predict_proba'):
proba = inference.predict_proba(sample_input)
print(f"Probabilities: {proba}")
scikit-learn Features
Built-in Preprocessing
The template includes essential preprocessing:
- StandardScaler: Standardizes features by removing the mean and scaling to unit variance
- Pipeline Support: Easy to extend with additional preprocessing steps
- Fit/Transform Pattern: Proper handling of training vs inference preprocessing
Model Evaluation
- Cross-validation: 5-fold cross-validation for robust evaluation
- Train/Test Split: 80/20 split with fixed random state
- Comprehensive Metrics: Classification report or MSE/R² for regression
Model Persistence
- joblib: Efficient serialization of scikit-learn models
- Preprocessor Storage: Saves both model and preprocessor together
- Configuration Storage: Includes training configuration in saved model
Customization
Adding Custom Models
- Modify
src/model.py to add your custom model:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
def create_model():
"""Create a custom classification model"""
# Choose your model
model = SVC(probability=True, random_state=42)
# or
# model = LogisticRegression(random_state=42)
return model
Custom Preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
def preprocess_data(X, preprocessor=None, fit=True):
"""Custom preprocessing pipeline"""
if preprocessor is None:
# Create custom preprocessing pipeline
preprocessor = Pipeline([
('scaler', RobustScaler()),
('feature_selection', SelectKBest(f_classif, k=10))
])
if fit:
X_processed = preprocessor.fit_transform(X)
else:
X_processed = preprocessor.transform(X)
return X_processed, preprocessor
Custom Data Loading
def load_data(data_path):
"""Custom data loading for your specific format"""
# Load your data
data = pd.read_csv(data_path)
# Custom preprocessing
data = data.dropna() # Remove missing values
data = data.drop_duplicates() # Remove duplicates
# Feature engineering
data['new_feature'] = data['feature1'] * data['feature2']
# Separate features and target
feature_columns = ['feature1', 'feature2', 'feature3', 'new_feature']
target_column = 'target'
X = data[feature_columns]
y = data[target_column]
return X, y
Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
def create_model():
"""Create model with hyperparameter tuning"""
base_model = RandomForestClassifier(random_state=42)
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10]
}
model = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy')
return model
Best Practices
Data Organization
data/
├── train_data.csv # Training data
├── test_data.csv # Test data (optional)
└── sample_data.csv # Sample data for testing
Model Saving
- Models are saved in
models/ directory
- Uses joblib for efficient serialization
- Includes both model and preprocessor
- Saves training configuration
Feature Engineering
- Implement feature engineering in
data_loader.py
- Use scikit-learn’s preprocessing pipeline
- Handle missing values and outliers
- Scale features appropriately
Model Selection
- Start with RandomForest for baseline
- Try different algorithms based on your data
- Use cross-validation for model comparison
- Consider ensemble methods for better performance
Memory Efficiency
# Use chunked processing for large datasets
def load_large_data(data_path, chunk_size=10000):
chunks = []
for chunk in pd.read_csv(data_path, chunksize=chunk_size):
chunks.append(chunk)
return pd.concat(chunks)
Parallel Processing
# Enable parallel processing for RandomForest
model = RandomForestClassifier(n_jobs=-1, random_state=42)
Model Optimization
# Use more efficient algorithms for large datasets
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log', random_state=42)
Next Steps