8.1.2. Scikit-Learn Model Persistence#

Scikit-learn models are Python objects that can be serialized using pickle or joblib. Proper persistence of scikit-learn models goes beyond simply saving the model object—it requires understanding how to package preprocessing pipelines, track version information, and validate that a loaded model will behave exactly as it did during training.

8.1.2.2. Pipelines: Saving the Entire Preprocessing Chain#

When deploying models, you almost always need to apply the same preprocessing steps used during training. The most common serialization bug in machine learning comes from saving a model but forgetting to save—or improperly applying—the preprocessing. Pipelines solve this by bundling preprocessing and modeling into a single object that is saved and loaded as a unit.

Without Pipelines (Error-Prone)#

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

# Training
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_scaled, y_train)

# Must save both (easy to forget!)
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(model, 'model.joblib')

# Loading (must remember order!)
scaler = joblib.load('scaler.joblib')
model = joblib.load('model.joblib')

# Must remember to scale (easy to forget!)
X_test_scaled = scaler.transform(X_test)
predictions = model.predict(X_test_scaled)

8.1.2.3. Complex Pipeline Example#

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
import joblib

# Define preprocessing for different column types
numeric_features = ['age', 'income', 'credit_score']
categorical_features = ['occupation', 'education']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create full pipeline
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100))
])

# Train
full_pipeline.fit(X_train, y_train)

# Save the entire pipeline
joblib.dump(full_pipeline, 'full_model_pipeline.joblib')

# Load and use
pipeline = joblib.load('full_model_pipeline.joblib')
predictions = pipeline.predict(X_new)  # All preprocessing automatic!

8.1.2.4. Saving Model Metadata#

Include important information alongside your model for reproducibility and debugging.

import joblib
from datetime import datetime
from sklearn import __version__ as sklearn_version
import numpy as np

def save_model_with_metadata(model, X_train, y_train, filepath, 
                             feature_names=None, model_description=None):
    """
    Save a scikit-learn model with comprehensive metadata.
    """
    # Evaluate model
    train_score = model.score(X_train, y_train)
    
    # Create metadata package
    model_package = {
        'model': model,
        'metadata': {
            'training_date': datetime.now().isoformat(),
            'sklearn_version': sklearn_version,
            'python_version': f"{sys.version_info.major}.{sys.version_info.minor}",
            'train_score': train_score,
            'n_samples_train': len(y_train),
            'n_features': X_train.shape[1],
            'feature_names': feature_names,
            'model_type': type(model).__name__,
            'model_params': model.get_params(),
            'description': model_description
        }
    }
    
    joblib.dump(model_package, filepath)
    print(f"Model saved to {filepath}")
    return model_package

# Usage
feature_names = ['age', 'income', 'credit_score', 'employment_length']
model_package = save_model_with_metadata(
    pipeline, 
    X_train, 
    y_train,
    'model_v1.joblib',
    feature_names=feature_names,
    model_description="Credit risk classifier v1.0"
)

Loading with Metadata#

def load_model_with_metadata(filepath):
    """Load and display model metadata."""
    package = joblib.load(filepath)
    
    model = package['model']
    metadata = package['metadata']
    
    print("=" * 50)
    print("MODEL INFORMATION")
    print("=" * 50)
    print(f"Model Type: {metadata['model_type']}")
    print(f"Training Date: {metadata['training_date']}")
    print(f"Training Score: {metadata['train_score']:.4f}")
    print(f"Training Samples: {metadata['n_samples_train']}")
    print(f"Number of Features: {metadata['n_features']}")
    print(f"Scikit-learn Version: {metadata['sklearn_version']}")
    if metadata.get('description'):
        print(f"Description: {metadata['description']}")
    print("=" * 50)
    
    return model, metadata

# Usage
model, metadata = load_model_with_metadata('model_v1.joblib')
predictions = model.predict(X_test)

8.1.2.5. Version Compatibility#

Scikit-learn models saved with one version may not work with another version. Always track the scikit-learn version used.

Checking Version Compatibility#

from sklearn import __version__ as current_version
import joblib

def load_model_safely(filepath):
    """Load model with version checking."""
    package = joblib.load(filepath)
    
    model = package['model']
    saved_version = package['metadata']['sklearn_version']
    
    if saved_version != current_version:
        print(f"Warning: Model saved with sklearn {saved_version}, "
              f"but current version is {current_version}")
        print("   Model may not work correctly. Consider retraining.")
    
    return model

model = load_model_safely('model_v1.joblib')

8.1.2.6. Feature Names and Order#

When loading models, ensure features are in the same order as training.

import pandas as pd
import joblib

def predict_with_feature_validation(model, X, feature_names):
    """
    Make predictions with feature order validation.
    """
    if isinstance(X, pd.DataFrame):
        # Reorder columns to match training order
        if list(X.columns) != feature_names:
            print("Warning: Reordering features to match training data")
            X = X[feature_names]
    
    return model.predict(X)

# Save feature names with model
model_data = {
    'model': pipeline,
    'feature_names': list(X_train.columns)
}
joblib.dump(model_data, 'model_with_features.joblib')

# Load and validate
model_data = joblib.load('model_with_features.joblib')
predictions = predict_with_feature_validation(
    model_data['model'], 
    X_new, 
    model_data['feature_names']
)

8.1.2.7. Complete Example: Production-Ready Model Saving#

import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn import __version__ as sklearn_version
import json

class ModelManager:
    """Manage model saving and loading with metadata."""
    
    @staticmethod
    def save(model, filepath, X_train, y_train, feature_names=None, 
             additional_info=None):
        """Save model with comprehensive metadata."""
        train_score = model.score(X_train, y_train)
        
        package = {
            'model': model,
            'metadata': {
                'saved_at': datetime.now().isoformat(),
                'sklearn_version': sklearn_version,
                'train_score': train_score,
                'n_samples': len(y_train),
                'n_features': X_train.shape[1],
                'feature_names': feature_names or list(range(X_train.shape[1])),
                'model_type': type(model.named_steps.get('classifier', model)).__name__,
                'additional_info': additional_info or {}
            }
        }
        
        joblib.dump(package, filepath, compress=3)
        
        # Save metadata separately as JSON for easy inspection
        metadata_path = filepath.replace('.joblib', '_metadata.json')
        with open(metadata_path, 'w') as f:
            json.dump(package['metadata'], f, indent=2, default=str)
        
        
        return package
    
    @staticmethod
    def load(filepath, check_version=True):
        """Load model with validation."""
        package = joblib.load(filepath)
        model = package['model']
        metadata = package['metadata']
        
        # Version check
        if check_version and metadata['sklearn_version'] != sklearn_version:
            raise Exception("Version mismatch!")
        
        
        return model, metadata

# Usage
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)

# Save
ModelManager.save(
    pipeline, 
    'models/rf_model_v1.joblib',
    X_train, 
    y_train,
    feature_names=['feature1', 'feature2', 'feature3'],
    additional_info={'dataset': 'customer_churn', 'experiment_id': 'exp_001'}
)

# Load
model, metadata = ModelManager.load('models/rf_model_v1.joblib')
predictions = model.predict(X_test)

8.1.2.8. Summary#

  • Always use pipelines to save models with their preprocessing steps

  • Save models with metadata including version, date, features, and performance

  • Use joblib for efficient serialization of scikit-learn models

  • Track feature names and order to prevent prediction errors

  • Implement version checks when loading models

  • Validate library compatibility between training and deployment

Proper persistence ensures your models are reproducible, maintainable, and production-ready.