Skip to content

Preprocessing Pipeline (scikit-learn)

Why pipelines?

Pipelines help you:

Avoid data leakage
Keep preprocessing consistent across train/test
Make training reproducible

Example: numeric + categorical preprocessing

ColumnTransformer + Pipeline

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
 
# Example dataset
 
df = pd.DataFrame({
    "age": [22, 25, None, 31, 35],
    "city": ["Pune", "Delhi", "Pune", "Mumbai", None],
    "score": [60, 65, 70, 75, 80],
    "target": [0, 0, 1, 1, 1],
})
 
X = df.drop(columns=["target"])
y = df["target"]
 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
 
numeric_features = ["age", "score"]
cat_features = ["city"]
 
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
 
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])
 
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, cat_features),
    ]
)
 
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=1000)),
])
 
model.fit(X_train, y_train)
print("Test score:", model.score(X_test, y_test))

ColumnTransformer + Pipeline

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
 
# Example dataset
 
df = pd.DataFrame({
    "age": [22, 25, None, 31, 35],
    "city": ["Pune", "Delhi", "Pune", "Mumbai", None],
    "score": [60, 65, 70, 75, 80],
    "target": [0, 0, 1, 1, 1],
})
 
X = df.drop(columns=["target"])
y = df["target"]
 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
 
numeric_features = ["age", "score"]
cat_features = ["city"]
 
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
 
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])
 
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, cat_features),
    ]
)
 
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=1000)),
])
 
model.fit(X_train, y_train)
print("Test score:", model.score(X_test, y_test))

Key takeaways

Fit happens on training data.
The same transformations are applied to test data.
Pipelines reduce bugs and make code cleaner.

If this helped you, consider buying me a coffee ☕

Buy me a coffee