kmodel

kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows.

kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks.

Installation

pip install kmodel

Quick start

The examples below follow the notebooks under nbs/ in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring.

01 ML

from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml

from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns

df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape

(342, 10)

# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])

StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3

(228, 114)

# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((228, 4), (228, 3), (114, 4), (114, 3))

# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()

	species_Adelie	species_Chinstrap	species_Gentoo	nfold
0	0.993427	0.137000	-0.130427	0
1	0.790344	0.103762	0.105894	1
2	0.673088	0.317647	0.009265	2
3	1.064457	0.046586	-0.111043	0
4	1.122991	0.154406	-0.277398	1

# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	1.211930e-01	8.846216e-09
3	0.958070	4.192990e-02	9.000554e-09
9	0.839056	1.188384e-01	4.210543e-02
11	0.612601	3.873988e-01	9.149350e-09
14	0.978535	9.311731e-09	2.146502e-02

# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	0.121193	8.846216e-09
1	0.790344	0.103762	1.058942e-01
2	0.673088	0.317647	9.264531e-03
3	0.958070	0.041930	9.000554e-09
4	0.879124	0.120876	7.828416e-09

# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

02 DNN

from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv

import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader

# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape

(342, 10)

# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape

torch.Size([8, 3])

# Weight-normalized linear block.
lin_wn(10, 3)

Sequential(
  (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): ParametrizedLinear(
    in_features=10, out_features=3, bias=True
    (parametrizations): ModuleDict(
      (weight): ParametrizationList(
        (0): _WeightNorm()
      )
    )
  )
  (3): SiLU()
)

# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape

torch.Size([8, 3])

# Cross-entropy with soft labels.
CE(logits, yb)

tensor(1.0681, grad_fn=<MeanBackward0>)

# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)

tensor(1.0681, grad_fn=<MeanBackward0>)

# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)

tensor(0.3023, grad_fn=<MeanBackward0>)

# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
    df,
    feat_col,
    target_col,
    split0,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
    save='model',
)
pred.head()

# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
    df.iloc[split0[1]].copy(),
    feat_col,
    target_col,
    model_func=get_mlp,
    model_pth='model',
    A=n_aa,
)
test_pred

# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
    df,
    feat_col,
    target_col,
    splits=splits,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
)
oof.nfold.value_counts()