kmodel

kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows.

kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks.

Installation

pip install kmodel

Quick start

The examples below follow the notebooks under nbs/ in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring.

01 ML

from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape
(342, 10)
# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])
StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3
(228, 114)
# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((228, 4), (228, 3), (114, 4), (114, 3))
# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()
species_Adelie species_Chinstrap species_Gentoo
0 0.993427 0.137000 -0.130427
3 1.064457 0.046586 -0.111043
9 0.839056 0.118838 0.042105
11 0.669557 0.423417 -0.092974
14 1.050863 -0.073914 0.023052
# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()
species_Adelie species_Chinstrap species_Gentoo nfold
0 0.993427 0.137000 -0.130427 0
1 0.790344 0.103762 0.105894 1
2 0.673088 0.317647 0.009265 2
3 1.064457 0.046586 -0.111043 0
4 1.122991 0.154406 -0.277398 1
# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())
species_Adelie species_Chinstrap species_Gentoo
0 0.878807 1.211930e-01 8.846216e-09
3 0.958070 4.192990e-02 9.000554e-09
9 0.839056 1.188384e-01 4.210543e-02
11 0.612601 3.873988e-01 9.149350e-09
14 0.978535 9.311731e-09 2.146502e-02
# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()
species_Adelie species_Chinstrap species_Gentoo
0 0.878807 0.121193 8.846216e-09
1 0.790344 0.103762 1.058942e-01
2 0.673088 0.317647 9.264531e-03
3 0.958070 0.041930 9.000554e-09
4 0.879124 0.120876 7.828416e-09
# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
species_Adelie species_Chinstrap species_Gentoo
0 0.993427 0.137000 -0.130427
3 1.064457 0.046586 -0.111043
9 0.839056 0.118838 0.042105
11 0.669557 0.423417 -0.092974
14 1.050863 -0.073914 0.023052

02 DNN

from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv
import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape
(342, 10)
# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape
torch.Size([8, 3])
# Weight-normalized linear block.
lin_wn(10, 3)
Sequential(
  (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): ParametrizedLinear(
    in_features=10, out_features=3, bias=True
    (parametrizations): ModuleDict(
      (weight): ParametrizationList(
        (0): _WeightNorm()
      )
    )
  )
  (3): SiLU()
)
# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape
torch.Size([8, 3])
# Cross-entropy with soft labels.
CE(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)
tensor(0.3023, grad_fn=<MeanBackward0>)
# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
    df,
    feat_col,
    target_col,
    split0,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
    save='model',
)
pred.head()
# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
    df.iloc[split0[1]].copy(),
    feat_col,
    target_col,
    model_func=get_mlp,
    model_pth='model',
    A=n_aa,
)
test_pred
# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
    df,
    feat_col,
    target_col,
    splits=splits,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
)
oof.nfold.value_counts()