kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows.
kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks.
Installation
pip install kmodel
Quick start
The examples below follow the notebooks under nbs/ in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring.
01 ML
from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml
from pathlib import Pathimport pandas as pdfrom sklearn.linear_model import LinearRegressionimport seaborn as sns
# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.splits = get_splits(df, stratified="species", nfold=3)split0 = splits[0]len(split0[0]), len(split0[1])
StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3
(228, 114)
# Given a split tuple, return X_train, y_train, X_test, and y_test.X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)X_train.shape, y_train.shape, X_test.shape, y_test.shape
((228, 4), (228, 3), (114, 4), (114, 3))
# Fit and predict with a sklearn model, returning validation targets and predictions.model = LinearRegression()target, pred = train_ml(df, feat_col, target_col, split0, model)pred.head()
species_Adelie
species_Chinstrap
species_Gentoo
0
0.993427
0.137000
-0.130427
3
1.064457
0.046586
-0.111043
9
0.839056
0.118838
0.042105
11
0.669557
0.423417
-0.092974
14
1.050863
-0.073914
0.023052
# Run cross-validation through the given splits.oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())oof.head()
species_Adelie
species_Chinstrap
species_Gentoo
nfold
0
0.993427
0.137000
-0.130427
0
1
0.790344
0.103762
0.105894
1
2
0.673088
0.317647
0.009265
2
3
1.064457
0.046586
-0.111043
0
4
1.122991
0.154406
-0.277398
1
# Clip negatives and renormalize probability-like predictions.post_process(pred.head())
species_Adelie
species_Chinstrap
species_Gentoo
0
0.878807
1.211930e-01
8.846216e-09
3
0.958070
4.192990e-02
9.000554e-09
9
0.839056
1.188384e-01
4.210543e-02
11
0.612601
3.873988e-01
9.149350e-09
14
0.978535
9.311731e-09
2.146502e-02
# Post-process prediction columns in an out-of-fold dataframe.oof = post_process_oof(oof, target_col)oof[target_col].head()
species_Adelie
species_Chinstrap
species_Gentoo
0
0.878807
0.121193
8.846216e-09
1
0.790344
0.103762
1.058942e-01
2
0.673088
0.317647
9.264531e-03
3
0.958070
0.041930
9.000554e-09
4
0.879124
0.120876
7.828416e-09
# Predict from a saved sklearn model.model_path = Path("_tmp/penguins_ml.joblib")model_path.parent.mkdir(parents=True, exist_ok=True)_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
# Initialize convolution layers with Kaiming normal weights.cnn = CNN1D(n_feature, n_target).apply(init_weights)cnn(xb).shape
torch.Size([8, 3])
# Cross-entropy with soft labels.CE(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average KL divergence across positions between target_probs and softmax(logits).KLD(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).JSD(logits, yb)
tensor(0.3023, grad_fn=<MeanBackward0>)
# Train a deep learning model with the fastai learner stack.get_mlp =lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')target, pred = train_dl( df, feat_col, target_col, split0, model_func=get_mlp, A=n_aa, n_epoch=1, bs=16, lr=3e-3, save='model',)pred.head()
# Predict a dataframe given a deep learning model saved by fastai.test_pred = predict_dl( df.iloc[split0[1]].copy(), feat_col, target_col, model_func=get_mlp, model_pth='model', A=n_aa,)test_pred
# Cross-validation training loop for deep learning models.oof = train_dl_cv( df, feat_col, target_col, splits=splits, model_func=get_mlp, A=n_aa, n_epoch=1, bs=16, lr=3e-3,)oof.nfold.value_counts()