# kmodel kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks. ## Installation ``` bash pip install kmodel ``` ## Quick start The examples below follow the notebooks under `nbs/` in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring. ### 01 ML ``` python from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml ``` ``` python from pathlib import Path import pandas as pd from sklearn.linear_model import LinearRegression import seaborn as sns ``` ``` python df = sns.load_dataset("penguins").dropna( subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"] ).reset_index(drop=True) feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"] target_df = pd.get_dummies(df["species"], prefix="species", dtype=float) target_col = target_df.columns.tolist() df[target_col] = target_df df.shape ``` (342, 10) ``` python # Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic. splits = get_splits(df, stratified="species", nfold=3) split0 = splits[0] len(split0[0]), len(split0[1]) ``` StratifiedKFold(n_splits=3, random_state=123, shuffle=True) # species in train set: 3 # species in test set: 3 (228, 114) ``` python # Given a split tuple, return X_train, y_train, X_test, and y_test. X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0) X_train.shape, y_train.shape, X_test.shape, y_test.shape ``` ((228, 4), (228, 3), (114, 4), (114, 3)) ``` python # Fit and predict with a sklearn model, returning validation targets and predictions. model = LinearRegression() target, pred = train_ml(df, feat_col, target_col, split0, model) pred.head() ```

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

``` python # Run cross-validation through the given splits. oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression()) oof.head() ```

	species_Adelie	species_Chinstrap	species_Gentoo	nfold
0	0.993427	0.137000	-0.130427	0
1	0.790344	0.103762	0.105894	1
2	0.673088	0.317647	0.009265	2
3	1.064457	0.046586	-0.111043	0
4	1.122991	0.154406	-0.277398	1

``` python # Clip negatives and renormalize probability-like predictions. post_process(pred.head()) ```

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	1.211930e-01	8.846216e-09
3	0.958070	4.192990e-02	9.000554e-09
9	0.839056	1.188384e-01	4.210543e-02
11	0.612601	3.873988e-01	9.149350e-09
14	0.978535	9.311731e-09	2.146502e-02

``` python # Post-process prediction columns in an out-of-fold dataframe. oof = post_process_oof(oof, target_col) oof[target_col].head() ```

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	0.121193	8.846216e-09
1	0.790344	0.103762	1.058942e-01
2	0.673088	0.317647	9.264531e-03
3	0.958070	0.041930	9.000554e-09
4	0.879124	0.120876	7.828416e-09

``` python # Predict from a saved sklearn model. model_path = Path("_tmp/penguins_ml.joblib") model_path.parent.mkdir(parents=True, exist_ok=True) _ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path) predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head() ```

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

### 02 DNN ``` python from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv ``` ``` python import fastcore.all as fc import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import torch import torch.nn as nn import torch.nn.functional as F from fastai.vision.all import * from sklearn.model_selection import StratifiedKFold from torch.utils.data import DataLoader ``` ``` python # Set up the objects used by the examples below. seed_everything(123) df = sns.load_dataset("penguins").dropna( subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"] ).reset_index(drop=True) feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"] target_df = pd.get_dummies(df["species"], prefix="species", dtype=float) target_col = target_df.columns.tolist() df[target_col] = target_df n_feature = len(feat_col) n_target = len(target_col) n_aa = len(target_col) skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123) splits = list(skf.split(df.index, df["species"])) split0 = splits[0] ds = GeneralDataset(df, feat_col, target_col, A=n_aa) xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True))) logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb) df.shape ``` (342, 10) ``` python # Feed-forward model for tabular inputs. mlp = MLP(n_feature, n_target) mlp(xb).shape ``` torch.Size([8, 3]) ``` python # Weight-normalized linear block. lin_wn(10, 3) ``` Sequential( (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (1): Dropout(p=0.1, inplace=False) (2): ParametrizedLinear( in_features=10, out_features=3, bias=True (parametrizations): ModuleDict( (weight): ParametrizationList( (0): _WeightNorm() ) ) ) (3): SiLU() ) ``` python # Initialize convolution layers with Kaiming normal weights. cnn = CNN1D(n_feature, n_target).apply(init_weights) cnn(xb).shape ``` torch.Size([8, 3]) ``` python # Cross-entropy with soft labels. CE(logits, yb) ``` tensor(1.0681, grad_fn=) ``` python # Average KL divergence across positions between target_probs and softmax(logits). KLD(logits, yb) ``` tensor(1.0681, grad_fn=) ``` python # Average Jensen-Shannon divergence across positions between target_probs and softmax(logits). JSD(logits, yb) ``` tensor(0.3023, grad_fn=) ``` python # Train a deep learning model with the fastai learner stack. get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP') target, pred = train_dl( df, feat_col, target_col, split0, model_func=get_mlp, A=n_aa, n_epoch=1, bs=16, lr=3e-3, save='model', ) pred.head() ``` ``` python # Predict a dataframe given a deep learning model saved by fastai. test_pred = predict_dl( df.iloc[split0[1]].copy(), feat_col, target_col, model_func=get_mlp, model_pth='model', A=n_aa, ) test_pred ``` ``` python # Cross-validation training loop for deep learning models. oof = train_dl_cv( df, feat_col, target_col, splits=splits, model_func=get_mlp, A=n_aa, n_epoch=1, bs=16, lr=3e-3, ) oof.nfold.value_counts() ```