# kmodel


<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

kmodel provides reusable machine learning and deep learning helpers for
multi-output modeling workflows. It covers tabular model training,
scoring, post-processing, prediction, and fastai-based deep learning
utilities through runnable examples derived from the project notebooks.

## Installation

``` bash
pip install kmodel
```

## Quick start

The examples below follow the notebooks under `nbs/` in order. Each
function example lives in its own cell and starts with a short comment
derived from the function docstring.

### 01 ML

``` python
from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml
```

``` python
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
```

``` python
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape
```

    (342, 10)

``` python
# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])
```

    StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
    # species in train set: 3
    # species in test set: 3

    (228, 114)

``` python
# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
```

    ((228, 4), (228, 3), (114, 4), (114, 3))

``` python
# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()
```

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
&#10;    .dataframe tbody tr th {
        vertical-align: top;
    }
&#10;    .dataframe thead th {
        text-align: right;
    }
</style>

<table class="dataframe" data-quarto-postprocess="true" data-border="1">
<thead>
<tr style="text-align: right;">
<th data-quarto-table-cell-role="th"></th>
<th data-quarto-table-cell-role="th">species_Adelie</th>
<th data-quarto-table-cell-role="th">species_Chinstrap</th>
<th data-quarto-table-cell-role="th">species_Gentoo</th>
</tr>
</thead>
<tbody>
<tr>
<td data-quarto-table-cell-role="th">0</td>
<td>0.993427</td>
<td>0.137000</td>
<td>-0.130427</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">3</td>
<td>1.064457</td>
<td>0.046586</td>
<td>-0.111043</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">9</td>
<td>0.839056</td>
<td>0.118838</td>
<td>0.042105</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">11</td>
<td>0.669557</td>
<td>0.423417</td>
<td>-0.092974</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">14</td>
<td>1.050863</td>
<td>-0.073914</td>
<td>0.023052</td>
</tr>
</tbody>
</table>

</div>

``` python
# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()
```

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
&#10;    .dataframe tbody tr th {
        vertical-align: top;
    }
&#10;    .dataframe thead th {
        text-align: right;
    }
</style>

<table class="dataframe" data-quarto-postprocess="true" data-border="1">
<thead>
<tr style="text-align: right;">
<th data-quarto-table-cell-role="th"></th>
<th data-quarto-table-cell-role="th">species_Adelie</th>
<th data-quarto-table-cell-role="th">species_Chinstrap</th>
<th data-quarto-table-cell-role="th">species_Gentoo</th>
<th data-quarto-table-cell-role="th">nfold</th>
</tr>
</thead>
<tbody>
<tr>
<td data-quarto-table-cell-role="th">0</td>
<td>0.993427</td>
<td>0.137000</td>
<td>-0.130427</td>
<td>0</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">1</td>
<td>0.790344</td>
<td>0.103762</td>
<td>0.105894</td>
<td>1</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">2</td>
<td>0.673088</td>
<td>0.317647</td>
<td>0.009265</td>
<td>2</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">3</td>
<td>1.064457</td>
<td>0.046586</td>
<td>-0.111043</td>
<td>0</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">4</td>
<td>1.122991</td>
<td>0.154406</td>
<td>-0.277398</td>
<td>1</td>
</tr>
</tbody>
</table>

</div>

``` python
# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())
```

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
&#10;    .dataframe tbody tr th {
        vertical-align: top;
    }
&#10;    .dataframe thead th {
        text-align: right;
    }
</style>

<table class="dataframe" data-quarto-postprocess="true" data-border="1">
<thead>
<tr style="text-align: right;">
<th data-quarto-table-cell-role="th"></th>
<th data-quarto-table-cell-role="th">species_Adelie</th>
<th data-quarto-table-cell-role="th">species_Chinstrap</th>
<th data-quarto-table-cell-role="th">species_Gentoo</th>
</tr>
</thead>
<tbody>
<tr>
<td data-quarto-table-cell-role="th">0</td>
<td>0.878807</td>
<td>1.211930e-01</td>
<td>8.846216e-09</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">3</td>
<td>0.958070</td>
<td>4.192990e-02</td>
<td>9.000554e-09</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">9</td>
<td>0.839056</td>
<td>1.188384e-01</td>
<td>4.210543e-02</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">11</td>
<td>0.612601</td>
<td>3.873988e-01</td>
<td>9.149350e-09</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">14</td>
<td>0.978535</td>
<td>9.311731e-09</td>
<td>2.146502e-02</td>
</tr>
</tbody>
</table>

</div>

``` python
# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()
```

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
&#10;    .dataframe tbody tr th {
        vertical-align: top;
    }
&#10;    .dataframe thead th {
        text-align: right;
    }
</style>

<table class="dataframe" data-quarto-postprocess="true" data-border="1">
<thead>
<tr style="text-align: right;">
<th data-quarto-table-cell-role="th"></th>
<th data-quarto-table-cell-role="th">species_Adelie</th>
<th data-quarto-table-cell-role="th">species_Chinstrap</th>
<th data-quarto-table-cell-role="th">species_Gentoo</th>
</tr>
</thead>
<tbody>
<tr>
<td data-quarto-table-cell-role="th">0</td>
<td>0.878807</td>
<td>0.121193</td>
<td>8.846216e-09</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">1</td>
<td>0.790344</td>
<td>0.103762</td>
<td>1.058942e-01</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">2</td>
<td>0.673088</td>
<td>0.317647</td>
<td>9.264531e-03</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">3</td>
<td>0.958070</td>
<td>0.041930</td>
<td>9.000554e-09</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">4</td>
<td>0.879124</td>
<td>0.120876</td>
<td>7.828416e-09</td>
</tr>
</tbody>
</table>

</div>

``` python
# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
```

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
&#10;    .dataframe tbody tr th {
        vertical-align: top;
    }
&#10;    .dataframe thead th {
        text-align: right;
    }
</style>

<table class="dataframe" data-quarto-postprocess="true" data-border="1">
<thead>
<tr style="text-align: right;">
<th data-quarto-table-cell-role="th"></th>
<th data-quarto-table-cell-role="th">species_Adelie</th>
<th data-quarto-table-cell-role="th">species_Chinstrap</th>
<th data-quarto-table-cell-role="th">species_Gentoo</th>
</tr>
</thead>
<tbody>
<tr>
<td data-quarto-table-cell-role="th">0</td>
<td>0.993427</td>
<td>0.137000</td>
<td>-0.130427</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">3</td>
<td>1.064457</td>
<td>0.046586</td>
<td>-0.111043</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">9</td>
<td>0.839056</td>
<td>0.118838</td>
<td>0.042105</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">11</td>
<td>0.669557</td>
<td>0.423417</td>
<td>-0.092974</td>
</tr>
<tr>
<td data-quarto-table-cell-role="th">14</td>
<td>1.050863</td>
<td>-0.073914</td>
<td>0.023052</td>
</tr>
</tbody>
</table>

</div>

### 02 DNN

``` python
from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv
```

``` python
import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
```

``` python
# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape
```

    (342, 10)

``` python
# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape
```

    torch.Size([8, 3])

``` python
# Weight-normalized linear block.
lin_wn(10, 3)
```

    Sequential(
      (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.1, inplace=False)
      (2): ParametrizedLinear(
        in_features=10, out_features=3, bias=True
        (parametrizations): ModuleDict(
          (weight): ParametrizationList(
            (0): _WeightNorm()
          )
        )
      )
      (3): SiLU()
    )

``` python
# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape
```

    torch.Size([8, 3])

``` python
# Cross-entropy with soft labels.
CE(logits, yb)
```

    tensor(1.0681, grad_fn=<MeanBackward0>)

``` python
# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)
```

    tensor(1.0681, grad_fn=<MeanBackward0>)

``` python
# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)
```

    tensor(0.3023, grad_fn=<MeanBackward0>)

``` python
# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
    df,
    feat_col,
    target_col,
    split0,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
    save='model',
)
pred.head()
```

``` python
# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
    df.iloc[split0[1]].copy(),
    feat_col,
    target_col,
    model_func=get_mlp,
    model_pth='model',
    A=n_aa,
)
test_pred
```

``` python
# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
    df,
    feat_col,
    target_col,
    splits=splits,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
)
oof.nfold.value_counts()
```
