ml

Generic helpers for tabular multi-output training and scoring, with runnable examples built from a seaborn dataset.

Setup

Example Data

import seaborn as sns
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape
(342, 10)
df[feat_col + ["species"] + target_col].head()
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species species_Adelie species_Chinstrap species_Gentoo
0 39.1 18.7 181.0 3750.0 Adelie 1.0 0.0 0.0
1 39.5 17.4 186.0 3800.0 Adelie 1.0 0.0 0.0
2 40.3 18.0 195.0 3250.0 Adelie 1.0 0.0 0.0
3 36.7 19.3 193.0 3450.0 Adelie 1.0 0.0 0.0
4 39.3 20.6 190.0 3650.0 Adelie 1.0 0.0 0.0

Splitter


source

get_splits


def get_splits(
    df:DataFrame, stratified:str | None=None, # col used for stratified sampling
    group:str | None=None, # col used to keep grouped rows together
    nfold:int=5, seed:int=123
)->list:

Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.

splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])
StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3
(228, 114)

Train/Test Split


source

split_data


def split_data(
    df:DataFrame, # dataframe of values
    feat_col:Sequence, # feature columns
    target_col:Sequence, # target columns
    split:tuple
)->tuple:

Given a split tuple, return X_train, y_train, X_test, and y_test.

X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((228, 4), (228, 3), (114, 4), (114, 3))

Trainer


source

train_ml


def train_ml(
    df:DataFrame, # dataframe of values
    feat_col:Sequence, # feature columns
    target_col:Sequence, # target columns
    split:tuple, model, # sklearn model instance
    save:str | pathlib.Path | None=None, # output path for joblib model
    params:dict | None=None, # kwargs forwarded to model.fit
)->tuple:

Fit and predict with a sklearn model, returning validation targets and predictions.

model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()
species_Adelie species_Chinstrap species_Gentoo
0 0.993427 0.137000 -0.130427
3 1.064457 0.046586 -0.111043
9 0.839056 0.118838 0.042105
11 0.669557 0.423417 -0.092974
14 1.050863 -0.073914 0.023052

Cross-Validation


source

train_ml_cv


def train_ml_cv(
    df:DataFrame, # dataframe of values
    feat_col:Sequence, # feature columns
    target_col:Sequence, # target columns
    splits:Sequence, model, # sklearn model instance
    save:str | None=None, # model name prefix for saved folds
    params:dict | None=None, # kwargs forwarded to model.fit
)->DataFrame:

Run cross-validation through the given splits.

oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()
species_Adelie species_Chinstrap species_Gentoo nfold
0 0.993427 0.137000 -0.130427 0
1 0.790344 0.103762 0.105894 1
2 0.673088 0.317647 0.009265 2
3 1.064457 0.046586 -0.111043 0
4 1.122991 0.154406 -0.277398 1

Score


source

post_process


def post_process(
    pred_like:pandas.DataFrame | pandas.Series, epsilon:float=1e-08
):

Clip negatives and renormalize probability-like predictions.

post_process(pred.head())
species_Adelie species_Chinstrap species_Gentoo
0 0.878807 1.211930e-01 8.846216e-09
3 0.958070 4.192990e-02 9.000554e-09
9 0.839056 1.188384e-01 4.210543e-02
11 0.612601 3.873988e-01 9.149350e-09
14 0.978535 9.311731e-09 2.146502e-02

source

post_process_oof


def post_process_oof(
    oof_ml:DataFrame, target_col:Sequence
)->DataFrame:

Post-process prediction columns in an out-of-fold dataframe.

oof = post_process_oof(oof, target_col)
oof[target_col].head()
species_Adelie species_Chinstrap species_Gentoo
0 0.878807 0.121193 8.846216e-09
1 0.790344 0.103762 1.058942e-01
2 0.673088 0.317647 9.264531e-03
3 0.958070 0.041930 9.000554e-09
4 0.879124 0.120876 7.828416e-09

source

get_score


def get_score(
    target:DataFrame, pred:DataFrame, func:Callable
)->Series:

Apply a row-wise score function to aligned target and prediction frames.


source

js_divergence_flat


def js_divergence_flat(
    target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:

Compute Jensen-Shannon divergence between two flattened probability vectors.


source

kl_divergence_flat


def kl_divergence_flat(
    target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:

Compute KL divergence between two flattened probability vectors.


source

calculate_ce


def calculate_ce(
    target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:

Compute cross-entropy between two flattened probability vectors.

target = df.loc[oof.index, target_col].copy()
pd.DataFrame({
    "jsd": get_score_jsd(target, oof),
    "kld": get_score_kld(target, oof),
    "ce": get_score_ce(target, oof),
}).head()
jsd kld ce
0 0.043958 0.129190 0.129190
1 0.078813 0.235286 0.235287
2 0.129371 0.395879 0.395879
3 0.014756 0.042834 0.042835
4 0.043837 0.128829 0.128829

Predictor


source

predict_ml


def predict_ml(
    df:DataFrame, # dataframe with features
    feat_col:Sequence, # feature columns
    target_col:collections.abc.Sequence[str] | None=None, model_pth:str | pathlib.Path='model.joblib'
)->DataFrame:

Predict from a saved sklearn model.

model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
species_Adelie species_Chinstrap species_Gentoo
0 0.993427 0.137000 -0.130427
3 1.064457 0.046586 -0.111043
9 0.839056 0.118838 0.042105
11 0.669557 0.423417 -0.092974
14 1.050863 -0.073914 0.023052