ml

Generic helpers for tabular multi-output training and scoring, with runnable examples built from a seaborn dataset.

Setup

Example Data

import seaborn as sns

df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape

(342, 10)

df[feat_col + ["species"] + target_col].head()

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	species	species_Adelie
0	39.1	18.7	181.0	3750.0	Adelie	1.0
1	39.5	17.4	186.0	3800.0	Adelie	1.0
2	40.3	18.0	195.0	3250.0	Adelie	1.0
3	36.7	19.3	193.0	3450.0	Adelie	1.0
4	39.3	20.6	190.0	3650.0	Adelie	1.0

Splitter

source

get_splits


def get_splits(
    df:DataFrame, stratified:str | None=None, # col used for stratified sampling
    group:str | None=None, # col used to keep grouped rows together
    nfold:int=5, seed:int=123
)->list:

Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.

splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])

StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3

(228, 114)

Train/Test Split

source

split_data


def split_data(
    df:DataFrame, # dataframe of values
    feat_col:Sequence, # feature columns
    target_col:Sequence, # target columns
    split:tuple
)->tuple:

Given a split tuple, return X_train, y_train, X_test, and y_test.

X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((228, 4), (228, 3), (114, 4), (114, 3))

Trainer

source

train_ml


def train_ml(
    df:DataFrame, # dataframe of values
    feat_col:Sequence, # feature columns
    target_col:Sequence, # target columns
    split:tuple, model, # sklearn model instance
    save:str | pathlib.Path | None=None, # output path for joblib model
    params:dict | None=None, # kwargs forwarded to model.fit
)->tuple:

Fit and predict with a sklearn model, returning validation targets and predictions.

model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052

Cross-Validation

source

train_ml_cv


def train_ml_cv(
    df:DataFrame, # dataframe of values
    feat_col:Sequence, # feature columns
    target_col:Sequence, # target columns
    splits:Sequence, model, # sklearn model instance
    save:str | None=None, # model name prefix for saved folds
    params:dict | None=None, # kwargs forwarded to model.fit
)->DataFrame:

Run cross-validation through the given splits.

oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()

	species_Adelie	species_Chinstrap	species_Gentoo	nfold
0	0.993427	0.137000	-0.130427	0
1	0.790344	0.103762	0.105894	1
2	0.673088	0.317647	0.009265	2
3	1.064457	0.046586	-0.111043	0
4	1.122991	0.154406	-0.277398	1

Score

source

post_process


def post_process(
    pred_like:pandas.DataFrame | pandas.Series, epsilon:float=1e-08
):

Clip negatives and renormalize probability-like predictions.

post_process(pred.head())

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	1.211930e-01	8.846216e-09
3	0.958070	4.192990e-02	9.000554e-09
9	0.839056	1.188384e-01	4.210543e-02
11	0.612601	3.873988e-01	9.149350e-09
14	0.978535	9.311731e-09	2.146502e-02

source

post_process_oof


def post_process_oof(
    oof_ml:DataFrame, target_col:Sequence
)->DataFrame:

Post-process prediction columns in an out-of-fold dataframe.

oof = post_process_oof(oof, target_col)
oof[target_col].head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.878807	0.121193	8.846216e-09
1	0.790344	0.103762	1.058942e-01
2	0.673088	0.317647	9.264531e-03
3	0.958070	0.041930	9.000554e-09
4	0.879124	0.120876	7.828416e-09

source

get_score


def get_score(
    target:DataFrame, pred:DataFrame, func:Callable
)->Series:

Apply a row-wise score function to aligned target and prediction frames.

source

js_divergence_flat


def js_divergence_flat(
    target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:

Compute Jensen-Shannon divergence between two flattened probability vectors.

source

kl_divergence_flat


def kl_divergence_flat(
    target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:

Compute KL divergence between two flattened probability vectors.

source

calculate_ce


def calculate_ce(
    target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:

Compute cross-entropy between two flattened probability vectors.

target = df.loc[oof.index, target_col].copy()
pd.DataFrame({
    "jsd": get_score_jsd(target, oof),
    "kld": get_score_kld(target, oof),
    "ce": get_score_ce(target, oof),
}).head()

	jsd	kld	ce
0	0.043958	0.129190	0.129190
1	0.078813	0.235286	0.235287
2	0.129371	0.395879	0.395879
3	0.014756	0.042834	0.042835
4	0.043837	0.128829	0.128829

Predictor

source

predict_ml


def predict_ml(
    df:DataFrame, # dataframe with features
    feat_col:Sequence, # feature columns
    target_col:collections.abc.Sequence[str] | None=None, model_pth:str | pathlib.Path='model.joblib'
)->DataFrame:

Predict from a saved sklearn model.

model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()

	species_Adelie	species_Chinstrap	species_Gentoo
0	0.993427	0.137000	-0.130427
3	1.064457	0.046586	-0.111043
9	0.839056	0.118838	0.042105
11	0.669557	0.423417	-0.092974
14	1.050863	-0.073914	0.023052