import seaborn as snsml
Generic helpers for tabular multi-output training and scoring, with runnable examples built from a seaborn dataset.
Setup
Example Data
df = sns.load_dataset("penguins").dropna(
subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape(342, 10)
df[feat_col + ["species"] + target_col].head()| bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | species | species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|---|---|---|---|---|
| 0 | 39.1 | 18.7 | 181.0 | 3750.0 | Adelie | 1.0 | 0.0 | 0.0 |
| 1 | 39.5 | 17.4 | 186.0 | 3800.0 | Adelie | 1.0 | 0.0 | 0.0 |
| 2 | 40.3 | 18.0 | 195.0 | 3250.0 | Adelie | 1.0 | 0.0 | 0.0 |
| 3 | 36.7 | 19.3 | 193.0 | 3450.0 | Adelie | 1.0 | 0.0 | 0.0 |
| 4 | 39.3 | 20.6 | 190.0 | 3650.0 | Adelie | 1.0 | 0.0 | 0.0 |
Splitter
get_splits
def get_splits(
df:DataFrame, stratified:str | None=None, # col used for stratified sampling
group:str | None=None, # col used to keep grouped rows together
nfold:int=5, seed:int=123
)->list:
Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3
(228, 114)
Train/Test Split
split_data
def split_data(
df:DataFrame, # dataframe of values
feat_col:Sequence, # feature columns
target_col:Sequence, # target columns
split:tuple
)->tuple:
Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape((228, 4), (228, 3), (114, 4), (114, 3))
Trainer
train_ml
def train_ml(
df:DataFrame, # dataframe of values
feat_col:Sequence, # feature columns
target_col:Sequence, # target columns
split:tuple, model, # sklearn model instance
save:str | pathlib.Path | None=None, # output path for joblib model
params:dict | None=None, # kwargs forwarded to model.fit
)->tuple:
Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.993427 | 0.137000 | -0.130427 |
| 3 | 1.064457 | 0.046586 | -0.111043 |
| 9 | 0.839056 | 0.118838 | 0.042105 |
| 11 | 0.669557 | 0.423417 | -0.092974 |
| 14 | 1.050863 | -0.073914 | 0.023052 |
Cross-Validation
train_ml_cv
def train_ml_cv(
df:DataFrame, # dataframe of values
feat_col:Sequence, # feature columns
target_col:Sequence, # target columns
splits:Sequence, model, # sklearn model instance
save:str | None=None, # model name prefix for saved folds
params:dict | None=None, # kwargs forwarded to model.fit
)->DataFrame:
Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()| species_Adelie | species_Chinstrap | species_Gentoo | nfold | |
|---|---|---|---|---|
| 0 | 0.993427 | 0.137000 | -0.130427 | 0 |
| 1 | 0.790344 | 0.103762 | 0.105894 | 1 |
| 2 | 0.673088 | 0.317647 | 0.009265 | 2 |
| 3 | 1.064457 | 0.046586 | -0.111043 | 0 |
| 4 | 1.122991 | 0.154406 | -0.277398 | 1 |
Score
post_process
def post_process(
pred_like:pandas.DataFrame | pandas.Series, epsilon:float=1e-08
):
Clip negatives and renormalize probability-like predictions.
post_process(pred.head())| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.878807 | 1.211930e-01 | 8.846216e-09 |
| 3 | 0.958070 | 4.192990e-02 | 9.000554e-09 |
| 9 | 0.839056 | 1.188384e-01 | 4.210543e-02 |
| 11 | 0.612601 | 3.873988e-01 | 9.149350e-09 |
| 14 | 0.978535 | 9.311731e-09 | 2.146502e-02 |
post_process_oof
def post_process_oof(
oof_ml:DataFrame, target_col:Sequence
)->DataFrame:
Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.878807 | 0.121193 | 8.846216e-09 |
| 1 | 0.790344 | 0.103762 | 1.058942e-01 |
| 2 | 0.673088 | 0.317647 | 9.264531e-03 |
| 3 | 0.958070 | 0.041930 | 9.000554e-09 |
| 4 | 0.879124 | 0.120876 | 7.828416e-09 |
get_score
def get_score(
target:DataFrame, pred:DataFrame, func:Callable
)->Series:
Apply a row-wise score function to aligned target and prediction frames.
js_divergence_flat
def js_divergence_flat(
target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:
Compute Jensen-Shannon divergence between two flattened probability vectors.
kl_divergence_flat
def kl_divergence_flat(
target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:
Compute KL divergence between two flattened probability vectors.
calculate_ce
def calculate_ce(
target_series:pandas.Series | numpy.ndarray, pred_series:pandas.Series | numpy.ndarray, epsilon:float=1e-08
)->float:
Compute cross-entropy between two flattened probability vectors.
target = df.loc[oof.index, target_col].copy()
pd.DataFrame({
"jsd": get_score_jsd(target, oof),
"kld": get_score_kld(target, oof),
"ce": get_score_ce(target, oof),
}).head()| jsd | kld | ce | |
|---|---|---|---|
| 0 | 0.043958 | 0.129190 | 0.129190 |
| 1 | 0.078813 | 0.235286 | 0.235287 |
| 2 | 0.129371 | 0.395879 | 0.395879 |
| 3 | 0.014756 | 0.042834 | 0.042835 |
| 4 | 0.043837 | 0.128829 | 0.128829 |
Predictor
predict_ml
def predict_ml(
df:DataFrame, # dataframe with features
feat_col:Sequence, # feature columns
target_col:collections.abc.Sequence[str] | None=None, model_pth:str | pathlib.Path='model.joblib'
)->DataFrame:
Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.993427 | 0.137000 | -0.130427 |
| 3 | 1.064457 | 0.046586 | -0.111043 |
| 9 | 0.839056 | 0.118838 | 0.042105 |
| 11 | 0.669557 | 0.423417 | -0.092974 |
| 14 | 1.050863 | -0.073914 | 0.023052 |