Train ML

A collection of machine learning tools

Overview

A collection of utilities for training, evaluating, and deploying scikit-learn models for kinase substrate specificity prediction.


Data Splitting

get_splits - Creates cross-validation splits using stratified, grouped, or stratified-grouped KFold methods. This ensures proper data separation to avoid data leakage (e.g., keeping kinases from the same subfamily in the same fold).

splits = get_splits(
    df=pspa_info,        # DataFrame containing metadata for splitting
    stratified=None,     # column name for stratified sampling (samples from different strata in each fold)
    group='subfamily',   # column name for group splitting (train/test never share groups)
    nfold=5,             # number of cross-validation folds
    seed=123,            # random seed for reproducibility
)

split_data - Splits a dataframe into train/test features and targets based on a single split tuple from get_splits.

X_train, y_train, X_test, y_test = split_data(
    df=df,               # full DataFrame with features and targets
    feat_col=feat_col,   # list of feature column names (e.g., T5 embeddings)
    target_col=target_col,  # list of target column names (e.g., PSSM values)
    split=splits[0],     # tuple of (train_indices, test_indices)
)

Model Training

train_ml - Fits a single sklearn model on one train/test split and returns predictions on the test set. Optionally saves the trained model.

y_test, y_pred = train_ml(
    df=df,                    # DataFrame with features and targets
    feat_col=feat_col,        # feature column names
    target_col=target_col,    # target column names
    split=splits[0],          # single split tuple (train_idx, test_idx)
    model=LinearRegression(), # any sklearn-compatible model
    save='models/lr_fold0.joblib',  # path to save model (None to skip)
    params={},                # extra kwargs passed to model.fit()
)

train_ml_cv - Performs full cross-validation across all splits, returning out-of-fold (OOF) predictions for the entire dataset.

oof = train_ml_cv(
    df=df,                    # DataFrame with features and targets
    feat_col=feat_col,        # feature column names
    target_col=target_col,    # target column names
    splits=splits,            # list of split tuples from get_splits
    model=Ridge(alpha=1.0),   # sklearn model (re-instantiated each fold)
    save='ridge',             # base name for saved models (becomes ridge_0.joblib, etc.)
    params={},                # extra kwargs for model.fit()
)

Post-Processing

post_process - Cleans raw PSSM predictions by clipping negatives to zero, cleaning position zero, and normalizing each position to sum to 1.

pssm_clean = post_process(
    pssm_df=raw_pssm,    # raw PSSM DataFrame (positions × amino acids)
)

post_process_oof - Applies post_process to all rows in an OOF prediction DataFrame.

oof_clean = post_process_oof(
    oof_ml=oof,          # OOF DataFrame from train_ml_cv
    target_col=target_col,  # target column names to process
)

Scoring

get_score - Computes a per-sample score between target and prediction using a custom function.

scores = get_score(
    target=df[target_col],  # ground truth DataFrame
    pred=oof[target_col],   # predictions DataFrame
    func=js_divergence_flat,  # scoring function (target_row, pred_row) -> float
)

Convenience partials - Pre-configured scorers:

jsd_scores = get_score_jsd(target=df[target_col], pred=oof)  # Jensen-Shannon divergence
kld_scores = get_score_kld(target=df[target_col], pred=oof)  # KL divergence
ce_scores  = get_score_ce(target=df[target_col], pred=oof)   # Cross-entropy

Inference

predict_ml - Loads a saved model and generates predictions on new data.

predictions = predict_ml(
    df=new_data,              # DataFrame containing features
    feat_col=feat_col,        # feature column names (must match training)
    target_col=target_col,    # column names for output DataFrame
    model_pth='models/ridge_0.joblib',  # path to saved model
)

Typical Workflow

# 1. Prepare splits (group by subfamily to prevent leakage)
splits = get_splits(df=info, group='subfamily', nfold=5)

# 2. Train with cross-validation
oof = train_ml_cv(df=df, feat_col=feat_col, target_col=target_col, 
                  splits=splits, model=Ridge(), save='ridge')

# 3. Post-process predictions
oof = post_process_oof(oof_ml=oof, target_col=target_col)

# 4. Evaluate
info['jsd'] = get_score_jsd(target=df[target_col], pred=oof)
print(f"Mean JSD: {info.groupby('nfold').jsd.mean()}")

# 5. Deploy
pred = predict_ml(df=test_df, feat_col=feat_col, target_col=target_col,
                  model_pth='models/ridge_0.joblib')

Setup

Splitter


get_splits


def get_splits(
    df:DataFrame, # df contains info for split
    stratified:str=None, # colname to make stratified kfold; sampling from different groups
    group:str=None, # colname to make group kfold; test and train are from different groups
    nfold:int=5, seed:int=123
):

Split samples in a dataframe based on Stratified, Group, or StratifiedGroup Kfold method

!ls
00_data.ipynb        02e_pssm_compare.ipynb  07_pathway.ipynb      custom.scss
01_utils.ipynb       03_scoring.ipynb        10_ML.ipynb       index.ipynb
02a_pssm_core.ipynb  04_clustering.ipynb     11_DNN.ipynb      models
02b_pssm_plot.ipynb  04b_hierarchical.ipynb  _08_statistics.ipynb  nbdev.yml
02c_pssm_lo.ipynb    05_feature.ipynb        _quarto.yml       styles.css
02d_pssm_pspa.ipynb  06_plot.ipynb       _test.ipynb
# df=pd.read_parquet('paper/kinase_domain/train/pspa_t5.parquet')
# info=Data.get_kinase_info()

# info = info[info.pseudo=='0']

# info = info[info.kd_ID.notna()]

# subfamily_map = info[['kd_ID','subfamily']].drop_duplicates().set_index('kd_ID')['subfamily']

# pspa_info = pd.DataFrame(df.index.tolist(),columns=['kinase'])

# pspa_info['subfamily'] = pspa_info.kinase.map(subfamily_map)

# splits = get_splits(pspa_info, group='subfamily',nfold=5)

# split0 = splits[0]
# df=df.reset_index()
# df.columns
# # column name of feature and target
# feat_col = df.columns[df.columns.str.startswith('T5_')]
# target_col = df.columns[~df.columns.isin(feat_col)][1:]
# feat_col
# target_col

split_data


def split_data(
    df:DataFrame, # dataframe of values
    feat_col:list, # feature columns
    target_col:list, # target columns
    split:tuple, # one of the split in splits
):

Given split tuple, split dataframe into X_train, y_train, X_test, y_test

# X_train, y_train, X_test, y_test = split_data(df,feat_col, target_col, split0)
# X_train.shape,y_train.shape,X_test.shape,y_test.shape

Trainer


train_ml


def train_ml(
    df, # dataframe of values
    feat_col, # feature columns
    target_col, # target columns
    split, # one split in splits
    model, # a sklearn models
    save:NoneType=None, # file (.joblib) to save, e.g. 'model.joblib'
    params:NoneType=None, # dict parameters for model.fit from sklearn
):

Fit and predict using sklearn model format, return target and pred of valid dataset.

# model = LinearRegression()

# ## Uncheck to run with saving model
# # target,pred = train_ml(df, feat_col, target_col, split0, model,'model.joblib')

# # Run without saving model
# target,pred = train_ml(df, feat_col, target_col, split0, model)

# pred.head()

Cross-Validation


train_ml_cv


def train_ml_cv(
    df, # dataframe of values
    feat_col, # feature columns
    target_col, # target columns
    splits, # splits
    model, # sklearn model
    save:NoneType=None, # model name to be saved, e.g., 'LR'
    params:NoneType=None, # act as kwargs, for model.fit
):

Cross-validation through the given splits

# oof = train_ml_cv(df,feat_col,target_col,splits=splits,model=model)

Score


post_process


def post_process(
    pssm_df
):

Convert neg value to 0, clean non-last three values in position zero, and normalize each position

# pssm = post_process(recover_pssm(oof.iloc[0,:-1].sort_values()))
# pssm.sum()

post_process_oof


def post_process_oof(
    oof_ml, target_col
):
# oof = post_process_oof(oof,target_col)

get_score


def get_score(
    target, pred, func
):
# target = df[target_col].copy()
# pspa_info['jsd'] =get_score_jsd(target,oof)
# pspa_info['kld'] =get_score_kld(target,oof)
# pspa_info['jsd']
# pspa_info['kld']

calculate_ce


def calculate_ce(
    target_series, pred_series
):
# pspa_info['ce'] =get_score_ce(target,oof)
# pspa_info['ce']
# pspa_info['nfold'] = oof['nfold']
# pspa_info.groupby('nfold').jsd.mean()

Predictor


predict_ml


def predict_ml(
    df, # Dataframe that contains features
    feat_col, # feature columns
    target_col:NoneType=None, model_pth:str='model.joblib'
):

Make predictions based on trained model.

Uncheck below to run if you have model_pth:

# pred2 = predict_ml(X_test,feat_col, target_col, model_pth = 'model.joblib')
# pred2.head()
## or
# predict_ml(df.iloc[split_0[1]],feat_col,'model.joblib')