Train DL

Deep neural nets for PSSM

Overview

Utilities

seed_everything(seed=123) — Sets random seeds across Python, NumPy, and PyTorch for reproducibility. Ensures deterministic behavior on CUDA.

seed_everything(
    seed=42,  # random seed for reproducibility
)

init_weights(m, leaky=0.) — Applies Kaiming initialization to Conv layers. Pass to model.apply() for weight initialization.

model = CNN1D(ni=1024, nf=230).apply(
    init_weights,  # initializes Conv layers with Kaiming normal
)

Layer Builders

lin_wn(ni, nf, dp=0.1, act=nn.SiLU) — Creates a weight-normalized linear layer with BatchNorm, Dropout, and activation.

layer = lin_wn(
    ni=1024,       # input features
    nf=512,        # output features  
    dp=0.1,        # dropout probability
    act=nn.SiLU,   # activation function (None to disable)
)

conv_wn(ni, nf, ks=3, stride=1, padding=1, dp=0.1, act=nn.ReLU) — Creates a weight-normalized 1D convolution with BatchNorm, Dropout, and activation.

layer = conv_wn(
    ni=256,        # input channels
    nf=512,        # output channels
    ks=5,          # kernel size
    stride=1,      # stride
    padding=2,     # padding
    dp=0.1,        # dropout probability
    act=nn.ReLU,   # activation function
)

Models

MLP(num_features, num_targets, hidden_units=[512, 218], dp=0.2) — Builds a multi-layer perceptron with BatchNorm and PReLU activations.

model = MLP(
    num_features=1024,          # input dimension (e.g., T5 embeddings)
    num_targets=230,            # output dimension (23 AA × 10 positions)
    hidden_units=[512, 256],    # list of hidden layer sizes
    dp=0.2,                     # dropout rate (currently commented out)
)

CNN1D(ni, nf, amp_scale=16) — 1D CNN that amplifies input features, applies convolutions with skip connections, then projects to output.

model = CNN1D(
    ni=1024,        # input features
    nf=230,         # output features (flattened PSSM)
    amp_scale=16,   # amplification factor for feature expansion
).apply(init_weights)

PSSM_model(n_features, n_targets, model='MLP') — Wrapper that reshapes flat output to (batch, 23, positions) PSSM format with softmax-ready logits.

model = PSSM_model(
    n_features=1024,   # input feature dimension
    n_targets=230,     # total targets (must be divisible by 23)
    model='CNN',       # 'MLP' or 'CNN' architecture
)
# Output shape: (batch, 23, 10) for 10 positions

Dataset

GeneralDataset(df, feat_col, target_col=None, A=23, dtype=np.float32) — PyTorch Dataset that extracts features and reshapes targets to (23, L) PSSM matrices.

ds = GeneralDataset(
    df=train_df,           # DataFrame with features and targets
    feat_col=feat_col,     # Index/list of feature column names
    target_col=target_col, # Index/list of target columns (None for test mode)
    A=23,                  # number of amino acids (including pS, pT, pY)
    dtype=np.float32,      # data type for tensors
)
# Returns (X, y) where y.shape = (23, L)

Loss Function

CE(logits, target_probs) — Cross-entropy loss with soft labels. Applies log_softmax to logits and computes against target probability distributions.

loss = CE(
    logits=model_output,      # (B, 23, 10) raw logits
    target_probs=target_pssm, # (B, 23, 10) target probabilities (sum to 1 per position)
)

Metrics

KLD(logits, target_probs) — Kullback-Leibler divergence between target distribution (p) and predicted softmax distribution (q).

kl_div = KLD(
    logits=model_output,      # (B, 23, 10) raw logits
    target_probs=target_pssm, # (B, 23, 10) target probabilities
)

JSD(logits, target_probs) — Jensen-Shannon divergence (symmetric metric) between target and predicted distributions.

js_div = JSD(
    logits=model_output,      # (B, 23, 10) raw logits  
    target_probs=target_pssm, # (B, 23, 10) target probabilities
)

Training

train_dl(df, feat_col, target_col, split, model_func, ...) — Trains a model on a single train/valid split using fastai’s Learner with one-cycle policy.

target, pred = train_dl(
    df=df,                     # full DataFrame
    feat_col=feat_col,         # feature column names
    target_col=target_col,     # target column names
    split=split0,              # (train_idx, valid_idx) tuple
    model_func=get_cnn,        # callable returning fresh model
    n_epoch=10,                # number of training epochs
    bs=32,                     # batch size
    lr=3e-3,                   # learning rate
    loss=CE,                   # loss function
    save='my_model',           # save to models/my_model.pth
    sampler=None,              # optional custom sampler
    lr_find=True,              # run lr_find before training
)
# Returns (target_df, pred_df) for validation set

train_dl_cv(df, feat_col, target_col, splits, model_func, save=None, **kwargs) — Cross-validation wrapper that trains across multiple folds and concatenates OOF predictions.

oof = train_dl_cv(
    df=df,                     # full DataFrame
    feat_col=feat_col,         # feature column names
    target_col=target_col,     # target column names
    splits=splits,             # list of (train_idx, valid_idx) tuples
    model_func=get_cnn,        # callable returning fresh model
    save='cnn',                # saves as cnn_fold0.pth, cnn_fold1.pth, ...
    n_epoch=10,                # passed to train_dl
    lr=3e-3,                   # passed to train_dl
)
# Returns DataFrame with all OOF predictions + 'nfold' column

Prediction

predict_dl(df, feat_col, target_col, model_func, model_pth) — Loads a saved model and generates predictions for a DataFrame.

preds = predict_dl(
    df=test_df,                # DataFrame to predict
    feat_col=feat_col,         # feature column names
    target_col=target_col,     # used for output column names
    model_func=get_cnn,        # must match saved architecture
    model_pth='cnn_fold0',     # model name (without .pth)
)
# Returns DataFrame with softmax probabilities, same shape as target_col

Setup

Utils


seed_everything


def seed_everything(
    seed:int=123
):
seed_everything()
def_device
'cpu'

Load Data

# df=pd.read_parquet('paper/kinase_domain/train/pspa_t5.parquet')
# info=Data.get_kinase_info()

# info = info[info.pseudo=='0']

# info = info[info.kd_ID.notna()]

# subfamily_map = info[['kd_ID','subfamily']].drop_duplicates().set_index('kd_ID')['subfamily']

# pspa_info = pd.DataFrame(df.index.tolist(),columns=['kinase'])

# pspa_info['subfamily'] = pspa_info.kinase.map(subfamily_map)

# splits = get_splits(pspa_info, group='subfamily',nfold=5)

# split0 = splits[0]
# df=df.reset_index()
# df.columns
# # column name of feature and target
# feat_col = df.columns[df.columns.str.startswith('T5_')]
# target_col = df.columns[~df.columns.isin(feat_col)][1:]
# feat_col
# target_col

Dataset


GeneralDataset


def GeneralDataset(
    df, feat_col, # list/Index of feature columns (e.g., 100 cols)
    target_col:NoneType=None, # list/Index of flattened PSSM cols; AA-first; A=23
    A:int=23, dtype:type=float32
):

An abstract class representing a :class:Dataset.

All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:__getitem__, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler implementations and the default options of :class:~torch.utils.data.DataLoader. Subclasses could also optionally implement :meth:__getitems__, for speedup batched samples loading. This method accepts list of indices of samples of batch and returns list of samples.

.. note:: :class:~torch.utils.data.DataLoader by default constructs an index sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.

# # dataset
# ds = GeneralDataset(df,feat_col,target_col)
# len(ds)
# dl = DataLoader(ds, batch_size=64, shuffle=True)
# xb,yb = next(iter(dl))

# xb.shape,yb.shape

Models

MLP


MLP


def MLP(
    num_features, num_targets, hidden_units:list=[512, 218], dp:float=0.2
):
# n_feature = len(feat_col)
# n_target = len(target_col)
# model = MLP(n_feature, n_target)
# model(xb)

CNN1D


lin_wn


def lin_wn(
    ni, nf, dp:float=0.1, act:type=SiLU
):

Weight norm of linear.

lin_wn(10,3)
Sequential(
  (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): ParametrizedLinear(
    in_features=10, out_features=3, bias=True
    (parametrizations): ModuleDict(
      (weight): ParametrizationList(
        (0): _WeightNorm()
      )
    )
  )
  (3): SiLU()
)

conv_wn


def conv_wn(
    ni, nf, ks:int=3, stride:int=1, padding:int=1, dp:float=0.1, act:type=ReLU
):

Weight norm of conv.


CNN1D


def CNN1D(
    ni, nf, amp_scale:int=16
):

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing them to be nested in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will also have their parameters converted when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool


init_weights


def init_weights(
    m, leaky:float=0.0
):

Initiate any Conv layer with Kaiming norm.

# model = CNN1D(n_feature,n_target).apply(init_weights)
# model(xb).shape

Wrapper


PSSM_model


def PSSM_model(
    n_features, n_targets, model:str='MLP'
):

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing them to be nested in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will also have their parameters converted when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

# model = PSSM_model(n_feature,n_target)
# logits= model(xb)
# logits.shape
# def get_mlp(): return PSSM_model(n_feature,n_target,model='MLP')

# def get_cnn(): return PSSM_model(n_feature,n_target,model='CNN')

Loss


CE


def CE(
    logits:Tensor, target_probs:Tensor
):

Cross-entropy with soft labels. logits: (B, 20, 10) target_probs: (B, 20, 10), each column (over AA) sums to 1

# CE(logits,yb)

Metrics


KLD


def KLD(
    logits:Tensor, target_probs:Tensor
):

Averaged KL divergence across positions between target_probs (p) and softmax(logits) (q).

logits: (B, 20, 10) target_probs: (B, 20, 10), each column (over AA) sums to 1

# KLD(logits,yb)

JSD


def JSD(
    logits:Tensor, target_probs:Tensor
):

Averaged Jensen-Shannon Divergence across positions between target_probs (p) and softmax(logits) (q).

logits: (B, 20, 10) target_probs: (B, 20, 10), each column (over AA) sums to 1

# JSD(logits,yb)

Trainer


train_dl


def train_dl(
    df, feat_col, target_col, split, # tuple of numpy array for split index
    model_func, # function to get pytorch model
    n_epoch:int=4, # number of epochs
    bs:int=32, # batch size
    lr:float=0.01, # will be useless if lr_find is True
    loss:function=CE, # loss function
    save:NoneType=None, # models/{save}.pth
    sampler:NoneType=None, lr_find:bool=False, # if true, will use lr from lr_find
):

A DL trainer.

# target, pred = train_dl(df, 
#                         feat_col, 
#                         target_col,
#                         split0, 
#                         model_func=get_cnn,
#                         n_epoch=1,
#                         lr = 3e-3,
#                         lr_find=True,
#                         save = 'test')
# pred
# pred_pssm = recover_pssm(pred.iloc[0])
# pred_pssm.sum()

Predict


predict_dl


def predict_dl(
    df, feat_col, target_col, model_func, # model architecture
    model_pth, # only name, not with .pth
):

Predict dataframe given a deep learning model

# test = df.loc[split0[1]].copy()
# test_pred = predict_dl(test, 
#                feat_col, 
#                target_col,
#                model_func=get_cnn, # model architecture
#                model_pth='test', # only name, not with .pth
#               )
# test_pred.columns
# pssm_pred = recover_pssm(test_pred.iloc[0])
# pssm_pred.sum()
# plot_heatmap(pssm_pred)

CV train

cross-validation


train_dl_cv


def train_dl_cv(
    df, feat_col, target_col, splits, # list of tuples
    model_func, # functions like lambda x: return MLP_1(num_feat, num_target)
    save:str=None, kwargs:VAR_KEYWORD
):
# oof = train_dl_cv(df,feat_col,target_col,
#                   splits = splits,
#                   model_func = get_cnn,
#                   n_epoch=1,lr=3e-3,save='cnn')
# oof.nfold.value_counts()