seed_everything()Train DL
Overview
Utilities
seed_everything(seed=123) — Sets random seeds across Python, NumPy, and PyTorch for reproducibility. Ensures deterministic behavior on CUDA.
seed_everything(
seed=42, # random seed for reproducibility
)init_weights(m, leaky=0.) — Applies Kaiming initialization to Conv layers. Pass to model.apply() for weight initialization.
model = CNN1D(ni=1024, nf=230).apply(
init_weights, # initializes Conv layers with Kaiming normal
)Layer Builders
lin_wn(ni, nf, dp=0.1, act=nn.SiLU) — Creates a weight-normalized linear layer with BatchNorm, Dropout, and activation.
layer = lin_wn(
ni=1024, # input features
nf=512, # output features
dp=0.1, # dropout probability
act=nn.SiLU, # activation function (None to disable)
)conv_wn(ni, nf, ks=3, stride=1, padding=1, dp=0.1, act=nn.ReLU) — Creates a weight-normalized 1D convolution with BatchNorm, Dropout, and activation.
layer = conv_wn(
ni=256, # input channels
nf=512, # output channels
ks=5, # kernel size
stride=1, # stride
padding=2, # padding
dp=0.1, # dropout probability
act=nn.ReLU, # activation function
)Models
MLP(num_features, num_targets, hidden_units=[512, 218], dp=0.2) — Builds a multi-layer perceptron with BatchNorm and PReLU activations.
model = MLP(
num_features=1024, # input dimension (e.g., T5 embeddings)
num_targets=230, # output dimension (23 AA × 10 positions)
hidden_units=[512, 256], # list of hidden layer sizes
dp=0.2, # dropout rate (currently commented out)
)CNN1D(ni, nf, amp_scale=16) — 1D CNN that amplifies input features, applies convolutions with skip connections, then projects to output.
model = CNN1D(
ni=1024, # input features
nf=230, # output features (flattened PSSM)
amp_scale=16, # amplification factor for feature expansion
).apply(init_weights)PSSM_model(n_features, n_targets, model='MLP') — Wrapper that reshapes flat output to (batch, 23, positions) PSSM format with softmax-ready logits.
model = PSSM_model(
n_features=1024, # input feature dimension
n_targets=230, # total targets (must be divisible by 23)
model='CNN', # 'MLP' or 'CNN' architecture
)
# Output shape: (batch, 23, 10) for 10 positionsDataset
GeneralDataset(df, feat_col, target_col=None, A=23, dtype=np.float32) — PyTorch Dataset that extracts features and reshapes targets to (23, L) PSSM matrices.
ds = GeneralDataset(
df=train_df, # DataFrame with features and targets
feat_col=feat_col, # Index/list of feature column names
target_col=target_col, # Index/list of target columns (None for test mode)
A=23, # number of amino acids (including pS, pT, pY)
dtype=np.float32, # data type for tensors
)
# Returns (X, y) where y.shape = (23, L)Loss Function
CE(logits, target_probs) — Cross-entropy loss with soft labels. Applies log_softmax to logits and computes against target probability distributions.
loss = CE(
logits=model_output, # (B, 23, 10) raw logits
target_probs=target_pssm, # (B, 23, 10) target probabilities (sum to 1 per position)
)Metrics
KLD(logits, target_probs) — Kullback-Leibler divergence between target distribution (p) and predicted softmax distribution (q).
kl_div = KLD(
logits=model_output, # (B, 23, 10) raw logits
target_probs=target_pssm, # (B, 23, 10) target probabilities
)JSD(logits, target_probs) — Jensen-Shannon divergence (symmetric metric) between target and predicted distributions.
js_div = JSD(
logits=model_output, # (B, 23, 10) raw logits
target_probs=target_pssm, # (B, 23, 10) target probabilities
)Training
train_dl(df, feat_col, target_col, split, model_func, ...) — Trains a model on a single train/valid split using fastai’s Learner with one-cycle policy.
target, pred = train_dl(
df=df, # full DataFrame
feat_col=feat_col, # feature column names
target_col=target_col, # target column names
split=split0, # (train_idx, valid_idx) tuple
model_func=get_cnn, # callable returning fresh model
n_epoch=10, # number of training epochs
bs=32, # batch size
lr=3e-3, # learning rate
loss=CE, # loss function
save='my_model', # save to models/my_model.pth
sampler=None, # optional custom sampler
lr_find=True, # run lr_find before training
)
# Returns (target_df, pred_df) for validation settrain_dl_cv(df, feat_col, target_col, splits, model_func, save=None, **kwargs) — Cross-validation wrapper that trains across multiple folds and concatenates OOF predictions.
oof = train_dl_cv(
df=df, # full DataFrame
feat_col=feat_col, # feature column names
target_col=target_col, # target column names
splits=splits, # list of (train_idx, valid_idx) tuples
model_func=get_cnn, # callable returning fresh model
save='cnn', # saves as cnn_fold0.pth, cnn_fold1.pth, ...
n_epoch=10, # passed to train_dl
lr=3e-3, # passed to train_dl
)
# Returns DataFrame with all OOF predictions + 'nfold' columnPrediction
predict_dl(df, feat_col, target_col, model_func, model_pth) — Loads a saved model and generates predictions for a DataFrame.
preds = predict_dl(
df=test_df, # DataFrame to predict
feat_col=feat_col, # feature column names
target_col=target_col, # used for output column names
model_func=get_cnn, # must match saved architecture
model_pth='cnn_fold0', # model name (without .pth)
)
# Returns DataFrame with softmax probabilities, same shape as target_colSetup
Utils
seed_everything
def seed_everything(
seed:int=123
):
def_device'cpu'
Load Data
# df=pd.read_parquet('paper/kinase_domain/train/pspa_t5.parquet')# info=Data.get_kinase_info()
# info = info[info.pseudo=='0']
# info = info[info.kd_ID.notna()]
# subfamily_map = info[['kd_ID','subfamily']].drop_duplicates().set_index('kd_ID')['subfamily']
# pspa_info = pd.DataFrame(df.index.tolist(),columns=['kinase'])
# pspa_info['subfamily'] = pspa_info.kinase.map(subfamily_map)
# splits = get_splits(pspa_info, group='subfamily',nfold=5)
# split0 = splits[0]# df=df.reset_index()# df.columns# # column name of feature and target
# feat_col = df.columns[df.columns.str.startswith('T5_')]
# target_col = df.columns[~df.columns.isin(feat_col)][1:]# feat_col# target_colDataset
GeneralDataset
def GeneralDataset(
df, feat_col, # list/Index of feature columns (e.g., 100 cols)
target_col:NoneType=None, # list/Index of flattened PSSM cols; AA-first; A=23
A:int=23, dtype:type=float32
):
An abstract class representing a :class:Dataset.
All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:__getitem__, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler implementations and the default options of :class:~torch.utils.data.DataLoader. Subclasses could also optionally implement :meth:__getitems__, for speedup batched samples loading. This method accepts list of indices of samples of batch and returns list of samples.
.. note:: :class:~torch.utils.data.DataLoader by default constructs an index sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.
# # dataset
# ds = GeneralDataset(df,feat_col,target_col)# len(ds)# dl = DataLoader(ds, batch_size=64, shuffle=True)# xb,yb = next(iter(dl))
# xb.shape,yb.shapeModels
MLP
MLP
def MLP(
num_features, num_targets, hidden_units:list=[512, 218], dp:float=0.2
):
# n_feature = len(feat_col)
# n_target = len(target_col)# model = MLP(n_feature, n_target)# model(xb)CNN1D
lin_wn
def lin_wn(
ni, nf, dp:float=0.1, act:type=SiLU
):
Weight norm of linear.
lin_wn(10,3)Sequential(
(0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): Dropout(p=0.1, inplace=False)
(2): ParametrizedLinear(
in_features=10, out_features=3, bias=True
(parametrizations): ModuleDict(
(weight): ParametrizationList(
(0): _WeightNorm()
)
)
)
(3): SiLU()
)
conv_wn
def conv_wn(
ni, nf, ks:int=3, stride:int=1, padding:int=1, dp:float=0.1, act:type=ReLU
):
Weight norm of conv.
CNN1D
def CNN1D(
ni, nf, amp_scale:int=16
):
Base class for all neural network modules.
Your models should also subclass this class.
Modules can also contain other Modules, allowing them to be nested in a tree structure. You can assign the submodules as regular attributes::
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)
def forward(self, x):
x = F.relu(self.conv1(x))
return F.relu(self.conv2(x))
Submodules assigned in this way will be registered, and will also have their parameters converted when you call :meth:to, etc.
.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.
:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool
init_weights
def init_weights(
m, leaky:float=0.0
):
Initiate any Conv layer with Kaiming norm.
# model = CNN1D(n_feature,n_target).apply(init_weights)# model(xb).shapeWrapper
PSSM_model
def PSSM_model(
n_features, n_targets, model:str='MLP'
):
Base class for all neural network modules.
Your models should also subclass this class.
Modules can also contain other Modules, allowing them to be nested in a tree structure. You can assign the submodules as regular attributes::
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)
def forward(self, x):
x = F.relu(self.conv1(x))
return F.relu(self.conv2(x))
Submodules assigned in this way will be registered, and will also have their parameters converted when you call :meth:to, etc.
.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.
:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool
# model = PSSM_model(n_feature,n_target)# logits= model(xb)# logits.shape# def get_mlp(): return PSSM_model(n_feature,n_target,model='MLP')
# def get_cnn(): return PSSM_model(n_feature,n_target,model='CNN')Loss
CE
def CE(
logits:Tensor, target_probs:Tensor
):
Cross-entropy with soft labels. logits: (B, 20, 10) target_probs: (B, 20, 10), each column (over AA) sums to 1
# CE(logits,yb)Metrics
KLD
def KLD(
logits:Tensor, target_probs:Tensor
):
Averaged KL divergence across positions between target_probs (p) and softmax(logits) (q).
logits: (B, 20, 10) target_probs: (B, 20, 10), each column (over AA) sums to 1
# KLD(logits,yb)JSD
def JSD(
logits:Tensor, target_probs:Tensor
):
Averaged Jensen-Shannon Divergence across positions between target_probs (p) and softmax(logits) (q).
logits: (B, 20, 10) target_probs: (B, 20, 10), each column (over AA) sums to 1
# JSD(logits,yb)Trainer
train_dl
def train_dl(
df, feat_col, target_col, split, # tuple of numpy array for split index
model_func, # function to get pytorch model
n_epoch:int=4, # number of epochs
bs:int=32, # batch size
lr:float=0.01, # will be useless if lr_find is True
loss:function=CE, # loss function
save:NoneType=None, # models/{save}.pth
sampler:NoneType=None, lr_find:bool=False, # if true, will use lr from lr_find
):
A DL trainer.
# target, pred = train_dl(df,
# feat_col,
# target_col,
# split0,
# model_func=get_cnn,
# n_epoch=1,
# lr = 3e-3,
# lr_find=True,
# save = 'test')# pred# pred_pssm = recover_pssm(pred.iloc[0])
# pred_pssm.sum()Predict
predict_dl
def predict_dl(
df, feat_col, target_col, model_func, # model architecture
model_pth, # only name, not with .pth
):
Predict dataframe given a deep learning model
# test = df.loc[split0[1]].copy()# test_pred = predict_dl(test,
# feat_col,
# target_col,
# model_func=get_cnn, # model architecture
# model_pth='test', # only name, not with .pth
# )# test_pred.columns# pssm_pred = recover_pssm(test_pred.iloc[0])
# pssm_pred.sum()# plot_heatmap(pssm_pred)CV train
cross-validation
train_dl_cv
def train_dl_cv(
df, feat_col, target_col, splits, # list of tuples
model_func, # functions like lambda x: return MLP_1(num_feat, num_target)
save:str=None, kwargs:VAR_KEYWORD
):
# oof = train_dl_cv(df,feat_col,target_col,
# splits = splits,
# model_func = get_cnn,
# n_epoch=1,lr=3e-3,save='cnn')# oof.nfold.value_counts()