Predict PSSMs

Based on trained models, predict PSSMs of previously uncharacterized kinases.

Setup

from katlas.imports import *
from fastbook import *
from functools import reduce
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

Load data

df = Data.get_kinase_info().query('pseudo=="0"') # exclude pseudo kinase
# Remove too long proteins as they can't fit into the model
df = df[df.human_uniprot_sequence.str.len()<7000]
df = df[df.kinasecom_domain.str.len()<7000]

Get protein embeddings

Uncheck below to get protein embeddings for the kinases

# Remove too long proteins as they can't fit into the model
# valid_uniprot = df[df.human_uniprot_sequence.str.len()<7000]
# valid_kd = df[df.kinasecom_domain.str.len()<7000]
# feat_esm = get_esm(valid_uniprot,'human_uniprot_sequence')
# feat_esm_kd = get_esm(valid_kd,'kinasecom_domain')

# feat_t5 = get_t5(valid_uniprot,'human_uniprot_sequence')
# feat_t5_kd = get_t5(valid_kd,'kinasecom_domain')
# feat_esm.index=valid_uniprot.kinase

# feat_t5.index = valid_uniprot.kinase

# feat_esm_kd.index = valid_kd.kinase
# feat_t5_kd.index= valid_kd.kinase
# feat_esm.astype(float).to_parquet('raw/esm_unknown.parquet')
# feat_esm_kd.astype(float).to_parquet('raw/esm_unknown_kd.parquet')

# feat_t5.astype(float).to_parquet('raw/t5_unknown.parquet')
# feat_t5_kd.astype(float).to_parquet('raw/t5_unknown_kd.parquet')

Or directly load

feat_esm = pd.read_parquet('raw/esm_unknown.parquet')
feat_esm_kd = pd.read_parquet('raw/esm_unknown_kd.parquet')

feat_t5 = pd.read_parquet('raw/t5_unknown.parquet')
feat_t5_kd = pd.read_parquet('raw/t5_unknown_kd.parquet')
feat_esm.shape,feat_esm_kd.shape,feat_t5.shape,feat_t5_kd.shape
((508, 1280), (503, 1280), (508, 1024), (503, 1024))
# filter out pseudokinase
feat_esm = feat_esm[feat_esm.index.isin(df.kinase)]
feat_esm_kd = feat_esm_kd[feat_esm_kd.index.isin(df.kinase)]

feat_t5 = feat_t5[feat_t5.index.isin(df.kinase)]
feat_t5_kd = feat_t5_kd[feat_t5_kd.index.isin(df.kinase)]

Prepare models

5 Fold x 3 models = 15 models

top = ['cnn_esm_kd',
'cnn_t5_kd',
'cnn_t5',
# 'mlp_t5',
      ]
# will not use these data, just to get t5_col and esm_col
t5 = pd.read_parquet('train_data/combine_t5.parquet').reset_index()
esm = pd.read_parquet('train_data/combine_esm.parquet').reset_index()

# feature column
t5_col = t5.columns[t5.columns.str.startswith('T5_')]
esm_col = esm.columns[esm.columns.str.startswith('esm_')]

# target column
target_col = t5.columns[~t5.columns.isin(t5_col)][1:]
num_esm = len(esm_col)
num_t5 = len(t5_col)
num_target = len(target_col)
def cnn_esm():
    return CNN1D_2(num_esm, num_target)

def cnn_t5():
    return CNN1D_2(num_t5, num_target)

def mlp_t5():
    return MLP_1(num_t5, num_target)

def mlp_esm():
    return MLP_1(num_esm, num_target)
models = {
    'cnn_esm_kd':(feat_esm_kd, esm_col, cnn_esm()),
    'cnn_t5_kd': (feat_t5_kd, t5_col, cnn_t5()),
    'cnn_t5':(feat_t5, t5_col, cnn_t5()) 
 }

Predict through top models

predict_dl?
Signature: predict_dl(df, feat_col, target_col, model, model_pth)
Docstring: Predict dataframe given a deep learning model
File:      /usr/local/lib/python3.9/dist-packages/katlas/dl.py
Type:      function
N_FOLD=5
pred_list = []
for model_name, (data,feat_col,model) in models.items():
    for i in range(N_FOLD):
        pred = predict_dl(data,feat_col,target_col,model, f'{model_name}_fold{i}')
        pred_list.append(pred)
len(pred_list)
15

Aggregate results

# Add up everything
preds = reduce(lambda x, y: x.add(y, fill_value=0), pred_list)
def get_heatmap(df, # Stacked Dataframe with kinase as index, substrates as columns
                kinase, # A specific kinase name in index
                figsize = (7.5,10)
               ):
    
    "Plot PSSM of a single kinase from a df"
    # get a single kinase matrix from the df
    matrix = get_one_kinase(df, kinase,drop_s=False).T
    
    # reorder aa order
    aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
    matrix = matrix.reindex(index=aa_order)
    
    plot_heatmap(matrix,title=kinase,figsize=figsize)
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set_context('notebook')
sns.set_style("ticks")
get_heatmap(preds.iloc[:,:-3],'CK1A2')

Post-process

# remove kinase with duplicated name
preds= preds[~preds.index.duplicated()]
preds_minmax = MinMaxScaler().fit_transform(preds.T).T
data = []
for k in preds_minmax.index:
    w = get_one_kinase(preds_minmax,k,drop_s=False).T
    w = w/w.sum()
    w2 = w.unstack().reset_index(name=k)
    w2['substrate'] = w2.position.astype(str)+w2.aa
    w3 = w2.set_index('substrate')[k]
    data.append(w3)
    # break
preds_final = pd.concat(data,axis=1).T

preds_final = preds_final[target_col]
preds_final = preds_final[~preds_final.index.isin(t5.kinase)]
preds_final.index
Index(['ADCK1', 'ADCK2', 'COQ8A', 'COQ8B', 'ADCK5', 'ACVRL1', 'ACVR1C',
       'ALPK3', 'ALPK2', 'ARAF', 'CASK', 'CDK20', 'CDKL2', 'CDKL3', 'CDKL4',
       'STK35', 'PDIK1L', 'DCLK3', 'DDR1', 'CDC42BPG', 'STK17B', 'ERBB2',
       'MAPK6', 'MAPK4', 'STK36', 'TNNI3K', 'AATK', 'LMTK2', 'LMTK3', 'LRRK1',
       'MAP3K13', 'MAP2K6', 'MAP3K4', 'MAST1', 'MAST2', 'MAST3', 'MAST4',
       'AMHR2', 'PKMYT1', 'NEK10', 'NRK', 'PDHK3', 'CDK15', 'PIK3R4', 'CDK11B',
       'PSKH1', 'RIOK1', 'RIOK2', 'RIOK3', 'RNASEL', 'ROR1', 'ROR2', 'SBK2',
       'SBK3', 'ANKK1', 'RSKR', 'SPEG', 'TESK2', 'TIE1', 'KALRN', 'TRIO',
       'TSSK3', 'TSSK4', 'WEE2', 'STK32A'],
      dtype='object')
# preds_final.to_parquet('raw/predicted.parquet')
# or directly load
preds_final = pd.read_parquet('raw/predicted.parquet')

Select kinase families with high oof Pearson scores

family_score = pd.read_csv('raw/oof_corr_family.csv').rename(columns={'kinase':'family_count','Pearson':'Pearson_family'})
subfamily_score = pd.read_csv('raw/oof_corr_subfamily.csv').rename(columns={'kinase':'subfamily_count','Pearson':'Pearson_subfamily'})
family_score
family Pearson_family family_count
0 ALK 0.968724 2
1 Abl 0.959211 2
2 Ack 0.830424 2
3 Akt 0.974935 3
4 Alpha 0.198162 4
... ... ... ...
95 VEGFR 0.964378 3
96 VRK 0.651387 2
97 WEE -0.090115 1
98 WNK 0.618240 4
99 YANK 0.816684 2

100 rows × 3 columns

preds_info = pd.DataFrame(preds_final.index,columns=['kinase']).merge(df,'left')
preds_info = preds_info.merge(family_score,'left')
preds_info = preds_info.merge(subfamily_score,'left')
preds_info = preds_info[['kinase','ID_coral','uniprot','ID_HGNC',
                         'group','family','subfamily',
                         'Pearson_family','family_count',
                         'Pearson_subfamily','subfamily_count'
                        ]]
selected = pd.read_csv('raw/pred_kinase.csv')
selected_df = preds_info[preds_info.kinase.isin(selected.kinase)]
selected_df
kinase ID_coral uniprot ID_HGNC group family subfamily Pearson_family family_count Pearson_subfamily subfamily_count
5 ACVRL1 ALK1 P37023 ACVRL1 TKL STKR STKR1 0.857840 9.0 0.908758 5.0
6 ACVR1C ALK7 Q8NER5 ACVR1C TKL STKR STKR1 0.857840 9.0 0.908758 5.0
11 CDK20 CCRK Q8IZL9 CDK20 CMGC CDK CDK 0.923265 17.0 NaN NaN
12 CDKL2 CDKL2 Q92772 CDKL2 CMGC CDKL CDKL 0.763117 2.0 0.763117 2.0
13 CDKL3 CDKL3 Q8IVW4 CDKL3 CMGC CDKL CDKL 0.763117 2.0 0.763117 2.0
14 CDKL4 CDKL4 Q5MAI5 CDKL4 CMGC CDKL CDKL 0.763117 2.0 0.763117 2.0
17 DCLK3 DCAMKL3 Q9C098 DCLK3 CAMK DCAMKL DCAMKL 0.901985 2.0 0.901985 2.0
19 CDC42BPG DMPK2 Q6DT37 CDC42BPG AGC DMPK GEK 0.954669 6.0 0.961369 3.0
20 STK17B DRAK2 O94768 STK17B CAMK DAPK DAPK 0.792083 4.0 0.792083 4.0
22 MAPK6 Erk3 Q16659 MAPK6 CMGC MAPK ERK3 0.881876 12.0 NaN NaN
23 MAPK4 Erk4 P31152 MAPK4 CMGC MAPK ERK3 0.881876 12.0 NaN NaN
25 TNNI3K HH498 Q59H18 TNNI3K TKL MLK HH498 0.733784 7.0 NaN NaN
30 MAP3K13 LZK O43283 MAP3K13 TKL MLK LZK 0.733784 7.0 0.497632 1.0
32 MAP3K4 MAP3K4 Q9Y6R4 MAP3K4 STE STE11 STE11 0.749903 7.0 0.749903 7.0
37 AMHR2 MISR2 Q16671 AMHR2 TKL STKR STKR2 0.857840 9.0 0.794193 4.0
39 NEK10 NEK10 Q6ZWH5 NEK10 Other NEK NEK 0.778235 10.0 0.778235 10.0
40 NRK NRK Q7Z2Y5 NRK STE STE20 MSN 0.863932 27.0 0.945743 3.0
41 PDHK3 PDHK3 Q15120 PDK3 Atypical PDHK PDHK 0.676690 3.0 0.676690 3.0
42 CDK15 PFTAIRE2 Q96Q40 CDK15 CMGC CDK PFTAIRE 0.923265 17.0 0.949171 1.0
44 CDK11B PITSLRE P21127 CDK11B CMGC CDK CDK11 0.923265 17.0 NaN NaN
52 SBK2 SgK069 P0C263 SBK2 Other NKF1 NKF1 0.734262 1.0 0.734262 1.0
53 SBK3 SgK110 P0C264 SBK3 Other NKF1 NKF1 0.734262 1.0 0.734262 1.0
54 ANKK1 SgK288 Q8NFD2 ANKK1 TKL RIPK RIPK 0.627125 4.0 0.627125 4.0
57 TESK2 TESK2 Q96S53 TESK2 TKL LISK TESK 0.576401 3.0 0.099881 1.0
61 TSSK3 TSSK3 Q96PN8 TSSK3 CAMK TSSK TSSK 0.847705 3.0 0.847705 3.0
62 TSSK4 TSSK4 Q6SA08 TSSK4 CAMK TSSK TSSK 0.847705 3.0 0.847705 3.0
64 STK32A YANK1 Q8WU08 STK32A AGC YANK YANK 0.816684 2.0 0.816684 2.0
# selected = preds_info[preds_info['Pearson_family']>=0.55]\
# .sort_values('Pearson_family',ascending=False)\
# .query('group!="TK"')

# # Remove MAP2K families, as it is potentially contaminated with MAPK families
# selected = selected[~selected.kinase.str.startswith('MAP2')]
len(selected)
29

To save PSSMs:

preds_final.index
Index(['ADCK1', 'ADCK2', 'COQ8A', 'COQ8B', 'ADCK5', 'ACVRL1', 'ACVR1C',
       'ALPK3', 'ALPK2', 'ARAF', 'CASK', 'CDK20', 'CDKL2', 'CDKL3', 'CDKL4',
       'STK35', 'PDIK1L', 'DCLK3', 'DDR1', 'CDC42BPG', 'STK17B', 'ERBB2',
       'MAPK6', 'MAPK4', 'STK36', 'TNNI3K', 'AATK', 'LMTK2', 'LMTK3', 'LRRK1',
       'MAP3K13', 'MAP2K6', 'MAP3K4', 'MAST1', 'MAST2', 'MAST3', 'MAST4',
       'AMHR2', 'PKMYT1', 'NEK10', 'NRK', 'CDK15', 'PIK3R4', 'CDK11B', 'PSKH1',
       'RIOK1', 'RIOK2', 'RIOK3', 'RNASEL', 'ROR1', 'ROR2', 'SBK2', 'SBK3',
       'ANKK1', 'RSKR', 'SPEG', 'TESK2', 'TIE1', 'KALRN', 'TRIO', 'TSSK3',
       'TSSK4', 'WEE2', 'STK32A'],
      dtype='object')
selected_PSSM = preds_final.loc[selected.kinase]
# selected_PSSM.to_csv('source/Supplementary_table3_predicted_PSSM.csv')
preds_final
substrate -5P -5G -5A -5C -5S -5T -5V -5I -5L -5M -5F -5Y -5W -5H -5K -5R -5Q -5N -5D -5E -5s -5t -5y -4P -4G -4A -4C -4S -4T -4V -4I -4L -4M -4F -4Y -4W -4H -4K -4R -4Q -4N -4D -4E -4s -4t -4y -3P -3G -3A -3C -3S -3T -3V -3I -3L -3M -3F -3Y -3W -3H -3K -3R -3Q -3N -3D -3E -3s -3t -3y -2P -2G -2A -2C -2S -2T -2V -2I -2L -2M -2F -2Y -2W -2H -2K -2R -2Q -2N -2D -2E -2s -2t -2y -1P -1G -1A -1C -1S -1T -1V -1I -1L -1M -1F -1Y -1W -1H -1K -1R -1Q -1N -1D -1E -1s -1t -1y 1P 1G 1A 1C 1S 1T 1V 1I 1L 1M 1F 1Y 1W 1H 1K 1R 1Q 1N 1D 1E 1s 1t 1y 2P 2G 2A 2C 2S 2T 2V 2I 2L 2M 2F 2Y 2W 2H 2K 2R 2Q 2N 2D 2E 2s 2t 2y 3P 3G 3A 3C 3S 3T 3V 3I 3L 3M 3F 3Y 3W 3H 3K 3R 3Q 3N 3D 3E 3s 3t 3y 4P 4G 4A 4C 4S 4T 4V 4I 4L 4M 4F 4Y 4W 4H 4K 4R 4Q 4N 4D 4E 4s 4t 4y 0s 0t 0y
ADCK1 0.048353 0.044662 0.049684 0.047756 0.039644 0.038732 0.041271 0.037347 0.041508 0.038331 0.053608 0.049934 0.062422 0.053521 0.049245 0.060374 0.029276 0.031936 0.034305 0.029565 0.038343 0.035231 0.044951 0.044934 0.055889 0.053259 0.051039 0.040561 0.039071 0.029120 0.026485 0.032313 0.043253 0.043563 0.045449 0.055382 0.051374 0.051370 0.058486 0.042473 0.033350 0.037059 0.042023 0.037581 0.039726 0.046240 0.045317 0.043955 0.041623 0.038439 0.031459 0.036182 0.017883 0.025107 0.028089 0.039873 0.038124 0.047898 0.048998 0.048629 0.060935 0.169430 0.039727 0.039115 0.027784 0.027859 0.026473 0.025607 0.051496 0.052765 0.066532 0.069552 0.057155 0.035952 0.036574 0.029716 0.024590 0.024692 0.044897 0.037173 0.038102 0.036595 0.045834 0.042720 0.070292 0.053697 0.044977 0.034447 0.035987 0.033721 0.033731 0.050301 0.050042 0.068772 0.047273 0.037983 0.037060 0.039980 0.016494 0.016991 0.048666 0.050307 0.035136 0.047393 0.021929 0.049534 0.049586 0.067902 0.043523 0.063060 0.035749 0.029079 0.014841 0.017725 0.110975 0.037987 0.036425 0.031539 0.035683 0.029945 0.028544 0.049934 0.052586 0.064842 0.065427 0.068493 0.070078 0.055551 0.037420 0.034944 0.041709 0.127522 0.034614 0.011865 0.012881 0.019960 0.019043 0.033009 0.038021 0.037081 0.045470 0.044291 0.037363 0.037934 0.032741 0.023745 0.026896 0.036136 0.043042 0.064908 0.042939 0.073471 0.058376 0.082958 0.044870 0.047893 0.049456 0.037746 0.036087 0.036824 0.021754 0.048246 0.052635 0.044155 0.047448 0.037878 0.042588 0.030934 0.027071 0.034234 0.036288 0.040991 0.053992 0.051278 0.051650 0.056384 0.083639 0.037271 0.044311 0.040318 0.036551 0.032791 0.030228 0.039123 0.044532 0.046071 0.034738 0.039499 0.041556 0.039789 0.026431 0.034100 0.032969 0.039661 0.043047 0.048863 0.067794 0.052228 0.059207 0.054760 0.044493 0.039064 0.038197 0.039430 0.044199 0.045629 0.043743 0.592792 0.407208 0.000000
ADCK2 0.045721 0.039397 0.046075 0.048963 0.039347 0.039411 0.037317 0.039063 0.038259 0.038223 0.050568 0.046183 0.060257 0.053303 0.053477 0.059690 0.032951 0.037253 0.037623 0.035300 0.040233 0.038882 0.042505 0.038253 0.046895 0.046298 0.050421 0.040807 0.040307 0.026036 0.026761 0.031410 0.040487 0.041790 0.049121 0.067160 0.053791 0.054370 0.057504 0.040842 0.038828 0.038541 0.046431 0.037180 0.038448 0.048320 0.029899 0.032401 0.036139 0.042894 0.034648 0.035971 0.016074 0.024989 0.037466 0.050001 0.041506 0.051284 0.054756 0.047511 0.055568 0.170383 0.034518 0.033387 0.029179 0.024037 0.028684 0.028524 0.060181 0.037890 0.045830 0.057961 0.060794 0.037078 0.037156 0.029906 0.035417 0.040394 0.069018 0.050424 0.050308 0.056359 0.049178 0.036979 0.050035 0.047813 0.034787 0.024446 0.031328 0.033462 0.036423 0.047013 0.044032 0.054508 0.044736 0.038589 0.038306 0.039960 0.019210 0.015471 0.043412 0.050007 0.032492 0.046547 0.028663 0.048846 0.062203 0.079447 0.049108 0.061290 0.034761 0.033345 0.013798 0.020700 0.100570 0.072016 0.052588 0.035022 0.042979 0.032550 0.032083 0.033524 0.041327 0.041449 0.056695 0.059243 0.057737 0.045298 0.040674 0.053588 0.058783 0.066609 0.050871 0.016541 0.022681 0.025931 0.025844 0.035966 0.043190 0.031350 0.040698 0.040927 0.039169 0.039970 0.029137 0.024877 0.024263 0.038165 0.042098 0.061697 0.048007 0.071997 0.060763 0.092504 0.040901 0.046491 0.033582 0.031696 0.036643 0.036539 0.045335 0.044955 0.046267 0.038843 0.048163 0.041402 0.042168 0.028033 0.027368 0.029552 0.039374 0.043419 0.057464 0.053689 0.051217 0.066311 0.101883 0.038626 0.047659 0.030986 0.022662 0.028900 0.029022 0.042037 0.049299 0.041710 0.031999 0.042755 0.043463 0.042232 0.027886 0.036487 0.037486 0.038706 0.039611 0.049975 0.063763 0.052915 0.062923 0.064993 0.048762 0.040808 0.032854 0.029899 0.040699 0.041583 0.039191 0.550779 0.449222 0.000000
COQ8A 0.047078 0.049584 0.046538 0.046978 0.038505 0.039265 0.038297 0.027514 0.025965 0.034214 0.046496 0.053125 0.061218 0.063530 0.050068 0.057296 0.030144 0.036608 0.041208 0.034382 0.040725 0.038488 0.052773 0.051598 0.054722 0.051946 0.053349 0.041256 0.040011 0.028248 0.022370 0.025431 0.038975 0.042358 0.047993 0.058613 0.053117 0.044062 0.050123 0.041462 0.035505 0.039927 0.042196 0.040626 0.042263 0.053851 0.047354 0.044989 0.042722 0.043670 0.036494 0.039349 0.019971 0.025216 0.025044 0.042077 0.042146 0.052869 0.057933 0.045585 0.046349 0.114434 0.041640 0.040620 0.037419 0.036016 0.028373 0.029125 0.060607 0.053200 0.076206 0.065363 0.053134 0.034243 0.035309 0.027042 0.027100 0.026685 0.040070 0.041957 0.044627 0.037925 0.047342 0.033303 0.061909 0.042819 0.042647 0.039713 0.036886 0.035117 0.037581 0.059822 0.046798 0.075342 0.045890 0.037855 0.035734 0.039024 0.015485 0.014795 0.051195 0.048579 0.034145 0.045877 0.022837 0.045838 0.046570 0.062046 0.037391 0.058346 0.035356 0.024353 0.018455 0.018843 0.139246 0.046620 0.029290 0.026538 0.029174 0.029126 0.030876 0.048084 0.055013 0.066582 0.067302 0.064741 0.074518 0.062068 0.035251 0.038081 0.038229 0.112505 0.026391 0.011765 0.018384 0.024527 0.025697 0.039235 0.034462 0.038124 0.039552 0.039801 0.034877 0.035696 0.030061 0.021655 0.026670 0.038152 0.040993 0.065945 0.040576 0.082912 0.063481 0.080361 0.041272 0.043819 0.047356 0.041445 0.042968 0.042554 0.027267 0.047743 0.054207 0.042449 0.043962 0.038254 0.039598 0.027100 0.024370 0.032305 0.036350 0.039941 0.054749 0.049598 0.049072 0.061979 0.091424 0.034625 0.039491 0.038573 0.036898 0.037693 0.037084 0.042537 0.040794 0.048981 0.034644 0.042590 0.041290 0.038335 0.021906 0.022611 0.024033 0.032745 0.038359 0.051069 0.071394 0.053739 0.060517 0.056973 0.045843 0.038744 0.041111 0.042403 0.050887 0.051535 0.049496 0.531098 0.468902 0.000000
COQ8B 0.046490 0.047984 0.047754 0.047483 0.039761 0.038309 0.038994 0.028313 0.026969 0.035838 0.042184 0.050925 0.056083 0.059763 0.058692 0.067034 0.035713 0.038473 0.037042 0.033089 0.039843 0.037535 0.045730 0.051322 0.054295 0.053244 0.051641 0.041342 0.041967 0.029125 0.021728 0.025911 0.040115 0.038629 0.046331 0.054629 0.053792 0.050985 0.055641 0.042761 0.040816 0.036494 0.041124 0.036740 0.040735 0.050634 0.049042 0.048814 0.042762 0.043352 0.036251 0.038490 0.021945 0.025510 0.028467 0.042587 0.040218 0.053032 0.054805 0.047075 0.046827 0.114656 0.045907 0.040083 0.032092 0.036369 0.024304 0.026974 0.060437 0.060197 0.071454 0.059700 0.054718 0.034807 0.035518 0.032697 0.033934 0.035685 0.047181 0.039248 0.044588 0.038767 0.050232 0.038700 0.068320 0.041183 0.041750 0.023728 0.025068 0.031471 0.034198 0.056857 0.051250 0.067170 0.048648 0.041807 0.037135 0.038623 0.021044 0.016097 0.047293 0.047487 0.034862 0.050749 0.028458 0.049276 0.042625 0.061439 0.040653 0.058289 0.032244 0.027185 0.016145 0.017861 0.123662 0.046231 0.054320 0.043225 0.038327 0.030678 0.031225 0.046657 0.043060 0.056026 0.058741 0.052604 0.066302 0.053787 0.042357 0.047298 0.056374 0.094522 0.036503 0.008882 0.015888 0.021940 0.022703 0.032349 0.029345 0.034197 0.032630 0.038210 0.034895 0.035072 0.030126 0.023677 0.031024 0.040142 0.041913 0.070701 0.045380 0.071101 0.067837 0.097443 0.043714 0.038530 0.037926 0.041200 0.035452 0.036403 0.043080 0.041556 0.047100 0.040494 0.044526 0.039906 0.041232 0.029107 0.027733 0.036204 0.038227 0.043508 0.057160 0.051152 0.049759 0.061597 0.099651 0.036943 0.038938 0.035402 0.032446 0.032767 0.032437 0.042155 0.041628 0.047614 0.034308 0.043401 0.041791 0.039117 0.026839 0.029370 0.025726 0.033912 0.039222 0.050926 0.066690 0.053031 0.066319 0.065283 0.047686 0.037915 0.035601 0.037493 0.046414 0.044682 0.045031 0.536430 0.463570 0.000000
ADCK5 0.049543 0.047891 0.053344 0.047442 0.039538 0.039139 0.037515 0.030666 0.034010 0.034438 0.048031 0.047656 0.058163 0.057253 0.053287 0.060471 0.032071 0.034439 0.040639 0.036601 0.039099 0.036382 0.042382 0.046010 0.055103 0.051187 0.048136 0.040229 0.039656 0.026984 0.025265 0.029893 0.040407 0.040468 0.048090 0.060049 0.052049 0.046587 0.056406 0.040163 0.036351 0.042446 0.047558 0.037579 0.038367 0.051016 0.044579 0.041849 0.043394 0.040439 0.034076 0.037129 0.020297 0.029052 0.041045 0.045282 0.043976 0.054678 0.053243 0.049444 0.054603 0.134756 0.039868 0.034255 0.029839 0.030173 0.018328 0.019532 0.060163 0.056545 0.062542 0.064457 0.056482 0.034845 0.037462 0.032346 0.032776 0.029499 0.049168 0.035670 0.039854 0.036803 0.045590 0.041027 0.082523 0.049648 0.044459 0.029937 0.032879 0.031460 0.031788 0.042241 0.047725 0.070456 0.052987 0.044120 0.038745 0.038989 0.015254 0.011927 0.052403 0.050014 0.032129 0.047100 0.022718 0.048942 0.046519 0.058769 0.051422 0.066435 0.038945 0.036135 0.013666 0.014764 0.099837 0.045244 0.049914 0.037949 0.042928 0.029092 0.027040 0.041000 0.035742 0.057063 0.056079 0.054661 0.061901 0.043877 0.037606 0.036988 0.051406 0.169357 0.047310 0.009869 0.010754 0.014510 0.013932 0.025780 0.038718 0.032160 0.044575 0.038416 0.034899 0.037351 0.027102 0.018443 0.025340 0.031271 0.037153 0.059982 0.038112 0.062765 0.059484 0.095888 0.045951 0.041892 0.058679 0.054161 0.039517 0.042078 0.036063 0.048144 0.047688 0.042921 0.045241 0.038341 0.041129 0.033237 0.028478 0.038002 0.035906 0.040726 0.050294 0.049203 0.046171 0.055096 0.084852 0.038221 0.040089 0.041478 0.039187 0.038181 0.036559 0.040855 0.048178 0.046452 0.036643 0.039390 0.041241 0.039432 0.028720 0.033507 0.033618 0.036911 0.042227 0.047590 0.065330 0.048556 0.058778 0.057889 0.044317 0.036743 0.036225 0.038623 0.047162 0.046304 0.046163 0.589432 0.410568 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TRIO 0.049070 0.047619 0.050919 0.036940 0.039468 0.036601 0.037051 0.039088 0.055922 0.033747 0.037797 0.037885 0.040365 0.047620 0.089619 0.104386 0.035882 0.035138 0.027533 0.026947 0.029696 0.027546 0.033160 0.040242 0.054292 0.049277 0.036617 0.038336 0.036097 0.032922 0.029114 0.039965 0.034677 0.033214 0.033950 0.051522 0.042696 0.086770 0.096365 0.040047 0.039212 0.031899 0.046810 0.039532 0.033757 0.032687 0.040588 0.046487 0.041025 0.035235 0.024366 0.022327 0.015677 0.015624 0.027098 0.021550 0.015253 0.022121 0.028663 0.042066 0.110473 0.274714 0.042374 0.026190 0.018635 0.025052 0.039983 0.032963 0.031535 0.039414 0.053819 0.059979 0.051875 0.030931 0.028657 0.028994 0.020251 0.031121 0.028722 0.026451 0.028520 0.038303 0.052640 0.077293 0.169409 0.053991 0.044506 0.015306 0.026811 0.034840 0.028631 0.029538 0.059929 0.060988 0.043823 0.031445 0.038540 0.034964 0.027002 0.015795 0.052652 0.040019 0.031302 0.043654 0.020271 0.052015 0.084335 0.077021 0.050008 0.065065 0.047315 0.034443 0.025273 0.022954 0.041188 0.027463 0.057164 0.036142 0.048622 0.023223 0.019486 0.040813 0.038318 0.045951 0.044914 0.083225 0.036573 0.037518 0.022715 0.029867 0.036701 0.049416 0.031456 0.036215 0.042563 0.091499 0.083831 0.036325 0.029434 0.029243 0.033266 0.036495 0.024849 0.025063 0.034100 0.032383 0.042886 0.029725 0.037479 0.043795 0.029674 0.048883 0.063765 0.093569 0.033577 0.039036 0.042169 0.039494 0.067533 0.063865 0.079716 0.041778 0.041596 0.034564 0.059608 0.031858 0.030933 0.041257 0.046932 0.060865 0.030918 0.042661 0.036810 0.046814 0.037225 0.052091 0.068956 0.031620 0.033985 0.035322 0.034865 0.050751 0.044158 0.064431 0.052995 0.050392 0.041937 0.034362 0.037274 0.033597 0.037413 0.031098 0.038037 0.026956 0.040720 0.037006 0.046596 0.044965 0.066581 0.067287 0.042255 0.039186 0.039640 0.042475 0.052884 0.046734 0.049610 0.477542 0.522458 0.000000
TSSK3 0.058653 0.025380 0.066570 0.056317 0.037207 0.036167 0.080225 0.075408 0.115814 0.049548 0.037163 0.036261 0.024276 0.026531 0.046915 0.083276 0.037862 0.020326 0.007830 0.007382 0.021717 0.022070 0.027101 0.027753 0.058953 0.062683 0.042829 0.038773 0.036912 0.039729 0.030383 0.037500 0.049506 0.021706 0.032364 0.026124 0.056058 0.116581 0.107571 0.041487 0.037895 0.022279 0.037350 0.029320 0.030523 0.015721 0.057714 0.044055 0.054457 0.034306 0.030038 0.031732 0.027433 0.024502 0.018529 0.024557 0.014175 0.025756 0.019820 0.051878 0.128521 0.239667 0.041065 0.032839 0.008127 0.015395 0.027027 0.027420 0.020989 0.018026 0.039466 0.090171 0.075016 0.036252 0.037163 0.065410 0.035393 0.020649 0.030855 0.031480 0.036507 0.026200 0.050527 0.126116 0.125357 0.054350 0.047741 0.002026 0.007067 0.012767 0.014999 0.016464 0.090926 0.052566 0.058482 0.030202 0.041133 0.039203 0.029094 0.021753 0.033467 0.040912 0.024244 0.038699 0.025500 0.045168 0.103450 0.108203 0.062956 0.048850 0.023629 0.027969 0.013680 0.014419 0.025493 0.007954 0.025397 0.036891 0.053578 0.041111 0.040525 0.037977 0.026780 0.040914 0.059131 0.048805 0.051981 0.040530 0.049537 0.059025 0.078777 0.070291 0.056160 0.028709 0.035887 0.040605 0.042591 0.026845 0.022039 0.027035 0.042073 0.043381 0.040552 0.040987 0.054155 0.066387 0.063583 0.059575 0.052021 0.068460 0.054099 0.129755 0.020011 0.036822 0.042941 0.037583 0.009656 0.017845 0.027976 0.026101 0.016962 0.018217 0.029149 0.036300 0.065432 0.043473 0.042880 0.025155 0.023838 0.032150 0.039964 0.055690 0.043721 0.075102 0.076906 0.045870 0.051201 0.045220 0.091284 0.078466 0.034623 0.011269 0.010538 0.023553 0.032837 0.030686 0.030414 0.041301 0.033481 0.034361 0.063916 0.108155 0.139418 0.086923 0.071254 0.028400 0.035041 0.036489 0.034246 0.030458 0.034184 0.036234 0.026309 0.018402 0.015990 0.015742 0.015759 0.620700 0.379300 0.000000
TSSK4 0.052728 0.027791 0.059114 0.053775 0.036460 0.036364 0.077692 0.076985 0.115165 0.056947 0.044673 0.036815 0.035141 0.033569 0.040793 0.060000 0.034332 0.024112 0.013683 0.011410 0.022643 0.021738 0.028070 0.029160 0.058073 0.059669 0.042281 0.039931 0.039793 0.040242 0.034511 0.042144 0.051907 0.027392 0.036469 0.032939 0.054183 0.099418 0.088901 0.040957 0.040556 0.024174 0.039583 0.028542 0.031062 0.018114 0.057667 0.046000 0.050294 0.032826 0.034368 0.035929 0.028808 0.023344 0.025414 0.033727 0.021281 0.030275 0.027176 0.057613 0.111045 0.189078 0.041813 0.038692 0.014025 0.023164 0.027983 0.028601 0.020876 0.024183 0.046774 0.083527 0.070331 0.041056 0.042470 0.060570 0.036528 0.023113 0.034264 0.039010 0.037698 0.034880 0.049172 0.099556 0.077305 0.056535 0.060101 0.017323 0.018576 0.014551 0.015900 0.016576 0.081065 0.061904 0.055609 0.032369 0.042901 0.040985 0.030295 0.024896 0.035281 0.045901 0.028236 0.039934 0.028533 0.046500 0.093109 0.093223 0.060791 0.049695 0.026890 0.026179 0.013563 0.014949 0.027192 0.000000 0.029969 0.039070 0.047886 0.041985 0.042070 0.039563 0.033557 0.043256 0.065946 0.066799 0.055416 0.044437 0.049172 0.053947 0.070393 0.067609 0.050689 0.026755 0.032607 0.034816 0.035600 0.028457 0.022125 0.030072 0.043746 0.043979 0.040514 0.040628 0.056415 0.067637 0.062479 0.057405 0.055342 0.068655 0.059069 0.107542 0.016290 0.023216 0.039592 0.036376 0.014476 0.020323 0.026238 0.026215 0.041665 0.016244 0.031620 0.036699 0.057738 0.041766 0.042658 0.029229 0.029311 0.036618 0.042735 0.057426 0.047430 0.085156 0.070591 0.038712 0.042855 0.043761 0.083382 0.072574 0.035593 0.014580 0.016641 0.026681 0.035851 0.033285 0.033330 0.041113 0.036743 0.036320 0.058757 0.093853 0.119285 0.082519 0.066321 0.029773 0.036372 0.037973 0.033146 0.029864 0.035657 0.038069 0.030552 0.023436 0.021363 0.022620 0.023797 0.569166 0.358863 0.071971
WEE2 0.043967 0.075610 0.066270 0.022744 0.035380 0.016544 0.036876 0.052500 0.080799 0.022085 0.026341 0.022279 0.018525 0.029408 0.080315 0.075319 0.040636 0.042321 0.066750 0.054230 0.031021 0.028095 0.031986 0.042869 0.086758 0.045162 0.023713 0.029243 0.023958 0.034570 0.044249 0.072020 0.026170 0.025481 0.016079 0.021329 0.033396 0.069211 0.068525 0.040078 0.039380 0.075558 0.068705 0.072290 0.029349 0.011905 0.059183 0.091238 0.078219 0.017050 0.016508 0.009258 0.038960 0.035152 0.043076 0.017049 0.023477 0.012678 0.008085 0.031861 0.073281 0.069517 0.038651 0.039370 0.049084 0.080480 0.093174 0.052211 0.022438 0.034737 0.097239 0.050473 0.034056 0.019104 0.006526 0.032920 0.023955 0.032882 0.018267 0.015706 0.004761 0.002667 0.014464 0.033755 0.042027 0.044284 0.070194 0.132836 0.134048 0.094661 0.049406 0.011032 0.044062 0.078456 0.059391 0.017279 0.017465 0.015591 0.060694 0.036641 0.082583 0.026307 0.035697 0.025075 0.007191 0.046183 0.060556 0.081128 0.044798 0.053526 0.052816 0.048690 0.039354 0.019848 0.046669 0.000000 0.146733 0.039054 0.036102 0.016717 0.011760 0.066982 0.074497 0.036724 0.037622 0.039006 0.012272 0.015387 0.014001 0.084789 0.098231 0.032632 0.037891 0.037673 0.067815 0.036568 0.030217 0.027327 0.033869 0.071613 0.071329 0.029926 0.017721 0.018773 0.066162 0.044255 0.054582 0.025666 0.031754 0.032632 0.026301 0.045391 0.075666 0.104309 0.027431 0.069056 0.036004 0.032605 0.043191 0.033639 0.008123 0.070788 0.083901 0.044681 0.017935 0.017282 0.013780 0.060448 0.045041 0.094324 0.019183 0.050876 0.021346 0.027009 0.040054 0.071068 0.109179 0.034782 0.036738 0.046085 0.034930 0.030356 0.018590 0.011623 0.053925 0.082393 0.064062 0.013706 0.022838 0.018505 0.049247 0.036580 0.056739 0.016486 0.034869 0.018795 0.033647 0.030841 0.094766 0.096597 0.040385 0.036239 0.040689 0.067024 0.036481 0.036286 0.018901 0.345801 0.372509 0.281690
STK32A 0.042088 0.048958 0.043909 0.042148 0.039149 0.039619 0.035109 0.038230 0.032561 0.036990 0.039822 0.044389 0.042409 0.046752 0.047287 0.052673 0.036711 0.042259 0.042804 0.037494 0.053057 0.054500 0.061083 0.043605 0.046288 0.041353 0.039434 0.041191 0.038272 0.034217 0.032662 0.035199 0.036823 0.036835 0.036977 0.041268 0.038293 0.041026 0.045493 0.035155 0.038190 0.041974 0.047614 0.074720 0.074024 0.059386 0.035833 0.030871 0.032162 0.035356 0.031169 0.031806 0.030441 0.029950 0.027791 0.030205 0.030785 0.033947 0.029999 0.032071 0.025611 0.061684 0.030103 0.029443 0.036652 0.047217 0.130831 0.130034 0.066041 0.044565 0.034198 0.034086 0.043920 0.036184 0.036320 0.031351 0.032022 0.051327 0.039138 0.049859 0.041268 0.038005 0.034934 0.027656 0.030420 0.035404 0.036394 0.026301 0.031036 0.059159 0.060801 0.145654 0.056115 0.048990 0.038302 0.038636 0.041204 0.040568 0.039856 0.036125 0.040662 0.047873 0.043651 0.048880 0.036103 0.041382 0.052906 0.064304 0.043274 0.038753 0.027183 0.032817 0.044028 0.043816 0.054571 0.021531 0.028537 0.028373 0.038668 0.039189 0.038405 0.051296 0.062902 0.043929 0.062803 0.089980 0.043558 0.040010 0.032584 0.033808 0.046026 0.037831 0.022418 0.015478 0.028611 0.060652 0.061338 0.072072 0.017621 0.021457 0.030677 0.031442 0.026585 0.026038 0.034290 0.034485 0.026500 0.030659 0.036728 0.043694 0.044186 0.027152 0.015201 0.015454 0.021167 0.018053 0.028275 0.029752 0.037259 0.037739 0.365584 0.021352 0.031640 0.033771 0.040891 0.036303 0.037887 0.042047 0.044009 0.039734 0.039842 0.044166 0.043477 0.052977 0.039275 0.026209 0.031257 0.031869 0.031070 0.037814 0.042506 0.068686 0.068696 0.114522 0.041301 0.035156 0.037080 0.032705 0.037500 0.037323 0.034375 0.033726 0.033730 0.034566 0.032953 0.034890 0.040020 0.036438 0.036881 0.041973 0.034480 0.033865 0.035673 0.040218 0.061115 0.061496 0.152536 0.488526 0.511474 0.000000

65 rows × 210 columns

Visualize predicted PSSMs

for i,r in selected_df.iterrows():
    print(f'{r.family}:{r.Pearson_family}')
    
    k = r.kinase
    matrix = get_one_kinase(preds_final,k,drop_s=False).T
    get_logo2(matrix, k)
    plt.show()
    plt.close()
    
    get_heatmap(preds_final.iloc[:,:-3],k)
    plt.show()
    plt.close()
STKR:0.8578397866239376

STKR:0.8578397866239376

CDK:0.9232645242649458

CDKL:0.76311665060942

CDKL:0.76311665060942

CDKL:0.76311665060942

DCAMKL:0.9019845007257056

DMPK:0.9546690741027648

DAPK:0.7920828111362217

MAPK:0.8818759660317733

MAPK:0.8818759660317733

MLK:0.7337835153843144

MLK:0.7337835153843144

STE11:0.7499031572306992

STKR:0.8578397866239376

NEK:0.7782348100131159

STE20:0.8639317629980943

PDHK:0.6766899079964322

CDK:0.9232645242649458

CDK:0.9232645242649458

NKF1:0.7342624924789267

NKF1:0.7342624924789267

RIPK:0.62712483511819

LISK:0.5764010061153408

TSSK:0.8477050673723502

TSSK:0.8477050673723502

YANK:0.8166840920260132

Save images

Create folder: predict/logo, predict/heatmap, predict/combine

# !mkdir predict
# !mkdir predict/logo
# !mkdir predict/heatmap
# !mkdir predict/combine
# clear contents in the folder
# !rm -r predict/logo/*
# !rm -r predict/heatmap/*
# !rm -r predict/combine/*
# for i,r in selected.iterrows():
    
#     print(f'{r.family}:{r.Pearson_family}')
    
#     k = r.kinase
#     matrix = get_one_kinase(preds_final,k,drop_s=False).T
#     get_logo2(matrix, k)
#     plt.savefig(f'predict/logo/{k}.png',bbox_inches='tight', pad_inches=0.3) #0.3
#     plt.close()
    
#     get_heatmap(preds_final.iloc[:,:-3],k,figsize=(7.5,10))
#     plt.savefig(f'predict/heatmap/{k}.png',bbox_inches='tight', pad_inches=0)
#     plt.close()
    
#     # break

Save and combine images for pdf

def combine_images_vertically(image_paths, output_path):
    images = [Image.open(image_path).convert('RGBA') for image_path in image_paths]
    
    total_width = max(image.width for image in images)
    total_height = sum(image.height for image in images)

    combined_image = Image.new('RGBA', (total_width, total_height))

    y_offset = 0
    for image in images:
        combined_image.paste(image, (0, y_offset), image)
        y_offset += image.height

    combined_image.save(output_path)
# folders = ["predict/logo", "predict/heatmap"]

# for i,r in tqdm(selected.iterrows(),total=len(selected)):
    
#     k = r.kinase
#     filename = f"{k}.png"
#     image_paths = [os.path.join(folder, filename) for folder in folders]
#     output_path = f"predict/combine/{k}.png"
    
#     combine_images_vertically(image_paths, output_path)
#     # break
# !zip -rq predict.zip predict/combine/*