Get kinase-specific PSSMs

from katlas.core import *
import pandas as pd
df = Data.get_ks_dataset()
CPU times: user 878 ms, sys: 338 ms, total: 1.22 s
Wall time: 5.63 s
df['kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0]
cnt = df['kinase_id'].value_counts()
idx = cnt[cnt>=40].index
idx
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
       'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
       'P29323_EPHB2', 'P54762_EPHB1',
       ...
       'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
       'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
       'Q6P0Q8_MAST2', 'O14976_GAK'],
      dtype='object', name='kinase_id', length=333)
df.shape
(187066, 22)

We only evaluate those on kinome tree

df = df[df['kinase_id'].isin(idx)].copy()
df.shape
(185883, 22)
df['site_seq_upper']=df.site_seq.str.upper()
df['source_len'] = df.source.str.split('|').str.len()
def get_LO_all(pssms,site_type='STY'):
    out=[]
    for idx, flat_pssm in pssms.iterrows():
        out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))
    return pd.DataFrame(out,index=pssms.index)

Take out 20% of PSP as test

psp = df[df.source.str.contains('PSP')].copy()
def sample(group):
    n = max(1, int(len(group) * 0.2))  # At least 1 row if group is small
    # weights = group['source_len'].values
    # weights = weights / weights.sum()  # normalize

    sampled = group.sample(
        n=n, replace=False, random_state=42
    )

    # Add back the group key as a column
    sampled['kinase_uniprot'] = group.name
    return sampled
test = psp.groupby('kinase_uniprot', group_keys=False)\
    .apply(sample,include_groups=False)
test.kinase_group.value_counts()
kinase_group
CMGC        777
AGC         511
TK          387
CAMK        357
Atypical    148
Other       108
STE          89
CK1          63
TKL          44
NEK          16
Name: count, dtype: int64
test
kin_sub_site substrate_uniprot site source substrate_genes substrate_phosphoseq position site_seq sub_site substrate_sequence ... kinase_family kinase_subfamily kinase_pspa_big kinase_pspa_small kinase_coral_ID num_kin kinase_id site_seq_upper source_len kinase_uniprot
160 O00141_P46527_T157 P46527 T157 SIGNOR|EPSD|PSP CDKN1B KIP1 p27 MSNVRVSNGsPsLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... 157 PsDsQTGLAEQCAGIRKRPAtDDSSTQNKRANRTEENVsDG P46527_T157 MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... ... SGK SGK Basophilic Akt/rock SGK1 6 O00141_SGK1 PSDSQTGLAEQCAGIRKRPATDDSSTQNKRANRTEENVSDG 3 O00141
309 O00141_Q96J92_S1217 Q96J92 S1217 EPSD|PSP WNK4 PRKWNK4 MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... 1217 SRRNsLQRSEPPGPGIMRRNsLsGsSTGSQEQRASKGVTFA Q96J92_S1217 MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... ... SGK SGK Basophilic Akt/rock SGK1 1 O00141_SGK1 SRRNSLQRSEPPGPGIMRRNSLSGSSTGSQEQRASKGVTFA 2 O00141
369 O00141_Q9UN36_S346 Q9UN36 S346 EPSD|PSP NDRG2 KIAA1248 SYLD MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... 346 sRsRtAsLtsAAsVDGNRsRsRtLsQssEsGtLsSGPPGHT Q9UN36_S346 MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... ... SGK SGK Basophilic Akt/rock SGK1 1 O00141_SGK1 SRSRTASLTSAASVDGNRSRSRTLSQSSESGTLSSGPPGHT 2 O00141
39 O00141_O60343_T642 O60343 T642 GPS6|PSP TBC1D4 AS160 KIAA0603 MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... 642 AWQTFPEEDSDSPQFRRRAHtFsHPPsstKRKLNLQDGRAQ O60343_T642 MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... ... SGK SGK Basophilic Akt/rock SGK1 4 O00141_SGK1 AWQTFPEEDSDSPQFRRRAHTFSHPPSSTKRKLNLQDGRAQ 2 O00141
314 O00141_Q96PU5_S448 Q96PU5 S448 GPS6|SIGNOR|ELM|EPSD|PSP NEDD4L KIAA0439 NEDL3 MATGLGEPVYGLsEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... 448 sATNSNNHLIEPQIRRPRsLssPtVTLSAPLEGAKDsPVRR Q96PU5_S448 MATGLGEPVYGLSEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... ... SGK SGK Basophilic Akt/rock SGK1 4 O00141_SGK1 SATNSNNHLIEPQIRRPRSLSSPTVTLSAPLEGAKDSPVRR 5 O00141
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
185492 Q9Y4K4_P49841_S9 P49841 S9 SIGNOR|EPSD|PSP GSK3B MSGRPRttsFAEsCKPVQQPsAFGsMKVSRDKDGSKVTTVVAtPGQ... 9 ____________MSGRPRttsFAEsCKPVQQPsAFGsMKVS P49841_S9 MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQ... ... STE20 KHS Map4k Map4k KHS1 25 Q9Y4K4_KHS1 ____________MSGRPRTTSFAESCKPVQQPSAFGSMKVS 3 Q9Y4K4
185874 Q9Y5S2_O43255_S6 O43255 S6 PSP SIAH2 MSRPsstGPsANKPCsKQPPPQPQHtPsPAAPPAAATISAAGPGSS... 6 _______________MSRPsstGPsANKPCsKQPPPQPQHt O43255_S6 MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... ... DMPK GEK Basophilic Akt/rock MRCKb 1 Q9Y5S2_MRCKB _______________MSRPSSTGPSANKPCSKQPPPQPQHT 1 Q9Y5S2
186807 Q9Y6E0_Q9Y2K2_T221 Q9Y2K2 T221 PSP SIK3 KIAA0999 QSK L19 MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG... 221 NIKIADFGFSNLFTPGQLLKtWCGSPPYAAPELFEGKEYDG Q9Y2K2_T221 MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG... ... STE20 YSK Map4k Map4k MST3 5 Q9Y6E0_MST3 NIKIADFGFSNLFTPGQLLKTWCGSPPYAAPELFEGKEYDG 1 Q9Y6E0
186392 Q9Y6E0_P57059_T182 P57059 T182 PSP SIK1 SIK SNF1LK MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH... 182 DIKLADFGFGNFYKSGEPLStWCGsPPYAAPEVFEGKEYEG P57059_T182 MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH... ... STE20 YSK Map4k Map4k MST3 3 Q9Y6E0_MST3 DIKLADFGFGNFYKSGEPLSTWCGSPPYAAPEVFEGKEYEG 1 Q9Y6E0
186845 Q9Y6M4_O75581_S1490 O75581 S1490 PSP LRP6 MGAVLRSLLACSFCVLLRAAPLLLYANRRDLRLVDATNGKENATIV... 1490 SSSSssTKGtYFPAILNPPPsPAtERsHYTMEFGYSsNsPs O75581_S1490 MGAVLRSLLACSFCVLLRAAPLLLYANRRDLRLVDATNGKENATIV... ... CK1 CK1 Acidophilic Ck1 CK1g3 12 Q9Y6M4_CK1G3 SSSSSSTKGTYFPAILNPPPSPATERSHYTMEFGYSSNSPS 1 Q9Y6M4

2500 rows × 24 columns

test.to_parquet('out/CDDM_test_set.parquet')
test.head()
kin_sub_site substrate_uniprot site source substrate_genes substrate_phosphoseq position site_seq sub_site substrate_sequence ... kinase_family kinase_subfamily kinase_pspa_big kinase_pspa_small kinase_coral_ID num_kin kinase_id site_seq_upper source_len kinase_uniprot
160 O00141_P46527_T157 P46527 T157 SIGNOR|EPSD|PSP CDKN1B KIP1 p27 MSNVRVSNGsPsLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... 157 PsDsQTGLAEQCAGIRKRPAtDDSSTQNKRANRTEENVsDG P46527_T157 MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... ... SGK SGK Basophilic Akt/rock SGK1 6 O00141_SGK1 PSDSQTGLAEQCAGIRKRPATDDSSTQNKRANRTEENVSDG 3 O00141
309 O00141_Q96J92_S1217 Q96J92 S1217 EPSD|PSP WNK4 PRKWNK4 MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... 1217 SRRNsLQRSEPPGPGIMRRNsLsGsSTGSQEQRASKGVTFA Q96J92_S1217 MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... ... SGK SGK Basophilic Akt/rock SGK1 1 O00141_SGK1 SRRNSLQRSEPPGPGIMRRNSLSGSSTGSQEQRASKGVTFA 2 O00141
369 O00141_Q9UN36_S346 Q9UN36 S346 EPSD|PSP NDRG2 KIAA1248 SYLD MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... 346 sRsRtAsLtsAAsVDGNRsRsRtLsQssEsGtLsSGPPGHT Q9UN36_S346 MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... ... SGK SGK Basophilic Akt/rock SGK1 1 O00141_SGK1 SRSRTASLTSAASVDGNRSRSRTLSQSSESGTLSSGPPGHT 2 O00141
39 O00141_O60343_T642 O60343 T642 GPS6|PSP TBC1D4 AS160 KIAA0603 MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... 642 AWQTFPEEDSDSPQFRRRAHtFsHPPsstKRKLNLQDGRAQ O60343_T642 MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... ... SGK SGK Basophilic Akt/rock SGK1 4 O00141_SGK1 AWQTFPEEDSDSPQFRRRAHTFSHPPSSTKRKLNLQDGRAQ 2 O00141
314 O00141_Q96PU5_S448 Q96PU5 S448 GPS6|SIGNOR|ELM|EPSD|PSP NEDD4L KIAA0439 NEDL3 MATGLGEPVYGLsEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... 448 sATNSNNHLIEPQIRRPRsLssPtVTLSAPLEGAKDsPVRR Q96PU5_S448 MATGLGEPVYGLSEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... ... SGK SGK Basophilic Akt/rock SGK1 4 O00141_SGK1 SATNSNNHLIEPQIRRPRSLSSPTVTLSAPLEGAKDSPVRR 5 O00141

5 rows × 24 columns

df.shape
(185883, 24)
df_eval = df[~df.index.isin(test.index)]
df_eval.shape
(183383, 24)

Get eval PSSMs

pssms = get_cluster_pssms(df_eval,
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 30.25it/s]
pssms.shape
(333, 943)
pssms_upper = get_cluster_pssms(df_eval,
                                     seq_col='site_seq_upper',
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 29.50it/s]

LO of eval PSSMs

LO = get_LO_all(pssms)
LO_upper = get_LO_all(pssms_upper,'STY_upper')
LO.shape
(333, 943)

Remove isoform and pseudo gene

info= Data.get_kinase_info()
info = info[info.pseudo=='0'].copy()
info['id'] = info.uniprot+'_'+info.kinase
LO[~LO.index.isin(info.id)]
-20P -20G -20A -20C -20S -20T -20V -20I -20L -20M ... 20H 20K 20R 20Q 20N 20D 20E 20pS 20pT 20pY
P07948-2_LYN -0.695061 0.248942 -0.181606 -0.523522 -0.241880 -0.087184 0.270835 0.248626 -0.177055 0.151905 ... -1.002663 0.187090 -0.131176 -0.477873 0.089392 -0.069890 0.272381 -0.624875 -0.293957 0.686397
O60566_BUB1B -0.098063 0.177522 0.006417 -2.364642 -0.336759 -0.656228 0.622357 0.285512 -1.214575 -1.313708 ... 0.538365 0.313082 -0.184508 -1.521805 0.314920 0.267606 0.993983 -0.586346 -0.015962 -0.647041
P05771-2_PKCB -0.513296 0.082197 -0.419347 0.153007 0.387344 0.446387 1.033094 1.359558 -0.959957 -0.059090 ... 1.086193 -0.296631 -0.264711 -1.710942 0.415289 0.443465 0.117313 0.283410 -0.205099 -0.099214
Q13976-2_PKG1 0.267160 -0.880855 0.326552 -20.552054 -0.980098 -0.599127 0.959565 -0.422922 -0.420509 -0.841570 ... 0.565165 0.388792 0.076757 0.183066 -0.243242 -0.843097 -0.192211 0.440454 -1.311089 1.379757

4 rows × 943 columns

LO = LO[LO.index.isin(info.id)]
LO_upper =LO_upper[LO_upper.index.isin(info.id)]
pssms= pssms[pssms.index.isin(info.id)]
pssms_upper = pssms_upper[pssms_upper.index.isin(info.id)]
LO.shape,pssms.shape
((329, 943), (329, 943))
pssms.to_parquet('out/CDDM_pssms_eval_psp_02.parquet')
pssms_upper.to_parquet('out/CDDM_pssms_eval_upper_psp_02.parquet')

LO.to_parquet('out/CDDM_pssms_LO_eval_psp_02.parquet')
LO_upper.to_parquet('out/CDDM_pssms_LO_eval_upper_psp_02.parquet')