from katlas.core import *
import pandas as pd
Get kinase-specific PSSMs
= Data.get_ks_dataset() df
CPU times: user 878 ms, sys: 338 ms, total: 1.22 s
Wall time: 5.63 s
'kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0] df[
= df['kinase_id'].value_counts() cnt
= cnt[cnt>=40].index
idx idx
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
'P29323_EPHB2', 'P54762_EPHB1',
...
'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
'Q6P0Q8_MAST2', 'O14976_GAK'],
dtype='object', name='kinase_id', length=333)
df.shape
(187066, 22)
We only evaluate those on kinome tree
= df[df['kinase_id'].isin(idx)].copy() df
df.shape
(185883, 22)
'site_seq_upper']=df.site_seq.str.upper() df[
'source_len'] = df.source.str.split('|').str.len() df[
def get_LO_all(pssms,site_type='STY'):
=[]
outfor idx, flat_pssm in pssms.iterrows():
out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))return pd.DataFrame(out,index=pssms.index)
Take out 20% of PSP as test
= df[df.source.str.contains('PSP')].copy() psp
def sample(group):
= max(1, int(len(group) * 0.2)) # At least 1 row if group is small
n # weights = group['source_len'].values
# weights = weights / weights.sum() # normalize
= group.sample(
sampled =n, replace=False, random_state=42
n
)
# Add back the group key as a column
'kinase_uniprot'] = group.name
sampled[return sampled
= psp.groupby('kinase_uniprot', group_keys=False)\
test apply(sample,include_groups=False) .
test.kinase_group.value_counts()
kinase_group
CMGC 777
AGC 511
TK 387
CAMK 357
Atypical 148
Other 108
STE 89
CK1 63
TKL 44
NEK 16
Name: count, dtype: int64
test
kin_sub_site | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | ... | kinase_family | kinase_subfamily | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | num_kin | kinase_id | site_seq_upper | source_len | kinase_uniprot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
160 | O00141_P46527_T157 | P46527 | T157 | SIGNOR|EPSD|PSP | CDKN1B KIP1 p27 | MSNVRVSNGsPsLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... | 157 | PsDsQTGLAEQCAGIRKRPAtDDSSTQNKRANRTEENVsDG | P46527_T157 | MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 6 | O00141_SGK1 | PSDSQTGLAEQCAGIRKRPATDDSSTQNKRANRTEENVSDG | 3 | O00141 |
309 | O00141_Q96J92_S1217 | Q96J92 | S1217 | EPSD|PSP | WNK4 PRKWNK4 | MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... | 1217 | SRRNsLQRSEPPGPGIMRRNsLsGsSTGSQEQRASKGVTFA | Q96J92_S1217 | MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 1 | O00141_SGK1 | SRRNSLQRSEPPGPGIMRRNSLSGSSTGSQEQRASKGVTFA | 2 | O00141 |
369 | O00141_Q9UN36_S346 | Q9UN36 | S346 | EPSD|PSP | NDRG2 KIAA1248 SYLD | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | 346 | sRsRtAsLtsAAsVDGNRsRsRtLsQssEsGtLsSGPPGHT | Q9UN36_S346 | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 1 | O00141_SGK1 | SRSRTASLTSAASVDGNRSRSRTLSQSSESGTLSSGPPGHT | 2 | O00141 |
39 | O00141_O60343_T642 | O60343 | T642 | GPS6|PSP | TBC1D4 AS160 KIAA0603 | MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... | 642 | AWQTFPEEDSDSPQFRRRAHtFsHPPsstKRKLNLQDGRAQ | O60343_T642 | MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 4 | O00141_SGK1 | AWQTFPEEDSDSPQFRRRAHTFSHPPSSTKRKLNLQDGRAQ | 2 | O00141 |
314 | O00141_Q96PU5_S448 | Q96PU5 | S448 | GPS6|SIGNOR|ELM|EPSD|PSP | NEDD4L KIAA0439 NEDL3 | MATGLGEPVYGLsEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... | 448 | sATNSNNHLIEPQIRRPRsLssPtVTLSAPLEGAKDsPVRR | Q96PU5_S448 | MATGLGEPVYGLSEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 4 | O00141_SGK1 | SATNSNNHLIEPQIRRPRSLSSPTVTLSAPLEGAKDSPVRR | 5 | O00141 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
185492 | Q9Y4K4_P49841_S9 | P49841 | S9 | SIGNOR|EPSD|PSP | GSK3B | MSGRPRttsFAEsCKPVQQPsAFGsMKVSRDKDGSKVTTVVAtPGQ... | 9 | ____________MSGRPRttsFAEsCKPVQQPsAFGsMKVS | P49841_S9 | MSGRPRTTSFAESCKPVQQPSAFGSMKVSRDKDGSKVTTVVATPGQ... | ... | STE20 | KHS | Map4k | Map4k | KHS1 | 25 | Q9Y4K4_KHS1 | ____________MSGRPRTTSFAESCKPVQQPSAFGSMKVS | 3 | Q9Y4K4 |
185874 | Q9Y5S2_O43255_S6 | O43255 | S6 | PSP | SIAH2 | MSRPsstGPsANKPCsKQPPPQPQHtPsPAAPPAAATISAAGPGSS... | 6 | _______________MSRPsstGPsANKPCsKQPPPQPQHt | O43255_S6 | MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... | ... | DMPK | GEK | Basophilic | Akt/rock | MRCKb | 1 | Q9Y5S2_MRCKB | _______________MSRPSSTGPSANKPCSKQPPPQPQHT | 1 | Q9Y5S2 |
186807 | Q9Y6E0_Q9Y2K2_T221 | Q9Y2K2 | T221 | PSP | SIK3 KIAA0999 QSK L19 | MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG... | 221 | NIKIADFGFSNLFTPGQLLKtWCGSPPYAAPELFEGKEYDG | Q9Y2K2_T221 | MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG... | ... | STE20 | YSK | Map4k | Map4k | MST3 | 5 | Q9Y6E0_MST3 | NIKIADFGFSNLFTPGQLLKTWCGSPPYAAPELFEGKEYDG | 1 | Q9Y6E0 |
186392 | Q9Y6E0_P57059_T182 | P57059 | T182 | PSP | SIK1 SIK SNF1LK | MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH... | 182 | DIKLADFGFGNFYKSGEPLStWCGsPPYAAPEVFEGKEYEG | P57059_T182 | MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH... | ... | STE20 | YSK | Map4k | Map4k | MST3 | 3 | Q9Y6E0_MST3 | DIKLADFGFGNFYKSGEPLSTWCGSPPYAAPEVFEGKEYEG | 1 | Q9Y6E0 |
186845 | Q9Y6M4_O75581_S1490 | O75581 | S1490 | PSP | LRP6 | MGAVLRSLLACSFCVLLRAAPLLLYANRRDLRLVDATNGKENATIV... | 1490 | SSSSssTKGtYFPAILNPPPsPAtERsHYTMEFGYSsNsPs | O75581_S1490 | MGAVLRSLLACSFCVLLRAAPLLLYANRRDLRLVDATNGKENATIV... | ... | CK1 | CK1 | Acidophilic | Ck1 | CK1g3 | 12 | Q9Y6M4_CK1G3 | SSSSSSTKGTYFPAILNPPPSPATERSHYTMEFGYSSNSPS | 1 | Q9Y6M4 |
2500 rows × 24 columns
'out/CDDM_test_set.parquet') test.to_parquet(
test.head()
kin_sub_site | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | ... | kinase_family | kinase_subfamily | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | num_kin | kinase_id | site_seq_upper | source_len | kinase_uniprot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
160 | O00141_P46527_T157 | P46527 | T157 | SIGNOR|EPSD|PSP | CDKN1B KIP1 p27 | MSNVRVSNGsPsLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... | 157 | PsDsQTGLAEQCAGIRKRPAtDDSSTQNKRANRTEENVsDG | P46527_T157 | MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 6 | O00141_SGK1 | PSDSQTGLAEQCAGIRKRPATDDSSTQNKRANRTEENVSDG | 3 | O00141 |
309 | O00141_Q96J92_S1217 | Q96J92 | S1217 | EPSD|PSP | WNK4 PRKWNK4 | MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... | 1217 | SRRNsLQRSEPPGPGIMRRNsLsGsSTGSQEQRASKGVTFA | Q96J92_S1217 | MLASPATETTVLMSQTEADLALRPPPPLGTAGQPRLGPPPRRARRF... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 1 | O00141_SGK1 | SRRNSLQRSEPPGPGIMRRNSLSGSSTGSQEQRASKGVTFA | 2 | O00141 |
369 | O00141_Q9UN36_S346 | Q9UN36 | S346 | EPSD|PSP | NDRG2 KIAA1248 SYLD | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | 346 | sRsRtAsLtsAAsVDGNRsRsRtLsQssEsGtLsSGPPGHT | Q9UN36_S346 | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 1 | O00141_SGK1 | SRSRTASLTSAASVDGNRSRSRTLSQSSESGTLSSGPPGHT | 2 | O00141 |
39 | O00141_O60343_T642 | O60343 | T642 | GPS6|PSP | TBC1D4 AS160 KIAA0603 | MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... | 642 | AWQTFPEEDSDSPQFRRRAHtFsHPPsstKRKLNLQDGRAQ | O60343_T642 | MEPPSCIQDEPFPHPLEPEPGVSAQPGPGKPSDKRFRLWYVGGSCL... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 4 | O00141_SGK1 | AWQTFPEEDSDSPQFRRRAHTFSHPPSSTKRKLNLQDGRAQ | 2 | O00141 |
314 | O00141_Q96PU5_S448 | Q96PU5 | S448 | GPS6|SIGNOR|ELM|EPSD|PSP | NEDD4L KIAA0439 NEDL3 | MATGLGEPVYGLsEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... | 448 | sATNSNNHLIEPQIRRPRsLssPtVTLSAPLEGAKDsPVRR | Q96PU5_S448 | MATGLGEPVYGLSEDEGESRILRVKVVSGIDLAKKDIFGASDPYVK... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 4 | O00141_SGK1 | SATNSNNHLIEPQIRRPRSLSSPTVTLSAPLEGAKDSPVRR | 5 | O00141 |
5 rows × 24 columns
df.shape
(185883, 24)
= df[~df.index.isin(test.index)] df_eval
df_eval.shape
(183383, 24)
Get eval PSSMs
= get_cluster_pssms(df_eval,
pssms = 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 30.25it/s]
pssms.shape
(333, 943)
= get_cluster_pssms(df_eval,
pssms_upper ='site_seq_upper',
seq_col= 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 29.50it/s]
LO of eval PSSMs
= get_LO_all(pssms) LO
= get_LO_all(pssms_upper,'STY_upper') LO_upper
LO.shape
(333, 943)
Remove isoform and pseudo gene
= Data.get_kinase_info() info
= info[info.pseudo=='0'].copy() info
'id'] = info.uniprot+'_'+info.kinase info[
~LO.index.isin(info.id)] LO[
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20pS | 20pT | 20pY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
P07948-2_LYN | -0.695061 | 0.248942 | -0.181606 | -0.523522 | -0.241880 | -0.087184 | 0.270835 | 0.248626 | -0.177055 | 0.151905 | ... | -1.002663 | 0.187090 | -0.131176 | -0.477873 | 0.089392 | -0.069890 | 0.272381 | -0.624875 | -0.293957 | 0.686397 |
O60566_BUB1B | -0.098063 | 0.177522 | 0.006417 | -2.364642 | -0.336759 | -0.656228 | 0.622357 | 0.285512 | -1.214575 | -1.313708 | ... | 0.538365 | 0.313082 | -0.184508 | -1.521805 | 0.314920 | 0.267606 | 0.993983 | -0.586346 | -0.015962 | -0.647041 |
P05771-2_PKCB | -0.513296 | 0.082197 | -0.419347 | 0.153007 | 0.387344 | 0.446387 | 1.033094 | 1.359558 | -0.959957 | -0.059090 | ... | 1.086193 | -0.296631 | -0.264711 | -1.710942 | 0.415289 | 0.443465 | 0.117313 | 0.283410 | -0.205099 | -0.099214 |
Q13976-2_PKG1 | 0.267160 | -0.880855 | 0.326552 | -20.552054 | -0.980098 | -0.599127 | 0.959565 | -0.422922 | -0.420509 | -0.841570 | ... | 0.565165 | 0.388792 | 0.076757 | 0.183066 | -0.243242 | -0.843097 | -0.192211 | 0.440454 | -1.311089 | 1.379757 |
4 rows × 943 columns
= LO[LO.index.isin(info.id)]
LO =LO_upper[LO_upper.index.isin(info.id)] LO_upper
= pssms[pssms.index.isin(info.id)]
pssms= pssms_upper[pssms_upper.index.isin(info.id)] pssms_upper
LO.shape,pssms.shape
((329, 943), (329, 943))
'out/CDDM_pssms_eval_psp_02.parquet')
pssms.to_parquet('out/CDDM_pssms_eval_upper_psp_02.parquet')
pssms_upper.to_parquet(
'out/CDDM_pssms_LO_eval_psp_02.parquet')
LO.to_parquet('out/CDDM_pssms_LO_eval_upper_psp_02.parquet') LO_upper.to_parquet(