Get kinase-specific PSSMs

from katlas.core import *
import pandas as pd
df = Data.get_ks_dataset()
CPU times: user 2.9 s, sys: 404 ms, total: 3.31 s
Wall time: 7.36 s
df['kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0]
cnt = df['kinase_id'].value_counts()
idx = cnt[cnt>=40].index
idx
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
       'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
       'P29323_EPHB2', 'P54762_EPHB1',
       ...
       'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
       'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
       'Q6P0Q8_MAST2', 'O14976_GAK'],
      dtype='object', name='kinase_id', length=333)
df.shape
(187066, 22)

All PSSMs

df = df[df['kinase_id'].isin(idx)].copy()
pssms_all = get_cluster_pssms(df,
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 27.83it/s]
pssms_all.shape
(333, 943)
# pssms_all.to_parquet('out/CDDM_pssms.parquet')

Upper

df['site_seq_upper'] = df.site_seq.str.upper()
pssms_all_upper = get_cluster_pssms(df,
                              seq_col='site_seq_upper',
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 27.98it/s]
# pssms_all_upper.to_parquet('out/CDDM_pssms_upper.parquet')

All log-odds

def get_LO_all(pssms,site_type='STY'):
    out=[]
    for idx, flat_pssm in pssms.iterrows():
        out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))
    return pd.DataFrame(out,index=pssms.index)
LO_all = get_LO_all(pssms_all)
# LO_all.to_parquet('out/CDDM_pssms_LO.parquet')
LO_all_upper = get_LO_all(pssms_all_upper,site_type='STY_upper')
# LO_all_upper.to_parquet('out/CDDM_pssms_LO_upper.parquet')

Split dataset for eval

df['source_len'] = df.source.str.split('|').str.len()
df.source_len.value_counts()
source_len
1    173678
2      5977
3      3232
4      1291
5       901
6       754
7        50
Name: count, dtype: int64
def sample_with_weights(group):
    n = max(1, int(len(group) * 0.1))  # At least 1 row if group is small
    weights = group['source_len'].values
    weights = weights / weights.sum()  # normalize

    sampled = group.sample(
        n=n, weights=weights, replace=False, random_state=42
    )

    # Add back the group key as a column
    sampled['kinase_uniprot'] = group.name
    return sampled
test = df.groupby('kinase_uniprot', group_keys=False)\
    .apply(sample_with_weights,include_groups=False)
# test.to_parquet('out/CDDM_test_set.parquet')
test.head()
kin_sub_site substrate_uniprot site source substrate_genes substrate_phosphoseq position site_seq sub_site substrate_sequence ... kinase_family kinase_subfamily kinase_pspa_big kinase_pspa_small kinase_coral_ID num_kin kinase_id site_seq_upper source_len kinase_uniprot
154 O00141_P43243_S188 P43243 S188 Sugiyama MATR3 KIAA0723 MsKsFQQssLsRDsQGHGRDLsAAGIGLLAAAtQsLsMPAsLGRMN... 188 EPPyRVPRDDWEEKRHFRRDsFDDRGPsLNPVLDyDHGsRs P43243_S188 MSKSFQQSSLSRDSQGHGRDLSAAGIGLLAAATQSLSMPASLGRMN... ... SGK SGK Basophilic Akt/rock SGK1 106 O00141_SGK1 EPPYRVPRDDWEEKRHFRRDSFDDRGPSLNPVLDYDHGSRS 1 O00141
370 O00141_Q9UN36_T330 Q9UN36 T330 GPS6|SIGNOR|ELM|EPSD|PSP NDRG2 KIAA1248 SYLD MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... 330 FLQGMGYMASSCMTRLsRsRtAsLtsAAsVDGNRsRsRtLs Q9UN36_T330 MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... ... SGK SGK Basophilic Akt/rock SGK1 2 O00141_SGK1 FLQGMGYMASSCMTRLSRSRTASLTSAASVDGNRSRSRTLS 5 O00141
299 O00141_Q92597_T356 Q92597 T356 GPS6|SIGNOR|ELM|EPSD|PSP NDRG1 CAP43 DRG1 RTP MsREMQDVDLAEVKPLVEKGETITGLLQEFDVQEQDIETLHGSVHV... 356 sLDGtRsRsHtSEGTRsRsHtsEGtRsRsHtsEGAHLDItP Q92597_T356 MSREMQDVDLAEVKPLVEKGETITGLLQEFDVQEQDIETLHGSVHV... ... SGK SGK Basophilic Akt/rock SGK1 2 O00141_SGK1 SLDGTRSRSHTSEGTRSRSHTSEGTRSRSHTSEGAHLDITP 5 O00141
251 O00141_Q15149_S4386 Q15149 S4386 Sugiyama PLEC PLEC1 MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL... 4386 ItEFADMLsGNAGGFRsRsssVGssssyPIsPAVsRtQLAs Q15149_S4386 MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL... ... SGK SGK Basophilic Akt/rock SGK1 8 O00141_SGK1 ITEFADMLSGNAGGFRSRSSSVGSSSSYPISPAVSRTQLAS 1 O00141
58 O00141_P00338_T322 P00338 T322 Sugiyama LDHA PIG19 MAtLKDQLIyNLLKEEQtPQNKITVVGVGAVGMACAISILMKDLAD... 322 DLVKVTLtsEEEARLKKsADtLWGIQKELQF__________ P00338_T322 MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD... ... SGK SGK Basophilic Akt/rock SGK1 52 O00141_SGK1 DLVKVTLTSEEEARLKKSADTLWGIQKELQF__________ 1 O00141

5 rows × 24 columns

df.shape
(185883, 24)
df_eval = df[~df.index.isin(test.index)]
df_eval.shape
(167441, 24)

Get eval PSSMs

pssms_eval = get_cluster_pssms(df_eval,
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:09<00:00, 33.41it/s]
# pssms_eval.to_parquet('out/CDDM_pssms_eval.parquet')
pssms_eval_upper = get_cluster_pssms(df_eval,
                                     seq_col='site_seq_upper',
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 28.91it/s]
pssms_eval_upper.to_parquet('out/CDDM_pssms_eval_upper.parquet')

LO of eval PSSMs

LO_eval = get_LO_all(pssms_eval)
# LO_eval.to_parquet('out/CDDM_pssms_LO_eval.parquet')
LO_eval_upper = get_LO_all(pssms_eval_upper,'STY_upper')
# LO_eval_upper.to_parquet('out/CDDM_pssms_LO_eval_upper.parquet')

Eval: Filter those on kinome tree

import pandas as pd
from katlas.data import *

CDDM

LO = pd.read_parquet('out/CDDM_pssms_LO.parquet')
LO_upper = pd.read_parquet('out/CDDM_pssms_LO_upper.parquet')
pssms = pd.read_parquet('out/CDDM_pssms.parquet')
pssms_upper = pd.read_parquet('out/CDDM_pssms_upper.parquet')
LO.shape,pssms.shape
((333, 943), (333, 943))
info= Data.get_kinase_info()
info = info[info.pseudo=='0'].copy()
info['id'] = info.uniprot+'_'+info.kinase
LO[~LO.index.isin(info.id)]
-20P -20G -20A -20C -20S -20T -20V -20I -20L -20M ... 20H 20K 20R 20Q 20N 20D 20E 20pS 20pT 20pY
index
P07948-2_LYN -0.695061 0.248942 -0.181606 -0.523522 -0.241880 -0.087184 0.270835 0.248626 -0.177055 0.151905 ... -1.002663 0.187090 -0.131176 -0.477873 0.089392 -0.069890 0.272381 -0.624875 -0.293957 0.686397
O60566_BUB1B -0.098063 0.177522 0.006417 -2.364642 -0.336759 -0.656228 0.622357 0.285512 -1.214575 -1.313708 ... 0.538365 0.313082 -0.184508 -1.521805 0.314920 0.267606 0.993983 -0.586346 -0.015962 -0.647041
P05771-2_PKCB -0.513296 0.082197 -0.419347 0.153007 0.387344 0.446387 1.033094 1.359558 -0.959957 -0.059090 ... 1.086193 -0.296631 -0.264711 -1.710942 0.415289 0.443465 0.117313 0.283410 -0.205099 -0.099214
Q13976-2_PKG1 0.267160 -0.880855 0.326552 -20.552054 -0.980098 -0.599127 0.959565 -0.422922 -0.420509 -0.841570 ... 0.565165 0.388792 0.076757 0.183066 -0.243242 -0.843097 -0.192211 0.440454 -1.311089 1.379757

4 rows × 943 columns

LO = LO[LO.index.isin(info.id)]
LO_upper =LO_upper[LO_upper.index.isin(info.id)]
pssms= pssms[pssms.index.isin(info.id)]
pssms_upper = pssms_upper[pssms_upper.index.isin(info.id)]
LO.shape,pssms.shape
((329, 943), (329, 943))
LO.to_parquet('out/CDDM_pssms_LO.parquet')
LO_upper.to_parquet('out/CDDM_pssms_LO_upper.parquet')
pssms.to_parquet('out/CDDM_pssms.parquet')
pssms_upper.to_parquet('out/CDDM_pssms_upper.parquet')

Do the same for _eval parquet

Overlap

pspa = Data.get_pspa_all_norm()
pspa.shape
(396, 236)
# drop those with TYR
pspa = pspa[~pspa.index.str.contains('_')]
pspa.shape
(381, 236)
idx_map = info.set_index('kinase').id
pspa.index = pspa.index.map(idx_map)
pspa.index.duplicated().sum()
np.int64(0)
pspa.index.isna().sum()
np.int64(0)

In CDDM but not in PSPA:

Some are _TYR in PSPA, such as LIMK1/2 and MAP2K

LO[~LO.index.isin(pspa.index)]
-20P -20G -20A -20C -20S -20T -20V -20I -20L -20M ... 20H 20K 20R 20Q 20N 20D 20E 20pS 20pT 20pY
index
P53667_LIMK1 -0.743331 0.425765 0.460108 -0.617596 -0.152646 -0.530667 -0.174914 0.667565 -0.286954 0.696375 ... -0.264557 0.331659 0.416959 -0.865297 0.511997 -0.585357 -0.205241 0.102584 0.766077 1.331394
Q6PHR2_ULK3 -0.490634 -0.087072 0.029732 -0.867398 0.123012 -0.158981 0.634177 0.110333 -0.887254 -0.986387 ... 0.814737 0.168879 -1.088708 -0.245434 0.813684 0.069440 0.504820 0.011954 0.845372 0.688223
Q9Y3S1_WNK2 -0.549822 0.658648 -0.455872 0.438409 -2.556071 0.525339 0.755560 0.849101 -0.617971 -1.417543 ... 1.166616 0.338166 -0.973869 -1.031058 1.542632 0.205812 -1.380340 -0.095599 -0.847142 1.165632
Q9HBY8_SGK2 -0.015252 -0.725862 0.230006 -0.889517 -0.392146 -0.011175 1.460055 -1.326822 -0.739447 -1.423542 ... 0.249261 -1.512075 -0.016755 -0.395873 -1.144109 -0.284533 0.676309 1.561955 -1.890026 0.800819
P30291_WEE1 -1.324755 -0.357293 0.466785 -1.783981 -0.649181 -0.753637 -0.144904 -0.899360 -0.218875 -0.318009 ... -1.155933 1.212013 -0.322414 -1.216105 -0.156987 -1.242268 -0.465851 0.041283 0.874701 -1.019412
Q15569_TESK1 -1.198287 0.147686 0.654654 0.190482 -0.067034 -0.101100 1.033094 -0.509857 -1.507445 -0.343544 ... 1.711714 0.490946 -0.066201 -2.807887 -0.234197 -0.155980 0.264293 0.034462 0.282917 1.388802
P45985_MAP2K4 -0.646683 0.642707 0.708758 1.701443 -0.456536 0.924434 0.073736 -0.543216 -0.081372 -0.639936 ... 0.073312 0.311976 -0.192703 0.013141 0.001870 -0.334951 -0.406530 0.685566 1.518984 -19.873740
P53671_LIMK2 0.402872 0.611342 1.044310 -0.930824 -0.603378 0.584948 -1.166214 0.216833 -0.780754 -21.086079 ... 0.944579 0.768205 0.593674 -2.437519 -1.185757 -1.200649 0.187203 0.627222 0.653286 -0.240828
O14733_MAP2K7 0.561353 0.376813 0.262292 1.687088 0.277569 1.073578 -2.455191 -0.487182 -0.069732 -21.086079 ... 0.610969 0.586599 0.859526 -0.449202 0.217599 -0.060328 -0.546945 1.223223 -0.528320 -19.873740
P46734_MAP2K3 -0.610158 0.816736 -0.435288 1.515577 -0.156976 0.545923 -1.304775 0.400200 -0.111960 -21.086079 ... 0.207613 0.253632 0.041133 0.954797 -1.600794 -2.200649 0.120089 -0.010207 0.653286 0.344134
P52564_MAP2K6 -0.206700 0.608759 1.024753 1.597106 -0.490484 0.627453 -22.372406 0.159802 -0.252823 -21.086079 ... 0.207613 0.031240 0.041133 0.147442 -0.015832 -2.200649 -0.365338 0.726758 1.653285 -19.873740
Q59H18_TNNI3K -1.210887 0.341537 1.642054 -20.552054 -22.224607 -21.843636 -0.320542 -1.522457 -0.198117 -21.086079 ... -21.010260 0.301329 -2.010705 -1.582468 -0.330705 0.654402 0.582823 -1.325080 0.923375 2.199185
Q15120_PDHK3 -0.583490 0.120938 1.491844 0.957282 -0.130308 -21.843636 0.944285 0.841906 -0.892647 -21.086079 ... -21.010260 0.506149 -1.805885 -0.377649 -0.125886 -22.418522 -0.212357 0.879739 -20.564588 0.819042
Q01974_ROR2 -1.935252 0.839564 -0.497349 0.190482 -0.482071 -0.101100 2.070569 -21.989358 -0.659448 -0.343544 ... -21.010260 1.151753 -0.382674 -22.070431 -0.702674 -1.302529 2.018209 -0.697049 -20.564588 -19.873740
Q99640_PKMYT1 -1.687325 -0.719863 1.165616 -20.552054 -22.224607 0.731789 1.203019 -0.998895 -0.996483 -21.086079 ... 0.010576 1.571168 0.107131 -22.070431 -0.797831 -1.397686 -1.884303 0.207793 0.456249 -19.873740
Q01973_ROR1 0.540085 0.507547 -1.606973 -20.552054 -1.006733 1.959200 1.652822 -0.771485 -0.769072 -21.086079 ... -21.010260 0.066864 -0.245171 -22.070431 0.434829 -1.165025 0.670286 1.762382 -20.564588 1.379757
Q6P0Q8_MAST2 -0.528628 0.760762 -0.090724 -20.552054 -1.075446 0.890487 0.361717 -21.989358 -0.837785 1.063081 ... -21.010260 -0.553723 1.889130 -22.070431 0.399205 -1.200649 -1.687266 -0.595170 -20.564588 -19.873740

17 rows × 943 columns

In PSPA but not in CDDM

pspa[~pspa.index.isin(LO.index)]
-5P -5G -5A -5C -5S -5T -5V -5I -5L -5M ... 5H 5K 5R 5Q 5N 5D 5E 5s 5t 5y
kinase
Q2M2I8_AAK1 0.0720 0.0245 0.0284 0.0456 0.0425 0.0425 0.0951 0.1554 0.0993 0.0864 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Q13705_ACVR2B 0.0533 0.0517 0.0566 0.0772 0.0533 0.0533 0.0543 0.0442 0.0471 0.0516 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Q96QP1_ALPHAK3 0.0571 0.0478 0.0253 0.0384 0.0571 0.0571 0.0586 0.0602 0.0394 0.0673 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
P57078_ANKRD3 0.0542 0.0555 0.0611 0.0521 0.0554 0.0554 0.0509 0.0515 0.0545 0.0534 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
O14874_BCKDK 0.0482 0.0672 0.0598 0.0694 0.0566 0.0566 0.0517 0.0467 0.0505 0.0566 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Q8IYT8_ULK2 0.0593 0.0724 0.0812 0.0682 0.0603 0.0603 0.0479 0.0399 0.0440 0.0415 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Q96J92_WNK4 0.0369 0.0523 0.0539 0.0544 0.0580 0.0580 0.0524 0.0631 0.0668 0.0580 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Q9NY57_YANK2 0.0580 0.0699 0.0637 0.0602 0.0580 0.0580 0.0433 0.0470 0.0459 0.0469 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Q86UX6_YANK3 0.0625 0.0776 0.0647 0.0598 0.0545 0.0545 0.0502 0.0537 0.0561 0.0543 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Q56UN5_YSK4 0.0593 0.0728 0.0744 0.0734 0.0597 0.0597 0.0517 0.0400 0.0433 0.0512 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

69 rows × 236 columns

overlap_cddm = LO[LO.index.isin(pspa.index)].copy()
overlap_cddm.shape
(312, 943)
overlap_pspa = pspa[pspa.index.isin(LO.index)].copy()
overlap_pspa.shape
(312, 236)
# overlap_cddm.to_parquet('raw/overlap_cddm_eval.parquet')
overlap_cddm.to_parquet('raw/overlap_cddm.parquet')
overlap_pspa.to_parquet('raw/overlap_pspa.parquet')