from katlas.core import *
import pandas as pd
Get kinase-specific PSSMs
= Data.get_ks_dataset() df
CPU times: user 2.9 s, sys: 404 ms, total: 3.31 s
Wall time: 7.36 s
'kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0] df[
= df['kinase_id'].value_counts() cnt
= cnt[cnt>=40].index
idx idx
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
'P29323_EPHB2', 'P54762_EPHB1',
...
'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
'Q6P0Q8_MAST2', 'O14976_GAK'],
dtype='object', name='kinase_id', length=333)
df.shape
(187066, 22)
All PSSMs
= df[df['kinase_id'].isin(idx)].copy() df
= get_cluster_pssms(df,
pssms_all = 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 27.83it/s]
pssms_all.shape
(333, 943)
# pssms_all.to_parquet('out/CDDM_pssms.parquet')
Upper
'site_seq_upper'] = df.site_seq.str.upper() df[
= get_cluster_pssms(df,
pssms_all_upper ='site_seq_upper',
seq_col= 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 27.98it/s]
# pssms_all_upper.to_parquet('out/CDDM_pssms_upper.parquet')
All log-odds
def get_LO_all(pssms,site_type='STY'):
=[]
outfor idx, flat_pssm in pssms.iterrows():
out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))return pd.DataFrame(out,index=pssms.index)
= get_LO_all(pssms_all) LO_all
# LO_all.to_parquet('out/CDDM_pssms_LO.parquet')
= get_LO_all(pssms_all_upper,site_type='STY_upper') LO_all_upper
# LO_all_upper.to_parquet('out/CDDM_pssms_LO_upper.parquet')
Split dataset for eval
'source_len'] = df.source.str.split('|').str.len() df[
df.source_len.value_counts()
source_len
1 173678
2 5977
3 3232
4 1291
5 901
6 754
7 50
Name: count, dtype: int64
def sample_with_weights(group):
= max(1, int(len(group) * 0.1)) # At least 1 row if group is small
n = group['source_len'].values
weights = weights / weights.sum() # normalize
weights
= group.sample(
sampled =n, weights=weights, replace=False, random_state=42
n
)
# Add back the group key as a column
'kinase_uniprot'] = group.name
sampled[return sampled
= df.groupby('kinase_uniprot', group_keys=False)\
test apply(sample_with_weights,include_groups=False) .
# test.to_parquet('out/CDDM_test_set.parquet')
test.head()
kin_sub_site | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | ... | kinase_family | kinase_subfamily | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | num_kin | kinase_id | site_seq_upper | source_len | kinase_uniprot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
154 | O00141_P43243_S188 | P43243 | S188 | Sugiyama | MATR3 KIAA0723 | MsKsFQQssLsRDsQGHGRDLsAAGIGLLAAAtQsLsMPAsLGRMN... | 188 | EPPyRVPRDDWEEKRHFRRDsFDDRGPsLNPVLDyDHGsRs | P43243_S188 | MSKSFQQSSLSRDSQGHGRDLSAAGIGLLAAATQSLSMPASLGRMN... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 106 | O00141_SGK1 | EPPYRVPRDDWEEKRHFRRDSFDDRGPSLNPVLDYDHGSRS | 1 | O00141 |
370 | O00141_Q9UN36_T330 | Q9UN36 | T330 | GPS6|SIGNOR|ELM|EPSD|PSP | NDRG2 KIAA1248 SYLD | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | 330 | FLQGMGYMASSCMTRLsRsRtAsLtsAAsVDGNRsRsRtLs | Q9UN36_T330 | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 2 | O00141_SGK1 | FLQGMGYMASSCMTRLSRSRTASLTSAASVDGNRSRSRTLS | 5 | O00141 |
299 | O00141_Q92597_T356 | Q92597 | T356 | GPS6|SIGNOR|ELM|EPSD|PSP | NDRG1 CAP43 DRG1 RTP | MsREMQDVDLAEVKPLVEKGETITGLLQEFDVQEQDIETLHGSVHV... | 356 | sLDGtRsRsHtSEGTRsRsHtsEGtRsRsHtsEGAHLDItP | Q92597_T356 | MSREMQDVDLAEVKPLVEKGETITGLLQEFDVQEQDIETLHGSVHV... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 2 | O00141_SGK1 | SLDGTRSRSHTSEGTRSRSHTSEGTRSRSHTSEGAHLDITP | 5 | O00141 |
251 | O00141_Q15149_S4386 | Q15149 | S4386 | Sugiyama | PLEC PLEC1 | MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL... | 4386 | ItEFADMLsGNAGGFRsRsssVGssssyPIsPAVsRtQLAs | Q15149_S4386 | MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 8 | O00141_SGK1 | ITEFADMLSGNAGGFRSRSSSVGSSSSYPISPAVSRTQLAS | 1 | O00141 |
58 | O00141_P00338_T322 | P00338 | T322 | Sugiyama | LDHA PIG19 | MAtLKDQLIyNLLKEEQtPQNKITVVGVGAVGMACAISILMKDLAD... | 322 | DLVKVTLtsEEEARLKKsADtLWGIQKELQF__________ | P00338_T322 | MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 52 | O00141_SGK1 | DLVKVTLTSEEEARLKKSADTLWGIQKELQF__________ | 1 | O00141 |
5 rows × 24 columns
df.shape
(185883, 24)
= df[~df.index.isin(test.index)] df_eval
df_eval.shape
(167441, 24)
Get eval PSSMs
= get_cluster_pssms(df_eval,
pssms_eval = 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:09<00:00, 33.41it/s]
# pssms_eval.to_parquet('out/CDDM_pssms_eval.parquet')
= get_cluster_pssms(df_eval,
pssms_eval_upper ='site_seq_upper',
seq_col= 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 333/333 [00:11<00:00, 28.91it/s]
'out/CDDM_pssms_eval_upper.parquet') pssms_eval_upper.to_parquet(
LO of eval PSSMs
= get_LO_all(pssms_eval) LO_eval
# LO_eval.to_parquet('out/CDDM_pssms_LO_eval.parquet')
= get_LO_all(pssms_eval_upper,'STY_upper') LO_eval_upper
# LO_eval_upper.to_parquet('out/CDDM_pssms_LO_eval_upper.parquet')
Eval: Filter those on kinome tree
import pandas as pd
from katlas.data import *
CDDM
= pd.read_parquet('out/CDDM_pssms_LO.parquet')
LO = pd.read_parquet('out/CDDM_pssms_LO_upper.parquet') LO_upper
= pd.read_parquet('out/CDDM_pssms.parquet')
pssms = pd.read_parquet('out/CDDM_pssms_upper.parquet') pssms_upper
LO.shape,pssms.shape
((333, 943), (333, 943))
= Data.get_kinase_info() info
= info[info.pseudo=='0'].copy() info
'id'] = info.uniprot+'_'+info.kinase info[
~LO.index.isin(info.id)] LO[
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20pS | 20pT | 20pY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
P07948-2_LYN | -0.695061 | 0.248942 | -0.181606 | -0.523522 | -0.241880 | -0.087184 | 0.270835 | 0.248626 | -0.177055 | 0.151905 | ... | -1.002663 | 0.187090 | -0.131176 | -0.477873 | 0.089392 | -0.069890 | 0.272381 | -0.624875 | -0.293957 | 0.686397 |
O60566_BUB1B | -0.098063 | 0.177522 | 0.006417 | -2.364642 | -0.336759 | -0.656228 | 0.622357 | 0.285512 | -1.214575 | -1.313708 | ... | 0.538365 | 0.313082 | -0.184508 | -1.521805 | 0.314920 | 0.267606 | 0.993983 | -0.586346 | -0.015962 | -0.647041 |
P05771-2_PKCB | -0.513296 | 0.082197 | -0.419347 | 0.153007 | 0.387344 | 0.446387 | 1.033094 | 1.359558 | -0.959957 | -0.059090 | ... | 1.086193 | -0.296631 | -0.264711 | -1.710942 | 0.415289 | 0.443465 | 0.117313 | 0.283410 | -0.205099 | -0.099214 |
Q13976-2_PKG1 | 0.267160 | -0.880855 | 0.326552 | -20.552054 | -0.980098 | -0.599127 | 0.959565 | -0.422922 | -0.420509 | -0.841570 | ... | 0.565165 | 0.388792 | 0.076757 | 0.183066 | -0.243242 | -0.843097 | -0.192211 | 0.440454 | -1.311089 | 1.379757 |
4 rows × 943 columns
= LO[LO.index.isin(info.id)]
LO =LO_upper[LO_upper.index.isin(info.id)] LO_upper
= pssms[pssms.index.isin(info.id)]
pssms= pssms_upper[pssms_upper.index.isin(info.id)] pssms_upper
LO.shape,pssms.shape
((329, 943), (329, 943))
'out/CDDM_pssms_LO.parquet')
LO.to_parquet('out/CDDM_pssms_LO_upper.parquet') LO_upper.to_parquet(
'out/CDDM_pssms.parquet')
pssms.to_parquet('out/CDDM_pssms_upper.parquet') pssms_upper.to_parquet(
Do the same for _eval parquet
Overlap
= Data.get_pspa_all_norm() pspa
pspa.shape
(396, 236)
# drop those with TYR
= pspa[~pspa.index.str.contains('_')] pspa
pspa.shape
(381, 236)
= info.set_index('kinase').id idx_map
= pspa.index.map(idx_map) pspa.index
sum() pspa.index.duplicated().
np.int64(0)
sum() pspa.index.isna().
np.int64(0)
In CDDM but not in PSPA:
Some are _TYR in PSPA, such as LIMK1/2 and MAP2K
~LO.index.isin(pspa.index)] LO[
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20pS | 20pT | 20pY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
P53667_LIMK1 | -0.743331 | 0.425765 | 0.460108 | -0.617596 | -0.152646 | -0.530667 | -0.174914 | 0.667565 | -0.286954 | 0.696375 | ... | -0.264557 | 0.331659 | 0.416959 | -0.865297 | 0.511997 | -0.585357 | -0.205241 | 0.102584 | 0.766077 | 1.331394 |
Q6PHR2_ULK3 | -0.490634 | -0.087072 | 0.029732 | -0.867398 | 0.123012 | -0.158981 | 0.634177 | 0.110333 | -0.887254 | -0.986387 | ... | 0.814737 | 0.168879 | -1.088708 | -0.245434 | 0.813684 | 0.069440 | 0.504820 | 0.011954 | 0.845372 | 0.688223 |
Q9Y3S1_WNK2 | -0.549822 | 0.658648 | -0.455872 | 0.438409 | -2.556071 | 0.525339 | 0.755560 | 0.849101 | -0.617971 | -1.417543 | ... | 1.166616 | 0.338166 | -0.973869 | -1.031058 | 1.542632 | 0.205812 | -1.380340 | -0.095599 | -0.847142 | 1.165632 |
Q9HBY8_SGK2 | -0.015252 | -0.725862 | 0.230006 | -0.889517 | -0.392146 | -0.011175 | 1.460055 | -1.326822 | -0.739447 | -1.423542 | ... | 0.249261 | -1.512075 | -0.016755 | -0.395873 | -1.144109 | -0.284533 | 0.676309 | 1.561955 | -1.890026 | 0.800819 |
P30291_WEE1 | -1.324755 | -0.357293 | 0.466785 | -1.783981 | -0.649181 | -0.753637 | -0.144904 | -0.899360 | -0.218875 | -0.318009 | ... | -1.155933 | 1.212013 | -0.322414 | -1.216105 | -0.156987 | -1.242268 | -0.465851 | 0.041283 | 0.874701 | -1.019412 |
Q15569_TESK1 | -1.198287 | 0.147686 | 0.654654 | 0.190482 | -0.067034 | -0.101100 | 1.033094 | -0.509857 | -1.507445 | -0.343544 | ... | 1.711714 | 0.490946 | -0.066201 | -2.807887 | -0.234197 | -0.155980 | 0.264293 | 0.034462 | 0.282917 | 1.388802 |
P45985_MAP2K4 | -0.646683 | 0.642707 | 0.708758 | 1.701443 | -0.456536 | 0.924434 | 0.073736 | -0.543216 | -0.081372 | -0.639936 | ... | 0.073312 | 0.311976 | -0.192703 | 0.013141 | 0.001870 | -0.334951 | -0.406530 | 0.685566 | 1.518984 | -19.873740 |
P53671_LIMK2 | 0.402872 | 0.611342 | 1.044310 | -0.930824 | -0.603378 | 0.584948 | -1.166214 | 0.216833 | -0.780754 | -21.086079 | ... | 0.944579 | 0.768205 | 0.593674 | -2.437519 | -1.185757 | -1.200649 | 0.187203 | 0.627222 | 0.653286 | -0.240828 |
O14733_MAP2K7 | 0.561353 | 0.376813 | 0.262292 | 1.687088 | 0.277569 | 1.073578 | -2.455191 | -0.487182 | -0.069732 | -21.086079 | ... | 0.610969 | 0.586599 | 0.859526 | -0.449202 | 0.217599 | -0.060328 | -0.546945 | 1.223223 | -0.528320 | -19.873740 |
P46734_MAP2K3 | -0.610158 | 0.816736 | -0.435288 | 1.515577 | -0.156976 | 0.545923 | -1.304775 | 0.400200 | -0.111960 | -21.086079 | ... | 0.207613 | 0.253632 | 0.041133 | 0.954797 | -1.600794 | -2.200649 | 0.120089 | -0.010207 | 0.653286 | 0.344134 |
P52564_MAP2K6 | -0.206700 | 0.608759 | 1.024753 | 1.597106 | -0.490484 | 0.627453 | -22.372406 | 0.159802 | -0.252823 | -21.086079 | ... | 0.207613 | 0.031240 | 0.041133 | 0.147442 | -0.015832 | -2.200649 | -0.365338 | 0.726758 | 1.653285 | -19.873740 |
Q59H18_TNNI3K | -1.210887 | 0.341537 | 1.642054 | -20.552054 | -22.224607 | -21.843636 | -0.320542 | -1.522457 | -0.198117 | -21.086079 | ... | -21.010260 | 0.301329 | -2.010705 | -1.582468 | -0.330705 | 0.654402 | 0.582823 | -1.325080 | 0.923375 | 2.199185 |
Q15120_PDHK3 | -0.583490 | 0.120938 | 1.491844 | 0.957282 | -0.130308 | -21.843636 | 0.944285 | 0.841906 | -0.892647 | -21.086079 | ... | -21.010260 | 0.506149 | -1.805885 | -0.377649 | -0.125886 | -22.418522 | -0.212357 | 0.879739 | -20.564588 | 0.819042 |
Q01974_ROR2 | -1.935252 | 0.839564 | -0.497349 | 0.190482 | -0.482071 | -0.101100 | 2.070569 | -21.989358 | -0.659448 | -0.343544 | ... | -21.010260 | 1.151753 | -0.382674 | -22.070431 | -0.702674 | -1.302529 | 2.018209 | -0.697049 | -20.564588 | -19.873740 |
Q99640_PKMYT1 | -1.687325 | -0.719863 | 1.165616 | -20.552054 | -22.224607 | 0.731789 | 1.203019 | -0.998895 | -0.996483 | -21.086079 | ... | 0.010576 | 1.571168 | 0.107131 | -22.070431 | -0.797831 | -1.397686 | -1.884303 | 0.207793 | 0.456249 | -19.873740 |
Q01973_ROR1 | 0.540085 | 0.507547 | -1.606973 | -20.552054 | -1.006733 | 1.959200 | 1.652822 | -0.771485 | -0.769072 | -21.086079 | ... | -21.010260 | 0.066864 | -0.245171 | -22.070431 | 0.434829 | -1.165025 | 0.670286 | 1.762382 | -20.564588 | 1.379757 |
Q6P0Q8_MAST2 | -0.528628 | 0.760762 | -0.090724 | -20.552054 | -1.075446 | 0.890487 | 0.361717 | -21.989358 | -0.837785 | 1.063081 | ... | -21.010260 | -0.553723 | 1.889130 | -22.070431 | 0.399205 | -1.200649 | -1.687266 | -0.595170 | -20.564588 | -19.873740 |
17 rows × 943 columns
In PSPA but not in CDDM
~pspa.index.isin(LO.index)] pspa[
-5P | -5G | -5A | -5C | -5S | -5T | -5V | -5I | -5L | -5M | ... | 5H | 5K | 5R | 5Q | 5N | 5D | 5E | 5s | 5t | 5y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
kinase | |||||||||||||||||||||
Q2M2I8_AAK1 | 0.0720 | 0.0245 | 0.0284 | 0.0456 | 0.0425 | 0.0425 | 0.0951 | 0.1554 | 0.0993 | 0.0864 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Q13705_ACVR2B | 0.0533 | 0.0517 | 0.0566 | 0.0772 | 0.0533 | 0.0533 | 0.0543 | 0.0442 | 0.0471 | 0.0516 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Q96QP1_ALPHAK3 | 0.0571 | 0.0478 | 0.0253 | 0.0384 | 0.0571 | 0.0571 | 0.0586 | 0.0602 | 0.0394 | 0.0673 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
P57078_ANKRD3 | 0.0542 | 0.0555 | 0.0611 | 0.0521 | 0.0554 | 0.0554 | 0.0509 | 0.0515 | 0.0545 | 0.0534 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
O14874_BCKDK | 0.0482 | 0.0672 | 0.0598 | 0.0694 | 0.0566 | 0.0566 | 0.0517 | 0.0467 | 0.0505 | 0.0566 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Q8IYT8_ULK2 | 0.0593 | 0.0724 | 0.0812 | 0.0682 | 0.0603 | 0.0603 | 0.0479 | 0.0399 | 0.0440 | 0.0415 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Q96J92_WNK4 | 0.0369 | 0.0523 | 0.0539 | 0.0544 | 0.0580 | 0.0580 | 0.0524 | 0.0631 | 0.0668 | 0.0580 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Q9NY57_YANK2 | 0.0580 | 0.0699 | 0.0637 | 0.0602 | 0.0580 | 0.0580 | 0.0433 | 0.0470 | 0.0459 | 0.0469 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Q86UX6_YANK3 | 0.0625 | 0.0776 | 0.0647 | 0.0598 | 0.0545 | 0.0545 | 0.0502 | 0.0537 | 0.0561 | 0.0543 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Q56UN5_YSK4 | 0.0593 | 0.0728 | 0.0744 | 0.0734 | 0.0597 | 0.0597 | 0.0517 | 0.0400 | 0.0433 | 0.0512 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
69 rows × 236 columns
= LO[LO.index.isin(pspa.index)].copy() overlap_cddm
overlap_cddm.shape
(312, 943)
= pspa[pspa.index.isin(LO.index)].copy() overlap_pspa
overlap_pspa.shape
(312, 236)
# overlap_cddm.to_parquet('raw/overlap_cddm_eval.parquet')
'raw/overlap_cddm.parquet') overlap_cddm.to_parquet(
'raw/overlap_pspa.parquet') overlap_pspa.to_parquet(