from katlas.core import *
import pandas as pd
Get kinase-specific PSSMs
= Data.get_ks_dataset() df
CPU times: user 917 ms, sys: 454 ms, total: 1.37 s
Wall time: 6.42 s
'kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0] df[
= df['kinase_id'].value_counts() cnt
= cnt[cnt>=40].index
idx idx
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
'P29323_EPHB2', 'P54762_EPHB1',
...
'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
'Q6P0Q8_MAST2', 'O14976_GAK'],
dtype='object', name='kinase_id', length=333)
df.shape
(187066, 22)
= df[df['kinase_id'].isin(idx)].copy() df
'site_seq_upper'] = df.site_seq.str.upper() df[
= df[df.source.str.contains("(?=.*PSP)(?=.*Sugiyama)")].copy() test
test.source.value_counts()
source
EPSD|PSP|Sugiyama 122
PSP|Sugiyama 73
SIGNOR|EPSD|PSP|Sugiyama 58
GPS6|SIGNOR|ELM|iPTMNet|EPSD|PSP|Sugiyama 50
GPS6|SIGNOR|EPSD|PSP|Sugiyama 27
SIGNOR|ELM|iPTMNet|EPSD|PSP|Sugiyama 26
SIGNOR|PSP|Sugiyama 22
GPS6|EPSD|PSP|Sugiyama 11
SIGNOR|iPTMNet|EPSD|PSP|Sugiyama 10
GPS6|SIGNOR|ELM|EPSD|PSP|Sugiyama 8
SIGNOR|ELM|iPTMNet|PSP|Sugiyama 5
GPS6|SIGNOR|PSP|Sugiyama 5
SIGNOR|ELM|EPSD|PSP|Sugiyama 5
iPTMNet|EPSD|PSP|Sugiyama 3
ELM|EPSD|PSP|Sugiyama 3
GPS6|ELM|iPTMNet|EPSD|PSP|Sugiyama 3
ELM|iPTMNet|EPSD|PSP|Sugiyama 3
GPS6|SIGNOR|ELM|PSP|Sugiyama 3
SIGNOR|iPTMNet|PSP|Sugiyama 3
GPS6|SIGNOR|iPTMNet|EPSD|PSP|Sugiyama 3
GPS6|PSP|Sugiyama 2
GPS6|ELM|EPSD|PSP|Sugiyama 2
ELM|iPTMNet|PSP|Sugiyama 2
ELM|PSP|Sugiyama 1
iPTMNet|PSP|Sugiyama 1
Name: count, dtype: int64
= df[~df.source.str.contains('PSP')].copy() df_eval
Get eval PSSMs
= get_cluster_pssms(df_eval,
pssms = 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 332/332 [00:10<00:00, 32.91it/s]
pssms.shape
(332, 943)
= get_cluster_pssms(df_eval,
pssms_upper ='site_seq_upper',
seq_col= 'kinase_id',
cluster_col =None, # since we already filtered, we set None here
count_thr=None) valid_thr
100%|███████████████████████████████████████████████████████████████████████████| 332/332 [00:11<00:00, 29.87it/s]
LO of eval PSSMs
def get_LO_all(pssms,site_type='STY'):
=[]
outfor idx, flat_pssm in pssms.iterrows():
out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))return pd.DataFrame(out,index=pssms.index)
= get_LO_all(pssms) LO
LO.shape
(332, 943)
= get_LO_all(pssms_upper,'STY_upper') LO_upper
LO_upper.shape
(332, 943)
Drop isoform and pseudokinase
= Data.get_kinase_info() info
= info[info.pseudo=='0'].copy() info
'id'] = info.uniprot+'_'+info.kinase info[
~LO.index.isin(info.id)] LO[
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20pS | 20pT | 20pY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
P07948-2_LYN | -0.695061 | 0.248942 | -0.181606 | -0.523522 | -0.241880 | -0.087184 | 0.270835 | 0.248626 | -0.177055 | 0.151905 | ... | -1.002663 | 0.187090 | -0.131176 | -0.477873 | 0.089392 | -0.069890 | 0.272381 | -0.624875 | -0.293957 | 0.686397 |
O60566_BUB1B | -0.098063 | 0.177522 | 0.006417 | -2.364642 | -0.336759 | -0.656228 | 0.622357 | 0.285512 | -1.214575 | -1.313708 | ... | 0.538365 | 0.313082 | -0.184508 | -1.521805 | 0.314920 | 0.267606 | 0.993983 | -0.586346 | -0.015962 | -0.647041 |
P05771-2_PKCB | -0.513296 | 0.082197 | -0.419347 | 0.153007 | 0.387344 | 0.446387 | 1.033094 | 1.359558 | -0.959957 | -0.059090 | ... | 1.086193 | -0.296631 | -0.264711 | -1.710942 | 0.415289 | 0.443465 | 0.117313 | 0.283410 | -0.205099 | -0.099214 |
Q13976-2_PKG1 | 0.267160 | -0.880855 | 0.326552 | -20.552054 | -0.980098 | -0.599127 | 0.959565 | -0.422922 | -0.420509 | -0.841570 | ... | 0.565165 | 0.388792 | 0.076757 | 0.183066 | -0.243242 | -0.843097 | -0.192211 | 0.440454 | -1.311089 | 1.379757 |
4 rows × 943 columns
= LO[LO.index.isin(info.id)]
LO =LO_upper[LO_upper.index.isin(info.id)] LO_upper
= pssms[pssms.index.isin(info.id)]
pssms= pssms_upper[pssms_upper.index.isin(info.id)] pssms_upper
LO.shape,pssms.shape
((328, 943), (328, 943))
'out/CDDM_pssms_eval_no_psp.parquet')
pssms.to_parquet('out/CDDM_pssms_eval_upper_no_psp.parquet')
pssms_upper.to_parquet(
'out/CDDM_pssms_LO_eval_no_psp.parquet')
LO.to_parquet('out/CDDM_pssms_LO_eval_upper_no_psp.parquet') LO_upper.to_parquet(