Get kinase-specific PSSMs

from katlas.core import *
import pandas as pd
df = Data.get_ks_dataset()
CPU times: user 871 ms, sys: 369 ms, total: 1.24 s
Wall time: 5.33 s
df['kinase_id'] = df['kinase_uniprot'] + '_' + df['kinase_protein'].str.split().str[0]
cnt = df['kinase_id'].value_counts()
idx = cnt[cnt>=40].index
idx
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
       'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
       'P29323_EPHB2', 'P54762_EPHB1',
       ...
       'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_CAMK1B', 'O00311_CDC7',
       'Q9NYV4_CDK12', 'Q15746_SMMLCK', 'Q01973_ROR1', 'P15056_BRAF',
       'Q6P0Q8_MAST2', 'O14976_GAK'],
      dtype='object', name='kinase_id', length=333)
df.shape
(187066, 22)
df = df[df['kinase_id'].isin(idx)].copy()
df['site_seq_upper'] = df.site_seq.str.upper()
test = df[df.source.str.contains('PSP')].copy()
df_eval = df[~df.source.str.contains('PSP')].copy()

Get eval PSSMs

pssms = get_cluster_pssms(df_eval,
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 332/332 [00:10<00:00, 32.91it/s]
pssms.shape
(332, 943)
pssms_upper = get_cluster_pssms(df_eval,
                                     seq_col='site_seq_upper',
                          cluster_col = 'kinase_id',
                          count_thr=None, # since we already filtered, we set None here
                          valid_thr=None)
100%|███████████████████████████████████████████████████████████████████████████| 332/332 [00:11<00:00, 29.87it/s]

LO of eval PSSMs

def get_LO_all(pssms,site_type='STY'):
    out=[]
    for idx, flat_pssm in pssms.iterrows():
        out.append(flatten_pssm(get_pssm_LO_flat(flat_pssm,site_type)))
    return pd.DataFrame(out,index=pssms.index)
LO = get_LO_all(pssms)
LO.shape
(332, 943)
LO_upper = get_LO_all(pssms_upper,'STY_upper')
LO_upper.shape
(332, 943)

Drop isoform and pseudokinase

info= Data.get_kinase_info()
info = info[info.pseudo=='0'].copy()
info['id'] = info.uniprot+'_'+info.kinase
LO[~LO.index.isin(info.id)]
-20P -20G -20A -20C -20S -20T -20V -20I -20L -20M ... 20H 20K 20R 20Q 20N 20D 20E 20pS 20pT 20pY
P07948-2_LYN -0.695061 0.248942 -0.181606 -0.523522 -0.241880 -0.087184 0.270835 0.248626 -0.177055 0.151905 ... -1.002663 0.187090 -0.131176 -0.477873 0.089392 -0.069890 0.272381 -0.624875 -0.293957 0.686397
O60566_BUB1B -0.098063 0.177522 0.006417 -2.364642 -0.336759 -0.656228 0.622357 0.285512 -1.214575 -1.313708 ... 0.538365 0.313082 -0.184508 -1.521805 0.314920 0.267606 0.993983 -0.586346 -0.015962 -0.647041
P05771-2_PKCB -0.513296 0.082197 -0.419347 0.153007 0.387344 0.446387 1.033094 1.359558 -0.959957 -0.059090 ... 1.086193 -0.296631 -0.264711 -1.710942 0.415289 0.443465 0.117313 0.283410 -0.205099 -0.099214
Q13976-2_PKG1 0.267160 -0.880855 0.326552 -20.552054 -0.980098 -0.599127 0.959565 -0.422922 -0.420509 -0.841570 ... 0.565165 0.388792 0.076757 0.183066 -0.243242 -0.843097 -0.192211 0.440454 -1.311089 1.379757

4 rows × 943 columns

LO = LO[LO.index.isin(info.id)]
LO_upper =LO_upper[LO_upper.index.isin(info.id)]
pssms= pssms[pssms.index.isin(info.id)]
pssms_upper = pssms_upper[pssms_upper.index.isin(info.id)]
LO.shape,pssms.shape
((328, 943), (328, 943))
pssms.to_parquet('out/CDDM_pssms_eval_no_psp.parquet')
pssms_upper.to_parquet('out/CDDM_pssms_eval_upper_no_psp.parquet')

LO.to_parquet('out/CDDM_pssms_LO_eval_no_psp.parquet')
LO_upper.to_parquet('out/CDDM_pssms_LO_eval_upper_no_psp.parquet')