from katlas.core import *
Checker and Scoring
Checker
- It will check if the middle position is s,t,y.
- Check all sequences have same length
- Convert non-s,t,y to upper case
- Convert rare aa to _
= Data.get_psp_human_site() df
df.head()
gene | protein | uniprot | site | gene_site | SITE_GRP_ID | species | site_seq | LT_LIT | MS_LIT | MS_CST | CST_CAT# | Ambiguous_Site | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | YWHAB | 14-3-3 beta | P31946 | T2 | YWHAB_T2 | 15718712 | human | ______MtMDksELV | NaN | 3.0 | 1.0 | None | 0 |
1 | YWHAB | 14-3-3 beta | P31946 | S6 | YWHAB_S6 | 15718709 | human | __MtMDksELVQkAk | NaN | 8.0 | NaN | None | 0 |
2 | YWHAB | 14-3-3 beta | P31946 | Y21 | YWHAB_Y21 | 3426383 | human | LAEQAERyDDMAAAM | NaN | NaN | 4.0 | None | 0 |
3 | YWHAB | 14-3-3 beta | P31946 | T32 | YWHAB_T32 | 23077803 | human | AAAMkAVtEQGHELs | NaN | NaN | 1.0 | None | 0 |
4 | YWHAB | 14-3-3 beta | P31946 | S39 | YWHAB_S39 | 27442700 | human | tEQGHELsNEERNLL | NaN | 4.0 | NaN | None | 0 |
try:
'site_seq')
check_seq_df(df, except Exception as e:
print(f"Error in check_site: {e}")
Error in check_site: SSEVQFGhAGACANQ has h at position 7; need to have one of 's', 't', or 'y' in the center
'site_seq'].str[7].value_counts() df[
site_seq
s 141851
t 58761
y 39367
h 14
k 4
r 3
g 3
p 2
n 1
f 1
l 1
a 1
i 1
d 1
Name: count, dtype: int64
Remove non-sty centered sequences:
= df[df['site_seq'].str[7].isin(list('sty'))].copy() df
'site_seq'] = check_seq_df(df, 'site_seq') df[
'site_seq'] df[
0 ______MtMDKsELV
1 __MtMDKsELVQKAK
2 LAEQAERyDDMAAAM
3 AAAMKAVtEQGHELs
4 tEQGHELsNEERNLL
...
240006 PsAKESAsQHITEEE
240007 GLPARPKsPLDPKKD
240008 LEQLLIKyPPEEVEs
240009 yPPEEVEsRRWQKIA
240010 NTAVEDAsDDESIPI
Name: site_seq, Length: 239979, dtype: object
Scoring
Params()
Available parameter sets:
['PSPA_st', 'PSPA_y', 'PSPA', 'CDDM', 'CDDM_upper']
Single sequence:
for param in ['PSPA', 'CDDM','CDDM_upper']:
print(predict_kinase('PSVEPPLsQETFSDL',**Params(param)).head())
considering string: ['-5V', '-4E', '-3P', '-2P', '-1L', '0s', '1Q', '2E', '3T', '4F', '5S']
kinase
ATM 5.037
SMG1 4.385
DNAPK 3.818
ATR 3.507
FAM20C 3.170
dtype: float64
considering string: ['-7P', '-6S', '-5V', '-4E', '-3P', '-2P', '-1L', '0s', '1Q', '2E', '3T', '4F', '5S', '6D', '7L']
kinase
ATR 3.064
ATM 2.909
DNAPK 2.270
CK2A1 1.873
TSSK1 1.856
dtype: float64
considering string: ['-7P', '-6S', '-5V', '-4E', '-3P', '-2P', '-1L', '0S', '1Q', '2E', '3T', '4F', '5S', '6D', '7L']
kinase
ATR 3.229
ATM 3.038
DNAPK 2.479
CK2A1 2.006
CDK8 1.999
dtype: float64
Multiple sequences scoring
CDDM:
= predict_kinase_df(df.head(500),'site_seq', **Params('CDDM')) out_cddm
input dataframe has a length 500
Preprocessing
Finish preprocessing
Merging reference
Finish merging
CPU times: user 25.6 ms, sys: 4.73 ms, total: 30.3 ms
Wall time: 29.4 ms
PSPA:
= predict_kinase_df(df.head(500),'site_seq', **Params('PSPA')) out_cddm
input dataframe has a length 500
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 396/396 [00:00<00:00, 423.14it/s]
CPU times: user 973 ms, sys: 12.4 ms, total: 985 ms
Wall time: 998 ms
PSPA percentile score
Single sequence
= Data.get_pspa_st_pct()
st_pct = Data.get_pspa_tyr_pct() y_pct
= get_pct('PSVEPPLyPETFSDL',**Params('PSPA_y'), pct_ref=y_pct)
a 'percentile',ascending=False) a.sort_values(
considering string: ['-5V', '-4E', '-3P', '-2P', '-1L', '0Y', '1P', '2E', '3T', '4F', '5S']
log2(score) | percentile | |
---|---|---|
TEK | 1.789 | 93.820916 |
PDHK4_TYR | 0.851 | 87.204375 |
DDR2 | 0.204 | 80.929597 |
CSF1R | 1.030 | 80.423787 |
PDHK3_TYR | 0.477 | 79.384826 |
... | ... | ... |
FLT1 | -3.121 | 7.723855 |
TNNI3K_TYR | -3.979 | 7.682843 |
PDGFRB | -3.144 | 7.177033 |
PTK6 | -3.540 | 4.743677 |
MUSK | -5.376 | 3.773069 |
93 rows × 2 columns
'PSVEPPLsQETFSDL',**Params('PSPA_st'), pct_ref=st_pct) get_pct(
considering string: ['-5V', '-4E', '-3P', '-2P', '-1L', '0S', '1Q', '2E', '3T', '4F']
log2(score) | percentile | |
---|---|---|
ATM | 5.037 | 99.822351 |
SMG1 | 4.385 | 99.831819 |
DNAPK | 3.818 | 99.205315 |
ATR | 3.507 | 99.680344 |
FAM20C | 3.170 | 95.370556 |
... | ... | ... |
PKN1 | -7.275 | 14.070436 |
P70S6K | -7.295 | 4.089816 |
AKT3 | -7.375 | 11.432995 |
PKCI | -7.742 | 8.129511 |
NEK3 | -8.254 | 4.637240 |
303 rows × 2 columns
Multiple sequences
= predict_kinase_df(df.head(1000),'site_seq', **Params('PSPA_st')) score_df
input dataframe has a length 1000
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 303/303 [00:00<00:00, 387.86it/s]
# get percentile reference
= Data.get_pspa_st_pct() pct_ref
= get_pct_df(score_df,pct_ref)
pct # the lower the better pct
100%|██████████| 303/303 [00:00<00:00, 1095.91it/s]
AAK1 | ACVR2A | ACVR2B | AKT1 | AKT2 | AKT3 | ALK2 | ALK4 | ALPHAK3 | AMPKA1 | ... | VRK1 | VRK2 | WNK1 | WNK3 | WNK4 | YANK2 | YANK3 | YSK1 | YSK4 | ZAK | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 73.188 | 87.606 | 84.729 | 79.877 | 79.533 | 81.850 | 89.768 | 91.201 | 98.426 | 54.554 | ... | 84.632 | 77.239 | 82.520 | 69.104 | 87.543 | 99.299 | 92.838 | 89.845 | 95.202 | 81.707 |
1 | 56.703 | 95.693 | 95.585 | 13.314 | 9.253 | 7.540 | 89.098 | 91.358 | 96.091 | 48.544 | ... | 30.292 | 25.050 | 17.132 | 47.388 | 20.742 | 80.747 | 87.061 | 8.621 | 53.530 | 45.247 |
2 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | ... | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 |
3 | 73.105 | 82.606 | 85.710 | 62.441 | 67.236 | 58.350 | 83.195 | 88.883 | 72.216 | 62.304 | ... | 99.196 | 88.782 | 42.032 | 26.203 | 48.761 | 48.276 | 36.724 | 68.810 | 65.439 | 40.623 |
4 | 41.466 | 93.063 | 95.262 | 25.830 | 32.035 | 22.619 | 98.084 | 96.177 | 53.175 | 29.332 | ... | 31.234 | 6.206 | 19.819 | 38.389 | 18.550 | 79.177 | 45.058 | 10.560 | 42.568 | 26.996 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 78.612 | 48.108 | 49.316 | 70.027 | 68.095 | 73.951 | 54.940 | 60.196 | 67.759 | 27.998 | ... | 80.102 | 79.886 | 48.167 | 46.783 | 49.481 | 90.781 | 52.997 | 84.096 | 44.233 | 84.789 |
996 | 15.366 | 42.891 | 37.567 | 42.280 | 46.271 | 53.040 | 31.365 | 24.402 | 46.969 | 56.985 | ... | 65.541 | 68.537 | 28.641 | 52.885 | 39.271 | 75.061 | 76.763 | 52.028 | 81.469 | 74.303 |
997 | 37.572 | 21.732 | 22.480 | 83.025 | 76.807 | 79.376 | 14.507 | 46.188 | 3.412 | 88.018 | ... | 78.811 | 58.363 | 95.636 | 96.562 | 91.573 | 81.036 | 77.758 | 85.756 | 75.917 | 82.245 |
998 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | ... | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 | 100.000 |
999 | 94.640 | 73.473 | 82.830 | 7.624 | 11.623 | 5.405 | 53.584 | 48.375 | 94.639 | 25.493 | ... | 87.950 | 66.768 | 8.615 | 0.937 | 14.440 | 14.019 | 3.233 | 73.357 | 35.516 | 42.516 |
1000 rows × 303 columns