from katlas.core import *
import pandas as pd
Scoring evaluation
= pd.read_parquet('out/CDDM_pssms_LO_eval.parquet')
LO = pd.read_parquet('out/CDDM_pssms_LO_eval_upper.parquet') LO_upper
Log-odds + sum
"PSVEPPLsQETFSDL",ref = LO,func=sumup) predict_kinase(
considering string: ['-7P', '-6S', '-5V', '-4E', '-3P', '-2P', '-1L', '0s', '1Q', '2E', '3T', '4F', '5S', '6D', '7L']
index
Q13535_ATR 11.215
Q13315_ATM 10.362
P78527_DNAPK 6.586
O96017_CHK2 2.101
P49840_GSK3A 1.719
...
Q6P2M8_CAMK1B -90.700
O00311_CDC7 -90.895
Q9NYV4_CDK12 -107.828
P15056_BRAF -135.335
Q59H18_TNNI3K -139.231
Length: 333, dtype: float64
# upper
"PSVEPPLsQETFSDL",ref = LO_upper,func=sumup) predict_kinase(
considering string: ['-7P', '-6S', '-5V', '-4E', '-3P', '-2P', '-1L', '0s', '1Q', '2E', '3T', '4F', '5S', '6D', '7L']
index
Q13535_ATR 10.045
Q13315_ATM 8.797
P78527_DNAPK 5.814
O96017_CHK2 3.416
P49761_CLK3 2.908
...
O75385_ULK1 -67.640
Q9NYV4_CDK12 -67.855
Q6P2M8_CAMK1B -73.139
Q59H18_TNNI3K -97.097
P15056_BRAF -114.566
Length: 333, dtype: float64
"PSVEPPLsQETFSDL",ref = ref,func=sumup) predict_kinase(
considering string: ['-7P', '-6S', '-5V', '-4E', '-3P', '-2P', '-1L', '0s', '1Q', '2E', '3T', '4F', '5S', '6D', '7L']
index
Q13535_ATR 11.214
Q13315_ATM 10.362
P78527_DNAPK 6.586
O96017_CHK2 2.101
P49840_GSK3A 1.718
...
Q6P2M8_CAMK1B -90.701
O00311_CDC7 -90.896
Q9NYV4_CDK12 -107.828
P15056_BRAF -135.335
Q59H18_TNNI3K -139.231
Length: 333, dtype: float64
"PSVEPPLSQETFSDL",ref = ref,func=sumup) predict_kinase(
considering string: ['-7P', '-6S', '-5V', '-4E', '-3P', '-2P', '-1L', '0S', '1Q', '2E', '3T', '4F', '5S', '6D', '7L']
index
Q13535_ATR 11.214
Q13315_ATM 10.362
P78527_DNAPK 6.586
O96017_CHK2 2.101
P49840_GSK3A 1.718
...
Q6P2M8_CAMK1B -90.701
O00311_CDC7 -90.896
Q9NYV4_CDK12 -107.828
P15056_BRAF -135.335
Q59H18_TNNI3K -139.231
Length: 333, dtype: float64
"AFEEKRYREMRRKNIIGQVCDsPKSyDNVMHVGLRKVTFKWQA",ref = ref,func=sumup) predict_kinase(
considering string: ['-20F', '-19E', '-18E', '-17K', '-16R', '-15Y', '-14R', '-13E', '-12M', '-11R', '-10R', '-9K', '-8N', '-7I', '-6I', '-5G', '-4Q', '-3V', '-2C', '-1D', '0s', '1P', '2K', '3S', '4y', '5D', '6N', '7V', '8M', '9H', '10V', '11G', '12L', '13R', '14K', '15V', '16T', '17F', '18K', '19W', '20Q']
index
P05129_PKCG 2.485
P05771-2_PKCB 1.566
Q13164_ERK5 1.337
Q13233_MEKK1 0.626
O00506_YSK1 0.495
...
Q01974_ROR2 -250.551
Q99640_PKMYT1 -261.428
P19525_PKR -283.355
Q99986_VRK1 -287.525
Q9NYV4_CDK12 -412.807
Length: 333, dtype: float64
Evaluate on test set
=pd.read_parquet('out/CDDM_test_set.parquet') data
= predict_kinase_df(data,seq_col='site_seq',ref=ref,func=sumup) out
input dataframe has a length 18461
Preprocessing
Finish preprocessing
Merging reference
Finish merging
data
kin_sub_site | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | ... | kinase_family | kinase_subfamily | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | num_kin | kinase_id | source_len | kinase_uniprot | rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
154 | O00141_P43243_S188 | P43243 | S188 | Sugiyama | MATR3 KIAA0723 | MsKsFQQssLsRDsQGHGRDLsAAGIGLLAAAtQsLsMPAsLGRMN... | 188 | EPPyRVPRDDWEEKRHFRRDsFDDRGPsLNPVLDyDHGsRs | P43243_S188 | MSKSFQQSSLSRDSQGHGRDLSAAGIGLLAAATQSLSMPASLGRMN... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 106 | O00141_SGK1 | 1 | O00141 | 248 |
370 | O00141_Q9UN36_T330 | Q9UN36 | T330 | GPS6|SIGNOR|ELM|EPSD|PSP | NDRG2 KIAA1248 SYLD | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | 330 | FLQGMGYMASSCMTRLsRsRtAsLtsAAsVDGNRsRsRtLs | Q9UN36_T330 | MAELQEVQITEEKPLLPGQTPEAAKEAELAARILLDQGQTHSVETP... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 2 | O00141_SGK1 | 5 | O00141 | 38 |
299 | O00141_Q92597_T356 | Q92597 | T356 | GPS6|SIGNOR|ELM|EPSD|PSP | NDRG1 CAP43 DRG1 RTP | MsREMQDVDLAEVKPLVEKGETITGLLQEFDVQEQDIETLHGSVHV... | 356 | sLDGtRsRsHtSEGTRsRsHtsEGtRsRsHtsEGAHLDItP | Q92597_T356 | MSREMQDVDLAEVKPLVEKGETITGLLQEFDVQEQDIETLHGSVHV... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 2 | O00141_SGK1 | 5 | O00141 | 10 |
251 | O00141_Q15149_S4386 | Q15149 | S4386 | Sugiyama | PLEC PLEC1 | MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL... | 4386 | ItEFADMLsGNAGGFRsRsssVGssssyPIsPAVsRtQLAs | Q15149_S4386 | MVAGMLMPRDQLRAIYEVLFREGVMVAKKDRRPRSLHPHVPGVTNL... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 8 | O00141_SGK1 | 1 | O00141 | 13 |
58 | O00141_P00338_T322 | P00338 | T322 | Sugiyama | LDHA PIG19 | MAtLKDQLIyNLLKEEQtPQNKITVVGVGAVGMACAISILMKDLAD... | 322 | DLVKVTLtsEEEARLKKsADtLWGIQKELQF__________ | P00338_T322 | MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD... | ... | SGK | SGK | Basophilic | Akt/rock | SGK1 | 52 | O00141_SGK1 | 1 | O00141 | 37 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
186963 | Q9Y6M4_P68133_S62 | P68133 | S62 | Sugiyama | ACTA1 ACTA | MCDEDETTALVCDNGSGLVKAGFAGDDAPRAVFPsIVGRPRHQGVM... | 62 | HQGVMVGMGQKDsyVGDEAQsKRGILTLKyPIEHGIItNWD | P68133_S62 | MCDEDETTALVCDNGSGLVKAGFAGDDAPRAVFPSIVGRPRHQGVM... | ... | CK1 | CK1 | Acidophilic | Ck1 | CK1g3 | 56 | Q9Y6M4_CSNK1G3 | 1 | Q9Y6M4 | 8 |
186857 | Q9Y6M4_P04406_S241 | P04406 | S241 | Sugiyama | GAPDH GAPD CDABP0047 OK/SW-cl.12 | MGKVKVGVNGFGRIGRLVtRAAFNsGKVDIVAINDPFIDLNyMVYM... | 241 | IPELNGKLtGMAFRVPtANVsVVDLtCRLEKPAKyDDIKKV | P04406_S241 | MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYM... | ... | CK1 | CK1 | Acidophilic | Ck1 | CK1g3 | 150 | Q9Y6M4_CSNK1G3 | 1 | Q9Y6M4 | 254 |
186892 | Q9Y6M4_P14625_S106 | P14625 | S106 | Sugiyama | HSP90B1 GRP94 HSPC4 TRA1 | MRALWVLGLCCVLLTFGSVRADDEVDVDGTVEEDLGKSREGsRtDD... | 106 | MKLIINSLYKNKEIFLRELIsNAsDALDKIRLISLTDENAL | P14625_S106 | MRALWVLGLCCVLLTFGSVRADDEVDVDGTVEEDLGKSREGSRTDD... | ... | CK1 | CK1 | Acidophilic | Ck1 | CK1g3 | 66 | Q9Y6M4_CSNK1G3 | 1 | Q9Y6M4 | 129 |
186908 | Q9Y6M4_P29692_S162 | P29692 | S162 | Sugiyama | EEF1D EF1D | MATNFLAHEKIWFDKFKYDDAERRFyEQMNGPVAGAsRQENGAsVI... | 162 | AKKPAtPAEDDEDDDIDLFGsDNEEEDKEAAQLREERLRQY | P29692_S162 | MATNFLAHEKIWFDKFKYDDAERRFYEQMNGPVAGASRQENGASVI... | ... | CK1 | CK1 | Acidophilic | Ck1 | CK1g3 | 56 | Q9Y6M4_CSNK1G3 | 1 | Q9Y6M4 | 148 |
186929 | Q9Y6M4_P60174_S212 | P60174 | S212 | Sugiyama | TPI1 TPI | MAPSRKFFVGGNWKMNGRKQsLGELIGtLNAAKVPADtEVVCAPPt... | 212 | WLKsNVsDAVAQstRIIyGGsVtGAtCKELASQPDVDGFLV | P60174_S212 | MAPSRKFFVGGNWKMNGRKQSLGELIGTLNAAKVPADTEVVCAPPT... | ... | CK1 | CK1 | Acidophilic | Ck1 | CK1g3 | 143 | Q9Y6M4_CSNK1G3 | 1 | Q9Y6M4 | 271 |
18461 rows × 23 columns
out.columns
Index(['P12931_SRC', 'P29320_EPHA3', 'P07332_FES', 'Q16288_NTRK3',
'Q9UM73_ALK', 'P00519_ABL1', 'P36888_FLT3', 'P29322_EPHA8',
'P29323_EPHB2', 'P54762_EPHB1',
...
'P35626_GRK3', 'Q99640_PKMYT1', 'Q6P2M8_PNCK', 'O00311_CDC7',
'Q9NYV4_CDK12', 'Q15746_MYLK', 'Q01973_ROR1', 'O14976_GAK',
'P15056_BRAF', 'Q6P0Q8_MAST2'],
dtype='object', length=335)
def get_kinase_rank(row_index):
= data.loc[row_index, 'kinase_id']
kinase = out.loc[row_index]
scores = scores.sort_values(ascending=False)
ranked = ranked.index.get_loc(kinase) + 1 # +1 to make rank start from 1
rank return rank
'rank'] = out.index.to_series().apply(get_kinase_rank) data[
Top 5/10
def top_k_accuracy(reference_df, result_df, k):
def is_in_top_k(row_index):
= reference_df.loc[row_index, 'kinase_id']
kinase = result_df.loc[row_index]
scores = scores.nlargest(k).index
top_k return kinase in top_k
return result_df.index.to_series().apply(is_in_top_k).mean()
= top_k_accuracy(data, out, 1)
top1 = top_k_accuracy(data, out, 5)
top5 = top_k_accuracy(data, out, 10) top10
print(f"Top-1 accuracy: {top1:.3f}")
print(f"Top-5 accuracy: {top5:.3f}")
print(f"Top-10 accuracy: {top10:.3f}")
Top-1 accuracy: 0.017
Top-5 accuracy: 0.087
Top-10 accuracy: 0.161
Groupby kinase subfamily
=data.copy() reference_df
= out.copy() result_df
def top_k_accuracy_group(group_indices, k):
def is_correct(row_index):
= reference_df.loc[row_index, 'kinase_id']
kinase = result_df.loc[row_index]
scores = scores.nlargest(k).index
top_k return kinase in top_k
return pd.Series(group_indices).apply(is_correct).mean()
# Group indices by subfamily
= data.groupby('kinase_subfamily').groups # dict: subfamily → list of indices
grouped
# Compute accuracy per subfamily
= {
topk_scores =10)
subfam: top_k_accuracy_group(indices, kfor subfam, indices in grouped.items()
}
# Convert to DataFrame for easy viewing
= pd.DataFrame.from_dict(topk_scores, orient='index', columns=['topk_accuracy']).sort_values('topk_accuracy', ascending=False) topk_df
'topk_accuracy'] topk_df[
ATM 0.764706
CRK7 0.750000
CDC2 0.637097
ERK1 0.591837
ATR 0.583333
...
CDC7 0.000000
ChaK 0.000000
SRPK 0.000000
VRK 0.000000
eEF2K 0.000000
Name: topk_accuracy, Length: 143, dtype: float64
'topk_accuracy'].plot.bar(figsize=(20,4))
topk_df[# TODO: add hue
By kinase group?
Bar plot of rank across kinase group
data.columns
Index(['kin_sub_site', 'substrate_uniprot', 'site', 'source',
'substrate_genes', 'substrate_phosphoseq', 'position', 'site_seq',
'sub_site', 'substrate_sequence', 'kinase_on_tree', 'kinase_genes',
'kinase_group', 'kinase_family', 'kinase_subfamily', 'kinase_pspa_big',
'kinase_pspa_small', 'kinase_coral_ID', 'num_kin', 'kinase_id',
'source_len', 'kinase_uniprot', 'rank'],
dtype='object')
='rank',group='kinase_subfamily',dots=False,figsize = (30,5)) plot_bar(data,value
=='TK'],value='rank',group='kinase_subfamily',dots=False,figsize = (30,5)) plot_bar(data[data.kinase_group
!='TK'],value='rank',group='kinase_subfamily',dots=False,figsize = (30,5)) plot_bar(data[data.kinase_group
AUCDF
from katlas.plot import *
'rank') get_AUCDF(data,
np.float64(0.8289094824229805)
= data[data.kinase_group=='TK']
data_tk = data[data.kinase_group!='TK'] data_st
'rank')
get_AUCDF(data_tk,'rank') get_AUCDF(data_st,
np.float64(0.7998083324521585)
'rank'].hist(bins=50) data[