from katlas.core import *
from katlas.plot import *
from katlas.feature import *
import pandas as pd, seaborn as sns, numpy as np
import matplotlib.pyplot as plt
Prepare training set
set(rc={"figure.dpi":200,'savefig.dpi':300})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
Get data
= Data.get_ks_dataset()
df df.head()
kin_sub_site | kinase_uniprot | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | ... | kinase_family | kinase_pspa_big | kinase_pspa_small | SUB | num_phospho | kinase_gene | upper_in_human_phosphoproteome | source_all | substrate_gene | kinase_uniprot_gene | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O00141_A4FU28_S140 | O00141 | A4FU28 | S140 | Sugiyama | CTAGE9 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 140 | AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH | A4FU28_S140 | ... | SGK | basophilic | AKT/ROCK | AAAEEARSLEATCEKLSRSNSELEDEILCLEKDLKEEKSKH | 2 | SGK1 | 1 | human_phosphoproteome|Sugiyama | CTAGE9 | O00141_SGK1 |
1 | O00141_O00141_S252 | O00141 | O00141 | S252 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 252 | SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ | O00141_S252 | ... | SGK | basophilic | AKT/ROCK | SQGHIVLTDFGLCKENIEHNSTTSTFCGTPEYLAPEVLHKQ | 6 | SGK1 | 0 | Sugiyama | SGK1 | O00141_SGK1 |
2 | O00141_O00141_S255 | O00141 | O00141 | S255 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 255 | HIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQPYD | O00141_S255 | ... | SGK | basophilic | AKT/ROCK | HIVLTDFGLCKENIEHNSTTSTFCGTPEYLAPEVLHKQPYD | 6 | SGK1 | 1 | human_phosphoproteome|Sugiyama | SGK1 | O00141_SGK1 |
3 | O00141_O00141_S397 | O00141 | O00141 | S397 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 397 | sGPNDLRHFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEA | O00141_S397 | ... | SGK | basophilic | AKT/ROCK | SGPNDLRHFDPEFTEEPVPNSIGKSPDSVLVTASVKEAAEA | 5 | SGK1 | 1 | human_phosphoproteome|Sugiyama | SGK1 | O00141_SGK1 |
4 | O00141_O00141_S404 | O00141 | O00141 | S404 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 404 | HFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEAFLGFsYA | O00141_S404 | ... | SGK | basophilic | AKT/ROCK | HFDPEFTEEPVPNSIGKSPDSVLVTASVKEAAEAFLGFSYA | 5 | SGK1 | 0 | Sugiyama | SGK1 | O00141_SGK1 |
5 rows × 24 columns
df.shape
(187066, 24)
Deal with duplicate site sequences
Here are the process we do:
- groupby the all uppercase of site sequence
- get the pivot table that columns are kinase uniprot and gene name
- the value in the pivot table reflect the count of kinase
- replace the all uppcase site sequence with the most phosphorylated version of that sequence
='site_seq'
SEQ_COL
'SUB'] = df[SEQ_COL].str.upper()
df['num_phospho']=df[SEQ_COL].str.count(r'[sty]')
df['kinase_gene']=df['kinase_genes'].str.split(' ').str[0] df[
= df.sort_values('num_phospho',ascending=False).drop_duplicates(subset='SUB')\
seq_map 'SUB')[SEQ_COL] .set_index(
'kinase_uniprot_gene']=df['kinase_uniprot']+'_'+df['kinase_gene'] df[
= df[['SUB', 'kinase_uniprot_gene']] pivot_data
= pivot_data.groupby(['SUB', 'kinase_uniprot_gene']).size().unstack(fill_value=0) pivot_table
# replace the uppercase seq with most phosphorylated form
= pivot_table.index.map(seq_map)
pivot_table.index = pivot_table.reset_index().rename(columns={'SUB': 'site_seq'}) pivot_table
pivot_table
kinase_uniprot_gene | site_seq | O00141_SGK1 | O00238_BMPR1B | O00311_CDC7 | O00329_PIK3CD | O00418_EEF2K | O00443_PIK3C2A | O00444_PLK4 | O00506_STK25 | O14578_CIT | ... | Q9Y2K2_SIK3 | Q9Y2U5_MAP3K2 | Q9Y3S1_WNK2 | Q9Y463_DYRK1B | Q9Y4K4_MAP4K5 | Q9Y572_RIPK3 | Q9Y5S2_CDC42BPB | Q9Y6E0_STK24 | Q9Y6M4_CSNK1G3 | Q9Y6R4_MAP3K4 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAAAAAAAAVAAPPTAVGSLsGAEGVPVSsQPLPSQPW___ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | AAAAAAASGGAQQRsHHAPMsPGssGGGGQPLARtPQPssP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | AAAAAAAVtAAstsYYGRDRsPLRRATAPVPTVGEGYGYGH | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | AAAAAVSRRRKAEYPRRRRssPsARPPDVPGQQPQAAKsPs | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | AAAAGAGKAEELHyPLGERRsDyDREALLGVQEDVDEyVKL | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
29151 | ___________________MstVHEILCKLsLEGDHstPPs | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29152 | ___________________MsYRRELEKyRDLDEDEILGAL | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29153 | ___________________MtAKMETtFYDDALNASFLPSE | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29154 | ___________________MtSSyGHVLERQPALGGRLDsP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29155 | ___________________MttsQKHRDFVAEPMGEKPVGS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29156 rows × 456 columns
Info table
= Data.get_human_site() human
'SUB'] = human['site_seq'].str.upper() human[
'upper_in_human_phosphoproteome'] = df.SUB.isin(human.SUB).astype(int) df[
df.upper_in_human_phosphoproteome.value_counts()
upper_in_human_phosphoproteome
1 106327
0 80739
Name: count, dtype: int64
'source_all']=df.upper_in_human_phosphoproteome.map({1:'human_phosphoproteome',0:''})+'|'+df.source
df['source_all'] = df['source_all'].str.lstrip('|') df[
'source_all'] df[
0 human_phosphoproteome|Sugiyama
1 Sugiyama
2 human_phosphoproteome|Sugiyama
3 human_phosphoproteome|Sugiyama
4 Sugiyama
...
187061 human_phosphoproteome|Sugiyama
187062 Sugiyama
187063 Sugiyama
187064 human_phosphoproteome|SIGNOR|EPSD|PSP
187065 Sugiyama
Name: source_all, Length: 187066, dtype: object
'substrate_gene']=df['substrate_genes'].str.split(' ').str[0] df[
sum() df.substrate_gene.isna().
7
= df.groupby('SUB').agg({'sub_site':lambda x: ','.join(x.unique()),
df_info 'substrate_gene':lambda x: ','.join(x.dropna().unique()),
'source_all':lambda x: '|'.join(set('|'.join(x.unique()).split('|')))}).reset_index()
'site_seq']=df_info['SUB'].map(seq_map) df_info[
df_info.columns
Index(['SUB', 'sub_site', 'substrate_gene', 'source_all', 'site_seq'], dtype='object')
= df_info[['site_seq','source_all','substrate_gene','sub_site']] df_info
= ['site_seq','site_source_all','substrate_gene','sub_site'] df_info.columns
= df_info.merge(pivot_table,on='site_seq') df_final
df_final
site_seq | site_source_all | substrate_gene | sub_site | O00141_SGK1 | O00238_BMPR1B | O00311_CDC7 | O00329_PIK3CD | O00418_EEF2K | O00443_PIK3C2A | ... | Q9Y2K2_SIK3 | Q9Y2U5_MAP3K2 | Q9Y3S1_WNK2 | Q9Y463_DYRK1B | Q9Y4K4_MAP4K5 | Q9Y572_RIPK3 | Q9Y5S2_CDC42BPB | Q9Y6E0_STK24 | Q9Y6M4_CSNK1G3 | Q9Y6R4_MAP3K4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAAAAAAAAVAAPPTAVGSLsGAEGVPVSsQPLPSQPW___ | SIGNOR|human_phosphoproteome|PSP|iPTMNet | MAZ | P56270_S460 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | AAAAAAASGGAQQRsHHAPMsPGssGGGGQPLARtPQPssP | PSP|human_phosphoproteome|EPSD|Sugiyama | ARID1A | O14497_S363 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | AAAAAAAVtAAstsYYGRDRsPLRRATAPVPTVGEGYGYGH | human_phosphoproteome|PSP|EPSD | RBM4 | Q9BWF3_S309 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | AAAAAVSRRRKAEYPRRRRssPsARPPDVPGQQPQAAKsPs | human_phosphoproteome|Sugiyama | ZFP91 | Q96JP5_S83 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | AAAAGAGKAEELHyPLGERRsDyDREALLGVQEDVDEyVKL | Sugiyama | RCN2 | Q14257_S37 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
29151 | ___________________MstVHEILCKLsLEGDHstPPs | SIGNOR|human_phosphoproteome|EPSD | ANXA2 | P07355_S2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29152 | ___________________MsYRRELEKyRDLDEDEILGAL | human_phosphoproteome|PSP|EPSD | TMOD1 | P28289_S2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29153 | ___________________MtAKMETtFYDDALNASFLPSE | SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 | JUN | P05412_T2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29154 | ___________________MtSSyGHVLERQPALGGRLDsP | Sugiyama | PRRX1 | P54821_T2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29155 | ___________________MttsQKHRDFVAEPMGEKPVGS | SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 | BANF1 | O75531_T2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29156 rows × 459 columns
# df_final.to_parquet('~/katlas/dataset/CDDM/ks_datasets_seq_unique_20250407.parquet')
The data is available under Data.get_ks_unique()