Prepare training set

from katlas.core import *
from katlas.plot import *
from katlas.feature import *
import pandas as pd, seaborn as sns, numpy as np
import matplotlib.pyplot as plt
sns.set(rc={"figure.dpi":200,'savefig.dpi':300})
sns.set_context('notebook')
sns.set_style("ticks")

Get data

df = Data.get_ks_dataset()
df.head()
kin_sub_site kinase_uniprot substrate_uniprot site source substrate_genes substrate_phosphoseq position site_seq sub_site ... kinase_family kinase_pspa_big kinase_pspa_small SUB num_phospho kinase_gene upper_in_human_phosphoproteome source_all substrate_gene kinase_uniprot_gene
0 O00141_A4FU28_S140 O00141 A4FU28 S140 Sugiyama CTAGE9 MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... 140 AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH A4FU28_S140 ... SGK basophilic AKT/ROCK AAAEEARSLEATCEKLSRSNSELEDEILCLEKDLKEEKSKH 2 SGK1 1 human_phosphoproteome|Sugiyama CTAGE9 O00141_SGK1
1 O00141_O00141_S252 O00141 O00141 S252 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 252 SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ O00141_S252 ... SGK basophilic AKT/ROCK SQGHIVLTDFGLCKENIEHNSTTSTFCGTPEYLAPEVLHKQ 6 SGK1 0 Sugiyama SGK1 O00141_SGK1
2 O00141_O00141_S255 O00141 O00141 S255 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 255 HIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQPYD O00141_S255 ... SGK basophilic AKT/ROCK HIVLTDFGLCKENIEHNSTTSTFCGTPEYLAPEVLHKQPYD 6 SGK1 1 human_phosphoproteome|Sugiyama SGK1 O00141_SGK1
3 O00141_O00141_S397 O00141 O00141 S397 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 397 sGPNDLRHFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEA O00141_S397 ... SGK basophilic AKT/ROCK SGPNDLRHFDPEFTEEPVPNSIGKSPDSVLVTASVKEAAEA 5 SGK1 1 human_phosphoproteome|Sugiyama SGK1 O00141_SGK1
4 O00141_O00141_S404 O00141 O00141 S404 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 404 HFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEAFLGFsYA O00141_S404 ... SGK basophilic AKT/ROCK HFDPEFTEEPVPNSIGKSPDSVLVTASVKEAAEAFLGFSYA 5 SGK1 0 Sugiyama SGK1 O00141_SGK1

5 rows × 24 columns

df.shape
(187066, 24)

Deal with duplicate site sequences

Here are the process we do:

  • groupby the all uppercase of site sequence
  • get the pivot table that columns are kinase uniprot and gene name
  • the value in the pivot table reflect the count of kinase
  • replace the all uppcase site sequence with the most phosphorylated version of that sequence
SEQ_COL='site_seq'

df['SUB'] = df[SEQ_COL].str.upper()
df['num_phospho']=df[SEQ_COL].str.count(r'[sty]')
df['kinase_gene']=df['kinase_genes'].str.split(' ').str[0]
seq_map = df.sort_values('num_phospho',ascending=False).drop_duplicates(subset='SUB')\
.set_index('SUB')[SEQ_COL]
df['kinase_uniprot_gene']=df['kinase_uniprot']+'_'+df['kinase_gene']
pivot_data = df[['SUB', 'kinase_uniprot_gene']]
pivot_table = pivot_data.groupby(['SUB', 'kinase_uniprot_gene']).size().unstack(fill_value=0)
# replace the uppercase seq with most phosphorylated form
pivot_table.index = pivot_table.index.map(seq_map)
pivot_table = pivot_table.reset_index().rename(columns={'SUB': 'site_seq'})
pivot_table
kinase_uniprot_gene site_seq O00141_SGK1 O00238_BMPR1B O00311_CDC7 O00329_PIK3CD O00418_EEF2K O00443_PIK3C2A O00444_PLK4 O00506_STK25 O14578_CIT ... Q9Y2K2_SIK3 Q9Y2U5_MAP3K2 Q9Y3S1_WNK2 Q9Y463_DYRK1B Q9Y4K4_MAP4K5 Q9Y572_RIPK3 Q9Y5S2_CDC42BPB Q9Y6E0_STK24 Q9Y6M4_CSNK1G3 Q9Y6R4_MAP3K4
0 AAAAAAAAAVAAPPTAVGSLsGAEGVPVSsQPLPSQPW___ 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 AAAAAAASGGAQQRsHHAPMsPGssGGGGQPLARtPQPssP 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 AAAAAAAVtAAstsYYGRDRsPLRRATAPVPTVGEGYGYGH 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 AAAAAVSRRRKAEYPRRRRssPsARPPDVPGQQPQAAKsPs 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 AAAAGAGKAEELHyPLGERRsDyDREALLGVQEDVDEyVKL 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29151 ___________________MstVHEILCKLsLEGDHstPPs 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29152 ___________________MsYRRELEKyRDLDEDEILGAL 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29153 ___________________MtAKMETtFYDDALNASFLPSE 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29154 ___________________MtSSyGHVLERQPALGGRLDsP 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29155 ___________________MttsQKHRDFVAEPMGEKPVGS 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

29156 rows × 456 columns

Info table

human = Data.get_human_site()
human['SUB'] = human['site_seq'].str.upper()
df['upper_in_human_phosphoproteome'] = df.SUB.isin(human.SUB).astype(int)
df.upper_in_human_phosphoproteome.value_counts()
upper_in_human_phosphoproteome
1    106327
0     80739
Name: count, dtype: int64
df['source_all']=df.upper_in_human_phosphoproteome.map({1:'human_phosphoproteome',0:''})+'|'+df.source
df['source_all'] = df['source_all'].str.lstrip('|')
df['source_all']
0                human_phosphoproteome|Sugiyama
1                                      Sugiyama
2                human_phosphoproteome|Sugiyama
3                human_phosphoproteome|Sugiyama
4                                      Sugiyama
                          ...                  
187061           human_phosphoproteome|Sugiyama
187062                                 Sugiyama
187063                                 Sugiyama
187064    human_phosphoproteome|SIGNOR|EPSD|PSP
187065                                 Sugiyama
Name: source_all, Length: 187066, dtype: object
df['substrate_gene']=df['substrate_genes'].str.split(' ').str[0]
df.substrate_gene.isna().sum()
7
df_info = df.groupby('SUB').agg({'sub_site':lambda x: ','.join(x.unique()),
                                 'substrate_gene':lambda x: ','.join(x.dropna().unique()),
                                 'source_all':lambda x: '|'.join(set('|'.join(x.unique()).split('|')))}).reset_index()
df_info['site_seq']=df_info['SUB'].map(seq_map)
df_info.columns
Index(['SUB', 'sub_site', 'substrate_gene', 'source_all', 'site_seq'], dtype='object')
df_info = df_info[['site_seq','source_all','substrate_gene','sub_site']]
df_info.columns = ['site_seq','site_source_all','substrate_gene','sub_site']
df_final = df_info.merge(pivot_table,on='site_seq')
df_final
site_seq site_source_all substrate_gene sub_site O00141_SGK1 O00238_BMPR1B O00311_CDC7 O00329_PIK3CD O00418_EEF2K O00443_PIK3C2A ... Q9Y2K2_SIK3 Q9Y2U5_MAP3K2 Q9Y3S1_WNK2 Q9Y463_DYRK1B Q9Y4K4_MAP4K5 Q9Y572_RIPK3 Q9Y5S2_CDC42BPB Q9Y6E0_STK24 Q9Y6M4_CSNK1G3 Q9Y6R4_MAP3K4
0 AAAAAAAAAVAAPPTAVGSLsGAEGVPVSsQPLPSQPW___ SIGNOR|human_phosphoproteome|PSP|iPTMNet MAZ P56270_S460 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 AAAAAAASGGAQQRsHHAPMsPGssGGGGQPLARtPQPssP PSP|human_phosphoproteome|EPSD|Sugiyama ARID1A O14497_S363 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 AAAAAAAVtAAstsYYGRDRsPLRRATAPVPTVGEGYGYGH human_phosphoproteome|PSP|EPSD RBM4 Q9BWF3_S309 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 AAAAAVSRRRKAEYPRRRRssPsARPPDVPGQQPQAAKsPs human_phosphoproteome|Sugiyama ZFP91 Q96JP5_S83 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 AAAAGAGKAEELHyPLGERRsDyDREALLGVQEDVDEyVKL Sugiyama RCN2 Q14257_S37 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29151 ___________________MstVHEILCKLsLEGDHstPPs SIGNOR|human_phosphoproteome|EPSD ANXA2 P07355_S2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29152 ___________________MsYRRELEKyRDLDEDEILGAL human_phosphoproteome|PSP|EPSD TMOD1 P28289_S2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29153 ___________________MtAKMETtFYDDALNASFLPSE SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 JUN P05412_T2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29154 ___________________MtSSyGHVLERQPALGGRLDsP Sugiyama PRRX1 P54821_T2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29155 ___________________MttsQKHRDFVAEPMGEKPVGS SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 BANF1 O75531_T2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

29156 rows × 459 columns

# df_final.to_parquet('~/katlas/dataset/CDDM/ks_datasets_seq_unique_20250407.parquet')

The data is available under Data.get_ks_unique()