Preprocess kinase-substrate datasets

Motif

Poll [Sugiyama & Douglass]: https://biosignaling.biomedcentral.com/articles/10.1186/s12964-023-01436-2

Poll web motif: https://esbl.nhlbi.nih.gov/Databases/Kinase_Logos/KinaseTree.html; https://esbl.nhlbi.nih.gov/Databases/Kinase_Logos/GeneSymbol.html

Sugiyama: https://pmc.ncbi.nlm.nih.gov/articles/PMC6642169/

Douglass: https://journals.physiology.org/doi/full/10.1152/ajpcell.00166.2012

RegPhos: Supplementary Table S4: motifs grouped by kinase family

PhosphoNetwork, logos of 300 motifs: https://pmc.ncbi.nlm.nih.gov/articles/PMC3658267/; web app: https://phosphonetworks.org/

GPS: https://gps.biocuckoo.cn/links.php#l1

Review of database

Wilson, AACR cancer research: https://aacrjournals.org/cancerres/article/78/1/15/625062/New-Perspectives-Opportunities-and-Challenges-in

GPS 6.0, List of database: https://gps.biocuckoo.cn/links.php

paper: https://academic.oup.com/nar/article/51/W1/W243/7157529

phospho.ELM links: http://phospho.elm.eu.org/links.html

GPS, https://gps.biocuckoo.cn/links.php#l1

Set up

from katlas.core import *
import pandas as pd
def map_substrate(idmapping_fname, ori_df, sub_col,remove_nonhuman=True):
    "Map uniprot id with uniprot info"

    substrate_id = pd.read_excel(idmapping_fname)
    substrate_id = substrate_id.drop_duplicates('From')
    substrate_id = substrate_id[['From', 'Entry', 'Gene Names', 'Sequence','Organism']]
    ori_df = ori_df.copy()

    # prevent name conflict
    if sub_col == 'substrate_uniprot':
        sub_col = 'substrate_uniprot_tmp'
        ori_df = ori_df.rename(columns={'substrate_uniprot':'substrate_uniprot_tmp'})

    substrate_id.columns = [sub_col, 'substrate_uniprot', 'substrate_genes', 'substrate_sequence','substrate_species']

    print('Shape before processing', ori_df.shape)

    ori_df = ori_df.merge(substrate_id, on=sub_col)

    print('Species counts:',ori_df.substrate_species.value_counts())

    if remove_nonhuman:
        print('Removing non-human substrates')
        ori_df = ori_df[ori_df.substrate_species=='Homo sapiens (Human)']

    ori_df = ori_df.drop(columns=[sub_col])  # Drop temp column after merging
    ori_df = ori_df.dropna(subset=['substrate_sequence'])
    print('Shape after processing', ori_df.shape)

    return ori_df.reset_index(drop=True)
kinase_uniprot=pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')

Sugiyama dataset

Data

Go to https://www.nature.com/articles/s41598-019-46385-4#Sec21, download table S2.

Modify the header (2 lines to 1 line) so that it can be read by pandas.

df = pd.read_csv('raw/Large_scale_S2.csv').iloc[:,:-2]
df.head()
Type Kinase Substrate_uniprot Position
0 TK ABL1 1433B_HUMAN S212
1 TK ABL1 1433B_HUMAN Y151
2 TK ABL1 1433B_HUMAN Y21
3 TK ABL1 1433B_HUMAN Y50
4 TK ABL1 1433E_HUMAN Y152
df.shape
(198536, 4)

Kinase mapping

# pd.DataFrame(df.Kinase.unique())[0].str.split('/').str[0].to_csv('raw/sugiyama_kinase_name.csv')

Map each kinase name with uniprot ID

kinase_id = pd.read_csv('raw/LS_info2.csv').iloc[:,:3]
df = df.merge(kinase_id)
df
Type Kinase Substrate_uniprot Position kinase_uniprot kinase_paper
0 TK ABL1 1433B_HUMAN S212 P00519 ABL1
1 TK ABL1 1433B_HUMAN Y151 P00519 ABL1
2 TK ABL1 1433B_HUMAN Y21 P00519 ABL1
3 TK ABL1 1433B_HUMAN Y50 P00519 ABL1
4 TK ABL1 1433E_HUMAN Y152 P00519 ABL1
... ... ... ... ... ... ...
198531 LK SPHK2 TICN3_HUMAN T118 Q9NRA0 NaN
198532 LK SPHK2 TPM4_HUMAN T241 Q9NRA0 NaN
198533 LK SPHK2 ULK3_HUMAN S305 Q9NRA0 NaN
198534 LK SPHK2 ZRAB2_HUMAN S165 Q9NRA0 NaN
198535 LK SPHK2 ZRAB2_HUMAN S181 Q9NRA0 NaN

198536 rows × 6 columns

Substrate mapping

# pd.DataFrame(df.Substrate_uniprot.unique()).to_csv('raw/sugiyama_uniprot_id.csv')

ID mapping of Substrate_uniprot to uniprot ID in https://www.uniprot.org/id-mapping

3,753 IDs were mapped to 3,753 results

375 ID were not mapped

df = map_substrate('raw/idmapping_2025_03_02.xlsx',df,'Substrate_uniprot')
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (198536, 6)
Species counts: substrate_species
Homo sapiens (Human)    187129
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (187129, 9)
# lipid kinase
df[df.Type=='LK'].Kinase.unique()
array(['PIK3C3', 'PIK3CB/PIK3R1', 'PIK3CD/PIK3R1', 'PIK3CG/PIK3R5',
       'PIK3CG', 'PIK4CA', 'PIP4K2B', 'PIP5K1C', 'SPHK1', 'SPHK2'],
      dtype=object)
df.isna().sum()
Type                      0
Kinase                    0
Position                  0
kinase_uniprot        16591
kinase_paper          18891
substrate_uniprot         0
substrate_genes          11
substrate_sequence        0
substrate_species         0
dtype: int64
df.head()
Type Kinase Position kinase_uniprot kinase_paper substrate_uniprot substrate_genes substrate_sequence substrate_species
0 TK ABL1 S212 P00519 ABL1 P31946 YWHAB MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... Homo sapiens (Human)
1 TK ABL1 Y151 P00519 ABL1 P31946 YWHAB MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... Homo sapiens (Human)
2 TK ABL1 Y21 P00519 ABL1 P31946 YWHAB MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... Homo sapiens (Human)
3 TK ABL1 Y50 P00519 ABL1 P31946 YWHAB MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... Homo sapiens (Human)
4 TK ABL1 Y152 P00519 ABL1 P62258 YWHAE MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS... Homo sapiens (Human)
df = df.rename(columns = {'Position':'site'})
df.shape
(187129, 9)

Validate site

df['site_match'] =validate_site_df(df,'site','substrate_sequence')
df.site_match.value_counts()
site_match
1    184948
0      2181
Name: count, dtype: int64
df=df[df.site_match==1]
df.shape
(184948, 10)

Save

df['source']='Sugiyama'
# df.to_excel('raw/sugiyama.xlsx',index=False)

PhosphoSitePlus (PSP)

Data

PSP paper: https://academic.oup.com/nar/article/43/D1/D512/2439467

Go to https://www.phosphosite.org/staticDownloads, download Kinase_Substrate_Dataset.txt

import pandas as pd
psp = pd.read_csv('raw/Kinase_Substrate_Dataset_final.csv')
psp.head()
GENE Kinase KIN_ACC_ID KIN_ORGANISM SUBSTRATE SUB_GENE_ID SUB_ACC_ID SUB_GENE SUB_ORGANISM SUB_MOD_RSD SITE_GRP_ID substrate DOMAIN IN_VIVO_RXN IN_VITRO_RXN CST_CAT# kinase_uniprot kinase_paper
0 Dyrk2 DYRK2 Q5U4C9 mouse NDEL1 83431.0 Q9ERR1 Ndel1 mouse S336 1869686801 LGSsRPSsAPGMLPL NaN X NaN Q92630 DYRK2
1 DYRK2 DYRK2 Q92630 human GLI2 14633.0 Q0VGT2 Gli2 mouse S385 3339001 AEGLRPAsPLGLTQE NaN X NaN Q92630 DYRK2
2 DYRK2 DYRK2 Q92630 human SIAH2 6478.0 O43255 SIAH2 human S68 5393502 GGGAGPVsPQHHELT NaN X NaN Q92630 DYRK2
3 DYRK2 DYRK2 Q92630 human CARHSP1 23589.0 Q9Y2V2 CARHSP1 human S41 455702 LRGNVVPsPLPtRRt NaN X NaN Q92630 DYRK2
4 DYRK2 DYRK2 Q92630 human Doublecortin iso2 1641.0 O43602-2 DCX human S306 454122 GPMRRSKsPADSANG NaN X NaN Q92630 DYRK2
psp =psp[psp.KIN_ORGANISM=='human'].reset_index(drop=True) 

psp =psp[psp.SUB_ORGANISM=='human'].reset_index(drop=True)
psp.shape
(14081, 18)
psp = psp[['KIN_ACC_ID','kinase_paper','GENE','SUB_ACC_ID','SUB_GENE','SUB_MOD_RSD','substrate']]
psp.head()
KIN_ACC_ID kinase_paper GENE SUB_ACC_ID SUB_GENE SUB_MOD_RSD substrate
0 Q92630 DYRK2 DYRK2 O43255 SIAH2 S68 GGGAGPVsPQHHELT
1 Q92630 DYRK2 DYRK2 Q9Y2V2 CARHSP1 S41 LRGNVVPsPLPtRRt
2 Q92630 DYRK2 DYRK2 O43602-2 DCX S306 GPMRRSKsPADSANG
3 Q92630 DYRK2 DYRK2 P30304 CDC25A S283 PErsQEEsPPGSTKr
4 Q92630 DYRK2 DYRK2 O43255 SIAH2 T119 PTCRGALtPSIRNLA

Substrate mapping

# psp.SUB_ACC_ID.drop_duplicates().to_csv('raw/psp_substrate_id.csv')

4,441 IDs were mapped to 4,446 results

19 ID were not mapped: NP_001184222 NP_001100737 AAA40678 HSBO22 NP_776683 NP_579829 P18433-2 NP_001076191 NP_001099740 NP_001005762 NP_001178533 XP_008773743 NP_001104263 ABR15760 AAB24204 AAB24205 BAA34185.2 NP_001103022 AAC50053

IDs with multiple results: P62991, P62988

psp = map_substrate('raw/idmapping_2025_03_02_psp.xlsx',psp,'SUB_ACC_ID')
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (14081, 7)
Species counts: substrate_species
Homo sapiens (Human)    14069
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (14069, 10)
psp.head()
KIN_ACC_ID kinase_paper GENE SUB_GENE SUB_MOD_RSD substrate substrate_uniprot substrate_genes substrate_sequence substrate_species
0 Q92630 DYRK2 DYRK2 SIAH2 S68 GGGAGPVsPQHHELT O43255 SIAH2 MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... Homo sapiens (Human)
1 Q92630 DYRK2 DYRK2 CARHSP1 S41 LRGNVVPsPLPtRRt Q9Y2V2 CARHSP1 MSSEPPPPPQPPTHQASVGLLDTPRSRERSPSPLRGNVVPSPLPTR... Homo sapiens (Human)
2 Q92630 DYRK2 DYRK2 DCX S306 GPMRRSKsPADSANG O43602 DCX DBCN LISX MELDFGHFDERDKTSRNMRGSRMNGLPSPTHSAHCSFYRTRTLQAL... Homo sapiens (Human)
3 Q92630 DYRK2 DYRK2 CDC25A S283 PErsQEEsPPGSTKr P30304 CDC25A MELGPEPPHRRRLLFACSPPPASQPVVKALFGASAAGGLSPVTNLT... Homo sapiens (Human)
4 Q92630 DYRK2 DYRK2 SIAH2 T119 PTCRGALtPSIRNLA O43255 SIAH2 MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... Homo sapiens (Human)
psp.shape
(14069, 10)

Kinase mapping

# psp.KIN_ACC_ID.drop_duplicates().to_csv('raw/psp_kin_id.csv')

Mapp KIN_ACC_ID to uniprot id

440 IDs were mapped to 440 results

1 ID was not mapped:

AAA58698

kinase_id = pd.read_excel('raw/idmapping_2025_03_02_psp_kinase.xlsx')[['From','Entry','Gene Names']]
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
kinase_id.columns = ['KIN_ACC_ID','kinase_uniprot','kinase_genes']
kinase_id[kinase_id.KIN_ACC_ID=="A9UF07"]
KIN_ACC_ID kinase_uniprot kinase_genes
124 A9UF07 A9UF07 BCR/ABL fusion
psp['KIN_ACC_ID'] = psp['KIN_ACC_ID'].replace('A9UF07','P00519')
psp = psp.merge(kinase_id)
psp = psp.rename(columns={'SUB_MOD_RSD':'site'})
psp = psp[['kinase_uniprot','kinase_genes','kinase_paper','substrate_uniprot','substrate_genes','site','substrate_sequence','substrate','substrate_species']]
psp.isna().sum()
kinase_uniprot          0
kinase_genes            0
kinase_paper          134
substrate_uniprot       0
substrate_genes         0
site                    0
substrate_sequence      0
substrate               0
substrate_species       0
dtype: int64
psp = psp.dropna(subset='substrate_sequence').reset_index(drop=True)

Check if kinase-uniprot belongs to kinase class

Not mapped:

psp[~psp.kinase_uniprot.isin(kinase_uniprot.Entry)].kinase_genes.value_counts()
kinase_genes
CSNK2B CK2N G5A                13
PRKAG2                          9
PRKAB1 AMPK                     9
GTF2F1 RAP74                    4
BLVRA BLVR BVR                  3
TGM2                            2
RET/PTC2                        2
PHKA1 PHKA                      1
HSPA5 GRP78                     1
ENPP3 PDNP3                     1
PIK3R1 GRB1                     1
CERT1 CERT COL4A3BP STARD11     1
JMJD6 KIAA0585 PSR PTDSR        1
Name: count, dtype: int64

CSNK2B belongs to CK2, PRKAB1, PRKAG2 belongs to AMPK subunit; as their count is not many, we can simply ignore them

psp = psp[psp.kinase_uniprot.isin(kinase_uniprot.Entry)].copy()
psp.shape
(14016, 9)

Validate site

Check if it match with substrate sequence

psp['position'] = psp['site'].str[1:].astype(int)
psp['site_seq']=extract_site_seq(psp,seq_col='substrate_sequence',position_col='position')
100%|██████████| 14016/14016 [00:00<00:00, 25600.07it/s]
(psp['site_seq']==psp['substrate'].str.upper()).value_counts()
True     13156
False      860
Name: count, dtype: int64
unmatch = psp[~(psp['site_seq']==psp['substrate'].str.upper())]
unmatch.kinase_genes.value_counts().head(10)
kinase_genes
PRKACA PKACA                                  40
MAPK1 ERK2 PRKM1 PRKM2                        40
SRC SRC1                                      39
CDK1 CDC2 CDC28A CDKN1 P34CDC2                38
GSK3B                                         37
MAPK3 ERK1 PRKM3                              35
AURKB AIK2 AIM1 AIRK2 ARK2 STK1 STK12 STK5    33
PRKCA PKCA PRKACA                             33
PRKCB PKCB PRKCB1                             25
CDK5 CDKN5 PSSALRE                            25
Name: count, dtype: int64

We’ll drop the rows with unmatch substrate sequence

psp = psp[psp['site_seq']==psp['substrate'].str.upper()]

Drop sites with center aa not in S,T or Y

psp['site_seq'].str[7].value_counts()
site_seq
S    8303
T    2812
Y    2029
H       6
K       4
R       2
Name: count, dtype: int64
psp = psp[psp['site_seq'].str[7].str.upper().isin(list('STY'))]

Drop site with unmatched residue and position

psp['site_match'] =validate_site_df(psp,'site','substrate_sequence')
psp['site_match'].value_counts()
site_match
1    13144
Name: count, dtype: int64
psp.shape
(13144, 12)

Save

psp['source']='PSP'
psp.head()
kinase_uniprot kinase_genes kinase_paper substrate_uniprot substrate_genes site substrate_sequence substrate substrate_species position site_seq site_match source
0 Q92630 DYRK2 DYRK2 O43255 SIAH2 S68 MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... GGGAGPVsPQHHELT Homo sapiens (Human) 68 GGGAGPVSPQHHELT 1 PSP
1 Q92630 DYRK2 DYRK2 Q9Y2V2 CARHSP1 S41 MSSEPPPPPQPPTHQASVGLLDTPRSRERSPSPLRGNVVPSPLPTR... LRGNVVPsPLPtRRt Homo sapiens (Human) 41 LRGNVVPSPLPTRRT 1 PSP
3 Q92630 DYRK2 DYRK2 P30304 CDC25A S283 MELGPEPPHRRRLLFACSPPPASQPVVKALFGASAAGGLSPVTNLT... PErsQEEsPPGSTKr Homo sapiens (Human) 283 PERSQEESPPGSTKR 1 PSP
4 Q92630 DYRK2 DYRK2 O43255 SIAH2 T119 MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... PTCRGALtPSIRNLA Homo sapiens (Human) 119 PTCRGALTPSIRNLA 1 PSP
5 Q92630 DYRK2 DYRK2 O75449 KATNA1 S42 MSLLMISENVKLAREYALLGNYDSAMVYYQGVLDQMNKYLYSVKDT... QMNKYLYsVkDTYLQ Homo sapiens (Human) 42 QMNKYLYSVKDTYLQ 1 PSP
# psp.to_excel('raw/psp_human.xlsx',index=False)

KiNET (ESPD + iPTMNet)

an integration of PSP, iPTMNet, EPSD

Data

KiNET paper: https://www.nature.com/articles/s41540-024-00442-5

KiNET web: https://kinet.kinametrix.com/

iPTMnet paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC5753337/;

iPTMnet web: https://research.bioinformatics.udel.edu/iptmnet/ ; can’t directly download through Download

EPSD paper: https://academic.oup.com/bib/article/22/1/298/5686325?login=false

EPSD web: https://epsd.biocuckoo.cn/Download.php

Go to https://kinet.kinametrix.com/, and click Download the full KiNet interaction dataset

df = pd.read_csv('raw/ksi_source_full_dataset.csv')
df = df.dropna(subset='Kinase')
df['Source Database'].value_counts()
Source Database
PhosphoSitePlus    13135
EPSD               10442
iPTMNet             3846
Name: count, dtype: int64

As we already included PSP, we will drop it

df = df[df['Source Database']!='PhosphoSitePlus']
df.shape
(14288, 8)

Kinase mapping

df.Kinase.isin(kinase_uniprot.Entry).value_counts()
Kinase
True     14283
False        5
Name: count, dtype: int64
df = df[df.Kinase.isin(kinase_uniprot.Entry)]

Substrate mapping

# df.Substrate.drop_duplicates().to_csv('raw/KiNet_substrate_id.csv')

Map the id to uniprot seq

df = map_substrate('raw/idmapping_2025_03_02_KiNET_substrate.xlsx',df,'Substrate')
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (14283, 8)
Species counts: substrate_species
Homo sapiens (Human)    14283
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (14283, 11)
# substrate_id = pd.read_excel('raw/idmapping_2025_03_02_KiNET_substrate.xlsx')[['From','Entry','Gene Names','Sequence']]

# substrate_id.columns = ['Substrate','substrate_uniprot','substrate_genes','substrate_sequence']

# df = df.merge(substrate_id)
df = df[['Kinase','Kinase Name','substrate_uniprot','substrate_genes','Site','Source Database','Evidence','substrate_sequence']]
df.columns
Index(['Kinase', 'Kinase Name', 'substrate_uniprot', 'substrate_genes', 'Site',
       'Source Database', 'Evidence', 'substrate_sequence'],
      dtype='object')
df.columns = ['kinase_uniprot', 'Kinase Name', 'substrate_uniprot', 'substrate_genes', 'site',

       'source', 'evidence', 'substrate_sequence']
df.shape
(14283, 8)

Validate site

We drop site with site residue info unmatch with its site position info in the protein sequence

df['site_match'] =validate_site_df(df,'site','substrate_sequence')
df.site_match.value_counts()
site_match
1    13786
0      497
Name: count, dtype: int64
df[df.site_match==0].source.value_counts()
source
iPTMNet    368
EPSD       129
Name: count, dtype: int64
df[df.site_match==1].source.value_counts()
source
EPSD       10308
iPTMNet     3478
Name: count, dtype: int64
df=df[df.site_match==1]
df.shape
(13786, 9)

Save

df.head()
kinase_uniprot Kinase Name substrate_uniprot substrate_genes site source evidence substrate_sequence site_match
0 O00141 SGK1 O00213 APBB1 FE65 RIR S610 EPSD Unspecified experimental method MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG... 1
1 O00141 SGK1 O14920 IKBKB IKKB S181 EPSD Unspecified experimental method MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC... 1
2 O00141 SGK1 O15111 CHUK IKKA TCF16 S180 EPSD Unspecified experimental method MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC... 1
3 O00141 SGK1 O43524 FOXO3 FKHRL1 FOXO3A T32 EPSD Unspecified experimental method MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK... 1
4 O00141 SGK1 O43524 FOXO3 FKHRL1 FOXO3A T32 iPTMNet Text mining MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK... 1
# df.to_excel('raw/KiNet.xlsx',index=False)

# df[df['source'] =='EPSD'].to_excel('raw/EPSD.xlsx',index=False)

# df[df['source'] =='iPTMNet'].to_excel('raw/iPTMNet.xlsx',index=False)

Phospho.ELM

Data

paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC3013696/

Go to http://phospho.elm.eu.org/dataset.html, fill info–>accept, then download the dataset

Open dump in excel, filter human and non-blank from kinases

elm=pd.read_csv('raw/phosphoELM.csv')
elm.shape
(3599, 6)
elm.head()
substrate_uniprot position acceptor kinase LTP_HTP species
0 O14543 204 Y Lck LTP Homo sapiens
1 O14543 221 Y Lck LTP Homo sapiens
2 O14746 824 S PKB_group LTP Homo sapiens
3 O14746 227 S PKB_group LTP Homo sapiens
4 O14746 707 Y SRC LTP Homo sapiens
elm.kinase = elm.kinase.str.upper()

Kinase mapping

kinase_id = pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')

kinase_id['Gene Names'] = kinase_id['Gene Names'].str.split(' ')

kinase_id = kinase_id.explode('Gene Names')
gene2uniprot = kinase_id.set_index('Gene Names')['Entry'].to_dict()
dup_name = set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names'])
dup_name.remove('PAK1')

dup_name.remove('PASK')
gene2uniprot['PAK1'] = 'Q13153'

gene2uniprot['PASK']='Q96RG2'
elm = elm[~elm.kinase.isin(dup_name)].copy()
elm[~elm.kinase.isin(gene2uniprot.keys())].kinase.unique()
array(['PKB_GROUP', 'IKK_GROUP', 'AURORA B', 'PDK-1', 'PKC_THETA',
       'SGK_GROUP', 'IKK_BETA', 'CDK_GROUP', 'CAM-KII_ALPHA', 'PKC_GROUP',
       'PKC_ALPHA', 'CK2_GROUP', 'CK2_ALPHA', 'GSK-3_GROUP',
       'GSK-3_ALPHA', 'GSK-3_BETA', 'PKA_GROUP', 'RSK_GROUP', 'PKB_BETA',
       'MAPK_GROUP', 'PAK_GROUP', 'DNA-PK', 'JNK_GROUP', 'CK1_GROUP',
       'AURORA A', 'P70S6KB', 'PKG/CGK_GROUP', 'PKC_DELTA', 'PDGFR_BETA',
       'CAM-KII_GROUP', 'GRK-2', 'GRK-5', 'GRK_GROUP', 'CK1_ALPHA',
       'PKC_EPSILON', 'ROCK_GROUP', 'PKG1/CGK-I', 'PDGFR_GROUP',
       'P70S6K_GROUP', 'P70S6K', 'PHK_GROUP', 'PKC_BETA', 'DMPK_GROUP',
       'CK1_DELTA', 'CK2_BETA', 'RSK-3', 'RSK-2', 'CAM-KIV', 'RSK-1',
       'IKK_ALPHA', 'CK1_EPSILON', 'MAP2K_GROUP', 'PIM-1', 'EG3 KINASE',
       'GRK-4', 'PKC_ZETA', 'PDKC', 'AMPK_GROUP', 'FGFR_GROUP',
       'JAK_GROUP', 'MAP3K_GROUP', 'RSK-5', 'IKK_EPSILON', 'CAM-KI_GROUP',
       'SRC_GROUP', 'PKA_ALPHA', 'PKC_ETA', 'PDK-2', 'GRK-6',
       'TITIN KINASE', 'CAM-KI_ALPHA', 'GRK-1', 'PDGFR_ALPHA', 'CCDPK',
       'MARK_GROUP', 'P38_GROUP', 'GRK-3', 'DAPK_GROUP', 'MRCKA',
       'PKG2/CGK-II', 'PKC_GAMMA', 'PKC_IOTA', 'CAM-KK_ALPHA'],
      dtype=object)

map them to genes

ids = pd.read_csv('raw/elm_kinase_id.csv')
ids = ids.set_index('kinase')['kinase_gene'].to_dict()
elm['kinase_genes'] = elm.kinase.map(ids).fillna(elm.kinase)
# for kinase group, we only consider the first two items

elm['kinase_genes'] = elm.kinase_genes.str.split(' ').str[:2]
elm = elm.explode('kinase_genes')
elm[elm.kinase_genes.isin(dup_name)].kinase_genes.unique()
array(['PRKACA'], dtype=object)
dup_name.remove('PRKACA')

gene2uniprot['PRKACA']= 'P17612'
elm[elm.kinase_genes.isin(dup_name)]
substrate_uniprot position acceptor kinase LTP_HTP species kinase_genes

unmapped:

elm[elm.kinase_genes.map(gene2uniprot).isna()].kinase_genes.unique()
array(['PHKA1', 'PHKA2', 'CSNK2B', 'CDPK1', 'CDPK2'], dtype=object)
elm['kinase_uniprot'] = elm.kinase_genes.map(gene2uniprot)
elm = elm.dropna(subset='kinase_uniprot')

Substrate mapping

# elm.substrate_uniprot.drop_duplicates().to_csv('raw/elm_substrate_id.csv')

916 IDs were mapped to 919 results

10 ID were not mapped: ENSP00000328213 ENSP00000343690 ENSP00000352232 ENSP00000347528 ENSP00000357298 ENSP00000248996 ENSP00000357225 ENSP00000261937 ENSP00000248419 ENSP00000267569

Found two of the unmapped are kinases:

ensp = {'ENSP00000328213':'P06239','ENSP00000261937':'P35916'}
elm.substrate_uniprot = elm.substrate_uniprot.map(ensp).fillna(elm.substrate_uniprot)
elm = map_substrate('raw/idmapping_2025_03_12_elm.xlsx',elm,'substrate_uniprot')
Shape before processing (4699, 8)
Species counts: substrate_species
Homo sapiens (Human)    4675
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (4675, 11)
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
elm.shape
(4675, 11)

Validate site

elm.acceptor.value_counts()
acceptor
S    3018
T     908
Y     749
Name: count, dtype: int64
elm['site']=elm['acceptor']+elm['position'].astype(str)
elm['site_match'] =validate_site_df(elm,'site','substrate_sequence')
elm.site_match.value_counts()
site_match
1    4519
0     156
Name: count, dtype: int64
elm = elm[elm.site_match==1]
elm.shape
(4519, 13)

Save

elm.columns
Index(['position', 'acceptor', 'kinase', 'LTP_HTP', 'species', 'kinase_genes',
       'kinase_uniprot', 'substrate_uniprot', 'substrate_genes',
       'substrate_sequence', 'substrate_species', 'site', 'site_match'],
      dtype='object')
col = ['kinase','kinase_uniprot', 'kinase_genes',

       'substrate_uniprot', 'substrate_genes','site','LTP_HTP', 'species', 

       'substrate_sequence',]
elm = elm[col]
elm['source']='ELM'
# elm.to_excel('raw/ELM.xlsx',index=False)

Signor

Data

paper: https://academic.oup.com/nar/article/51/D1/D631/6761728

web: https://signor.uniroma2.it/

Go to web link, Downloads –> Latest Release download (Jan 2025)

Open in excel, filter mechanism to be phosphorylation

sig = pd.read_excel('raw/signor_phosphorylation.xlsx')
sig.shape
(12973, 27)

Kinase mapping

TYPEA column contains complex proteins, we need to filter out the kinase name

comp = sig[sig.TYPEA=='complex'].copy()
# comp.ENTITYA.drop_duplicates().to_csv('raw/sig_complex.csv')
comp_id = pd.read_csv('raw/sig_complex_label.csv')
comp_id = comp_id.set_index('ENTITYA')['kinase_gene']
comp['kinase_gene'] = comp.ENTITYA.map(comp_id)
comp = comp.dropna(subset='kinase_gene')
comp.shape
(742, 28)

Fusion proteins

fus_id = {'BCR-ABL':'ABL1','EML4-ALK':'ALK'}
fus = sig[sig.TYPEA=='fusion protein'].copy()
fus['kinase_gene'] = fus.ENTITYA.map(fus_id)

For Protein family, we’ll make it apply to first two family members

fam = sig[sig.TYPEA=='proteinfamily'].copy()
# fam.ENTITYA.drop_duplicates().to_csv('raw/sig_fam.csv')
fam_id = pd.read_csv('raw/sig_fam_label.csv')
fam_id.head()
ENTITYA kinase_gene
0 ERK1/2 MAPK3 MAPK1
1 AKT AKT1 AKT2 AKT3
2 RPS6K RPS6KB1 RPS6KB2
3 p38 MAPK14 MAPK11 MAPK12 MAPK13
4 JNK MAPK8 MAPK9 MAPK10
fam_id = fam_id.set_index('ENTITYA')['kinase_gene']
fam['kinase_gene'] = fam.ENTITYA.map(fam_id)

fam = fam.dropna(subset='kinase_gene')
fam['kinase_gene'] = fam.kinase_gene.str.split(' ').str[:2]
fam = fam.explode('kinase_gene')

Protein:

pro = sig[sig.TYPEA=='protein'].copy()

for consistency:

pro['kinase_gene'] = pro['ENTITYA']

combine:

df = pd.concat([pro,comp,fus,fam])

Mapping:

kinase_id = pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')
kinase_id['Gene Names'] = kinase_id['Gene Names'].str.split(' ')
kinase_id = kinase_id.explode('Gene Names')

Some kinases share same gene names:

# kinase_id[kinase_id['Gene Names'].duplicated(keep=False)].sort_values('Gene Names').head()

We’ll drop them to prevent confusion.

dup_name = set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names'])
df.kinase_gene.isin(dup_name).sum()
np.int64(674)
df = df[~df.kinase_gene.isin(dup_name)].copy()
gene2uniprot = kinase_id.set_index('Gene Names')['Entry'].to_dict()
df.shape
(13013, 28)

unmapped:

df[df.kinase_gene.str.upper().map(gene2uniprot).isna()].kinase_gene.unique()
array(['HRAS', 'IL4R', 'CSNK2B', 'TLR4', 'IL6R', 'THAP12', 'PRKAR2A',
       'RHOA', 'AREG', 'RET/PTC2', 'PCSK7', 'CCNC', 'TGM2', 'SLC12A1',
       'RALA', 'IL6ST', 'CRK', 'SMAD9', 'PLCG1', 'ELOC', 'PPP2CA', 'CCR5',
       'GHR', 'CAD', 'SMO', 'FRS2', 'LEPR', 'BLVRA', 'TAB1', 'PRKAB1',
       'PIAS4', 'IFNAR1', 'HSP90AA1', 'PTPRJ', 'PRKAG2', 'CSN1S1',
       'GTF2F1', 'VASP', 'KRAS', 'SLC12A3', 'PRPF4B', 'CDKN2A', 'IL1R1',
       'PRKAR2B', 'IL15RA', 'KRT1', 'SMAD1', 'GTF2H1', 'CCR2', 'IL5RA',
       'PLCG2', 'SLC12A2', 'GTF2H2', 'TMIGD2', 'IKBKG', 'BORA', 'MNAT1',
       'PHKA1', 'BGLF4', 'RIN1', 'DLG1', 'CDK5RAP2', 'CCR1', 'IL10RA',
       'SMAD5', 'CCN4'], dtype=object)
df['kinase_uniprot'] = df.kinase_gene.str.upper().map(gene2uniprot)
df = df.dropna(subset='kinase_uniprot')

Substrate mapping

df = df.dropna(subset='IDB')

We can’t trace the specific whole protein sequence based on SIGNOR ID, so we filter them out

df = df[~df.IDB.str.contains('SIGNOR')]
# df.IDB.drop_duplicates().to_csv('raw/sig_substrate_id.csv')

2,298 IDs were mapped to 2,298 results

1 ID was not mapped: CHEBI:15721

Sequences are all from human

df = map_substrate('raw/idmapping_2025_03_12_signor.xlsx',df,'IDB')
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (12354, 29)
Species counts: substrate_species
Homo sapiens (Human)    12353
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (12353, 32)
df.shape
(12353, 32)

Validate site

df.RESIDUE = df.RESIDUE.str.split(';')
df = df.explode('RESIDUE')
df['acceptor'] = df['RESIDUE'].str[:3]
df.acceptor = df.acceptor.map({'Ser':'S','Thr':'T','Tyr':'Y'})
df['position'] = df['RESIDUE'].str[3:]
df['site']=df['acceptor']+df['position']
df = df.dropna(subset='site')
df['site_match'] =validate_site_df(df,'site','substrate_sequence')
df.site_match.value_counts()
site_match
1    11212
0      118
Name: count, dtype: int64
df = df[df.site_match==1]
df.shape
(11212, 36)

Save

df.columns
Index(['ENTITYA', 'TYPEA', 'IDA', 'DATABASEA', 'ENTITYB', 'TYPEB', 'DATABASEB',
       'EFFECT', 'MECHANISM', 'RESIDUE', 'SEQUENCE', 'TAX_ID', 'CELL_DATA',
       'TISSUE_DATA', 'MODULATOR_COMPLEX', 'TARGET_COMPLEX', 'MODIFICATIONA',
       'MODASEQ', 'MODIFICATIONB', 'MODBSEQ', 'PMID', 'DIRECT', 'NOTES',
       'ANNOTATOR', 'SENTENCE', 'SIGNOR_ID', 'kinase_gene', 'kinase_uniprot',
       'substrate_uniprot', 'substrate_genes', 'substrate_sequence',
       'substrate_species', 'acceptor', 'position', 'site', 'site_match'],
      dtype='object')
col = ['kinase_uniprot', 'kinase_gene', 'ENTITYA', 'TYPEA', 

       'substrate_uniprot', 'substrate_genes','site', 'substrate_sequence']
df = df[col].copy()
df.head()
kinase_uniprot kinase_gene ENTITYA TYPEA substrate_uniprot substrate_genes site substrate_sequence
0 P68400 CSNK2A1 CSNK2A1 protein P05455 SSB S366 MAENGDNEKMAALEAKICHQIEYYFGDFNLPRDKFLKEQIKLDEGW...
1 P28482 MAPK1 MAPK1 protein P43354 NR4A2 NOT NURR1 TINUR S126 MPCVQAQYGSSPQGASPASQSYSYHSSGEYSSDFLTPEFVKFSMDL...
2 P11362 FGFR1 FGFR1 protein P56945 BCAR1 CAS CASS1 CRKAS Y128 MNHLNVLAKALYDNVAESPDELSFRKGDIMTVLEQDTQGLDGWWLC...
3 P27361 MAPK3 MAPK3 protein P41182 BCL6 BCL5 LAZ3 ZBTB27 ZNF51 S343 MASPADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAH...
4 Q16539 MAPK14 MAPK14 protein Q02078 MEF2A MEF2 S408 MGRKKIQITRIMDERNRQVTFTKRKFGLMKKAYELSVLCDCEIALI...
df.shape
(11212, 8)
df['source']="SIGNOR"
# df.to_excel('raw/signor.xlsx',index=False)

GPS 6.0

Data

paper: https://academic.oup.com/nar/article/51/W1/W243/7157529#409532969

web: https://gps.biocuckoo.cn/index.php

Go to paper link, download supplementary data - zip file, get Table S5

import pandas as pd
gps = pd.read_csv('raw/GPS6_tableS5.csv')
gps = gps[gps.source!="PhosphositePlus"]
gps = gps[gps.species=="Homo sapiens"]
gps.shape
(6087, 6)
gps = gps[~gps.gene.str.contains('family')]

Kinase mapping

kinase_id = pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')
kinase_id['Gene Names'] = kinase_id['Gene Names'].str.split(' ')
kinase_id = kinase_id.explode('Gene Names')

Some kinases share same gene names:

kinase_id[kinase_id['Gene Names'].duplicated(keep=False)].sort_values('Gene Names').head()
Entry Entry Name Protein names Gene Names uniprot_keyword_kinase on_tree Organism Keywords Sequence
586 Q9UIJ7 KAD3_HUMAN GTP:AMP phosphotransferase AK3, mitochondrial ... AK3 1 0 Homo sapiens (Human) 3D-structure;Acetylation;Alternative splicing;... MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM...
152 P27144 KAD4_HUMAN Adenylate kinase 4, mitochondrial (EC 2.7.4.4)... AK3 1 0 Homo sapiens (Human) 3D-structure;Acetylation;ATP-binding;GTP-bindi... MASKLLRAVILGPPGSGKGTVCQRIAQNFGLQHLSSGHFLRENIKA...
586 Q9UIJ7 KAD3_HUMAN GTP:AMP phosphotransferase AK3, mitochondrial ... AK3L1 1 0 Homo sapiens (Human) 3D-structure;Acetylation;Alternative splicing;... MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM...
152 P27144 KAD4_HUMAN Adenylate kinase 4, mitochondrial (EC 2.7.4.4)... AK3L1 1 0 Homo sapiens (Human) 3D-structure;Acetylation;ATP-binding;GTP-bindi... MASKLLRAVILGPPGSGKGTVCQRIAQNFGLQHLSSGHFLRENIKA...
609 Q9Y3D8 KAD6_HUMAN Adenylate kinase isoenzyme 6 (AK6) (EC 2.7.4.3... AK6 1 0 Homo sapiens (Human) 3D-structure;Alternative splicing;ATP-binding;... MLLPNILLTGTPGVGKTTLGKELASKSGLKYINVGDLAREEQLYDG...

We’ll drop them to prevent confusion.

dup_name = set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names'])
gps.gene.isin(dup_name).sum()
np.int64(76)
gps.shape
(4260, 6)
gps = gps[~gps.gene.isin(dup_name)].copy()
gene2uniprot = kinase_id.set_index('Gene Names')['Entry'].to_dict()
gps.shape
(4184, 6)

unmapped:

gps[gps.gene.str.upper().map(gene2uniprot).isna()].gene.unique()
array(['DMPK1', 'MRCKa', 'YPKA', 'PKACb', 'PKCt', 'PKCh', 'PKCi', 'PKCz',
       'PKG1', 'PKG2', 'PRKG1 Isoform Alpha', 'p70S6K', 'p70S6Kb',
       'CCDPK', 'AMPKa1', 'PKD3', 'CK1d', 'CK1e', 'CK1g1', 'CSNK2B',
       'p38b', 'p38d', 'p38g', 'AurB', 'AurC', 'TP53RK ', 'STLK3',
       'PAK3 Isoform 2', 'TAO1', 'ACK'], dtype=object)
gps['kinase_uniprot'] = gps.gene.str.upper().map(gene2uniprot)
gps = gps.dropna(subset='kinase_uniprot')

Substrate mapping

# gps.uniprot.drop_duplicates().to_csv('raw/GPS_substrate_id.csv')

1,203 IDs were mapped to 1,208 results

1,202 active entries and 1 obsolete entry are found

gps.shape
(3750, 7)
gps = map_substrate('raw/idmapping_2025_03_12_GPS.xlsx',gps,'uniprot')
Shape before processing (3750, 7)
Species counts: substrate_species
Homo sapiens (Human)                                                                 3614
Rattus norvegicus (Rat)                                                                30
Mus musculus (Mouse)                                                                   19
Pongo abelii (Sumatran orangutan) (Pongo pygmaeus abelii)                              10
Pan troglodytes (Chimpanzee)                                                            6
Sus scrofa (Pig)                                                                        6
Bos taurus (Bovine)                                                                     5
Oryctolagus cuniculus (Rabbit)                                                          4
Macaca fascicularis (Crab-eating macaque) (Cynomolgus monkey)                           4
Mesocricetus auratus (Golden hamster)                                                   4
Tupaia belangeri (Common tree shrew) (Tupaia glis belangeri)                            2
Canis lupus familiaris (Dog) (Canis familiaris)                                         2
Phodopus roborovskii (Roborovski's desert hamster) (Cricetulus roborovskii)             2
Macaca mulatta (Rhesus macaque)                                                         2
Xenopus tropicalis (Western clawed frog) (Silurana tropicalis)                          2
Hylobates lar (Lar gibbon) (White-handed gibbon)                                        1
Cricetulus griseus (Chinese hamster) (Cricetulus barabensis griseus)                    1
Xenopus laevis (African clawed frog)                                                    1
Galeopterus variegatus (Malayan flying lemur) (Cynocephalus variegatus)                 1
Nannospalax galili (Northern Israeli blind subterranean mole rat) (Spalax galili)       1
Spermophilus citellus (European suslik) (Citellus citellus)                             1
Marmota monax (Woodchuck)                                                               1
Pan paniscus (Pygmy chimpanzee) (Bonobo)                                                1
Danio rerio (Zebrafish) (Brachydanio rerio)                                             1
Hepatitis delta virus genotype II (isolate 7/18/83) (HDV)                               1
Human T-cell leukemia virus 1 (strain Japan ATK-1 subtype A) (HTLV-1)                   1
Macaca fuscata fuscata (Japanese macaque)                                               1
Human T-cell leukemia virus 1 (isolate Caribbea HS-35 subtype A) (HTLV-1)               1
Human T-cell leukemia virus 1 (isolate Melanesia mel5 subtype C) (HTLV-1)               1
Ovis aries (Sheep)                                                                      1
Human immunodeficiency virus type 1 group M subtype B (isolate NY5) (HIV-1)             1
Human immunodeficiency virus type 2 subtype A (isolate BEN) (HIV-2)                     1
Human immunodeficiency virus type 1 group M subtype B (isolate YU-2) (HIV-1)            1
Human immunodeficiency virus type 1 group M subtype B (isolate WMJ22) (HIV-1)           1
Cavia porcellus (Guinea pig)                                                            1
Human immunodeficiency virus type 1 group M subtype A (isolate MAL) (HIV-1)             1
Human immunodeficiency virus type 1 group M subtype F2 (isolate MP257) (HIV-1)          1
Human immunodeficiency virus type 1 group M subtype B (isolate LW123) (HIV-1)           1
Human immunodeficiency virus type 1 group M subtype B (isolate JRCSF) (HIV-1)           1
Human immunodeficiency virus type 1 group M subtype B (isolate JH32) (HIV-1)            1
Human immunodeficiency virus type 1 group M subtype B (isolate HXB2) (HIV-1)            1
Human immunodeficiency virus type 1 group M subtype C (isolate ETH2220) (HIV-1)         1
Human immunodeficiency virus type 1 group M subtype D (isolate ELI) (HIV-1)             1
Human immunodeficiency virus type 1 group M subtype B (isolate CDC-451) (HIV-1)         1
Human immunodeficiency virus type 1 group O (isolate ANT70) (HIV-1)                     1
Human immunodeficiency virus type 1 group M subtype A (isolate U455) (HIV-1)            1
Human immunodeficiency virus type 2 subtype A (isolate KR) (HIV-2)                      1
Phodopus campbelli (Campbell's dwarf Russian hamster)                                   1
Human immunodeficiency virus type 2 subtype A (isolate ROD) (HIV-2)                     1
Human immunodeficiency virus type 1 group O (isolate MVP5180) (HIV-1)                   1
Felis catus (Cat) (Felis silvestris catus)                                              1
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (3614, 10)
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
gps.shape
(3614, 10)

Validate site

gps['site']=gps['code'] + gps['position'].astype(int).astype(str)
gps['site_match'] =validate_site_df(gps,'site','substrate_sequence')
gps['site_match'].value_counts()
site_match
1    3435
0     179
Name: count, dtype: int64
gps=gps[gps.site_match==1]
gps.shape
(3435, 12)

Save

gps.columns
Index(['position', 'code', 'gene', 'species', 'source', 'kinase_uniprot',
       'substrate_uniprot', 'substrate_genes', 'substrate_sequence',
       'substrate_species', 'site', 'site_match'],
      dtype='object')
gps = gps.rename(columns={'source':'GPS_source'})
gps['source']='GPS6'
col = ['kinase_uniprot','gene', 

       'substrate_uniprot', 'substrate_genes', 'site','substrate_sequence', 'GPS_source','source']
gps = gps[col].copy()
gps.head()
kinase_uniprot gene substrate_uniprot substrate_genes site substrate_sequence GPS_source source
0 P31749 AKT1 Q9Y261 FOXA2 HNF3B TCF3B T156 MLGAVKMEGHEPSDWSSYYAEPEGYSSVSNMNAGLGMNGMNTYMSM... 14500912 GPS6
1 P31749 AKT1 P49760 CLK2 T127 MPHPRRYHSSERGSRGSYREHYRSRKHKRRRSRSWSSSSDRTRRRR... UniProt GPS6
2 P31749 AKT1 P49815 TSC2 TSC4 T1462 MAKPTSKDSGLKEKFKILLGLGTPRPNPRSAEGKQTEFIITAEILR... 12150915;15342917;12172553;UniProt GPS6
3 P31749 AKT1 P46527 CDKN1B KIP1 p27 T187 MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... 12042314;9192873;9399644;9311993;9388487;17254... GPS6
4 P31749 AKT1 O15111 CHUK IKKA TCF16 T23 MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC... 10485710;UniProt GPS6
# gps.to_excel('raw/GPS6.xlsx',index=False)

Other datasets

Douglass

Paper: https://journals.physiology.org/doi/full/10.1152/ajpcell.00166.2012

Data is not available to download, but available upon request to authors

RegPhos 2.0

Paper: https://academic.oup.com/database/article/doi/10.1093/database/bau034/2634150

Go to the paper link, scroll down to Supplementary data, download the zip file, docx file, table S4 is motif

Phosida

http://www.phosida.com/ is not accessible

Phosida paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC2258193/

PhosphoNetwork

2013 bioinformatics: https://pmc.ncbi.nlm.nih.gov/articles/PMC3866559/#btt627-T1

2013 Mol Syst Biol.: https://pmc.ncbi.nlm.nih.gov/articles/PMC3658267/

web app: https://www.phosphonetworks.org/

The web app provides kinase-substrate pairs, but does not provides specific info of phosphorylation site.

It provides logo and pssm though in Download –> Motif Matrix, which can be used to compare the PSSM result

BioGRID

Go to : https://downloads.thebiogrid.org/BioGRID, Current-Release–>BIOGRID-PTMS-4.4.242.ptm.zip

There are two files in after extraction, however, they don’t contain kinase-substrate relationship.

Combine all

import pandas as pd
from pathlib import Path
paths = [
    'raw/GPS6.xlsx',
    'raw/signor.xlsx',
    'raw/ELM.xlsx',
    'raw/iPTMNet.xlsx','raw/EPSD.xlsx',# 'raw/KiNet.xlsx',
    'raw/psp_human.xlsx',
    'raw/sugiyama.xlsx'
]
dfs = [pd.read_excel(path) for path in paths]
[print(df.source.value_counts()) for df in dfs]
source
GPS6    3435
Name: count, dtype: int64
source
SIGNOR    11212
Name: count, dtype: int64
source
ELM    4519
Name: count, dtype: int64
source
iPTMNet    3478
Name: count, dtype: int64
source
EPSD    10308
Name: count, dtype: int64
source
PSP    13144
Name: count, dtype: int64
source
Sugiyama    184948
Name: count, dtype: int64
[None, None, None, None, None, None, None]

For each df, we need to drop duplicates of kinase(uniprot)-substrate(uniprot)-site

def get_key(df):
    df = df.copy()
    print('original shape:',df.shape)
    df['kin_sub_site']=df['kinase_uniprot']+'_'+df['substrate_uniprot']+'_'+df['site']
    df = df.drop_duplicates(subset='kin_sub_site')
    print('after removing duplicates',df.shape)
    return df
dfs=[get_key(df) for df in dfs]
original shape: (3435, 8)
after removing duplicates (3326, 9)
original shape: (11212, 9)
after removing duplicates (9320, 10)
original shape: (4519, 10)
after removing duplicates (3807, 11)
original shape: (3478, 9)
after removing duplicates (3478, 10)
original shape: (10308, 9)
after removing duplicates (10308, 10)
original shape: (13144, 12)
after removing duplicates (13091, 13)
original shape: (184948, 11)
after removing duplicates (168342, 12)
for path,df in zip(paths,dfs):
    print(Path(path).stem, df.shape)
GPS6 (3326, 9)
signor (9320, 10)
ELM (3807, 11)
iPTMNet (3478, 10)
EPSD (10308, 10)
psp_human (13091, 13)
sugiyama (168342, 12)
common_cols = ['kinase_uniprot','substrate_uniprot','site','kin_sub_site','source','substrate_genes','substrate_sequence']
df_all = pd.concat(dfs,ignore_index=True)
df_all = df_all[common_cols].copy()
df_all.source.value_counts()
source
Sugiyama    168342
PSP          13091
EPSD         10308
SIGNOR        9320
ELM           3807
iPTMNet       3478
GPS6          3326
Name: count, dtype: int64
# df_all.to_parquet('raw/combine_source.parquet')
df_grouped = df_all.groupby("kin_sub_site").agg({
    "kinase_uniprot": "first",
    "substrate_uniprot": "first",
    "site": "first",
    "source": '|'.join,  # Concatenate sources with '|'
    "substrate_genes": "first",
    "substrate_sequence": "first"
}).reset_index()
df_grouped.shape
(187066, 7)
df_grouped.head()
kin_sub_site kinase_uniprot substrate_uniprot site source substrate_genes substrate_sequence
0 O00141_A4FU28_S140 O00141 A4FU28 S140 Sugiyama CTAGE9 MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC...
1 O00141_O00141_S252 O00141 O00141 S252 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
2 O00141_O00141_S255 O00141 O00141 S255 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
3 O00141_O00141_S397 O00141 O00141 S397 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
4 O00141_O00141_S404 O00141 O00141 S404 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
# df_grouped.to_parquet('raw/combine_source_grouped.parquet')

Human phosphoproteome

from katlas.core import *
import pandas as pd

Data

human = Data.get_combine_site_psp_ochoa()
human.shape
(121419, 8)

Substrate mapping

# human.uniprot.drop_duplicates().to_csv('raw/human_phosphoproteome_uniprot.csv')

11,243 IDs were mapped to 11,241 results

5 ID were not mapped: AAA58698 P18433-2 AAC50053 AAA60149 NP_001184222

11,242 active entries and 1 obsolete entry are found

human = map_substrate('raw/idmapping_2025_03_20_human_phosphoproteome.xlsx',human,'uniprot')
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (121419, 8)
Species counts: substrate_species
Homo sapiens (Human)    121332
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (121332, 11)

Validate site

human.columns
Index(['gene', 'site', 'site_seq', 'source', 'AM_pathogenicity', 'CDDM_upper',
       'CDDM_max_score', 'substrate_uniprot', 'substrate_genes',
       'substrate_sequence', 'substrate_species'],
      dtype='object')
human['site_match'] =validate_site_df(human,'site','substrate_sequence')
human['site_match'].value_counts()
site_match
1    120084
0      1248
Name: count, dtype: int64
human=human[human.site_match==1].copy()
human.shape
(120084, 12)

Remove duplicates

human['sub_site'] = human['substrate_uniprot']+'_'+human['site']
human = human.drop_duplicates(subset='sub_site')
human.shape
(119955, 13)

Save

human.columns
Index(['gene', 'site', 'site_seq', 'source', 'AM_pathogenicity', 'CDDM_upper',
       'CDDM_max_score', 'substrate_uniprot', 'substrate_genes',
       'substrate_sequence', 'substrate_species', 'site_match', 'sub_site'],
      dtype='object')
cols = ['substrate_uniprot', 'substrate_genes',

        'site', 'source', 'AM_pathogenicity', 

       'substrate_sequence', 'substrate_species', 'sub_site']
human = human[cols]
# human.to_parquet('raw/human_phosphoproteome.parquet')

Phosphorylate sequence

Combine human phosphoproteome and KS dataset site info

human = pd.read_parquet('raw/human_phosphoproteome.parquet')
cols = ['substrate_uniprot','site','substrate_sequence']
df_grouped = pd.read_parquet('raw/combine_source_grouped.parquet')
human = human[cols]

df_grouped = df_grouped[cols]
comb = pd.concat([human,df_grouped])
comb['sub_site'] = comb['substrate_uniprot']+'_'+comb['site']
comb = comb.drop_duplicates('sub_site')

Phosphorylate sequence

seq = phosphorylate_seq_df(comb)
seq.head()
substrate_uniprot site substrate_sequence substrate_phosphoseq
0 A0A024R4G9 [S20] MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...
1 A0A075B6Q4 [S24, S35, S57, S68, S71, S72] MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...
2 A0A075B6T3 [S24, S26] XLKRAYRGLEEVQWCLEQLLTSPSPS XLKRAYRGLEEVQWCLEQLLTSPsPs
3 A0A075B759 [T68] MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF... MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
4 A0A087WTJ2 [T8, Y14, Y213, T215, S221, S421, S424] MGGRKMATDEENVYGLEENAQSRQESTRRLILVGRTGAGKSATGNS... MGGRKMAtDEENVyGLEENAQSRQESTRRLILVGRTGAGKSATGNS...
# seq.to_csv('raw/phosphoseq_map.csv',index=False)

Map to df

human = pd.read_parquet('raw/human_phosphoproteome.parquet')

df_grouped = pd.read_parquet('raw/combine_source_grouped.parquet')
seq_map = seq.set_index('substrate_uniprot')['substrate_phosphoseq']
human['substrate_phosphoseq'] = human.substrate_uniprot.map(seq_map)

df_grouped['substrate_phosphoseq'] = df_grouped.substrate_uniprot.map(seq_map)
human['substrate_phosphoseq'].isna().sum()
np.int64(0)
df_grouped['substrate_phosphoseq'].isna().sum()
np.int64(0)
# human.to_parquet('raw/human_phosphoproteome.parquet')

# df_grouped.to_parquet('raw/combine_source_grouped.parquet')

Extract site sequence

human['position'] = human['site'].str[1:].astype(int)

df_grouped['position']=df_grouped['site'].str[1:].astype(int)
human['site_seq'] = extract_site_seq(human,
                                  seq_col='substrate_phosphoseq',
                                  position_col='position',
                                  length=20)
100%|██████████| 119955/119955 [00:05<00:00, 22986.11it/s]
df_grouped['site_seq'] = extract_site_seq(df_grouped,
                                  seq_col='substrate_phosphoseq',
                                  position_col='position',
                                  length=20)
100%|██████████| 187066/187066 [00:07<00:00, 23764.80it/s]
df_grouped['sub_site'] = df_grouped['substrate_uniprot']+'_'+ df_grouped['site']

human['sub_site'] = human['substrate_uniprot']+'_'+ human['site']
# human.to_parquet('raw/human_phosphoproteome.parquet')

# df_grouped.to_parquet('raw/combine_source_grouped.parquet')

Add kinase info

df=df_grouped.copy()
# Remove pseudokinase duplicates by UniProt ID, keep only one entry per kinase
info = Data.get_kinase_info().sort_values('kinase').drop_duplicates('uniprot')

# Pre-extract UniProt ID without isoform for matching
df['uniprot_clean'] = df['kinase_uniprot'].str.split('-').str[0]

info_indexed = info.set_index('uniprot')
group_map = info_indexed['group']
family_map = info_indexed['family']
pspa_small_map = info_indexed['pspa_category_small']
pspa_big_map = info_indexed['pspa_category_big']

df['kinase_on_tree'] = df['uniprot_clean'].isin(info['uniprot']).astype(int)

kinase_gene_map = Data.get_kinase_uniprot().set_index('Entry')['Gene Names']
df['kinase_genes'] = df['uniprot_clean'].map(kinase_gene_map)

df['kinase_group'] = df['uniprot_clean'].map(group_map)
df['kinase_family'] = df['uniprot_clean'].map(family_map)
df['kinase_pspa_big'] = df['uniprot_clean'].map(pspa_big_map)
df['kinase_pspa_small'] = df['uniprot_clean'].map(pspa_small_map)

df.drop(columns='uniprot_clean', inplace=True)

The above code has been added into Data class when loading the ks dataset