Preprocess kinase-substrate datasets

Motif

Poll [Sugiyama & Douglass]: https://biosignaling.biomedcentral.com/articles/10.1186/s12964-023-01436-2

Poll web motif: https://esbl.nhlbi.nih.gov/Databases/Kinase_Logos/KinaseTree.html; https://esbl.nhlbi.nih.gov/Databases/Kinase_Logos/GeneSymbol.html

Sugiyama: https://pmc.ncbi.nlm.nih.gov/articles/PMC6642169/

Douglass: https://journals.physiology.org/doi/full/10.1152/ajpcell.00166.2012

RegPhos: Supplementary Table S4: motifs grouped by kinase family

PhosphoNetwork, logos of 300 motifs: https://pmc.ncbi.nlm.nih.gov/articles/PMC3658267/; web app: https://phosphonetworks.org/

GPS: https://gps.biocuckoo.cn/links.php#l1

Review of database

Wilson, AACR cancer research: https://aacrjournals.org/cancerres/article/78/1/15/625062/New-Perspectives-Opportunities-and-Challenges-in

GPS 6.0, List of database: https://gps.biocuckoo.cn/links.php

paper: https://academic.oup.com/nar/article/51/W1/W243/7157529

phospho.ELM links: http://phospho.elm.eu.org/links.html

GPS, https://gps.biocuckoo.cn/links.php#l1

Set up

from katlas.core import *
import pandas as pd

def map_substrate(idmapping_fname, ori_df, sub_col,remove_nonhuman=True):
    "Map uniprot id with uniprot info"

    substrate_id = pd.read_excel(idmapping_fname)
    substrate_id = substrate_id.drop_duplicates('From')
    substrate_id = substrate_id[['From', 'Entry', 'Gene Names', 'Sequence','Organism']]
    ori_df = ori_df.copy()

    # prevent name conflict
    if sub_col == 'substrate_uniprot':
        sub_col = 'substrate_uniprot_tmp'
        ori_df = ori_df.rename(columns={'substrate_uniprot':'substrate_uniprot_tmp'})

    substrate_id.columns = [sub_col, 'substrate_uniprot', 'substrate_genes', 'substrate_sequence','substrate_species']

    print('Shape before processing', ori_df.shape)

    ori_df = ori_df.merge(substrate_id, on=sub_col)

    print('Species counts:',ori_df.substrate_species.value_counts())

    if remove_nonhuman:
        print('Removing non-human substrates')
        ori_df = ori_df[ori_df.substrate_species=='Homo sapiens (Human)']

    ori_df = ori_df.drop(columns=[sub_col])  # Drop temp column after merging
    ori_df = ori_df.dropna(subset=['substrate_sequence'])
    print('Shape after processing', ori_df.shape)

    return ori_df.reset_index(drop=True)

kinase_uniprot=pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')

Sugiyama dataset

Data

Go to https://www.nature.com/articles/s41598-019-46385-4#Sec21, download table S2.

Modify the header (2 lines to 1 line) so that it can be read by pandas.

df = pd.read_csv('raw/Large_scale_S2.csv').iloc[:,:-2]

df.head()

	Type	Kinase	Substrate_uniprot	Position
0	TK	ABL1	1433B_HUMAN	S212
1	TK	ABL1	1433B_HUMAN	Y151
2	TK	ABL1	1433B_HUMAN	Y21
3	TK	ABL1	1433B_HUMAN	Y50
4	TK	ABL1	1433E_HUMAN	Y152

df.shape

(198536, 4)

Kinase mapping

# pd.DataFrame(df.Kinase.unique())[0].str.split('/').str[0].to_csv('raw/sugiyama_kinase_name.csv')

Map each kinase name with uniprot ID

kinase_id = pd.read_csv('raw/LS_info2.csv').iloc[:,:3]

df = df.merge(kinase_id)

df

	Type	Kinase	Substrate_uniprot	Position	kinase_uniprot	kinase_paper
0	TK	ABL1	1433B_HUMAN	S212	P00519	ABL1
1	TK	ABL1	1433B_HUMAN	Y151	P00519	ABL1
2	TK	ABL1	1433B_HUMAN	Y21	P00519	ABL1
3	TK	ABL1	1433B_HUMAN	Y50	P00519	ABL1
4	TK	ABL1	1433E_HUMAN	Y152	P00519	ABL1
...	...	...	...	...	...	...
198531	LK	SPHK2	TICN3_HUMAN	T118	Q9NRA0	NaN
198532	LK	SPHK2	TPM4_HUMAN	T241	Q9NRA0	NaN
198533	LK	SPHK2	ULK3_HUMAN	S305	Q9NRA0	NaN
198534	LK	SPHK2	ZRAB2_HUMAN	S165	Q9NRA0	NaN
198535	LK	SPHK2	ZRAB2_HUMAN	S181	Q9NRA0	NaN

198536 rows × 6 columns

Substrate mapping

# pd.DataFrame(df.Substrate_uniprot.unique()).to_csv('raw/sugiyama_uniprot_id.csv')

ID mapping of Substrate_uniprot to uniprot ID in https://www.uniprot.org/id-mapping

3,753 IDs were mapped to 3,753 results

375 ID were not mapped

df = map_substrate('raw/idmapping_2025_03_02.xlsx',df,'Substrate_uniprot')

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

Shape before processing (198536, 6)
Species counts: substrate_species
Homo sapiens (Human)    187129
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (187129, 9)

# lipid kinase
df[df.Type=='LK'].Kinase.unique()

array(['PIK3C3', 'PIK3CB/PIK3R1', 'PIK3CD/PIK3R1', 'PIK3CG/PIK3R5',
       'PIK3CG', 'PIK4CA', 'PIP4K2B', 'PIP5K1C', 'SPHK1', 'SPHK2'],
      dtype=object)

df.isna().sum()

Type                      0
Kinase                    0
Position                  0
kinase_uniprot        16591
kinase_paper          18891
substrate_uniprot         0
substrate_genes          11
substrate_sequence        0
substrate_species         0
dtype: int64

df.head()

	Type	Kinase	Position	kinase_uniprot	kinase_paper	substrate_uniprot	substrate_genes	substrate_sequence	substrate_species
0	TK	ABL1	S212	P00519	ABL1	P31946	YWHAB	MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...	Homo sapiens (Human)
1	TK	ABL1	Y151	P00519	ABL1	P31946	YWHAB	MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...	Homo sapiens (Human)
2	TK	ABL1	Y21	P00519	ABL1	P31946	YWHAB	MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...	Homo sapiens (Human)
3	TK	ABL1	Y50	P00519	ABL1	P31946	YWHAB	MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...	Homo sapiens (Human)
4	TK	ABL1	Y152	P00519	ABL1	P62258	YWHAE	MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...	Homo sapiens (Human)

df = df.rename(columns = {'Position':'site'})

df.shape

(187129, 9)

Validate site

df['site_match'] =validate_site_df(df,'site','substrate_sequence')

df.site_match.value_counts()

site_match
1    184948
0      2181
Name: count, dtype: int64

df=df[df.site_match==1]

df.shape

(184948, 10)

Save

df['source']='Sugiyama'

# df.to_excel('raw/sugiyama.xlsx',index=False)

PhosphoSitePlus (PSP)

Data

PSP paper: https://academic.oup.com/nar/article/43/D1/D512/2439467

Go to https://www.phosphosite.org/staticDownloads, download Kinase_Substrate_Dataset.txt

import pandas as pd

psp = pd.read_csv('raw/Kinase_Substrate_Dataset_final.csv')

psp.head()

	GENE	Kinase	KIN_ACC_ID	KIN_ORGANISM	SUBSTRATE	SUB_GENE_ID	SUB_ACC_ID	SUB_GENE	SUB_ORGANISM	SUB_MOD_RSD	SITE_GRP_ID	substrate	DOMAIN	IN_VIVO_RXN	IN_VITRO_RXN	CST_CAT#	kinase_uniprot	kinase_paper
0	Dyrk2	DYRK2	Q5U4C9	mouse	NDEL1	83431.0	Q9ERR1	Ndel1	mouse	S336	1869686801	LGSsRPSsAPGMLPL	NaN		X	NaN	Q92630	DYRK2
1	DYRK2	DYRK2	Q92630	human	GLI2	14633.0	Q0VGT2	Gli2	mouse	S385	3339001	AEGLRPAsPLGLTQE	NaN		X	NaN	Q92630	DYRK2
2	DYRK2	DYRK2	Q92630	human	SIAH2	6478.0	O43255	SIAH2	human	S68	5393502	GGGAGPVsPQHHELT	NaN	X		NaN	Q92630	DYRK2
3	DYRK2	DYRK2	Q92630	human	CARHSP1	23589.0	Q9Y2V2	CARHSP1	human	S41	455702	LRGNVVPsPLPtRRt	NaN		X	NaN	Q92630	DYRK2
4	DYRK2	DYRK2	Q92630	human	Doublecortin iso2	1641.0	O43602-2	DCX	human	S306	454122	GPMRRSKsPADSANG	NaN	X		NaN	Q92630	DYRK2

psp =psp[psp.KIN_ORGANISM=='human'].reset_index(drop=True) 

psp =psp[psp.SUB_ORGANISM=='human'].reset_index(drop=True)

psp.shape

(14081, 18)

psp = psp[['KIN_ACC_ID','kinase_paper','GENE','SUB_ACC_ID','SUB_GENE','SUB_MOD_RSD','substrate']]

psp.head()

	KIN_ACC_ID	kinase_paper	GENE	SUB_ACC_ID	SUB_GENE	SUB_MOD_RSD	substrate
0	Q92630	DYRK2	DYRK2	O43255	SIAH2	S68	GGGAGPVsPQHHELT
1	Q92630	DYRK2	DYRK2	Q9Y2V2	CARHSP1	S41	LRGNVVPsPLPtRRt
2	Q92630	DYRK2	DYRK2	O43602-2	DCX	S306	GPMRRSKsPADSANG
3	Q92630	DYRK2	DYRK2	P30304	CDC25A	S283	PErsQEEsPPGSTKr
4	Q92630	DYRK2	DYRK2	O43255	SIAH2	T119	PTCRGALtPSIRNLA

Substrate mapping

# psp.SUB_ACC_ID.drop_duplicates().to_csv('raw/psp_substrate_id.csv')

4,441 IDs were mapped to 4,446 results

19 ID were not mapped: NP_001184222 NP_001100737 AAA40678 HSBO22 NP_776683 NP_579829 P18433-2 NP_001076191 NP_001099740 NP_001005762 NP_001178533 XP_008773743 NP_001104263 ABR15760 AAB24204 AAB24205 BAA34185.2 NP_001103022 AAC50053

IDs with multiple results: P62991, P62988

psp = map_substrate('raw/idmapping_2025_03_02_psp.xlsx',psp,'SUB_ACC_ID')

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

Shape before processing (14081, 7)
Species counts: substrate_species
Homo sapiens (Human)    14069
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (14069, 10)

psp.head()

	KIN_ACC_ID	kinase_paper	GENE	SUB_GENE	SUB_MOD_RSD	substrate	substrate_uniprot	substrate_genes	substrate_sequence	substrate_species
0	Q92630	DYRK2	DYRK2	SIAH2	S68	GGGAGPVsPQHHELT	O43255	SIAH2	MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS...	Homo sapiens (Human)
1	Q92630	DYRK2	DYRK2	CARHSP1	S41	LRGNVVPsPLPtRRt	Q9Y2V2	CARHSP1	MSSEPPPPPQPPTHQASVGLLDTPRSRERSPSPLRGNVVPSPLPTR...	Homo sapiens (Human)
2	Q92630	DYRK2	DYRK2	DCX	S306	GPMRRSKsPADSANG	O43602	DCX DBCN LISX	MELDFGHFDERDKTSRNMRGSRMNGLPSPTHSAHCSFYRTRTLQAL...	Homo sapiens (Human)
3	Q92630	DYRK2	DYRK2	CDC25A	S283	PErsQEEsPPGSTKr	P30304	CDC25A	MELGPEPPHRRRLLFACSPPPASQPVVKALFGASAAGGLSPVTNLT...	Homo sapiens (Human)
4	Q92630	DYRK2	DYRK2	SIAH2	T119	PTCRGALtPSIRNLA	O43255	SIAH2	MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS...	Homo sapiens (Human)

psp.shape

(14069, 10)

Kinase mapping

# psp.KIN_ACC_ID.drop_duplicates().to_csv('raw/psp_kin_id.csv')

Mapp KIN_ACC_ID to uniprot id

440 IDs were mapped to 440 results

1 ID was not mapped:

AAA58698

kinase_id = pd.read_excel('raw/idmapping_2025_03_02_psp_kinase.xlsx')[['From','Entry','Gene Names']]

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

kinase_id.columns = ['KIN_ACC_ID','kinase_uniprot','kinase_genes']

kinase_id[kinase_id.KIN_ACC_ID=="A9UF07"]

	KIN_ACC_ID	kinase_uniprot	kinase_genes
124	A9UF07	A9UF07	BCR/ABL fusion

psp['KIN_ACC_ID'] = psp['KIN_ACC_ID'].replace('A9UF07','P00519')

psp = psp.merge(kinase_id)

psp = psp.rename(columns={'SUB_MOD_RSD':'site'})

psp = psp[['kinase_uniprot','kinase_genes','kinase_paper','substrate_uniprot','substrate_genes','site','substrate_sequence','substrate','substrate_species']]

psp.isna().sum()

kinase_uniprot          0
kinase_genes            0
kinase_paper          134
substrate_uniprot       0
substrate_genes         0
site                    0
substrate_sequence      0
substrate               0
substrate_species       0
dtype: int64

psp = psp.dropna(subset='substrate_sequence').reset_index(drop=True)

Check if kinase-uniprot belongs to kinase class

Not mapped:

psp[~psp.kinase_uniprot.isin(kinase_uniprot.Entry)].kinase_genes.value_counts()

kinase_genes
CSNK2B CK2N G5A                13
PRKAG2                          9
PRKAB1 AMPK                     9
GTF2F1 RAP74                    4
BLVRA BLVR BVR                  3
TGM2                            2
RET/PTC2                        2
PHKA1 PHKA                      1
HSPA5 GRP78                     1
ENPP3 PDNP3                     1
PIK3R1 GRB1                     1
CERT1 CERT COL4A3BP STARD11     1
JMJD6 KIAA0585 PSR PTDSR        1
Name: count, dtype: int64

CSNK2B belongs to CK2, PRKAB1, PRKAG2 belongs to AMPK subunit; as their count is not many, we can simply ignore them

psp = psp[psp.kinase_uniprot.isin(kinase_uniprot.Entry)].copy()

psp.shape

(14016, 9)

Validate site

Check if it match with substrate sequence

psp['position'] = psp['site'].str[1:].astype(int)

psp['site_seq']=extract_site_seq(psp,seq_col='substrate_sequence',position_col='position')

100%|██████████| 14016/14016 [00:00<00:00, 25600.07it/s]

(psp['site_seq']==psp['substrate'].str.upper()).value_counts()

True     13156
False      860
Name: count, dtype: int64

unmatch = psp[~(psp['site_seq']==psp['substrate'].str.upper())]

unmatch.kinase_genes.value_counts().head(10)

kinase_genes
PRKACA PKACA                                  40
MAPK1 ERK2 PRKM1 PRKM2                        40
SRC SRC1                                      39
CDK1 CDC2 CDC28A CDKN1 P34CDC2                38
GSK3B                                         37
MAPK3 ERK1 PRKM3                              35
AURKB AIK2 AIM1 AIRK2 ARK2 STK1 STK12 STK5    33
PRKCA PKCA PRKACA                             33
PRKCB PKCB PRKCB1                             25
CDK5 CDKN5 PSSALRE                            25
Name: count, dtype: int64

We’ll drop the rows with unmatch substrate sequence

psp = psp[psp['site_seq']==psp['substrate'].str.upper()]

Drop sites with center aa not in S,T or Y

psp['site_seq'].str[7].value_counts()

site_seq
S    8303
T    2812
Y    2029
H       6
K       4
R       2
Name: count, dtype: int64

psp = psp[psp['site_seq'].str[7].str.upper().isin(list('STY'))]

Drop site with unmatched residue and position

psp['site_match'] =validate_site_df(psp,'site','substrate_sequence')

psp['site_match'].value_counts()

site_match
1    13144
Name: count, dtype: int64

psp.shape

(13144, 12)

Save

psp['source']='PSP'

psp.head()

	kinase_uniprot	kinase_genes	kinase_paper	substrate_uniprot	substrate_genes	site	substrate_sequence	substrate	substrate_species	position	site_seq	site_match	source
0	Q92630	DYRK2	DYRK2	O43255	SIAH2	S68	MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS...	GGGAGPVsPQHHELT	Homo sapiens (Human)	68	GGGAGPVSPQHHELT	1	PSP
1	Q92630	DYRK2	DYRK2	Q9Y2V2	CARHSP1	S41	MSSEPPPPPQPPTHQASVGLLDTPRSRERSPSPLRGNVVPSPLPTR...	LRGNVVPsPLPtRRt	Homo sapiens (Human)	41	LRGNVVPSPLPTRRT	1	PSP
3	Q92630	DYRK2	DYRK2	P30304	CDC25A	S283	MELGPEPPHRRRLLFACSPPPASQPVVKALFGASAAGGLSPVTNLT...	PErsQEEsPPGSTKr	Homo sapiens (Human)	283	PERSQEESPPGSTKR	1	PSP
4	Q92630	DYRK2	DYRK2	O43255	SIAH2	T119	MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS...	PTCRGALtPSIRNLA	Homo sapiens (Human)	119	PTCRGALTPSIRNLA	1	PSP
5	Q92630	DYRK2	DYRK2	O75449	KATNA1	S42	MSLLMISENVKLAREYALLGNYDSAMVYYQGVLDQMNKYLYSVKDT...	QMNKYLYsVkDTYLQ	Homo sapiens (Human)	42	QMNKYLYSVKDTYLQ	1	PSP

# psp.to_excel('raw/psp_human.xlsx',index=False)

KiNET (ESPD + iPTMNet)

an integration of PSP, iPTMNet, EPSD

Data

KiNET paper: https://www.nature.com/articles/s41540-024-00442-5

KiNET web: https://kinet.kinametrix.com/

iPTMnet paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC5753337/;

iPTMnet web: https://research.bioinformatics.udel.edu/iptmnet/ ; can’t directly download through Download

EPSD paper: https://academic.oup.com/bib/article/22/1/298/5686325?login=false

EPSD web: https://epsd.biocuckoo.cn/Download.php

Go to https://kinet.kinametrix.com/, and click Download the full KiNet interaction dataset

df = pd.read_csv('raw/ksi_source_full_dataset.csv')

df = df.dropna(subset='Kinase')

df['Source Database'].value_counts()

Source Database
PhosphoSitePlus    13135
EPSD               10442
iPTMNet             3846
Name: count, dtype: int64

As we already included PSP, we will drop it

df = df[df['Source Database']!='PhosphoSitePlus']

df.shape

(14288, 8)

Kinase mapping

df.Kinase.isin(kinase_uniprot.Entry).value_counts()

Kinase
True     14283
False        5
Name: count, dtype: int64

df = df[df.Kinase.isin(kinase_uniprot.Entry)]

Substrate mapping

# df.Substrate.drop_duplicates().to_csv('raw/KiNet_substrate_id.csv')

Map the id to uniprot seq

df = map_substrate('raw/idmapping_2025_03_02_KiNET_substrate.xlsx',df,'Substrate')

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

Shape before processing (14283, 8)
Species counts: substrate_species
Homo sapiens (Human)    14283
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (14283, 11)

# substrate_id = pd.read_excel('raw/idmapping_2025_03_02_KiNET_substrate.xlsx')[['From','Entry','Gene Names','Sequence']]

# substrate_id.columns = ['Substrate','substrate_uniprot','substrate_genes','substrate_sequence']

# df = df.merge(substrate_id)

df = df[['Kinase','Kinase Name','substrate_uniprot','substrate_genes','Site','Source Database','Evidence','substrate_sequence']]

df.columns

Index(['Kinase', 'Kinase Name', 'substrate_uniprot', 'substrate_genes', 'Site',
       'Source Database', 'Evidence', 'substrate_sequence'],
      dtype='object')

df.columns = ['kinase_uniprot', 'Kinase Name', 'substrate_uniprot', 'substrate_genes', 'site',

       'source', 'evidence', 'substrate_sequence']

df.shape

(14283, 8)

Validate site

We drop site with site residue info unmatch with its site position info in the protein sequence

df['site_match'] =validate_site_df(df,'site','substrate_sequence')

df.site_match.value_counts()

site_match
1    13786
0      497
Name: count, dtype: int64

df[df.site_match==0].source.value_counts()

source
iPTMNet    368
EPSD       129
Name: count, dtype: int64

df[df.site_match==1].source.value_counts()

source
EPSD       10308
iPTMNet     3478
Name: count, dtype: int64

df=df[df.site_match==1]

df.shape

(13786, 9)

Save

df.head()

	kinase_uniprot	Kinase Name	substrate_uniprot	substrate_genes	site	source	evidence	substrate_sequence	site_match
0	O00141	SGK1	O00213	APBB1 FE65 RIR	S610	EPSD	Unspecified experimental method	MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG...	1
1	O00141	SGK1	O14920	IKBKB IKKB	S181	EPSD	Unspecified experimental method	MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...	1
2	O00141	SGK1	O15111	CHUK IKKA TCF16	S180	EPSD	Unspecified experimental method	MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...	1
3	O00141	SGK1	O43524	FOXO3 FKHRL1 FOXO3A	T32	EPSD	Unspecified experimental method	MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...	1
4	O00141	SGK1	O43524	FOXO3 FKHRL1 FOXO3A	T32	iPTMNet	Text mining	MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...	1

# df.to_excel('raw/KiNet.xlsx',index=False)

# df[df['source'] =='EPSD'].to_excel('raw/EPSD.xlsx',index=False)

# df[df['source'] =='iPTMNet'].to_excel('raw/iPTMNet.xlsx',index=False)

Phospho.ELM

Data

paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC3013696/

Go to http://phospho.elm.eu.org/dataset.html, fill info–>accept, then download the dataset

Open dump in excel, filter human and non-blank from kinases

elm=pd.read_csv('raw/phosphoELM.csv')

elm.shape

(3599, 6)

elm.head()

	substrate_uniprot	position	acceptor	kinase	LTP_HTP	species
0	O14543	204	Y	Lck	LTP	Homo sapiens
1	O14543	221	Y	Lck	LTP	Homo sapiens
2	O14746	824	S	PKB_group	LTP	Homo sapiens
3	O14746	227	S	PKB_group	LTP	Homo sapiens
4	O14746	707	Y	SRC	LTP	Homo sapiens

elm.kinase = elm.kinase.str.upper()

Kinase mapping

kinase_id = pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')

kinase_id['Gene Names'] = kinase_id['Gene Names'].str.split(' ')

kinase_id = kinase_id.explode('Gene Names')

gene2uniprot = kinase_id.set_index('Gene Names')['Entry'].to_dict()

dup_name = set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names'])

dup_name.remove('PAK1')

dup_name.remove('PASK')

gene2uniprot['PAK1'] = 'Q13153'

gene2uniprot['PASK']='Q96RG2'

elm = elm[~elm.kinase.isin(dup_name)].copy()

elm[~elm.kinase.isin(gene2uniprot.keys())].kinase.unique()

array(['PKB_GROUP', 'IKK_GROUP', 'AURORA B', 'PDK-1', 'PKC_THETA',
       'SGK_GROUP', 'IKK_BETA', 'CDK_GROUP', 'CAM-KII_ALPHA', 'PKC_GROUP',
       'PKC_ALPHA', 'CK2_GROUP', 'CK2_ALPHA', 'GSK-3_GROUP',
       'GSK-3_ALPHA', 'GSK-3_BETA', 'PKA_GROUP', 'RSK_GROUP', 'PKB_BETA',
       'MAPK_GROUP', 'PAK_GROUP', 'DNA-PK', 'JNK_GROUP', 'CK1_GROUP',
       'AURORA A', 'P70S6KB', 'PKG/CGK_GROUP', 'PKC_DELTA', 'PDGFR_BETA',
       'CAM-KII_GROUP', 'GRK-2', 'GRK-5', 'GRK_GROUP', 'CK1_ALPHA',
       'PKC_EPSILON', 'ROCK_GROUP', 'PKG1/CGK-I', 'PDGFR_GROUP',
       'P70S6K_GROUP', 'P70S6K', 'PHK_GROUP', 'PKC_BETA', 'DMPK_GROUP',
       'CK1_DELTA', 'CK2_BETA', 'RSK-3', 'RSK-2', 'CAM-KIV', 'RSK-1',
       'IKK_ALPHA', 'CK1_EPSILON', 'MAP2K_GROUP', 'PIM-1', 'EG3 KINASE',
       'GRK-4', 'PKC_ZETA', 'PDKC', 'AMPK_GROUP', 'FGFR_GROUP',
       'JAK_GROUP', 'MAP3K_GROUP', 'RSK-5', 'IKK_EPSILON', 'CAM-KI_GROUP',
       'SRC_GROUP', 'PKA_ALPHA', 'PKC_ETA', 'PDK-2', 'GRK-6',
       'TITIN KINASE', 'CAM-KI_ALPHA', 'GRK-1', 'PDGFR_ALPHA', 'CCDPK',
       'MARK_GROUP', 'P38_GROUP', 'GRK-3', 'DAPK_GROUP', 'MRCKA',
       'PKG2/CGK-II', 'PKC_GAMMA', 'PKC_IOTA', 'CAM-KK_ALPHA'],
      dtype=object)

map them to genes

ids = pd.read_csv('raw/elm_kinase_id.csv')

ids = ids.set_index('kinase')['kinase_gene'].to_dict()

elm['kinase_genes'] = elm.kinase.map(ids).fillna(elm.kinase)

# for kinase group, we only consider the first two items

elm['kinase_genes'] = elm.kinase_genes.str.split(' ').str[:2]

elm = elm.explode('kinase_genes')

elm[elm.kinase_genes.isin(dup_name)].kinase_genes.unique()

array(['PRKACA'], dtype=object)

dup_name.remove('PRKACA')

gene2uniprot['PRKACA']= 'P17612'

elm[elm.kinase_genes.isin(dup_name)]

	substrate_uniprot	position	acceptor	kinase	LTP_HTP	species	kinase_genes

unmapped:

elm[elm.kinase_genes.map(gene2uniprot).isna()].kinase_genes.unique()

array(['PHKA1', 'PHKA2', 'CSNK2B', 'CDPK1', 'CDPK2'], dtype=object)

elm['kinase_uniprot'] = elm.kinase_genes.map(gene2uniprot)

elm = elm.dropna(subset='kinase_uniprot')

Substrate mapping

# elm.substrate_uniprot.drop_duplicates().to_csv('raw/elm_substrate_id.csv')

916 IDs were mapped to 919 results

10 ID were not mapped: ENSP00000328213 ENSP00000343690 ENSP00000352232 ENSP00000347528 ENSP00000357298 ENSP00000248996 ENSP00000357225 ENSP00000261937 ENSP00000248419 ENSP00000267569

Found two of the unmapped are kinases:

ensp = {'ENSP00000328213':'P06239','ENSP00000261937':'P35916'}

elm.substrate_uniprot = elm.substrate_uniprot.map(ensp).fillna(elm.substrate_uniprot)

elm = map_substrate('raw/idmapping_2025_03_12_elm.xlsx',elm,'substrate_uniprot')

Shape before processing (4699, 8)
Species counts: substrate_species
Homo sapiens (Human)    4675
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (4675, 11)

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

elm.shape

(4675, 11)

Validate site

elm.acceptor.value_counts()

acceptor
S    3018
T     908
Y     749
Name: count, dtype: int64

elm['site']=elm['acceptor']+elm['position'].astype(str)

elm['site_match'] =validate_site_df(elm,'site','substrate_sequence')

elm.site_match.value_counts()

site_match
1    4519
0     156
Name: count, dtype: int64

elm = elm[elm.site_match==1]

elm.shape

(4519, 13)

Save

elm.columns

Index(['position', 'acceptor', 'kinase', 'LTP_HTP', 'species', 'kinase_genes',
       'kinase_uniprot', 'substrate_uniprot', 'substrate_genes',
       'substrate_sequence', 'substrate_species', 'site', 'site_match'],
      dtype='object')

col = ['kinase','kinase_uniprot', 'kinase_genes',

       'substrate_uniprot', 'substrate_genes','site','LTP_HTP', 'species', 

       'substrate_sequence',]

elm = elm[col]

elm['source']='ELM'

# elm.to_excel('raw/ELM.xlsx',index=False)

Signor

Data

paper: https://academic.oup.com/nar/article/51/D1/D631/6761728

web: https://signor.uniroma2.it/

Go to web link, Downloads –> Latest Release download (Jan 2025)

Open in excel, filter mechanism to be phosphorylation

sig = pd.read_excel('raw/signor_phosphorylation.xlsx')

sig.shape

(12973, 27)

Kinase mapping

TYPEA column contains complex proteins, we need to filter out the kinase name

comp = sig[sig.TYPEA=='complex'].copy()

# comp.ENTITYA.drop_duplicates().to_csv('raw/sig_complex.csv')

comp_id = pd.read_csv('raw/sig_complex_label.csv')

comp_id = comp_id.set_index('ENTITYA')['kinase_gene']

comp['kinase_gene'] = comp.ENTITYA.map(comp_id)

comp = comp.dropna(subset='kinase_gene')

comp.shape

(742, 28)

Fusion proteins

fus_id = {'BCR-ABL':'ABL1','EML4-ALK':'ALK'}

fus = sig[sig.TYPEA=='fusion protein'].copy()

fus['kinase_gene'] = fus.ENTITYA.map(fus_id)

For Protein family, we’ll make it apply to first two family members

fam = sig[sig.TYPEA=='proteinfamily'].copy()

# fam.ENTITYA.drop_duplicates().to_csv('raw/sig_fam.csv')

fam_id = pd.read_csv('raw/sig_fam_label.csv')

fam_id.head()

	ENTITYA	kinase_gene
0	ERK1/2	MAPK3 MAPK1
1	AKT	AKT1 AKT2 AKT3
2	RPS6K	RPS6KB1 RPS6KB2
3	p38	MAPK14 MAPK11 MAPK12 MAPK13
4	JNK	MAPK8 MAPK9 MAPK10

fam_id = fam_id.set_index('ENTITYA')['kinase_gene']

fam['kinase_gene'] = fam.ENTITYA.map(fam_id)

fam = fam.dropna(subset='kinase_gene')

fam['kinase_gene'] = fam.kinase_gene.str.split(' ').str[:2]

fam = fam.explode('kinase_gene')

Protein:

pro = sig[sig.TYPEA=='protein'].copy()

for consistency:

pro['kinase_gene'] = pro['ENTITYA']

combine:

df = pd.concat([pro,comp,fus,fam])

Mapping:

kinase_id = pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')

kinase_id['Gene Names'] = kinase_id['Gene Names'].str.split(' ')

kinase_id = kinase_id.explode('Gene Names')

Some kinases share same gene names:

# kinase_id[kinase_id['Gene Names'].duplicated(keep=False)].sort_values('Gene Names').head()

We’ll drop them to prevent confusion.

dup_name = set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names'])

df.kinase_gene.isin(dup_name).sum()

np.int64(674)

df = df[~df.kinase_gene.isin(dup_name)].copy()

gene2uniprot = kinase_id.set_index('Gene Names')['Entry'].to_dict()

df.shape

(13013, 28)

unmapped:

df[df.kinase_gene.str.upper().map(gene2uniprot).isna()].kinase_gene.unique()

array(['HRAS', 'IL4R', 'CSNK2B', 'TLR4', 'IL6R', 'THAP12', 'PRKAR2A',
       'RHOA', 'AREG', 'RET/PTC2', 'PCSK7', 'CCNC', 'TGM2', 'SLC12A1',
       'RALA', 'IL6ST', 'CRK', 'SMAD9', 'PLCG1', 'ELOC', 'PPP2CA', 'CCR5',
       'GHR', 'CAD', 'SMO', 'FRS2', 'LEPR', 'BLVRA', 'TAB1', 'PRKAB1',
       'PIAS4', 'IFNAR1', 'HSP90AA1', 'PTPRJ', 'PRKAG2', 'CSN1S1',
       'GTF2F1', 'VASP', 'KRAS', 'SLC12A3', 'PRPF4B', 'CDKN2A', 'IL1R1',
       'PRKAR2B', 'IL15RA', 'KRT1', 'SMAD1', 'GTF2H1', 'CCR2', 'IL5RA',
       'PLCG2', 'SLC12A2', 'GTF2H2', 'TMIGD2', 'IKBKG', 'BORA', 'MNAT1',
       'PHKA1', 'BGLF4', 'RIN1', 'DLG1', 'CDK5RAP2', 'CCR1', 'IL10RA',
       'SMAD5', 'CCN4'], dtype=object)

df['kinase_uniprot'] = df.kinase_gene.str.upper().map(gene2uniprot)

df = df.dropna(subset='kinase_uniprot')

Substrate mapping

df = df.dropna(subset='IDB')

We can’t trace the specific whole protein sequence based on SIGNOR ID, so we filter them out

df = df[~df.IDB.str.contains('SIGNOR')]

# df.IDB.drop_duplicates().to_csv('raw/sig_substrate_id.csv')

2,298 IDs were mapped to 2,298 results

1 ID was not mapped: CHEBI:15721

Sequences are all from human

df = map_substrate('raw/idmapping_2025_03_12_signor.xlsx',df,'IDB')

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

Shape before processing (12354, 29)
Species counts: substrate_species
Homo sapiens (Human)    12353
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (12353, 32)

df.shape

(12353, 32)

Validate site

df.RESIDUE = df.RESIDUE.str.split(';')

df = df.explode('RESIDUE')

df['acceptor'] = df['RESIDUE'].str[:3]

df.acceptor = df.acceptor.map({'Ser':'S','Thr':'T','Tyr':'Y'})

df['position'] = df['RESIDUE'].str[3:]

df['site']=df['acceptor']+df['position']

df = df.dropna(subset='site')

df['site_match'] =validate_site_df(df,'site','substrate_sequence')

df.site_match.value_counts()

site_match
1    11212
0      118
Name: count, dtype: int64

df = df[df.site_match==1]

df.shape

(11212, 36)

Save

df.columns

Index(['ENTITYA', 'TYPEA', 'IDA', 'DATABASEA', 'ENTITYB', 'TYPEB', 'DATABASEB',
       'EFFECT', 'MECHANISM', 'RESIDUE', 'SEQUENCE', 'TAX_ID', 'CELL_DATA',
       'TISSUE_DATA', 'MODULATOR_COMPLEX', 'TARGET_COMPLEX', 'MODIFICATIONA',
       'MODASEQ', 'MODIFICATIONB', 'MODBSEQ', 'PMID', 'DIRECT', 'NOTES',
       'ANNOTATOR', 'SENTENCE', 'SIGNOR_ID', 'kinase_gene', 'kinase_uniprot',
       'substrate_uniprot', 'substrate_genes', 'substrate_sequence',
       'substrate_species', 'acceptor', 'position', 'site', 'site_match'],
      dtype='object')

col = ['kinase_uniprot', 'kinase_gene', 'ENTITYA', 'TYPEA', 

       'substrate_uniprot', 'substrate_genes','site', 'substrate_sequence']

df = df[col].copy()

df.head()

	kinase_uniprot	kinase_gene	ENTITYA	TYPEA	substrate_uniprot	substrate_genes	site	substrate_sequence
0	P68400	CSNK2A1	CSNK2A1	protein	P05455	SSB	S366	MAENGDNEKMAALEAKICHQIEYYFGDFNLPRDKFLKEQIKLDEGW...
1	P28482	MAPK1	MAPK1	protein	P43354	NR4A2 NOT NURR1 TINUR	S126	MPCVQAQYGSSPQGASPASQSYSYHSSGEYSSDFLTPEFVKFSMDL...
2	P11362	FGFR1	FGFR1	protein	P56945	BCAR1 CAS CASS1 CRKAS	Y128	MNHLNVLAKALYDNVAESPDELSFRKGDIMTVLEQDTQGLDGWWLC...
3	P27361	MAPK3	MAPK3	protein	P41182	BCL6 BCL5 LAZ3 ZBTB27 ZNF51	S343	MASPADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAH...
4	Q16539	MAPK14	MAPK14	protein	Q02078	MEF2A MEF2	S408	MGRKKIQITRIMDERNRQVTFTKRKFGLMKKAYELSVLCDCEIALI...

df.shape

(11212, 8)

df['source']="SIGNOR"

# df.to_excel('raw/signor.xlsx',index=False)

GPS 6.0

Data

paper: https://academic.oup.com/nar/article/51/W1/W243/7157529#409532969

web: https://gps.biocuckoo.cn/index.php

Go to paper link, download supplementary data - zip file, get Table S5

import pandas as pd

gps = pd.read_csv('raw/GPS6_tableS5.csv')

gps = gps[gps.source!="PhosphositePlus"]

gps = gps[gps.species=="Homo sapiens"]

gps.shape

(6087, 6)

gps = gps[~gps.gene.str.contains('family')]

Kinase mapping

kinase_id = pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')

kinase_id['Gene Names'] = kinase_id['Gene Names'].str.split(' ')

kinase_id = kinase_id.explode('Gene Names')

Some kinases share same gene names:

kinase_id[kinase_id['Gene Names'].duplicated(keep=False)].sort_values('Gene Names').head()

	Entry	Entry Name	Protein names	Gene Names	uniprot_keyword_kinase	Organism	Keywords	Sequence
586	Q9UIJ7	KAD3_HUMAN	GTP:AMP phosphotransferase AK3, mitochondrial ...	AK3	1	Homo sapiens (Human)	3D-structure;Acetylation;Alternative splicing;...	MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM...
152	P27144	KAD4_HUMAN	Adenylate kinase 4, mitochondrial (EC 2.7.4.4)...	AK3	1	Homo sapiens (Human)	3D-structure;Acetylation;ATP-binding;GTP-bindi...	MASKLLRAVILGPPGSGKGTVCQRIAQNFGLQHLSSGHFLRENIKA...
586	Q9UIJ7	KAD3_HUMAN	GTP:AMP phosphotransferase AK3, mitochondrial ...	AK3L1	1	Homo sapiens (Human)	3D-structure;Acetylation;Alternative splicing;...	MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM...
152	P27144	KAD4_HUMAN	Adenylate kinase 4, mitochondrial (EC 2.7.4.4)...	AK3L1	1	Homo sapiens (Human)	3D-structure;Acetylation;ATP-binding;GTP-bindi...	MASKLLRAVILGPPGSGKGTVCQRIAQNFGLQHLSSGHFLRENIKA...
609	Q9Y3D8	KAD6_HUMAN	Adenylate kinase isoenzyme 6 (AK6) (EC 2.7.4.3...	AK6	1	Homo sapiens (Human)	3D-structure;Alternative splicing;ATP-binding;...	MLLPNILLTGTPGVGKTTLGKELASKSGLKYINVGDLAREEQLYDG...

We’ll drop them to prevent confusion.

dup_name = set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names'])

gps.gene.isin(dup_name).sum()

np.int64(76)

gps.shape

(4260, 6)

gps = gps[~gps.gene.isin(dup_name)].copy()

gene2uniprot = kinase_id.set_index('Gene Names')['Entry'].to_dict()

gps.shape

(4184, 6)

unmapped:

gps[gps.gene.str.upper().map(gene2uniprot).isna()].gene.unique()

array(['DMPK1', 'MRCKa', 'YPKA', 'PKACb', 'PKCt', 'PKCh', 'PKCi', 'PKCz',
       'PKG1', 'PKG2', 'PRKG1 Isoform Alpha', 'p70S6K', 'p70S6Kb',
       'CCDPK', 'AMPKa1', 'PKD3', 'CK1d', 'CK1e', 'CK1g1', 'CSNK2B',
       'p38b', 'p38d', 'p38g', 'AurB', 'AurC', 'TP53RK ', 'STLK3',
       'PAK3 Isoform 2', 'TAO1', 'ACK'], dtype=object)

gps['kinase_uniprot'] = gps.gene.str.upper().map(gene2uniprot)

gps = gps.dropna(subset='kinase_uniprot')

Substrate mapping

# gps.uniprot.drop_duplicates().to_csv('raw/GPS_substrate_id.csv')

1,203 IDs were mapped to 1,208 results

1,202 active entries and 1 obsolete entry are found

gps.shape

(3750, 7)

gps = map_substrate('raw/idmapping_2025_03_12_GPS.xlsx',gps,'uniprot')

Shape before processing (3750, 7)
Species counts: substrate_species
Homo sapiens (Human)                                                                 3614
Rattus norvegicus (Rat)                                                                30
Mus musculus (Mouse)                                                                   19
Pongo abelii (Sumatran orangutan) (Pongo pygmaeus abelii)                              10
Pan troglodytes (Chimpanzee)                                                            6
Sus scrofa (Pig)                                                                        6
Bos taurus (Bovine)                                                                     5
Oryctolagus cuniculus (Rabbit)                                                          4
Macaca fascicularis (Crab-eating macaque) (Cynomolgus monkey)                           4
Mesocricetus auratus (Golden hamster)                                                   4
Tupaia belangeri (Common tree shrew) (Tupaia glis belangeri)                            2
Canis lupus familiaris (Dog) (Canis familiaris)                                         2
Phodopus roborovskii (Roborovski's desert hamster) (Cricetulus roborovskii)             2
Macaca mulatta (Rhesus macaque)                                                         2
Xenopus tropicalis (Western clawed frog) (Silurana tropicalis)                          2
Hylobates lar (Lar gibbon) (White-handed gibbon)                                        1
Cricetulus griseus (Chinese hamster) (Cricetulus barabensis griseus)                    1
Xenopus laevis (African clawed frog)                                                    1
Galeopterus variegatus (Malayan flying lemur) (Cynocephalus variegatus)                 1
Nannospalax galili (Northern Israeli blind subterranean mole rat) (Spalax galili)       1
Spermophilus citellus (European suslik) (Citellus citellus)                             1
Marmota monax (Woodchuck)                                                               1
Pan paniscus (Pygmy chimpanzee) (Bonobo)                                                1
Danio rerio (Zebrafish) (Brachydanio rerio)                                             1
Hepatitis delta virus genotype II (isolate 7/18/83) (HDV)                               1
Human T-cell leukemia virus 1 (strain Japan ATK-1 subtype A) (HTLV-1)                   1
Macaca fuscata fuscata (Japanese macaque)                                               1
Human T-cell leukemia virus 1 (isolate Caribbea HS-35 subtype A) (HTLV-1)               1
Human T-cell leukemia virus 1 (isolate Melanesia mel5 subtype C) (HTLV-1)               1
Ovis aries (Sheep)                                                                      1
Human immunodeficiency virus type 1 group M subtype B (isolate NY5) (HIV-1)             1
Human immunodeficiency virus type 2 subtype A (isolate BEN) (HIV-2)                     1
Human immunodeficiency virus type 1 group M subtype B (isolate YU-2) (HIV-1)            1
Human immunodeficiency virus type 1 group M subtype B (isolate WMJ22) (HIV-1)           1
Cavia porcellus (Guinea pig)                                                            1
Human immunodeficiency virus type 1 group M subtype A (isolate MAL) (HIV-1)             1
Human immunodeficiency virus type 1 group M subtype F2 (isolate MP257) (HIV-1)          1
Human immunodeficiency virus type 1 group M subtype B (isolate LW123) (HIV-1)           1
Human immunodeficiency virus type 1 group M subtype B (isolate JRCSF) (HIV-1)           1
Human immunodeficiency virus type 1 group M subtype B (isolate JH32) (HIV-1)            1
Human immunodeficiency virus type 1 group M subtype B (isolate HXB2) (HIV-1)            1
Human immunodeficiency virus type 1 group M subtype C (isolate ETH2220) (HIV-1)         1
Human immunodeficiency virus type 1 group M subtype D (isolate ELI) (HIV-1)             1
Human immunodeficiency virus type 1 group M subtype B (isolate CDC-451) (HIV-1)         1
Human immunodeficiency virus type 1 group O (isolate ANT70) (HIV-1)                     1
Human immunodeficiency virus type 1 group M subtype A (isolate U455) (HIV-1)            1
Human immunodeficiency virus type 2 subtype A (isolate KR) (HIV-2)                      1
Phodopus campbelli (Campbell's dwarf Russian hamster)                                   1
Human immunodeficiency virus type 2 subtype A (isolate ROD) (HIV-2)                     1
Human immunodeficiency virus type 1 group O (isolate MVP5180) (HIV-1)                   1
Felis catus (Cat) (Felis silvestris catus)                                              1
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (3614, 10)

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

gps.shape

(3614, 10)

Validate site

gps['site']=gps['code'] + gps['position'].astype(int).astype(str)

gps['site_match'] =validate_site_df(gps,'site','substrate_sequence')

gps['site_match'].value_counts()

site_match
1    3435
0     179
Name: count, dtype: int64

gps=gps[gps.site_match==1]

gps.shape

(3435, 12)

Save

gps.columns

Index(['position', 'code', 'gene', 'species', 'source', 'kinase_uniprot',
       'substrate_uniprot', 'substrate_genes', 'substrate_sequence',
       'substrate_species', 'site', 'site_match'],
      dtype='object')

gps = gps.rename(columns={'source':'GPS_source'})

gps['source']='GPS6'

col = ['kinase_uniprot','gene', 

       'substrate_uniprot', 'substrate_genes', 'site','substrate_sequence', 'GPS_source','source']

gps = gps[col].copy()

gps.head()

	kinase_uniprot	gene	substrate_uniprot	substrate_genes	site	substrate_sequence	GPS_source	source
0	P31749	AKT1	Q9Y261	FOXA2 HNF3B TCF3B	T156	MLGAVKMEGHEPSDWSSYYAEPEGYSSVSNMNAGLGMNGMNTYMSM...	14500912	GPS6
1	P31749	AKT1	P49760	CLK2	T127	MPHPRRYHSSERGSRGSYREHYRSRKHKRRRSRSWSSSSDRTRRRR...	UniProt	GPS6
2	P31749	AKT1	P49815	TSC2 TSC4	T1462	MAKPTSKDSGLKEKFKILLGLGTPRPNPRSAEGKQTEFIITAEILR...	12150915;15342917;12172553;UniProt	GPS6
3	P31749	AKT1	P46527	CDKN1B KIP1 p27	T187	MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE...	12042314;9192873;9399644;9311993;9388487;17254...	GPS6
4	P31749	AKT1	O15111	CHUK IKKA TCF16	T23	MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...	10485710;UniProt	GPS6

# gps.to_excel('raw/GPS6.xlsx',index=False)

Other datasets

Douglass

Paper: https://journals.physiology.org/doi/full/10.1152/ajpcell.00166.2012

Data is not available to download, but available upon request to authors

RegPhos 2.0

Paper: https://academic.oup.com/database/article/doi/10.1093/database/bau034/2634150

Go to the paper link, scroll down to Supplementary data, download the zip file, docx file, table S4 is motif

Phosida

http://www.phosida.com/ is not accessible

Phosida paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC2258193/

PhosphoNetwork

2013 bioinformatics: https://pmc.ncbi.nlm.nih.gov/articles/PMC3866559/#btt627-T1

2013 Mol Syst Biol.: https://pmc.ncbi.nlm.nih.gov/articles/PMC3658267/

web app: https://www.phosphonetworks.org/

The web app provides kinase-substrate pairs, but does not provides specific info of phosphorylation site.

It provides logo and pssm though in Download –> Motif Matrix, which can be used to compare the PSSM result

BioGRID

Go to : https://downloads.thebiogrid.org/BioGRID, Current-Release–>BIOGRID-PTMS-4.4.242.ptm.zip

There are two files in after extraction, however, they don’t contain kinase-substrate relationship.

Combine all

import pandas as pd
from pathlib import Path

paths = [
    'raw/GPS6.xlsx',
    'raw/signor.xlsx',
    'raw/ELM.xlsx',
    'raw/iPTMNet.xlsx','raw/EPSD.xlsx',# 'raw/KiNet.xlsx',
    'raw/psp_human.xlsx',
    'raw/sugiyama.xlsx'
]

dfs = [pd.read_excel(path) for path in paths]

[print(df.source.value_counts()) for df in dfs]

source
GPS6    3435
Name: count, dtype: int64
source
SIGNOR    11212
Name: count, dtype: int64
source
ELM    4519
Name: count, dtype: int64
source
iPTMNet    3478
Name: count, dtype: int64
source
EPSD    10308
Name: count, dtype: int64
source
PSP    13144
Name: count, dtype: int64
source
Sugiyama    184948
Name: count, dtype: int64

[None, None, None, None, None, None, None]

For each df, we need to drop duplicates of kinase(uniprot)-substrate(uniprot)-site

def get_key(df):
    df = df.copy()
    print('original shape:',df.shape)
    df['kin_sub_site']=df['kinase_uniprot']+'_'+df['substrate_uniprot']+'_'+df['site']
    df = df.drop_duplicates(subset='kin_sub_site')
    print('after removing duplicates',df.shape)
    return df

dfs=[get_key(df) for df in dfs]

original shape: (3435, 8)
after removing duplicates (3326, 9)
original shape: (11212, 9)
after removing duplicates (9320, 10)
original shape: (4519, 10)
after removing duplicates (3807, 11)
original shape: (3478, 9)
after removing duplicates (3478, 10)
original shape: (10308, 9)
after removing duplicates (10308, 10)
original shape: (13144, 12)
after removing duplicates (13091, 13)
original shape: (184948, 11)
after removing duplicates (168342, 12)

for path,df in zip(paths,dfs):
    print(Path(path).stem, df.shape)

GPS6 (3326, 9)
signor (9320, 10)
ELM (3807, 11)
iPTMNet (3478, 10)
EPSD (10308, 10)
psp_human (13091, 13)
sugiyama (168342, 12)

common_cols = ['kinase_uniprot','substrate_uniprot','site','kin_sub_site','source','substrate_genes','substrate_sequence']

df_all = pd.concat(dfs,ignore_index=True)
df_all = df_all[common_cols].copy()

df_all.source.value_counts()

source
Sugiyama    168342
PSP          13091
EPSD         10308
SIGNOR        9320
ELM           3807
iPTMNet       3478
GPS6          3326
Name: count, dtype: int64

# df_all.to_parquet('raw/combine_source.parquet')

df_grouped = df_all.groupby("kin_sub_site").agg({
    "kinase_uniprot": "first",
    "substrate_uniprot": "first",
    "site": "first",
    "source": '|'.join,  # Concatenate sources with '|'
    "substrate_genes": "first",
    "substrate_sequence": "first"
}).reset_index()

df_grouped.shape

(187066, 7)

df_grouped.head()

	kin_sub_site	kinase_uniprot	substrate_uniprot	site	source	substrate_genes	substrate_sequence
0	O00141_A4FU28_S140	O00141	A4FU28	S140	Sugiyama	CTAGE9	MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC...
1	O00141_O00141_S252	O00141	O00141	S252	Sugiyama	SGK1 SGK	MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
2	O00141_O00141_S255	O00141	O00141	S255	Sugiyama	SGK1 SGK	MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
3	O00141_O00141_S397	O00141	O00141	S397	Sugiyama	SGK1 SGK	MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...
4	O00141_O00141_S404	O00141	O00141	S404	Sugiyama	SGK1 SGK	MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...

# df_grouped.to_parquet('raw/combine_source_grouped.parquet')

Human phosphoproteome

from katlas.core import *
import pandas as pd

Data

human = Data.get_combine_site_psp_ochoa()

human.shape

(121419, 8)

Substrate mapping

# human.uniprot.drop_duplicates().to_csv('raw/human_phosphoproteome_uniprot.csv')

11,243 IDs were mapped to 11,241 results

5 ID were not mapped: AAA58698 P18433-2 AAC50053 AAA60149 NP_001184222

11,242 active entries and 1 obsolete entry are found

human = map_substrate('raw/idmapping_2025_03_20_human_phosphoproteome.xlsx',human,'uniprot')

f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

Shape before processing (121419, 8)
Species counts: substrate_species
Homo sapiens (Human)    121332
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (121332, 11)

Validate site

human.columns

Index(['gene', 'site', 'site_seq', 'source', 'AM_pathogenicity', 'CDDM_upper',
       'CDDM_max_score', 'substrate_uniprot', 'substrate_genes',
       'substrate_sequence', 'substrate_species'],
      dtype='object')

human['site_match'] =validate_site_df(human,'site','substrate_sequence')

human['site_match'].value_counts()

site_match
1    120084
0      1248
Name: count, dtype: int64

human=human[human.site_match==1].copy()

human.shape

(120084, 12)

Remove duplicates

human['sub_site'] = human['substrate_uniprot']+'_'+human['site']

human = human.drop_duplicates(subset='sub_site')

human.shape

(119955, 13)

Save

human.columns

Index(['gene', 'site', 'site_seq', 'source', 'AM_pathogenicity', 'CDDM_upper',
       'CDDM_max_score', 'substrate_uniprot', 'substrate_genes',
       'substrate_sequence', 'substrate_species', 'site_match', 'sub_site'],
      dtype='object')

cols = ['substrate_uniprot', 'substrate_genes',

        'site', 'source', 'AM_pathogenicity', 

       'substrate_sequence', 'substrate_species', 'sub_site']

human = human[cols]

# human.to_parquet('raw/human_phosphoproteome.parquet')

Phosphorylate sequence

Combine human phosphoproteome and KS dataset site info

human = pd.read_parquet('raw/human_phosphoproteome.parquet')

cols = ['substrate_uniprot','site','substrate_sequence']

df_grouped = pd.read_parquet('raw/combine_source_grouped.parquet')

human = human[cols]

df_grouped = df_grouped[cols]

comb = pd.concat([human,df_grouped])

comb['sub_site'] = comb['substrate_uniprot']+'_'+comb['site']

comb = comb.drop_duplicates('sub_site')

Phosphorylate sequence

seq = phosphorylate_seq_df(comb)

seq.head()

	substrate_uniprot	site	substrate_sequence	substrate_phosphoseq
0	A0A024R4G9	[S20]	MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...	MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...
1	A0A075B6Q4	[S24, S35, S57, S68, S71, S72]	MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...	MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...
2	A0A075B6T3	[S24, S26]	XLKRAYRGLEEVQWCLEQLLTSPSPS	XLKRAYRGLEEVQWCLEQLLTSPsPs
3	A0A075B759	[T68]	MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...	MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
4	A0A087WTJ2	[T8, Y14, Y213, T215, S221, S421, S424]	MGGRKMATDEENVYGLEENAQSRQESTRRLILVGRTGAGKSATGNS...	MGGRKMAtDEENVyGLEENAQSRQESTRRLILVGRTGAGKSATGNS...

# seq.to_csv('raw/phosphoseq_map.csv',index=False)

Map to df

human = pd.read_parquet('raw/human_phosphoproteome.parquet')

df_grouped = pd.read_parquet('raw/combine_source_grouped.parquet')

seq_map = seq.set_index('substrate_uniprot')['substrate_phosphoseq']

human['substrate_phosphoseq'] = human.substrate_uniprot.map(seq_map)

df_grouped['substrate_phosphoseq'] = df_grouped.substrate_uniprot.map(seq_map)

human['substrate_phosphoseq'].isna().sum()

np.int64(0)

df_grouped['substrate_phosphoseq'].isna().sum()

np.int64(0)

# human.to_parquet('raw/human_phosphoproteome.parquet')

# df_grouped.to_parquet('raw/combine_source_grouped.parquet')

Extract site sequence

human['position'] = human['site'].str[1:].astype(int)

df_grouped['position']=df_grouped['site'].str[1:].astype(int)

human['site_seq'] = extract_site_seq(human,
                                  seq_col='substrate_phosphoseq',
                                  position_col='position',
                                  length=20)

100%|██████████| 119955/119955 [00:05<00:00, 22986.11it/s]

df_grouped['site_seq'] = extract_site_seq(df_grouped,
                                  seq_col='substrate_phosphoseq',
                                  position_col='position',
                                  length=20)

100%|██████████| 187066/187066 [00:07<00:00, 23764.80it/s]

df_grouped['sub_site'] = df_grouped['substrate_uniprot']+'_'+ df_grouped['site']

human['sub_site'] = human['substrate_uniprot']+'_'+ human['site']

# human.to_parquet('raw/human_phosphoproteome.parquet')

# df_grouped.to_parquet('raw/combine_source_grouped.parquet')

Add kinase info

df=df_grouped.copy()

# Remove pseudokinase duplicates by UniProt ID, keep only one entry per kinase
info = Data.get_kinase_info().sort_values('kinase').drop_duplicates('uniprot')

# Pre-extract UniProt ID without isoform for matching
df['uniprot_clean'] = df['kinase_uniprot'].str.split('-').str[0]

info_indexed = info.set_index('uniprot')
group_map = info_indexed['group']
family_map = info_indexed['family']
pspa_small_map = info_indexed['pspa_category_small']
pspa_big_map = info_indexed['pspa_category_big']

df['kinase_on_tree'] = df['uniprot_clean'].isin(info['uniprot']).astype(int)

kinase_gene_map = Data.get_kinase_uniprot().set_index('Entry')['Gene Names']
df['kinase_genes'] = df['uniprot_clean'].map(kinase_gene_map)

df['kinase_group'] = df['uniprot_clean'].map(group_map)
df['kinase_family'] = df['uniprot_clean'].map(family_map)
df['kinase_pspa_big'] = df['uniprot_clean'].map(pspa_big_map)
df['kinase_pspa_small'] = df['uniprot_clean'].map(pspa_small_map)

df.drop(columns='uniprot_clean', inplace=True)

The above code has been added into Data class when loading the ks dataset