from katlas.core import *
import pandas as pd
Preprocess kinase-substrate datasets
Motif
Poll [Sugiyama & Douglass]: https://biosignaling.biomedcentral.com/articles/10.1186/s12964-023-01436-2
Poll web motif: https://esbl.nhlbi.nih.gov/Databases/Kinase_Logos/KinaseTree.html; https://esbl.nhlbi.nih.gov/Databases/Kinase_Logos/GeneSymbol.html
Sugiyama: https://pmc.ncbi.nlm.nih.gov/articles/PMC6642169/
Douglass: https://journals.physiology.org/doi/full/10.1152/ajpcell.00166.2012
RegPhos: Supplementary Table S4: motifs grouped by kinase family
PhosphoNetwork, logos of 300 motifs: https://pmc.ncbi.nlm.nih.gov/articles/PMC3658267/; web app: https://phosphonetworks.org/
GPS: https://gps.biocuckoo.cn/links.php#l1
Review of database
Wilson, AACR cancer research: https://aacrjournals.org/cancerres/article/78/1/15/625062/New-Perspectives-Opportunities-and-Challenges-in
GPS 6.0, List of database: https://gps.biocuckoo.cn/links.php
paper: https://academic.oup.com/nar/article/51/W1/W243/7157529
phospho.ELM links: http://phospho.elm.eu.org/links.html
GPS, https://gps.biocuckoo.cn/links.php#l1
Set up
def map_substrate(idmapping_fname, ori_df, sub_col,remove_nonhuman=True):
"Map uniprot id with uniprot info"
= pd.read_excel(idmapping_fname)
substrate_id = substrate_id.drop_duplicates('From')
substrate_id = substrate_id[['From', 'Entry', 'Gene Names', 'Sequence','Organism']]
substrate_id = ori_df.copy()
ori_df
# prevent name conflict
if sub_col == 'substrate_uniprot':
= 'substrate_uniprot_tmp'
sub_col = ori_df.rename(columns={'substrate_uniprot':'substrate_uniprot_tmp'})
ori_df
= [sub_col, 'substrate_uniprot', 'substrate_genes', 'substrate_sequence','substrate_species']
substrate_id.columns
print('Shape before processing', ori_df.shape)
= ori_df.merge(substrate_id, on=sub_col)
ori_df
print('Species counts:',ori_df.substrate_species.value_counts())
if remove_nonhuman:
print('Removing non-human substrates')
= ori_df[ori_df.substrate_species=='Homo sapiens (Human)']
ori_df
= ori_df.drop(columns=[sub_col]) # Drop temp column after merging
ori_df = ori_df.dropna(subset=['substrate_sequence'])
ori_df print('Shape after processing', ori_df.shape)
return ori_df.reset_index(drop=True)
=pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx') kinase_uniprot
Sugiyama dataset
Data
Go to https://www.nature.com/articles/s41598-019-46385-4#Sec21, download table S2.
Modify the header (2 lines to 1 line) so that it can be read by pandas.
= pd.read_csv('raw/Large_scale_S2.csv').iloc[:,:-2] df
df.head()
Type | Kinase | Substrate_uniprot | Position | |
---|---|---|---|---|
0 | TK | ABL1 | 1433B_HUMAN | S212 |
1 | TK | ABL1 | 1433B_HUMAN | Y151 |
2 | TK | ABL1 | 1433B_HUMAN | Y21 |
3 | TK | ABL1 | 1433B_HUMAN | Y50 |
4 | TK | ABL1 | 1433E_HUMAN | Y152 |
df.shape
(198536, 4)
Kinase mapping
# pd.DataFrame(df.Kinase.unique())[0].str.split('/').str[0].to_csv('raw/sugiyama_kinase_name.csv')
Map each kinase name with uniprot ID
= pd.read_csv('raw/LS_info2.csv').iloc[:,:3] kinase_id
= df.merge(kinase_id) df
df
Type | Kinase | Substrate_uniprot | Position | kinase_uniprot | kinase_paper | |
---|---|---|---|---|---|---|
0 | TK | ABL1 | 1433B_HUMAN | S212 | P00519 | ABL1 |
1 | TK | ABL1 | 1433B_HUMAN | Y151 | P00519 | ABL1 |
2 | TK | ABL1 | 1433B_HUMAN | Y21 | P00519 | ABL1 |
3 | TK | ABL1 | 1433B_HUMAN | Y50 | P00519 | ABL1 |
4 | TK | ABL1 | 1433E_HUMAN | Y152 | P00519 | ABL1 |
... | ... | ... | ... | ... | ... | ... |
198531 | LK | SPHK2 | TICN3_HUMAN | T118 | Q9NRA0 | NaN |
198532 | LK | SPHK2 | TPM4_HUMAN | T241 | Q9NRA0 | NaN |
198533 | LK | SPHK2 | ULK3_HUMAN | S305 | Q9NRA0 | NaN |
198534 | LK | SPHK2 | ZRAB2_HUMAN | S165 | Q9NRA0 | NaN |
198535 | LK | SPHK2 | ZRAB2_HUMAN | S181 | Q9NRA0 | NaN |
198536 rows × 6 columns
Substrate mapping
# pd.DataFrame(df.Substrate_uniprot.unique()).to_csv('raw/sugiyama_uniprot_id.csv')
ID mapping of Substrate_uniprot
to uniprot ID in https://www.uniprot.org/id-mapping
3,753 IDs were mapped to 3,753 results
375 ID were not mapped
= map_substrate('raw/idmapping_2025_03_02.xlsx',df,'Substrate_uniprot') df
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (198536, 6)
Species counts: substrate_species
Homo sapiens (Human) 187129
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (187129, 9)
# lipid kinase
=='LK'].Kinase.unique() df[df.Type
array(['PIK3C3', 'PIK3CB/PIK3R1', 'PIK3CD/PIK3R1', 'PIK3CG/PIK3R5',
'PIK3CG', 'PIK4CA', 'PIP4K2B', 'PIP5K1C', 'SPHK1', 'SPHK2'],
dtype=object)
sum() df.isna().
Type 0
Kinase 0
Position 0
kinase_uniprot 16591
kinase_paper 18891
substrate_uniprot 0
substrate_genes 11
substrate_sequence 0
substrate_species 0
dtype: int64
df.head()
Type | Kinase | Position | kinase_uniprot | kinase_paper | substrate_uniprot | substrate_genes | substrate_sequence | substrate_species | |
---|---|---|---|---|---|---|---|---|---|
0 | TK | ABL1 | S212 | P00519 | ABL1 | P31946 | YWHAB | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... | Homo sapiens (Human) |
1 | TK | ABL1 | Y151 | P00519 | ABL1 | P31946 | YWHAB | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... | Homo sapiens (Human) |
2 | TK | ABL1 | Y21 | P00519 | ABL1 | P31946 | YWHAB | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... | Homo sapiens (Human) |
3 | TK | ABL1 | Y50 | P00519 | ABL1 | P31946 | YWHAB | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... | Homo sapiens (Human) |
4 | TK | ABL1 | Y152 | P00519 | ABL1 | P62258 | YWHAE | MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS... | Homo sapiens (Human) |
= df.rename(columns = {'Position':'site'}) df
df.shape
(187129, 9)
Validate site
'site_match'] =validate_site_df(df,'site','substrate_sequence') df[
df.site_match.value_counts()
site_match
1 184948
0 2181
Name: count, dtype: int64
=df[df.site_match==1] df
df.shape
(184948, 10)
Save
'source']='Sugiyama' df[
# df.to_excel('raw/sugiyama.xlsx',index=False)
PhosphoSitePlus (PSP)
Data
PSP paper: https://academic.oup.com/nar/article/43/D1/D512/2439467
Go to https://www.phosphosite.org/staticDownloads, download Kinase_Substrate_Dataset.txt
import pandas as pd
= pd.read_csv('raw/Kinase_Substrate_Dataset_final.csv') psp
psp.head()
GENE | Kinase | KIN_ACC_ID | KIN_ORGANISM | SUBSTRATE | SUB_GENE_ID | SUB_ACC_ID | SUB_GENE | SUB_ORGANISM | SUB_MOD_RSD | SITE_GRP_ID | substrate | DOMAIN | IN_VIVO_RXN | IN_VITRO_RXN | CST_CAT# | kinase_uniprot | kinase_paper | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Dyrk2 | DYRK2 | Q5U4C9 | mouse | NDEL1 | 83431.0 | Q9ERR1 | Ndel1 | mouse | S336 | 1869686801 | LGSsRPSsAPGMLPL | NaN | X | NaN | Q92630 | DYRK2 | |
1 | DYRK2 | DYRK2 | Q92630 | human | GLI2 | 14633.0 | Q0VGT2 | Gli2 | mouse | S385 | 3339001 | AEGLRPAsPLGLTQE | NaN | X | NaN | Q92630 | DYRK2 | |
2 | DYRK2 | DYRK2 | Q92630 | human | SIAH2 | 6478.0 | O43255 | SIAH2 | human | S68 | 5393502 | GGGAGPVsPQHHELT | NaN | X | NaN | Q92630 | DYRK2 | |
3 | DYRK2 | DYRK2 | Q92630 | human | CARHSP1 | 23589.0 | Q9Y2V2 | CARHSP1 | human | S41 | 455702 | LRGNVVPsPLPtRRt | NaN | X | NaN | Q92630 | DYRK2 | |
4 | DYRK2 | DYRK2 | Q92630 | human | Doublecortin iso2 | 1641.0 | O43602-2 | DCX | human | S306 | 454122 | GPMRRSKsPADSANG | NaN | X | NaN | Q92630 | DYRK2 |
=psp[psp.KIN_ORGANISM=='human'].reset_index(drop=True)
psp
=psp[psp.SUB_ORGANISM=='human'].reset_index(drop=True) psp
psp.shape
(14081, 18)
= psp[['KIN_ACC_ID','kinase_paper','GENE','SUB_ACC_ID','SUB_GENE','SUB_MOD_RSD','substrate']] psp
psp.head()
KIN_ACC_ID | kinase_paper | GENE | SUB_ACC_ID | SUB_GENE | SUB_MOD_RSD | substrate | |
---|---|---|---|---|---|---|---|
0 | Q92630 | DYRK2 | DYRK2 | O43255 | SIAH2 | S68 | GGGAGPVsPQHHELT |
1 | Q92630 | DYRK2 | DYRK2 | Q9Y2V2 | CARHSP1 | S41 | LRGNVVPsPLPtRRt |
2 | Q92630 | DYRK2 | DYRK2 | O43602-2 | DCX | S306 | GPMRRSKsPADSANG |
3 | Q92630 | DYRK2 | DYRK2 | P30304 | CDC25A | S283 | PErsQEEsPPGSTKr |
4 | Q92630 | DYRK2 | DYRK2 | O43255 | SIAH2 | T119 | PTCRGALtPSIRNLA |
Substrate mapping
# psp.SUB_ACC_ID.drop_duplicates().to_csv('raw/psp_substrate_id.csv')
4,441 IDs were mapped to 4,446 results
19 ID were not mapped: NP_001184222 NP_001100737 AAA40678 HSBO22 NP_776683 NP_579829 P18433-2 NP_001076191 NP_001099740 NP_001005762 NP_001178533 XP_008773743 NP_001104263 ABR15760 AAB24204 AAB24205 BAA34185.2 NP_001103022 AAC50053
IDs with multiple results: P62991, P62988
= map_substrate('raw/idmapping_2025_03_02_psp.xlsx',psp,'SUB_ACC_ID') psp
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (14081, 7)
Species counts: substrate_species
Homo sapiens (Human) 14069
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (14069, 10)
psp.head()
KIN_ACC_ID | kinase_paper | GENE | SUB_GENE | SUB_MOD_RSD | substrate | substrate_uniprot | substrate_genes | substrate_sequence | substrate_species | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Q92630 | DYRK2 | DYRK2 | SIAH2 | S68 | GGGAGPVsPQHHELT | O43255 | SIAH2 | MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... | Homo sapiens (Human) |
1 | Q92630 | DYRK2 | DYRK2 | CARHSP1 | S41 | LRGNVVPsPLPtRRt | Q9Y2V2 | CARHSP1 | MSSEPPPPPQPPTHQASVGLLDTPRSRERSPSPLRGNVVPSPLPTR... | Homo sapiens (Human) |
2 | Q92630 | DYRK2 | DYRK2 | DCX | S306 | GPMRRSKsPADSANG | O43602 | DCX DBCN LISX | MELDFGHFDERDKTSRNMRGSRMNGLPSPTHSAHCSFYRTRTLQAL... | Homo sapiens (Human) |
3 | Q92630 | DYRK2 | DYRK2 | CDC25A | S283 | PErsQEEsPPGSTKr | P30304 | CDC25A | MELGPEPPHRRRLLFACSPPPASQPVVKALFGASAAGGLSPVTNLT... | Homo sapiens (Human) |
4 | Q92630 | DYRK2 | DYRK2 | SIAH2 | T119 | PTCRGALtPSIRNLA | O43255 | SIAH2 | MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... | Homo sapiens (Human) |
psp.shape
(14069, 10)
Kinase mapping
# psp.KIN_ACC_ID.drop_duplicates().to_csv('raw/psp_kin_id.csv')
Mapp KIN_ACC_ID to uniprot id
440 IDs were mapped to 440 results
1 ID was not mapped:
AAA58698
= pd.read_excel('raw/idmapping_2025_03_02_psp_kinase.xlsx')[['From','Entry','Gene Names']] kinase_id
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
= ['KIN_ACC_ID','kinase_uniprot','kinase_genes'] kinase_id.columns
=="A9UF07"] kinase_id[kinase_id.KIN_ACC_ID
KIN_ACC_ID | kinase_uniprot | kinase_genes | |
---|---|---|---|
124 | A9UF07 | A9UF07 | BCR/ABL fusion |
'KIN_ACC_ID'] = psp['KIN_ACC_ID'].replace('A9UF07','P00519') psp[
= psp.merge(kinase_id) psp
= psp.rename(columns={'SUB_MOD_RSD':'site'}) psp
= psp[['kinase_uniprot','kinase_genes','kinase_paper','substrate_uniprot','substrate_genes','site','substrate_sequence','substrate','substrate_species']] psp
sum() psp.isna().
kinase_uniprot 0
kinase_genes 0
kinase_paper 134
substrate_uniprot 0
substrate_genes 0
site 0
substrate_sequence 0
substrate 0
substrate_species 0
dtype: int64
= psp.dropna(subset='substrate_sequence').reset_index(drop=True) psp
Check if kinase-uniprot belongs to kinase class
Not mapped:
~psp.kinase_uniprot.isin(kinase_uniprot.Entry)].kinase_genes.value_counts() psp[
kinase_genes
CSNK2B CK2N G5A 13
PRKAG2 9
PRKAB1 AMPK 9
GTF2F1 RAP74 4
BLVRA BLVR BVR 3
TGM2 2
RET/PTC2 2
PHKA1 PHKA 1
HSPA5 GRP78 1
ENPP3 PDNP3 1
PIK3R1 GRB1 1
CERT1 CERT COL4A3BP STARD11 1
JMJD6 KIAA0585 PSR PTDSR 1
Name: count, dtype: int64
CSNK2B belongs to CK2, PRKAB1, PRKAG2 belongs to AMPK subunit; as their count is not many, we can simply ignore them
= psp[psp.kinase_uniprot.isin(kinase_uniprot.Entry)].copy() psp
psp.shape
(14016, 9)
Validate site
Check if it match with substrate sequence
'position'] = psp['site'].str[1:].astype(int) psp[
'site_seq']=extract_site_seq(psp,seq_col='substrate_sequence',position_col='position') psp[
100%|██████████| 14016/14016 [00:00<00:00, 25600.07it/s]
'site_seq']==psp['substrate'].str.upper()).value_counts() (psp[
True 13156
False 860
Name: count, dtype: int64
= psp[~(psp['site_seq']==psp['substrate'].str.upper())] unmatch
10) unmatch.kinase_genes.value_counts().head(
kinase_genes
PRKACA PKACA 40
MAPK1 ERK2 PRKM1 PRKM2 40
SRC SRC1 39
CDK1 CDC2 CDC28A CDKN1 P34CDC2 38
GSK3B 37
MAPK3 ERK1 PRKM3 35
AURKB AIK2 AIM1 AIRK2 ARK2 STK1 STK12 STK5 33
PRKCA PKCA PRKACA 33
PRKCB PKCB PRKCB1 25
CDK5 CDKN5 PSSALRE 25
Name: count, dtype: int64
We’ll drop the rows with unmatch substrate sequence
= psp[psp['site_seq']==psp['substrate'].str.upper()] psp
Drop sites with center aa not in S,T or Y
'site_seq'].str[7].value_counts() psp[
site_seq
S 8303
T 2812
Y 2029
H 6
K 4
R 2
Name: count, dtype: int64
= psp[psp['site_seq'].str[7].str.upper().isin(list('STY'))] psp
Drop site with unmatched residue and position
'site_match'] =validate_site_df(psp,'site','substrate_sequence') psp[
'site_match'].value_counts() psp[
site_match
1 13144
Name: count, dtype: int64
psp.shape
(13144, 12)
Save
'source']='PSP' psp[
psp.head()
kinase_uniprot | kinase_genes | kinase_paper | substrate_uniprot | substrate_genes | site | substrate_sequence | substrate | substrate_species | position | site_seq | site_match | source | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Q92630 | DYRK2 | DYRK2 | O43255 | SIAH2 | S68 | MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... | GGGAGPVsPQHHELT | Homo sapiens (Human) | 68 | GGGAGPVSPQHHELT | 1 | PSP |
1 | Q92630 | DYRK2 | DYRK2 | Q9Y2V2 | CARHSP1 | S41 | MSSEPPPPPQPPTHQASVGLLDTPRSRERSPSPLRGNVVPSPLPTR... | LRGNVVPsPLPtRRt | Homo sapiens (Human) | 41 | LRGNVVPSPLPTRRT | 1 | PSP |
3 | Q92630 | DYRK2 | DYRK2 | P30304 | CDC25A | S283 | MELGPEPPHRRRLLFACSPPPASQPVVKALFGASAAGGLSPVTNLT... | PErsQEEsPPGSTKr | Homo sapiens (Human) | 283 | PERSQEESPPGSTKR | 1 | PSP |
4 | Q92630 | DYRK2 | DYRK2 | O43255 | SIAH2 | T119 | MSRPSSTGPSANKPCSKQPPPQPQHTPSPAAPPAAATISAAGPGSS... | PTCRGALtPSIRNLA | Homo sapiens (Human) | 119 | PTCRGALTPSIRNLA | 1 | PSP |
5 | Q92630 | DYRK2 | DYRK2 | O75449 | KATNA1 | S42 | MSLLMISENVKLAREYALLGNYDSAMVYYQGVLDQMNKYLYSVKDT... | QMNKYLYsVkDTYLQ | Homo sapiens (Human) | 42 | QMNKYLYSVKDTYLQ | 1 | PSP |
# psp.to_excel('raw/psp_human.xlsx',index=False)
KiNET (ESPD + iPTMNet)
an integration of PSP, iPTMNet, EPSD
Data
KiNET paper: https://www.nature.com/articles/s41540-024-00442-5
KiNET web: https://kinet.kinametrix.com/
iPTMnet paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC5753337/;
iPTMnet web: https://research.bioinformatics.udel.edu/iptmnet/ ; can’t directly download through Download
EPSD paper: https://academic.oup.com/bib/article/22/1/298/5686325?login=false
EPSD web: https://epsd.biocuckoo.cn/Download.php
Go to https://kinet.kinametrix.com/, and click Download the full KiNet interaction dataset
= pd.read_csv('raw/ksi_source_full_dataset.csv') df
= df.dropna(subset='Kinase') df
'Source Database'].value_counts() df[
Source Database
PhosphoSitePlus 13135
EPSD 10442
iPTMNet 3846
Name: count, dtype: int64
As we already included PSP, we will drop it
= df[df['Source Database']!='PhosphoSitePlus'] df
df.shape
(14288, 8)
Kinase mapping
df.Kinase.isin(kinase_uniprot.Entry).value_counts()
Kinase
True 14283
False 5
Name: count, dtype: int64
= df[df.Kinase.isin(kinase_uniprot.Entry)] df
Substrate mapping
# df.Substrate.drop_duplicates().to_csv('raw/KiNet_substrate_id.csv')
Map the id to uniprot seq
= map_substrate('raw/idmapping_2025_03_02_KiNET_substrate.xlsx',df,'Substrate') df
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (14283, 8)
Species counts: substrate_species
Homo sapiens (Human) 14283
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (14283, 11)
# substrate_id = pd.read_excel('raw/idmapping_2025_03_02_KiNET_substrate.xlsx')[['From','Entry','Gene Names','Sequence']]
# substrate_id.columns = ['Substrate','substrate_uniprot','substrate_genes','substrate_sequence']
# df = df.merge(substrate_id)
= df[['Kinase','Kinase Name','substrate_uniprot','substrate_genes','Site','Source Database','Evidence','substrate_sequence']] df
df.columns
Index(['Kinase', 'Kinase Name', 'substrate_uniprot', 'substrate_genes', 'Site',
'Source Database', 'Evidence', 'substrate_sequence'],
dtype='object')
= ['kinase_uniprot', 'Kinase Name', 'substrate_uniprot', 'substrate_genes', 'site',
df.columns
'source', 'evidence', 'substrate_sequence']
df.shape
(14283, 8)
Validate site
We drop site with site residue info unmatch with its site position info in the protein sequence
'site_match'] =validate_site_df(df,'site','substrate_sequence') df[
df.site_match.value_counts()
site_match
1 13786
0 497
Name: count, dtype: int64
==0].source.value_counts() df[df.site_match
source
iPTMNet 368
EPSD 129
Name: count, dtype: int64
==1].source.value_counts() df[df.site_match
source
EPSD 10308
iPTMNet 3478
Name: count, dtype: int64
=df[df.site_match==1] df
df.shape
(13786, 9)
Save
df.head()
kinase_uniprot | Kinase Name | substrate_uniprot | substrate_genes | site | source | evidence | substrate_sequence | site_match | |
---|---|---|---|---|---|---|---|---|---|
0 | O00141 | SGK1 | O00213 | APBB1 FE65 RIR | S610 | EPSD | Unspecified experimental method | MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG... | 1 |
1 | O00141 | SGK1 | O14920 | IKBKB IKKB | S181 | EPSD | Unspecified experimental method | MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC... | 1 |
2 | O00141 | SGK1 | O15111 | CHUK IKKA TCF16 | S180 | EPSD | Unspecified experimental method | MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC... | 1 |
3 | O00141 | SGK1 | O43524 | FOXO3 FKHRL1 FOXO3A | T32 | EPSD | Unspecified experimental method | MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK... | 1 |
4 | O00141 | SGK1 | O43524 | FOXO3 FKHRL1 FOXO3A | T32 | iPTMNet | Text mining | MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK... | 1 |
# df.to_excel('raw/KiNet.xlsx',index=False)
# df[df['source'] =='EPSD'].to_excel('raw/EPSD.xlsx',index=False)
# df[df['source'] =='iPTMNet'].to_excel('raw/iPTMNet.xlsx',index=False)
Phospho.ELM
Data
paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC3013696/
Go to http://phospho.elm.eu.org/dataset.html, fill info–>accept, then download the dataset
Open dump in excel, filter human and non-blank from kinases
=pd.read_csv('raw/phosphoELM.csv') elm
elm.shape
(3599, 6)
elm.head()
substrate_uniprot | position | acceptor | kinase | LTP_HTP | species | |
---|---|---|---|---|---|---|
0 | O14543 | 204 | Y | Lck | LTP | Homo sapiens |
1 | O14543 | 221 | Y | Lck | LTP | Homo sapiens |
2 | O14746 | 824 | S | PKB_group | LTP | Homo sapiens |
3 | O14746 | 227 | S | PKB_group | LTP | Homo sapiens |
4 | O14746 | 707 | Y | SRC | LTP | Homo sapiens |
= elm.kinase.str.upper() elm.kinase
Kinase mapping
= pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx')
kinase_id
'Gene Names'] = kinase_id['Gene Names'].str.split(' ')
kinase_id[
= kinase_id.explode('Gene Names') kinase_id
= kinase_id.set_index('Gene Names')['Entry'].to_dict() gene2uniprot
= set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names']) dup_name
'PAK1')
dup_name.remove(
'PASK') dup_name.remove(
'PAK1'] = 'Q13153'
gene2uniprot[
'PASK']='Q96RG2' gene2uniprot[
= elm[~elm.kinase.isin(dup_name)].copy() elm
~elm.kinase.isin(gene2uniprot.keys())].kinase.unique() elm[
array(['PKB_GROUP', 'IKK_GROUP', 'AURORA B', 'PDK-1', 'PKC_THETA',
'SGK_GROUP', 'IKK_BETA', 'CDK_GROUP', 'CAM-KII_ALPHA', 'PKC_GROUP',
'PKC_ALPHA', 'CK2_GROUP', 'CK2_ALPHA', 'GSK-3_GROUP',
'GSK-3_ALPHA', 'GSK-3_BETA', 'PKA_GROUP', 'RSK_GROUP', 'PKB_BETA',
'MAPK_GROUP', 'PAK_GROUP', 'DNA-PK', 'JNK_GROUP', 'CK1_GROUP',
'AURORA A', 'P70S6KB', 'PKG/CGK_GROUP', 'PKC_DELTA', 'PDGFR_BETA',
'CAM-KII_GROUP', 'GRK-2', 'GRK-5', 'GRK_GROUP', 'CK1_ALPHA',
'PKC_EPSILON', 'ROCK_GROUP', 'PKG1/CGK-I', 'PDGFR_GROUP',
'P70S6K_GROUP', 'P70S6K', 'PHK_GROUP', 'PKC_BETA', 'DMPK_GROUP',
'CK1_DELTA', 'CK2_BETA', 'RSK-3', 'RSK-2', 'CAM-KIV', 'RSK-1',
'IKK_ALPHA', 'CK1_EPSILON', 'MAP2K_GROUP', 'PIM-1', 'EG3 KINASE',
'GRK-4', 'PKC_ZETA', 'PDKC', 'AMPK_GROUP', 'FGFR_GROUP',
'JAK_GROUP', 'MAP3K_GROUP', 'RSK-5', 'IKK_EPSILON', 'CAM-KI_GROUP',
'SRC_GROUP', 'PKA_ALPHA', 'PKC_ETA', 'PDK-2', 'GRK-6',
'TITIN KINASE', 'CAM-KI_ALPHA', 'GRK-1', 'PDGFR_ALPHA', 'CCDPK',
'MARK_GROUP', 'P38_GROUP', 'GRK-3', 'DAPK_GROUP', 'MRCKA',
'PKG2/CGK-II', 'PKC_GAMMA', 'PKC_IOTA', 'CAM-KK_ALPHA'],
dtype=object)
map them to genes
= pd.read_csv('raw/elm_kinase_id.csv') ids
= ids.set_index('kinase')['kinase_gene'].to_dict() ids
'kinase_genes'] = elm.kinase.map(ids).fillna(elm.kinase) elm[
# for kinase group, we only consider the first two items
'kinase_genes'] = elm.kinase_genes.str.split(' ').str[:2] elm[
= elm.explode('kinase_genes') elm
elm[elm.kinase_genes.isin(dup_name)].kinase_genes.unique()
array(['PRKACA'], dtype=object)
'PRKACA')
dup_name.remove(
'PRKACA']= 'P17612' gene2uniprot[
elm[elm.kinase_genes.isin(dup_name)]
substrate_uniprot | position | acceptor | kinase | LTP_HTP | species | kinase_genes |
---|
unmapped:
map(gene2uniprot).isna()].kinase_genes.unique() elm[elm.kinase_genes.
array(['PHKA1', 'PHKA2', 'CSNK2B', 'CDPK1', 'CDPK2'], dtype=object)
'kinase_uniprot'] = elm.kinase_genes.map(gene2uniprot) elm[
= elm.dropna(subset='kinase_uniprot') elm
Substrate mapping
# elm.substrate_uniprot.drop_duplicates().to_csv('raw/elm_substrate_id.csv')
916 IDs were mapped to 919 results
10 ID were not mapped: ENSP00000328213 ENSP00000343690 ENSP00000352232 ENSP00000347528 ENSP00000357298 ENSP00000248996 ENSP00000357225 ENSP00000261937 ENSP00000248419 ENSP00000267569
Found two of the unmapped are kinases:
= {'ENSP00000328213':'P06239','ENSP00000261937':'P35916'} ensp
= elm.substrate_uniprot.map(ensp).fillna(elm.substrate_uniprot) elm.substrate_uniprot
= map_substrate('raw/idmapping_2025_03_12_elm.xlsx',elm,'substrate_uniprot') elm
Shape before processing (4699, 8)
Species counts: substrate_species
Homo sapiens (Human) 4675
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (4675, 11)
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
elm.shape
(4675, 11)
Validate site
elm.acceptor.value_counts()
acceptor
S 3018
T 908
Y 749
Name: count, dtype: int64
'site']=elm['acceptor']+elm['position'].astype(str) elm[
'site_match'] =validate_site_df(elm,'site','substrate_sequence') elm[
elm.site_match.value_counts()
site_match
1 4519
0 156
Name: count, dtype: int64
= elm[elm.site_match==1] elm
elm.shape
(4519, 13)
Save
elm.columns
Index(['position', 'acceptor', 'kinase', 'LTP_HTP', 'species', 'kinase_genes',
'kinase_uniprot', 'substrate_uniprot', 'substrate_genes',
'substrate_sequence', 'substrate_species', 'site', 'site_match'],
dtype='object')
= ['kinase','kinase_uniprot', 'kinase_genes',
col
'substrate_uniprot', 'substrate_genes','site','LTP_HTP', 'species',
'substrate_sequence',]
= elm[col] elm
'source']='ELM' elm[
# elm.to_excel('raw/ELM.xlsx',index=False)
Signor
Data
paper: https://academic.oup.com/nar/article/51/D1/D631/6761728
web: https://signor.uniroma2.it/
Go to web link, Downloads –> Latest Release download (Jan 2025)
Open in excel, filter mechanism to be phosphorylation
= pd.read_excel('raw/signor_phosphorylation.xlsx') sig
sig.shape
(12973, 27)
Kinase mapping
TYPEA column contains complex proteins, we need to filter out the kinase name
= sig[sig.TYPEA=='complex'].copy() comp
# comp.ENTITYA.drop_duplicates().to_csv('raw/sig_complex.csv')
= pd.read_csv('raw/sig_complex_label.csv') comp_id
= comp_id.set_index('ENTITYA')['kinase_gene'] comp_id
'kinase_gene'] = comp.ENTITYA.map(comp_id) comp[
= comp.dropna(subset='kinase_gene') comp
comp.shape
(742, 28)
Fusion proteins
= {'BCR-ABL':'ABL1','EML4-ALK':'ALK'} fus_id
= sig[sig.TYPEA=='fusion protein'].copy() fus
'kinase_gene'] = fus.ENTITYA.map(fus_id) fus[
For Protein family, we’ll make it apply to first two family members
= sig[sig.TYPEA=='proteinfamily'].copy() fam
# fam.ENTITYA.drop_duplicates().to_csv('raw/sig_fam.csv')
= pd.read_csv('raw/sig_fam_label.csv') fam_id
fam_id.head()
ENTITYA | kinase_gene | |
---|---|---|
0 | ERK1/2 | MAPK3 MAPK1 |
1 | AKT | AKT1 AKT2 AKT3 |
2 | RPS6K | RPS6KB1 RPS6KB2 |
3 | p38 | MAPK14 MAPK11 MAPK12 MAPK13 |
4 | JNK | MAPK8 MAPK9 MAPK10 |
= fam_id.set_index('ENTITYA')['kinase_gene'] fam_id
'kinase_gene'] = fam.ENTITYA.map(fam_id)
fam[
= fam.dropna(subset='kinase_gene') fam
'kinase_gene'] = fam.kinase_gene.str.split(' ').str[:2] fam[
= fam.explode('kinase_gene') fam
Protein:
= sig[sig.TYPEA=='protein'].copy() pro
for consistency:
'kinase_gene'] = pro['ENTITYA'] pro[
combine:
= pd.concat([pro,comp,fus,fam]) df
Mapping:
= pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx') kinase_id
'Gene Names'] = kinase_id['Gene Names'].str.split(' ') kinase_id[
= kinase_id.explode('Gene Names') kinase_id
Some kinases share same gene names:
# kinase_id[kinase_id['Gene Names'].duplicated(keep=False)].sort_values('Gene Names').head()
We’ll drop them to prevent confusion.
= set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names']) dup_name
sum() df.kinase_gene.isin(dup_name).
np.int64(674)
= df[~df.kinase_gene.isin(dup_name)].copy() df
= kinase_id.set_index('Gene Names')['Entry'].to_dict() gene2uniprot
df.shape
(13013, 28)
unmapped:
str.upper().map(gene2uniprot).isna()].kinase_gene.unique() df[df.kinase_gene.
array(['HRAS', 'IL4R', 'CSNK2B', 'TLR4', 'IL6R', 'THAP12', 'PRKAR2A',
'RHOA', 'AREG', 'RET/PTC2', 'PCSK7', 'CCNC', 'TGM2', 'SLC12A1',
'RALA', 'IL6ST', 'CRK', 'SMAD9', 'PLCG1', 'ELOC', 'PPP2CA', 'CCR5',
'GHR', 'CAD', 'SMO', 'FRS2', 'LEPR', 'BLVRA', 'TAB1', 'PRKAB1',
'PIAS4', 'IFNAR1', 'HSP90AA1', 'PTPRJ', 'PRKAG2', 'CSN1S1',
'GTF2F1', 'VASP', 'KRAS', 'SLC12A3', 'PRPF4B', 'CDKN2A', 'IL1R1',
'PRKAR2B', 'IL15RA', 'KRT1', 'SMAD1', 'GTF2H1', 'CCR2', 'IL5RA',
'PLCG2', 'SLC12A2', 'GTF2H2', 'TMIGD2', 'IKBKG', 'BORA', 'MNAT1',
'PHKA1', 'BGLF4', 'RIN1', 'DLG1', 'CDK5RAP2', 'CCR1', 'IL10RA',
'SMAD5', 'CCN4'], dtype=object)
'kinase_uniprot'] = df.kinase_gene.str.upper().map(gene2uniprot) df[
= df.dropna(subset='kinase_uniprot') df
Substrate mapping
= df.dropna(subset='IDB') df
We can’t trace the specific whole protein sequence based on SIGNOR ID, so we filter them out
= df[~df.IDB.str.contains('SIGNOR')] df
# df.IDB.drop_duplicates().to_csv('raw/sig_substrate_id.csv')
2,298 IDs were mapped to 2,298 results
1 ID was not mapped: CHEBI:15721
Sequences are all from human
= map_substrate('raw/idmapping_2025_03_12_signor.xlsx',df,'IDB') df
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (12354, 29)
Species counts: substrate_species
Homo sapiens (Human) 12353
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (12353, 32)
df.shape
(12353, 32)
Validate site
= df.RESIDUE.str.split(';') df.RESIDUE
= df.explode('RESIDUE') df
'acceptor'] = df['RESIDUE'].str[:3] df[
= df.acceptor.map({'Ser':'S','Thr':'T','Tyr':'Y'}) df.acceptor
'position'] = df['RESIDUE'].str[3:] df[
'site']=df['acceptor']+df['position'] df[
= df.dropna(subset='site') df
'site_match'] =validate_site_df(df,'site','substrate_sequence') df[
df.site_match.value_counts()
site_match
1 11212
0 118
Name: count, dtype: int64
= df[df.site_match==1] df
df.shape
(11212, 36)
Save
df.columns
Index(['ENTITYA', 'TYPEA', 'IDA', 'DATABASEA', 'ENTITYB', 'TYPEB', 'DATABASEB',
'EFFECT', 'MECHANISM', 'RESIDUE', 'SEQUENCE', 'TAX_ID', 'CELL_DATA',
'TISSUE_DATA', 'MODULATOR_COMPLEX', 'TARGET_COMPLEX', 'MODIFICATIONA',
'MODASEQ', 'MODIFICATIONB', 'MODBSEQ', 'PMID', 'DIRECT', 'NOTES',
'ANNOTATOR', 'SENTENCE', 'SIGNOR_ID', 'kinase_gene', 'kinase_uniprot',
'substrate_uniprot', 'substrate_genes', 'substrate_sequence',
'substrate_species', 'acceptor', 'position', 'site', 'site_match'],
dtype='object')
= ['kinase_uniprot', 'kinase_gene', 'ENTITYA', 'TYPEA',
col
'substrate_uniprot', 'substrate_genes','site', 'substrate_sequence']
= df[col].copy() df
df.head()
kinase_uniprot | kinase_gene | ENTITYA | TYPEA | substrate_uniprot | substrate_genes | site | substrate_sequence | |
---|---|---|---|---|---|---|---|---|
0 | P68400 | CSNK2A1 | CSNK2A1 | protein | P05455 | SSB | S366 | MAENGDNEKMAALEAKICHQIEYYFGDFNLPRDKFLKEQIKLDEGW... |
1 | P28482 | MAPK1 | MAPK1 | protein | P43354 | NR4A2 NOT NURR1 TINUR | S126 | MPCVQAQYGSSPQGASPASQSYSYHSSGEYSSDFLTPEFVKFSMDL... |
2 | P11362 | FGFR1 | FGFR1 | protein | P56945 | BCAR1 CAS CASS1 CRKAS | Y128 | MNHLNVLAKALYDNVAESPDELSFRKGDIMTVLEQDTQGLDGWWLC... |
3 | P27361 | MAPK3 | MAPK3 | protein | P41182 | BCL6 BCL5 LAZ3 ZBTB27 ZNF51 | S343 | MASPADSCIQFTRHASDVLLNLNRLRSRDILTDVVIVVSREQFRAH... |
4 | Q16539 | MAPK14 | MAPK14 | protein | Q02078 | MEF2A MEF2 | S408 | MGRKKIQITRIMDERNRQVTFTKRKFGLMKKAYELSVLCDCEIALI... |
df.shape
(11212, 8)
'source']="SIGNOR" df[
# df.to_excel('raw/signor.xlsx',index=False)
GPS 6.0
Data
paper: https://academic.oup.com/nar/article/51/W1/W243/7157529#409532969
web: https://gps.biocuckoo.cn/index.php
Go to paper link, download supplementary data - zip file, get Table S5
import pandas as pd
= pd.read_csv('raw/GPS6_tableS5.csv') gps
= gps[gps.source!="PhosphositePlus"] gps
= gps[gps.species=="Homo sapiens"] gps
gps.shape
(6087, 6)
= gps[~gps.gene.str.contains('family')] gps
Kinase mapping
= pd.read_excel('raw/uniprot_human_keyword_kinase.xlsx') kinase_id
'Gene Names'] = kinase_id['Gene Names'].str.split(' ') kinase_id[
= kinase_id.explode('Gene Names') kinase_id
Some kinases share same gene names:
'Gene Names'].duplicated(keep=False)].sort_values('Gene Names').head() kinase_id[kinase_id[
Entry | Entry Name | Protein names | Gene Names | uniprot_keyword_kinase | on_tree | Organism | Keywords | Sequence | |
---|---|---|---|---|---|---|---|---|---|
586 | Q9UIJ7 | KAD3_HUMAN | GTP:AMP phosphotransferase AK3, mitochondrial ... | AK3 | 1 | 0 | Homo sapiens (Human) | 3D-structure;Acetylation;Alternative splicing;... | MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM... |
152 | P27144 | KAD4_HUMAN | Adenylate kinase 4, mitochondrial (EC 2.7.4.4)... | AK3 | 1 | 0 | Homo sapiens (Human) | 3D-structure;Acetylation;ATP-binding;GTP-bindi... | MASKLLRAVILGPPGSGKGTVCQRIAQNFGLQHLSSGHFLRENIKA... |
586 | Q9UIJ7 | KAD3_HUMAN | GTP:AMP phosphotransferase AK3, mitochondrial ... | AK3L1 | 1 | 0 | Homo sapiens (Human) | 3D-structure;Acetylation;Alternative splicing;... | MGASARLLRAVIMGAPGSGKGTVSSRITTHFELKHLSSGDLLRDNM... |
152 | P27144 | KAD4_HUMAN | Adenylate kinase 4, mitochondrial (EC 2.7.4.4)... | AK3L1 | 1 | 0 | Homo sapiens (Human) | 3D-structure;Acetylation;ATP-binding;GTP-bindi... | MASKLLRAVILGPPGSGKGTVCQRIAQNFGLQHLSSGHFLRENIKA... |
609 | Q9Y3D8 | KAD6_HUMAN | Adenylate kinase isoenzyme 6 (AK6) (EC 2.7.4.3... | AK6 | 1 | 0 | Homo sapiens (Human) | 3D-structure;Alternative splicing;ATP-binding;... | MLLPNILLTGTPGVGKTTLGKELASKSGLKYINVGDLAREEQLYDG... |
We’ll drop them to prevent confusion.
= set(kinase_id[kinase_id['Gene Names'].duplicated(keep=False)]['Gene Names']) dup_name
sum() gps.gene.isin(dup_name).
np.int64(76)
gps.shape
(4260, 6)
= gps[~gps.gene.isin(dup_name)].copy() gps
= kinase_id.set_index('Gene Names')['Entry'].to_dict() gene2uniprot
gps.shape
(4184, 6)
unmapped:
str.upper().map(gene2uniprot).isna()].gene.unique() gps[gps.gene.
array(['DMPK1', 'MRCKa', 'YPKA', 'PKACb', 'PKCt', 'PKCh', 'PKCi', 'PKCz',
'PKG1', 'PKG2', 'PRKG1 Isoform Alpha', 'p70S6K', 'p70S6Kb',
'CCDPK', 'AMPKa1', 'PKD3', 'CK1d', 'CK1e', 'CK1g1', 'CSNK2B',
'p38b', 'p38d', 'p38g', 'AurB', 'AurC', 'TP53RK ', 'STLK3',
'PAK3 Isoform 2', 'TAO1', 'ACK'], dtype=object)
'kinase_uniprot'] = gps.gene.str.upper().map(gene2uniprot) gps[
= gps.dropna(subset='kinase_uniprot') gps
Substrate mapping
# gps.uniprot.drop_duplicates().to_csv('raw/GPS_substrate_id.csv')
1,203 IDs were mapped to 1,208 results
1,202 active entries and 1 obsolete entry are found
gps.shape
(3750, 7)
= map_substrate('raw/idmapping_2025_03_12_GPS.xlsx',gps,'uniprot') gps
Shape before processing (3750, 7)
Species counts: substrate_species
Homo sapiens (Human) 3614
Rattus norvegicus (Rat) 30
Mus musculus (Mouse) 19
Pongo abelii (Sumatran orangutan) (Pongo pygmaeus abelii) 10
Pan troglodytes (Chimpanzee) 6
Sus scrofa (Pig) 6
Bos taurus (Bovine) 5
Oryctolagus cuniculus (Rabbit) 4
Macaca fascicularis (Crab-eating macaque) (Cynomolgus monkey) 4
Mesocricetus auratus (Golden hamster) 4
Tupaia belangeri (Common tree shrew) (Tupaia glis belangeri) 2
Canis lupus familiaris (Dog) (Canis familiaris) 2
Phodopus roborovskii (Roborovski's desert hamster) (Cricetulus roborovskii) 2
Macaca mulatta (Rhesus macaque) 2
Xenopus tropicalis (Western clawed frog) (Silurana tropicalis) 2
Hylobates lar (Lar gibbon) (White-handed gibbon) 1
Cricetulus griseus (Chinese hamster) (Cricetulus barabensis griseus) 1
Xenopus laevis (African clawed frog) 1
Galeopterus variegatus (Malayan flying lemur) (Cynocephalus variegatus) 1
Nannospalax galili (Northern Israeli blind subterranean mole rat) (Spalax galili) 1
Spermophilus citellus (European suslik) (Citellus citellus) 1
Marmota monax (Woodchuck) 1
Pan paniscus (Pygmy chimpanzee) (Bonobo) 1
Danio rerio (Zebrafish) (Brachydanio rerio) 1
Hepatitis delta virus genotype II (isolate 7/18/83) (HDV) 1
Human T-cell leukemia virus 1 (strain Japan ATK-1 subtype A) (HTLV-1) 1
Macaca fuscata fuscata (Japanese macaque) 1
Human T-cell leukemia virus 1 (isolate Caribbea HS-35 subtype A) (HTLV-1) 1
Human T-cell leukemia virus 1 (isolate Melanesia mel5 subtype C) (HTLV-1) 1
Ovis aries (Sheep) 1
Human immunodeficiency virus type 1 group M subtype B (isolate NY5) (HIV-1) 1
Human immunodeficiency virus type 2 subtype A (isolate BEN) (HIV-2) 1
Human immunodeficiency virus type 1 group M subtype B (isolate YU-2) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype B (isolate WMJ22) (HIV-1) 1
Cavia porcellus (Guinea pig) 1
Human immunodeficiency virus type 1 group M subtype A (isolate MAL) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype F2 (isolate MP257) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype B (isolate LW123) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype B (isolate JRCSF) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype B (isolate JH32) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype B (isolate HXB2) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype C (isolate ETH2220) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype D (isolate ELI) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype B (isolate CDC-451) (HIV-1) 1
Human immunodeficiency virus type 1 group O (isolate ANT70) (HIV-1) 1
Human immunodeficiency virus type 1 group M subtype A (isolate U455) (HIV-1) 1
Human immunodeficiency virus type 2 subtype A (isolate KR) (HIV-2) 1
Phodopus campbelli (Campbell's dwarf Russian hamster) 1
Human immunodeficiency virus type 2 subtype A (isolate ROD) (HIV-2) 1
Human immunodeficiency virus type 1 group O (isolate MVP5180) (HIV-1) 1
Felis catus (Cat) (Felis silvestris catus) 1
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (3614, 10)
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
gps.shape
(3614, 10)
Validate site
'site']=gps['code'] + gps['position'].astype(int).astype(str) gps[
'site_match'] =validate_site_df(gps,'site','substrate_sequence') gps[
'site_match'].value_counts() gps[
site_match
1 3435
0 179
Name: count, dtype: int64
=gps[gps.site_match==1] gps
gps.shape
(3435, 12)
Save
gps.columns
Index(['position', 'code', 'gene', 'species', 'source', 'kinase_uniprot',
'substrate_uniprot', 'substrate_genes', 'substrate_sequence',
'substrate_species', 'site', 'site_match'],
dtype='object')
= gps.rename(columns={'source':'GPS_source'}) gps
'source']='GPS6' gps[
= ['kinase_uniprot','gene',
col
'substrate_uniprot', 'substrate_genes', 'site','substrate_sequence', 'GPS_source','source']
= gps[col].copy() gps
gps.head()
kinase_uniprot | gene | substrate_uniprot | substrate_genes | site | substrate_sequence | GPS_source | source | |
---|---|---|---|---|---|---|---|---|
0 | P31749 | AKT1 | Q9Y261 | FOXA2 HNF3B TCF3B | T156 | MLGAVKMEGHEPSDWSSYYAEPEGYSSVSNMNAGLGMNGMNTYMSM... | 14500912 | GPS6 |
1 | P31749 | AKT1 | P49760 | CLK2 | T127 | MPHPRRYHSSERGSRGSYREHYRSRKHKRRRSRSWSSSSDRTRRRR... | UniProt | GPS6 |
2 | P31749 | AKT1 | P49815 | TSC2 TSC4 | T1462 | MAKPTSKDSGLKEKFKILLGLGTPRPNPRSAEGKQTEFIITAEILR... | 12150915;15342917;12172553;UniProt | GPS6 |
3 | P31749 | AKT1 | P46527 | CDKN1B KIP1 p27 | T187 | MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE... | 12042314;9192873;9399644;9311993;9388487;17254... | GPS6 |
4 | P31749 | AKT1 | O15111 | CHUK IKKA TCF16 | T23 | MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC... | 10485710;UniProt | GPS6 |
# gps.to_excel('raw/GPS6.xlsx',index=False)
Other datasets
Douglass
Paper: https://journals.physiology.org/doi/full/10.1152/ajpcell.00166.2012
Data is not available to download, but available upon request to authors
RegPhos 2.0
Paper: https://academic.oup.com/database/article/doi/10.1093/database/bau034/2634150
Go to the paper link, scroll down to Supplementary data, download the zip file, docx file, table S4 is motif
Phosida
http://www.phosida.com/ is not accessible
Phosida paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC2258193/
PhosphoNetwork
2013 bioinformatics: https://pmc.ncbi.nlm.nih.gov/articles/PMC3866559/#btt627-T1
2013 Mol Syst Biol.: https://pmc.ncbi.nlm.nih.gov/articles/PMC3658267/
web app: https://www.phosphonetworks.org/
The web app provides kinase-substrate pairs, but does not provides specific info of phosphorylation site.
It provides logo and pssm though in Download –> Motif Matrix, which can be used to compare the PSSM result
BioGRID
Go to : https://downloads.thebiogrid.org/BioGRID, Current-Release–>BIOGRID-PTMS-4.4.242.ptm.zip
There are two files in after extraction, however, they don’t contain kinase-substrate relationship.
Combine all
import pandas as pd
from pathlib import Path
= [
paths 'raw/GPS6.xlsx',
'raw/signor.xlsx',
'raw/ELM.xlsx',
'raw/iPTMNet.xlsx','raw/EPSD.xlsx',# 'raw/KiNet.xlsx',
'raw/psp_human.xlsx',
'raw/sugiyama.xlsx'
]
= [pd.read_excel(path) for path in paths] dfs
print(df.source.value_counts()) for df in dfs] [
source
GPS6 3435
Name: count, dtype: int64
source
SIGNOR 11212
Name: count, dtype: int64
source
ELM 4519
Name: count, dtype: int64
source
iPTMNet 3478
Name: count, dtype: int64
source
EPSD 10308
Name: count, dtype: int64
source
PSP 13144
Name: count, dtype: int64
source
Sugiyama 184948
Name: count, dtype: int64
[None, None, None, None, None, None, None]
For each df, we need to drop duplicates of kinase(uniprot)-substrate(uniprot)-site
def get_key(df):
= df.copy()
df print('original shape:',df.shape)
'kin_sub_site']=df['kinase_uniprot']+'_'+df['substrate_uniprot']+'_'+df['site']
df[= df.drop_duplicates(subset='kin_sub_site')
df print('after removing duplicates',df.shape)
return df
=[get_key(df) for df in dfs] dfs
original shape: (3435, 8)
after removing duplicates (3326, 9)
original shape: (11212, 9)
after removing duplicates (9320, 10)
original shape: (4519, 10)
after removing duplicates (3807, 11)
original shape: (3478, 9)
after removing duplicates (3478, 10)
original shape: (10308, 9)
after removing duplicates (10308, 10)
original shape: (13144, 12)
after removing duplicates (13091, 13)
original shape: (184948, 11)
after removing duplicates (168342, 12)
for path,df in zip(paths,dfs):
print(Path(path).stem, df.shape)
GPS6 (3326, 9)
signor (9320, 10)
ELM (3807, 11)
iPTMNet (3478, 10)
EPSD (10308, 10)
psp_human (13091, 13)
sugiyama (168342, 12)
= ['kinase_uniprot','substrate_uniprot','site','kin_sub_site','source','substrate_genes','substrate_sequence'] common_cols
= pd.concat(dfs,ignore_index=True)
df_all = df_all[common_cols].copy() df_all
df_all.source.value_counts()
source
Sugiyama 168342
PSP 13091
EPSD 10308
SIGNOR 9320
ELM 3807
iPTMNet 3478
GPS6 3326
Name: count, dtype: int64
# df_all.to_parquet('raw/combine_source.parquet')
= df_all.groupby("kin_sub_site").agg({
df_grouped "kinase_uniprot": "first",
"substrate_uniprot": "first",
"site": "first",
"source": '|'.join, # Concatenate sources with '|'
"substrate_genes": "first",
"substrate_sequence": "first"
}).reset_index()
df_grouped.shape
(187066, 7)
df_grouped.head()
kin_sub_site | kinase_uniprot | substrate_uniprot | site | source | substrate_genes | substrate_sequence | |
---|---|---|---|---|---|---|---|
0 | O00141_A4FU28_S140 | O00141 | A4FU28 | S140 | Sugiyama | CTAGE9 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... |
1 | O00141_O00141_S252 | O00141 | O00141 | S252 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... |
2 | O00141_O00141_S255 | O00141 | O00141 | S255 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... |
3 | O00141_O00141_S397 | O00141 | O00141 | S397 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... |
4 | O00141_O00141_S404 | O00141 | O00141 | S404 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... |
# df_grouped.to_parquet('raw/combine_source_grouped.parquet')
Human phosphoproteome
from katlas.core import *
import pandas as pd
Data
= Data.get_combine_site_psp_ochoa() human
human.shape
(121419, 8)
Substrate mapping
# human.uniprot.drop_duplicates().to_csv('raw/human_phosphoproteome_uniprot.csv')
11,243 IDs were mapped to 11,241 results
5 ID were not mapped: AAA58698 P18433-2 AAC50053 AAA60149 NP_001184222
11,242 active entries and 1 obsolete entry are found
= map_substrate('raw/idmapping_2025_03_20_human_phosphoproteome.xlsx',human,'uniprot') human
f:\git\katlas\.venv\Lib\site-packages\openpyxl\styles\stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Shape before processing (121419, 8)
Species counts: substrate_species
Homo sapiens (Human) 121332
Name: count, dtype: int64
Removing non-human substrates
Shape after processing (121332, 11)
Validate site
human.columns
Index(['gene', 'site', 'site_seq', 'source', 'AM_pathogenicity', 'CDDM_upper',
'CDDM_max_score', 'substrate_uniprot', 'substrate_genes',
'substrate_sequence', 'substrate_species'],
dtype='object')
'site_match'] =validate_site_df(human,'site','substrate_sequence') human[
'site_match'].value_counts() human[
site_match
1 120084
0 1248
Name: count, dtype: int64
=human[human.site_match==1].copy() human
human.shape
(120084, 12)
Remove duplicates
'sub_site'] = human['substrate_uniprot']+'_'+human['site'] human[
= human.drop_duplicates(subset='sub_site') human
human.shape
(119955, 13)
Save
human.columns
Index(['gene', 'site', 'site_seq', 'source', 'AM_pathogenicity', 'CDDM_upper',
'CDDM_max_score', 'substrate_uniprot', 'substrate_genes',
'substrate_sequence', 'substrate_species', 'site_match', 'sub_site'],
dtype='object')
= ['substrate_uniprot', 'substrate_genes',
cols
'site', 'source', 'AM_pathogenicity',
'substrate_sequence', 'substrate_species', 'sub_site']
= human[cols] human
# human.to_parquet('raw/human_phosphoproteome.parquet')
Phosphorylate sequence
Combine human phosphoproteome and KS dataset site info
= pd.read_parquet('raw/human_phosphoproteome.parquet') human
= ['substrate_uniprot','site','substrate_sequence'] cols
= pd.read_parquet('raw/combine_source_grouped.parquet') df_grouped
= human[cols]
human
= df_grouped[cols] df_grouped
= pd.concat([human,df_grouped]) comb
'sub_site'] = comb['substrate_uniprot']+'_'+comb['site'] comb[
= comb.drop_duplicates('sub_site') comb
Phosphorylate sequence
= phosphorylate_seq_df(comb) seq
seq.head()
substrate_uniprot | site | substrate_sequence | substrate_phosphoseq | |
---|---|---|---|---|
0 | A0A024R4G9 | [S20] | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... |
1 | A0A075B6Q4 | [S24, S35, S57, S68, S71, S72] | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... |
2 | A0A075B6T3 | [S24, S26] | XLKRAYRGLEEVQWCLEQLLTSPSPS | XLKRAYRGLEEVQWCLEQLLTSPsPs |
3 | A0A075B759 | [T68] | MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF... | MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF... |
4 | A0A087WTJ2 | [T8, Y14, Y213, T215, S221, S421, S424] | MGGRKMATDEENVYGLEENAQSRQESTRRLILVGRTGAGKSATGNS... | MGGRKMAtDEENVyGLEENAQSRQESTRRLILVGRTGAGKSATGNS... |
# seq.to_csv('raw/phosphoseq_map.csv',index=False)
Map to df
= pd.read_parquet('raw/human_phosphoproteome.parquet')
human
= pd.read_parquet('raw/combine_source_grouped.parquet') df_grouped
= seq.set_index('substrate_uniprot')['substrate_phosphoseq'] seq_map
'substrate_phosphoseq'] = human.substrate_uniprot.map(seq_map)
human[
'substrate_phosphoseq'] = df_grouped.substrate_uniprot.map(seq_map) df_grouped[
'substrate_phosphoseq'].isna().sum() human[
np.int64(0)
'substrate_phosphoseq'].isna().sum() df_grouped[
np.int64(0)
# human.to_parquet('raw/human_phosphoproteome.parquet')
# df_grouped.to_parquet('raw/combine_source_grouped.parquet')
Extract site sequence
'position'] = human['site'].str[1:].astype(int)
human[
'position']=df_grouped['site'].str[1:].astype(int) df_grouped[
'site_seq'] = extract_site_seq(human,
human[='substrate_phosphoseq',
seq_col='position',
position_col=20) length
100%|██████████| 119955/119955 [00:05<00:00, 22986.11it/s]
'site_seq'] = extract_site_seq(df_grouped,
df_grouped[='substrate_phosphoseq',
seq_col='position',
position_col=20) length
100%|██████████| 187066/187066 [00:07<00:00, 23764.80it/s]
'sub_site'] = df_grouped['substrate_uniprot']+'_'+ df_grouped['site']
df_grouped[
'sub_site'] = human['substrate_uniprot']+'_'+ human['site'] human[
# human.to_parquet('raw/human_phosphoproteome.parquet')
# df_grouped.to_parquet('raw/combine_source_grouped.parquet')
Add kinase info
=df_grouped.copy() df
# Remove pseudokinase duplicates by UniProt ID, keep only one entry per kinase
= Data.get_kinase_info().sort_values('kinase').drop_duplicates('uniprot')
info
# Pre-extract UniProt ID without isoform for matching
'uniprot_clean'] = df['kinase_uniprot'].str.split('-').str[0]
df[
= info.set_index('uniprot')
info_indexed = info_indexed['group']
group_map = info_indexed['family']
family_map = info_indexed['pspa_category_small']
pspa_small_map = info_indexed['pspa_category_big']
pspa_big_map
'kinase_on_tree'] = df['uniprot_clean'].isin(info['uniprot']).astype(int)
df[
= Data.get_kinase_uniprot().set_index('Entry')['Gene Names']
kinase_gene_map 'kinase_genes'] = df['uniprot_clean'].map(kinase_gene_map)
df[
'kinase_group'] = df['uniprot_clean'].map(group_map)
df['kinase_family'] = df['uniprot_clean'].map(family_map)
df['kinase_pspa_big'] = df['uniprot_clean'].map(pspa_big_map)
df['kinase_pspa_small'] = df['uniprot_clean'].map(pspa_small_map)
df[
='uniprot_clean', inplace=True) df.drop(columns
The above code has been added into Data
class when loading the ks dataset