from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
Prepare kinase-substrate dataset
Large scale dataset
# large_scale data
= pd.read_excel('raw/large_scale_final2.xlsx')
LS
'gene_site'] = LS['S_position']
LS[
= LS[['Kinase','gene_site','substrate']]
LS
= pd.read_csv('raw/LS_info.csv')
LS_info
= LS.merge(LS_info).drop(columns = ['count']) LS
PhosphoSitePlus dataset
= pd.read_csv('raw/Kinase_Substrate_Dataset_final.csv')
KS
# only keep human kinase substrate pairs
= KS.query('KIN_ORGANISM=="human"')
KS
# remove the substrate that does not have sty in the middle
= KS[KS.substrate.str[7].isin(['s','t','y'])].reset_index(drop=True)
KS
'gene_site'] = KS.SUB_GENE.str.upper() + "_"+KS.SUB_MOD_RSD
KS[
= KS[['Kinase','gene_site','substrate','kinase_uniprot','kinase_paper']] KS
Combine
'source'] = 'pplus'
KS[
'source'] = 'large_scale' LS[
= pd.concat([KS,LS]).reset_index(drop=True) df
df
Kinase | gene_site | substrate | kinase_uniprot | kinase_paper | source | |
---|---|---|---|---|---|---|
0 | DYRK2 | GLI2_S385 | AEGLRPAsPLGLTQE | Q92630 | DYRK2 | pplus |
1 | DYRK2 | SIAH2_S68 | GGGAGPVsPQHHELT | Q92630 | DYRK2 | pplus |
2 | DYRK2 | CARHSP1_S41 | LRGNVVPsPLPtRRt | Q92630 | DYRK2 | pplus |
3 | DYRK2 | DCX_S306 | GPMRRSKsPADSANG | Q92630 | DYRK2 | pplus |
4 | DYRK2 | CDC25A_S283 | PErsQEEsPPGSTKr | Q92630 | DYRK2 | pplus |
... | ... | ... | ... | ... | ... | ... |
216464 | SPHK1 | HERC2_S2928 | AAVPFLAsDNEEEED | Q9NYA1 | NaN | large_scale |
216465 | SPHK1 | PARG_Y832 | DALHFRRyLDQFVPE | Q9NYA1 | NaN | large_scale |
216466 | SPHK1 | G6PI_T109 | ALRNRsNtPILVDGK | Q9NYA1 | NaN | large_scale |
216467 | PRPK | TITIN_S20761 | DAERKSWsTVTTECs | Q96S44 | PRPK | large_scale |
216468 | PRPK | TITIN_S20768 | sTVTTECsKTSFRVA | Q96S44 | PRPK | large_scale |
216469 rows × 6 columns
# kinase without mapping
df.loc[df.kinase_paper.isna()].Kinase.value_counts()
Kinase
LYNb 1694
ABL1[T315I] 1320
ABL1[E255K] 1274
RET[M918T] 1221
FGFR3[K650M] 1220
...
GRP78 1
TAO3 1
PIK3CD 1
PIK3CB 1
G11 1
Name: count, Length: 80, dtype: int64
str[7].value_counts() df.substrate.
substrate
y 109225
s 74830
t 32414
Name: count, dtype: int64
df.source.value_counts()
source
large_scale 198248
pplus 18221
Name: count, dtype: int64
len(df)
216469
'substrate'] = df['substrate'].\
df[str.replace('r', 'R').\
str.replace('k', 'K').\
str.replace('n', 'N').\
str.replace('d', 'D').\
str.replace('h', 'H').\
str.replace('c', 'C').\
str.replace('X', '_').\
str.replace('p', 'P').\
str.replace('a', 'A').\
str.replace('v', 'V').\
str.replace('g', 'G').\
str.replace('m', 'M').\
str.replace('l', 'L').\
str.replace('q', 'Q').\
str.replace('f', 'F')
Split phosphosite sequence to single amino acid
df
Kinase | gene_site | substrate | kinase_uniprot | kinase_paper | source | |
---|---|---|---|---|---|---|
0 | DYRK2 | GLI2_S385 | AEGLRPAsPLGLTQE | Q92630 | DYRK2 | pplus |
1 | DYRK2 | SIAH2_S68 | GGGAGPVsPQHHELT | Q92630 | DYRK2 | pplus |
2 | DYRK2 | CARHSP1_S41 | LRGNVVPsPLPtRRt | Q92630 | DYRK2 | pplus |
3 | DYRK2 | DCX_S306 | GPMRRSKsPADSANG | Q92630 | DYRK2 | pplus |
4 | DYRK2 | CDC25A_S283 | PERsQEEsPPGSTKR | Q92630 | DYRK2 | pplus |
... | ... | ... | ... | ... | ... | ... |
216464 | SPHK1 | HERC2_S2928 | AAVPFLAsDNEEEED | Q9NYA1 | NaN | large_scale |
216465 | SPHK1 | PARG_Y832 | DALHFRRyLDQFVPE | Q9NYA1 | NaN | large_scale |
216466 | SPHK1 | G6PI_T109 | ALRNRsNtPILVDGK | Q9NYA1 | NaN | large_scale |
216467 | PRPK | TITIN_S20761 | DAERKSWsTVTTECs | Q96S44 | PRPK | large_scale |
216468 | PRPK | TITIN_S20768 | sTVTTECsKTSFRVA | Q96S44 | PRPK | large_scale |
216469 rows × 6 columns
Split site sequence to individual columns
# Split the 'SITE_+/-7_AA' column into individual letters
= df['substrate'].apply(list).apply(pd.Series).fillna('_')
split_cols
# Rename the new columns from -7 to 7
= range(-7, 8)
split_cols.columns
# Concatenate the original dataframe with the split columns
= pd.concat([df, split_cols], axis=1) df
df
Kinase | gene_site | substrate | kinase_uniprot | kinase_paper | source | -7 | -6 | -5 | -4 | ... | -2 | -1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | DYRK2 | GLI2_S385 | AEGLRPAsPLGLTQE | Q92630 | DYRK2 | pplus | A | E | G | L | ... | P | A | s | P | L | G | L | T | Q | E |
1 | DYRK2 | SIAH2_S68 | GGGAGPVsPQHHELT | Q92630 | DYRK2 | pplus | G | G | G | A | ... | P | V | s | P | Q | H | H | E | L | T |
2 | DYRK2 | CARHSP1_S41 | LRGNVVPsPLPtRRt | Q92630 | DYRK2 | pplus | L | R | G | N | ... | V | P | s | P | L | P | t | R | R | t |
3 | DYRK2 | DCX_S306 | GPMRRSKsPADSANG | Q92630 | DYRK2 | pplus | G | P | M | R | ... | S | K | s | P | A | D | S | A | N | G |
4 | DYRK2 | CDC25A_S283 | PERsQEEsPPGSTKR | Q92630 | DYRK2 | pplus | P | E | R | s | ... | E | E | s | P | P | G | S | T | K | R |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
216464 | SPHK1 | HERC2_S2928 | AAVPFLAsDNEEEED | Q9NYA1 | NaN | large_scale | A | A | V | P | ... | L | A | s | D | N | E | E | E | E | D |
216465 | SPHK1 | PARG_Y832 | DALHFRRyLDQFVPE | Q9NYA1 | NaN | large_scale | D | A | L | H | ... | R | R | y | L | D | Q | F | V | P | E |
216466 | SPHK1 | G6PI_T109 | ALRNRsNtPILVDGK | Q9NYA1 | NaN | large_scale | A | L | R | N | ... | s | N | t | P | I | L | V | D | G | K |
216467 | PRPK | TITIN_S20761 | DAERKSWsTVTTECs | Q96S44 | PRPK | large_scale | D | A | E | R | ... | S | W | s | T | V | T | T | E | C | s |
216468 | PRPK | TITIN_S20768 | sTVTTECsKTSFRVA | Q96S44 | PRPK | large_scale | s | T | V | T | ... | E | C | s | K | T | S | F | R | V | A |
216469 rows × 21 columns
Add extra columns
'kinase'] = df['kinase_paper'].fillna(df['Kinase'])
df[
'on_tree']=df['kinase_paper'].notna().astype(int) df[
Save
Uncheck below
= df.columns.astype(str) # Parquet can only save df with string column name
df.columns 'raw/df.parquet') df.to_parquet(
Load data
Need to convert the number in the columns to integer type
= pd.read_parquet('raw/df.parquet')
df #Convert the number in the column name into integer
= [int(col) if col.lstrip('-').isdigit() else col for col in df.columns] df.columns
Visualize counts
=100)
df.kinase.value_counts().hist(bins=100, color='r', linestyle='--', linewidth=1)
plt.axvline(x'count per kinase')
plt.xlabel('# kinase'); plt.ylabel(
= df.kinase.value_counts().reset_index() cnt
= ['kinase','count'] cnt.columns
cnt
kinase | count | |
---|---|---|
0 | SRC | 2605 |
1 | EPHA3 | 2120 |
2 | FES | 2013 |
3 | NTRK3 | 1927 |
4 | ALK | 1889 |
... | ... | ... |
488 | CAMK1G | 1 |
489 | CAMK2D iso8 | 1 |
490 | PHKA1 | 1 |
491 | VPRBP | 1 |
492 | NEK10 | 1 |
493 rows × 2 columns