import pandas as pd
Preprocess Large-scale dataset
= pd.read_csv('raw/Large_scale_S2.csv') df
# Add substrate and phos position info
'Substrate'] = df.Substrate_uniprot.str.split('_').str[0]
df['substrate_position'] = df['Substrate'] + "_"+ df['Position'] df[
EDA - Pivot table
index is unique phosphosites and column is kinase
= df.pivot_table(index='substrate_position',
p ='Kinase',
columns='Substrate', # any column
values='count',
aggfunc=0) fill_value
p
Kinase | ABL1 | ABL1[E255K] | ABL1[T315I] | ABL2 | ACK | ACTR2 | ACTR2B | AKT1 | AKT2 | AKT3 | ... | ZAK | ZAP70 | p38a | p38b | p38d | p38g | p70S6K | p70S6Kb | skMLCK | smMLCK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
substrate_position | |||||||||||||||||||||
1433B_S132 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1433B_S145 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1433B_S147 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1433B_S158 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1433B_S212 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZW10_S611 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ZYX_S259 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
ZYX_S267 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
ZYX_S505 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ZYX_T270 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
21449 rows × 385 columns
# Check mutation
for kinase_with_mutation in ['ABL','ALK','BRAF','EGFR','FGFR3',
'KIT','LRRK2','MET','PDGFRa','RET']:
print(p[p.columns[p.columns.str.contains(kinase_with_mutation)]].sum().reset_index(name='#substrates'))
print('---------------')
Kinase #substrates
0 ABL1 1577
1 ABL1[E255K] 1274
2 ABL1[T315I] 1322
3 ABL2 1004
---------------
Kinase #substrates
0 ALK 1876
1 ALK1 29
2 ALK2 331
3 ALK4 463
4 ALK[F1174L] 1111
5 ALK[R1275Q] 1044
---------------
Kinase #substrates
0 BRAF 22
1 BRAF[V600E] 195
---------------
Kinase #substrates
0 EGFR 796
1 EGFR[L858R] 648
2 EGFR[L861Q] 981
3 EGFR[T790M/L858R] 1112
4 EGFR[T790M] 1086
---------------
Kinase #substrates
0 FGFR3 775
1 FGFR3[K650E] 827
2 FGFR3[K650M] 1220
---------------
Kinase #substrates
0 KIT 786
1 KIT[T670I] 674
2 KIT[V560G] 172
---------------
Kinase #substrates
0 LRRK2 59
1 LRRK2[G2019S] 11
---------------
Kinase #substrates
0 MET 1457
1 MET[Y1235D] 511
---------------
Kinase #substrates
0 PDGFRa 1194
1 PDGFRa[T674I] 249
2 PDGFRa[V561D] 1175
---------------
Kinase #substrates
0 RET 1735
1 RET[G691S] 1165
2 RET[M918T] 1221
3 RET[S891A] 876
4 RET[Y791F] 860
---------------
# Number of substrates of kinase
sum().sort_values(ascending=False) p.
Kinase
EPHA3 2102
FES 2008
TRKC 1928
SRC 1899
EPHA8 1878
...
PKN3 6
SPHK1 4
PIK3CD/PIK3R1 3
PFTAIRE1/cycD3 2
PRPK 2
Length: 385, dtype: int64
# Most phosphorylated substrates
sum(1).sort_values(ascending=False) p.
substrate_position
HSPB1_S82 307
ALDOA_S39 191
HS90B_S462 189
RS29_Y7 185
RS16_S9 180
...
ZN768_Y359 1
ZN687_S316 1
ZN706_Y39 1
1433F_T210 1
1433F_T231 1
Length: 21449, dtype: int64
index is unique substrates and column is kinase
= df.pivot_table(index='Substrate',
p2 ='Kinase',
columns='Type', # any column
values='count',
aggfunc=0) fill_value
p2
Kinase | ABL1 | ABL1[E255K] | ABL1[T315I] | ABL2 | ACK | ACTR2 | ACTR2B | AKT1 | AKT2 | AKT3 | ... | ZAK | ZAP70 | p38a | p38b | p38d | p38g | p70S6K | p70S6Kb | skMLCK | smMLCK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Substrate | |||||||||||||||||||||
1433B | 4 | 4 | 3 | 4 | 2 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 3 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
1433E | 3 | 4 | 3 | 4 | 2 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 4 | 1 | 1 | 2 | 2 | 1 | 0 | 0 | 0 |
1433F | 4 | 3 | 4 | 4 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 3 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | 0 |
1433G | 4 | 4 | 4 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 2 | 2 | 2 | 2 | 2 | 1 | 0 | 0 | 0 |
1433S | 5 | 4 | 3 | 3 | 2 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 3 | 2 | 1 | 2 | 2 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ZO1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
ZO2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ZRAB2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
ZW10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ZYX | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 |
4128 rows × 385 columns
Find substrates that share same kinase pattern
=False)].sort_values(list(p2.columns)) p2[p2.duplicated(keep
Kinase | ABL1 | ABL1[E255K] | ABL1[T315I] | ABL2 | ACK | ACTR2 | ACTR2B | AKT1 | AKT2 | AKT3 | ... | ZAK | ZAP70 | p38a | p38b | p38d | p38g | p70S6K | p70S6Kb | skMLCK | smMLCK |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Substrate | |||||||||||||||||||||
ATS1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
CARM1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
CLN3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
DPOE1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
EHMT2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
PP2AB | 4 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
ACTB | 6 | 7 | 5 | 3 | 4 | 2 | 0 | 2 | 2 | 2 | ... | 11 | 4 | 6 | 4 | 7 | 5 | 3 | 2 | 4 | 0 |
ACTG | 6 | 7 | 5 | 3 | 4 | 2 | 0 | 2 | 2 | 2 | ... | 11 | 4 | 6 | 4 | 7 | 5 | 3 | 2 | 4 | 0 |
TBA1A | 7 | 4 | 4 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 2 | 3 | 5 | 4 | 3 | 4 | 0 | 1 | 0 | 0 |
TBA1B | 7 | 4 | 4 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 2 | 3 | 5 | 4 | 3 | 4 | 0 | 1 | 0 | 0 |
869 rows × 385 columns
ID mapping
Query the uniprot id of substrates in the uniprot web
=pd.read_excel('raw/id_mapping.xlsx')
seq
= seq.rename(columns={'From':'uniprot','Sequence':'sequence'}) seq
= pd.read_csv('raw/Large_scale_S2.csv').rename(columns={'Substrate_uniprot':'uniprot'}) df
df
Type | Kinase | uniprot | Position | SIDIC | PTMscore\n(P > 075) | |
---|---|---|---|---|---|---|
0 | TK | ABL1 | 1433B_HUMAN | S212 | 1 | 1 |
1 | TK | ABL1 | 1433B_HUMAN | Y151 | 1 | 1 |
2 | TK | ABL1 | 1433B_HUMAN | Y21 | 1 | 1 |
3 | TK | ABL1 | 1433B_HUMAN | Y50 | 1 | 1 |
4 | TK | ABL1 | 1433E_HUMAN | Y152 | 1 | 1 |
... | ... | ... | ... | ... | ... | ... |
198531 | LK | SPHK2 | TICN3_HUMAN | T118 | 1 | 1 |
198532 | LK | SPHK2 | TPM4_HUMAN | T241 | 1 | 1 |
198533 | LK | SPHK2 | ULK3_HUMAN | S305 | 1 | 1 |
198534 | LK | SPHK2 | ZRAB2_HUMAN | S165 | 1 | 1 |
198535 | LK | SPHK2 | ZRAB2_HUMAN | S181 | 1 | 1 |
198536 rows × 6 columns
= df.merge(seq) df
# Extract amino acid and position from "Position"
'phospho_receptor'] = df['Position'].str[0]
df[
'phospho_position'] = df['Position'].str[1:].astype(int) df[
df.phospho_receptor.value_counts()
phospho_receptor
Y 106611
S 63242
T 28683
Name: count, dtype: int64
def validate_position(row):
# Extract amino acid and position from the new columns
= row['phospho_receptor']
amino_acid = int(row['phospho_position'])
position
try:
# Check if the amino acid at the given position matches the specified amino acid
if row['sequence'][position-1] == amino_acid:
return 1
else:
return 0
except IndexError: # Handle the case when position-1 exceeds the length of sequence
return 0
'Validated'] = df.apply(validate_position, axis=1) df[
df.Validated.value_counts()
Validated
1 196126
0 2410
Name: count, dtype: int64
'Validated==0').uniprot.value_counts() df.query(
uniprot
EFTU_HUMAN 339
DDX17_HUMAN 217
ARI1B_HUMAN 114
ANM1_HUMAN 95
MIER1_HUMAN 87
...
PAR14_HUMAN 1
ZNF30_HUMAN 1
RGS10_HUMAN 1
RAB4A_HUMAN 1
WDR81_HUMAN 1
Name: count, Length: 97, dtype: int64
Find substrate that does not have ideal amino acid at the position
= df.query('Validated==0').uniprot.value_counts().reset_index(name='count')\
invalid ={'index':'uniprot'})\
.rename(columns
.merge(seq)
invalid
uniprot | count | sequence | |
---|---|---|---|
0 | EFTU_HUMAN | 339 | MTTMAAATLLRATPHFSGLAAGRTFLLQGLLRLLKAPALPLLCRGL... |
1 | DDX17_HUMAN | 217 | MPTGFVAPILCVLLPSPTREAATVASATGDSASERESAAPAAAPTA... |
2 | ARI1B_HUMAN | 114 | MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG... |
3 | ANM1_HUMAN | 95 | MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED... |
4 | MIER1_HUMAN | 87 | MAEPSVESSSPGGSATSDDHEFDPSADMLVHDFDDERTLEEEEMME... |
... | ... | ... | ... |
92 | PAR14_HUMAN | 1 | MAVPGSFPLLVEGSWGPDPPKNLNTKLQMYFQSPKRSGGGECEVRQ... |
93 | ZNF30_HUMAN | 1 | MAHKYVGLQYHGSVTFEDVAIAFSQQEWESLDSSQRGLYRDVMLEN... |
94 | RGS10_HUMAN | 1 | MFNRAVSRLSRKRPPSDIHDSDGSSSSSHQSLKSTAKWAASLENLL... |
95 | RAB4A_HUMAN | 1 | MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI... |
96 | WDR81_HUMAN | 1 | MAQGSGGREGALRTPAGGWHSPPSPDMQELLRSVERDLSIDPRQLA... |
97 rows × 3 columns
Replace the invalidated sequence with updated sequence
# updated with valid sequences from old uniprot database
= pd.read_excel('raw/seq.xlsx') update1
# full sequence
= pd.read_csv('raw/large_scale_sequence.csv').drop(columns=['uniprot_updated']) seq
# get a new column "sequence_new" that shows the updated sequence
= seq.merge(update1,on='uniprot',how='left',suffixes=('', '_new'))
result
# Where a new sequence is available, replace the old sequence
'sequence'] = result['sequence_new'].combine_first(result['sequence'])
result[
# Drop the auxiliary columns
= result.drop(columns=['sequence_new'])
result
= result.copy() seq
seq
uniprot | sequence | |
---|---|---|
0 | 1433B_HUMAN | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
1 | 1433E_HUMAN | MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS... |
2 | 1433F_HUMAN | MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS... |
3 | 1433G_HUMAN | MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS... |
4 | 1433S_HUMAN | MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV... |
... | ... | ... |
4123 | 1B42_HUMAN | MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF... |
4124 | FA21D_HUMAN | MRGKRRPQTRAARRLAAQESSEAEDMSVPRGPIAQWADGAISPNGH... |
4125 | 1C06_HUMAN | MRVMAPRTLILLLSGALALTETWACSHSMRYFDTAVSRPGRGEPRF... |
4126 | MPP6_HUMAN | MQQVLENLTELPSSTGAEEIDLIFLKGIMENPIVKSLAKAHERLED... |
4127 | 1C16_HUMAN | MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF... |
4128 rows × 2 columns
Validate again
import pandas as pd
= pd.read_csv('raw/Large_scale_S2.csv').rename(columns={'Substrate_uniprot':'uniprot'}) df
= df.merge(result) df
# Extract amino acid and position from "Position"
'phospho_receptor'] = df['Position'].str[0]
df['phospho_position'] = df['Position'].str[1:].astype(int) df[
def validate_position(row):
# Extract amino acid and position from the new columns
= row['phospho_receptor']
amino_acid = int(row['phospho_position'])
position
try:
# Check if the amino acid at the given position matches the specified amino acid
if row['sequence'][position-1] == amino_acid:
return 1
else:
return 0
except IndexError: # Handle the case when position-1 exceeds the length of sequence
return 0
'Validated'] = df.apply(validate_position, axis=1) df[
# invalidated
'Validated==0').shape df.query(
(215, 10)
Much less than before (2410) !
= df.query('Validated==0') invalid
# remove uniprot that has invalid phosphoreceptor, and drop sequence column
= df[~df.uniprot.isin(invalid.uniprot)].drop(columns=['sequence']) df
# as we modify the sequence, we need to replace with new sequence
= df.merge(result) df
df
Type | Kinase | uniprot | Position | SIDIC | PTMscore\n(P > 075) | phospho_receptor | phospho_position | Validated | sequence | |
---|---|---|---|---|---|---|---|---|---|---|
0 | TK | ABL1 | 1433B_HUMAN | S212 | 1 | 1 | S | 212 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
1 | TK | ABL1 | 1433B_HUMAN | Y151 | 1 | 1 | Y | 151 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
2 | TK | ABL1 | 1433B_HUMAN | Y21 | 1 | 1 | Y | 21 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
3 | TK | ABL1 | 1433B_HUMAN | Y50 | 1 | 1 | Y | 50 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
4 | TK | ABL2 | 1433B_HUMAN | Y106 | 1 | 1 | Y | 106 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
198243 | LK | SPHK2 | SPHK2_HUMAN | T389 | 1 | 1 | T | 389 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
198244 | LK | SPHK2 | SPHK2_HUMAN | T402 | 1 | 1 | T | 402 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
198245 | LK | SPHK2 | SPHK2_HUMAN | T404 | 1 | 1 | T | 404 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
198246 | LK | SPHK2 | SPHK2_HUMAN | T503 | 1 | 1 | T | 503 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
198247 | LK | SPHK2 | SPHK2_HUMAN | T614 | 1 | 1 | T | 614 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
198248 rows × 10 columns
Convert substrate sequence to lower case
# for each substrate, find their phosphorylation point
= df.groupby('uniprot').agg({'Position':lambda r: r.unique()}).reset_index() modify
= modify.merge(result) modify
modify
uniprot | Position | sequence | |
---|---|---|---|
0 | 1433B_HUMAN | [S212, Y151, Y21, Y50, Y106, Y213, S47, S39, T... | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
1 | 1433E_HUMAN | [Y152, Y20, Y49, Y214, S148, S46, Y131, Y121, ... | MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS... |
2 | 1433F_HUMAN | [S215, Y154, Y20, Y49, S46, S150, Y216, S145, ... | MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS... |
3 | 1433G_HUMAN | [S215, Y20, Y216, Y49, S46, S38, T31, S64, Y12... | MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS... |
4 | 1433S_HUMAN | [S149, Y151, Y19, Y213, Y48, S45, S209, S37, T... | MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV... |
... | ... | ... | ... |
4094 | ZO1_HUMAN | [Y1165, Y669, S1064, Y1354, Y1355, Y1066, S106... | MSARAAAAKSTAMEETAIWEQHTVTLHRAPGFGFGIAISGGRDNPH... |
4095 | ZO2_HUMAN | [Y506, S430, Y426, Y1166] | MPVRGDRGFPPRRELSGWLRAPGMEELIWEQYTVTLQKDSKRGFGI... |
4096 | ZRAB2_HUMAN | [S188, T100, Y167, S120, Y114, Y124, S181, S15... | MSTKNFRVSDGDWICPDKKCGNVNFARRTSCNRCGREKTTEAKMMK... |
4097 | ZW10_HUMAN | [S605, S611] | MASFVTEVLAHSGRLEKEDLGTRISRLTRRVEEIKGEVCNMISKKY... |
4098 | ZYX_HUMAN | [S505, S259, S267, T270] | MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP... |
4099 rows × 3 columns
# Modify sequence based on position
def modify_sequence(row):
= list(row['sequence'])
seq for pos in row['Position']:
# Extract character and position
= pos[0]
char = int(pos[1:]) - 1 # Subtracting 1 because Python uses 0-based indexing
position
# Check if the character at the position matches
# if seq[position] == char:
= char.lower()
seq[position] return ''.join(seq)
'sequence2'] = modify.apply(modify_sequence, axis=1) modify[
= modify[['uniprot','sequence2']] seq2
# merge with transformed sequence
= df.drop(columns=['sequence']).merge(seq2) df
df.Kinase.value_counts()
Kinase
EPHA3 2101
FES 2007
TRKC 1926
SRC 1896
EPHA8 1877
...
PKN3 6
SPHK1 4
PIK3CD/PIK3R1 3
PFTAIRE1/cycD3 2
PRPK 2
Name: count, Length: 385, dtype: int64
Extract phosphosite sequence
= []
data for i, r in df.iterrows():
= r.phospho_position - 1
position = position - 7
start = position + 8
end
# Extract the subsequence
= r.sequence2[max(0, start):min(len(r.sequence2), end)]
subseq
# Pad the subsequence if needed
if start < 0:
= "_" * abs(start) + subseq
subseq if end > len(r.sequence2):
= subseq + "_" * (end - len(r.sequence2))
subseq
data.append(subseq)# break
'substrate'] = data df[
# check if the middle position belongs to s,t,y
str[7].value_counts() df.substrate.
substrate
y 106558
s 63030
t 28660
Name: count, dtype: int64
# check if the middle position belongs to s,t,y
apply(len).value_counts() df.substrate.
substrate
15 198248
Name: count, dtype: int64
10] df.substrate.value_counts()[:
substrate
IEQEGPEyWDRNTQI 1085
RKEsysVyVyKVLKQ 765
SSSLEKsyELPDGQV 765
VGMGQKDsyVGDEAQ 750
EsysVyVyKVLKQVH 729
TLNNKFAsFIDKVRF 708
MWISKQEyDEsGPsI 693
LPDGQVItIGNERFR 610
EyDEsGPsIVHRKCF 594
QGNRttPsyVAFtDt 556
Name: count, dtype: int64
'KS_pairs'] = df.Kinase+"_"+df.uniprot.str.split('_').str[0]+"_"+df.Position
df[
'S_position'] = df.uniprot.str.split('_').str[0]+"_"+df.Position
df[
'Uniprot'] = df.uniprot.str.split('_').str[0] df[
Check the most frequence substrate sequence
'substrate == "RKEsysVyVyKVLKQ"').S_position.value_counts() df.query(
S_position
H2B1C_Y41 85
H2B1D_Y41 85
H2B1H_Y41 85
H2B1K_Y41 85
H2B1L_Y41 85
H2B1M_Y41 85
H2B1N_Y41 85
H2B2F_Y41 85
H2BFS_Y41 85
Name: count, dtype: int64
Check if the sequences of these substrates are identical
= df.query('substrate == "RKEsysVyVyKVLKQ"').S_position.value_counts().index
name
df[df.S_position.isin(name)].sequence2.unique()
array(['MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPTKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPDPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK',
'MPELAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIASEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPVKSAPVPKKGSKKAINKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPSKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPDPAKSAPAPKKGSKKAVTKVQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLPHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK'],
dtype=object)
Save
df.shape
(198248, 14)
# df.to_excel('raw/large_scale_final2.xlsx',index=False)