import pandas as pdPreprocess Large-scale dataset
df = pd.read_csv('raw/Large_scale_S2.csv')# Add substrate and phos position info
df['Substrate'] = df.Substrate_uniprot.str.split('_').str[0]
df['substrate_position'] = df['Substrate'] + "_"+ df['Position']EDA - Pivot table
index is unique phosphosites and column is kinase
p = df.pivot_table(index='substrate_position',
columns='Kinase',
values='Substrate', # any column
aggfunc='count',
fill_value=0)p| Kinase | ABL1 | ABL1[E255K] | ABL1[T315I] | ABL2 | ACK | ACTR2 | ACTR2B | AKT1 | AKT2 | AKT3 | ... | ZAK | ZAP70 | p38a | p38b | p38d | p38g | p70S6K | p70S6Kb | skMLCK | smMLCK |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| substrate_position | |||||||||||||||||||||
| 1433B_S132 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1433B_S145 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1433B_S147 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1433B_S158 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1433B_S212 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZW10_S611 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ZYX_S259 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| ZYX_S267 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
| ZYX_S505 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ZYX_T270 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
21449 rows × 385 columns
# Check mutation
for kinase_with_mutation in ['ABL','ALK','BRAF','EGFR','FGFR3',
'KIT','LRRK2','MET','PDGFRa','RET']:
print(p[p.columns[p.columns.str.contains(kinase_with_mutation)]].sum().reset_index(name='#substrates'))
print('---------------') Kinase #substrates
0 ABL1 1577
1 ABL1[E255K] 1274
2 ABL1[T315I] 1322
3 ABL2 1004
---------------
Kinase #substrates
0 ALK 1876
1 ALK1 29
2 ALK2 331
3 ALK4 463
4 ALK[F1174L] 1111
5 ALK[R1275Q] 1044
---------------
Kinase #substrates
0 BRAF 22
1 BRAF[V600E] 195
---------------
Kinase #substrates
0 EGFR 796
1 EGFR[L858R] 648
2 EGFR[L861Q] 981
3 EGFR[T790M/L858R] 1112
4 EGFR[T790M] 1086
---------------
Kinase #substrates
0 FGFR3 775
1 FGFR3[K650E] 827
2 FGFR3[K650M] 1220
---------------
Kinase #substrates
0 KIT 786
1 KIT[T670I] 674
2 KIT[V560G] 172
---------------
Kinase #substrates
0 LRRK2 59
1 LRRK2[G2019S] 11
---------------
Kinase #substrates
0 MET 1457
1 MET[Y1235D] 511
---------------
Kinase #substrates
0 PDGFRa 1194
1 PDGFRa[T674I] 249
2 PDGFRa[V561D] 1175
---------------
Kinase #substrates
0 RET 1735
1 RET[G691S] 1165
2 RET[M918T] 1221
3 RET[S891A] 876
4 RET[Y791F] 860
---------------
# Number of substrates of kinase
p.sum().sort_values(ascending=False)Kinase
EPHA3 2102
FES 2008
TRKC 1928
SRC 1899
EPHA8 1878
...
PKN3 6
SPHK1 4
PIK3CD/PIK3R1 3
PFTAIRE1/cycD3 2
PRPK 2
Length: 385, dtype: int64
# Most phosphorylated substrates
p.sum(1).sort_values(ascending=False)substrate_position
HSPB1_S82 307
ALDOA_S39 191
HS90B_S462 189
RS29_Y7 185
RS16_S9 180
...
ZN768_Y359 1
ZN687_S316 1
ZN706_Y39 1
1433F_T210 1
1433F_T231 1
Length: 21449, dtype: int64
index is unique substrates and column is kinase
p2 = df.pivot_table(index='Substrate',
columns='Kinase',
values='Type', # any column
aggfunc='count',
fill_value=0)p2| Kinase | ABL1 | ABL1[E255K] | ABL1[T315I] | ABL2 | ACK | ACTR2 | ACTR2B | AKT1 | AKT2 | AKT3 | ... | ZAK | ZAP70 | p38a | p38b | p38d | p38g | p70S6K | p70S6Kb | skMLCK | smMLCK |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Substrate | |||||||||||||||||||||
| 1433B | 4 | 4 | 3 | 4 | 2 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 3 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
| 1433E | 3 | 4 | 3 | 4 | 2 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 4 | 1 | 1 | 2 | 2 | 1 | 0 | 0 | 0 |
| 1433F | 4 | 3 | 4 | 4 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 3 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | 0 |
| 1433G | 4 | 4 | 4 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 2 | 2 | 2 | 2 | 2 | 1 | 0 | 0 | 0 |
| 1433S | 5 | 4 | 3 | 3 | 2 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 3 | 2 | 1 | 2 | 2 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZO1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| ZO2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ZRAB2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ZW10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ZYX | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 |
4128 rows × 385 columns
Find substrates that share same kinase pattern
p2[p2.duplicated(keep=False)].sort_values(list(p2.columns))| Kinase | ABL1 | ABL1[E255K] | ABL1[T315I] | ABL2 | ACK | ACTR2 | ACTR2B | AKT1 | AKT2 | AKT3 | ... | ZAK | ZAP70 | p38a | p38b | p38d | p38g | p70S6K | p70S6Kb | skMLCK | smMLCK |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Substrate | |||||||||||||||||||||
| ATS1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| CARM1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| CLN3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| DPOE1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| EHMT2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| PP2AB | 4 | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ACTB | 6 | 7 | 5 | 3 | 4 | 2 | 0 | 2 | 2 | 2 | ... | 11 | 4 | 6 | 4 | 7 | 5 | 3 | 2 | 4 | 0 |
| ACTG | 6 | 7 | 5 | 3 | 4 | 2 | 0 | 2 | 2 | 2 | ... | 11 | 4 | 6 | 4 | 7 | 5 | 3 | 2 | 4 | 0 |
| TBA1A | 7 | 4 | 4 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 2 | 3 | 5 | 4 | 3 | 4 | 0 | 1 | 0 | 0 |
| TBA1B | 7 | 4 | 4 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 2 | 3 | 5 | 4 | 3 | 4 | 0 | 1 | 0 | 0 |
869 rows × 385 columns
ID mapping
Query the uniprot id of substrates in the uniprot web
seq=pd.read_excel('raw/id_mapping.xlsx')
seq = seq.rename(columns={'From':'uniprot','Sequence':'sequence'})df = pd.read_csv('raw/Large_scale_S2.csv').rename(columns={'Substrate_uniprot':'uniprot'})df| Type | Kinase | uniprot | Position | SIDIC | PTMscore\n(P > 075) | |
|---|---|---|---|---|---|---|
| 0 | TK | ABL1 | 1433B_HUMAN | S212 | 1 | 1 |
| 1 | TK | ABL1 | 1433B_HUMAN | Y151 | 1 | 1 |
| 2 | TK | ABL1 | 1433B_HUMAN | Y21 | 1 | 1 |
| 3 | TK | ABL1 | 1433B_HUMAN | Y50 | 1 | 1 |
| 4 | TK | ABL1 | 1433E_HUMAN | Y152 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 198531 | LK | SPHK2 | TICN3_HUMAN | T118 | 1 | 1 |
| 198532 | LK | SPHK2 | TPM4_HUMAN | T241 | 1 | 1 |
| 198533 | LK | SPHK2 | ULK3_HUMAN | S305 | 1 | 1 |
| 198534 | LK | SPHK2 | ZRAB2_HUMAN | S165 | 1 | 1 |
| 198535 | LK | SPHK2 | ZRAB2_HUMAN | S181 | 1 | 1 |
198536 rows × 6 columns
df = df.merge(seq)# Extract amino acid and position from "Position"
df['phospho_receptor'] = df['Position'].str[0]
df['phospho_position'] = df['Position'].str[1:].astype(int)df.phospho_receptor.value_counts()phospho_receptor
Y 106611
S 63242
T 28683
Name: count, dtype: int64
def validate_position(row):
# Extract amino acid and position from the new columns
amino_acid = row['phospho_receptor']
position = int(row['phospho_position'])
try:
# Check if the amino acid at the given position matches the specified amino acid
if row['sequence'][position-1] == amino_acid:
return 1
else:
return 0
except IndexError: # Handle the case when position-1 exceeds the length of sequence
return 0df['Validated'] = df.apply(validate_position, axis=1)df.Validated.value_counts()Validated
1 196126
0 2410
Name: count, dtype: int64
df.query('Validated==0').uniprot.value_counts()uniprot
EFTU_HUMAN 339
DDX17_HUMAN 217
ARI1B_HUMAN 114
ANM1_HUMAN 95
MIER1_HUMAN 87
...
PAR14_HUMAN 1
ZNF30_HUMAN 1
RGS10_HUMAN 1
RAB4A_HUMAN 1
WDR81_HUMAN 1
Name: count, Length: 97, dtype: int64
Find substrate that does not have ideal amino acid at the position
invalid = df.query('Validated==0').uniprot.value_counts().reset_index(name='count')\
.rename(columns={'index':'uniprot'})\
.merge(seq)
invalid| uniprot | count | sequence | |
|---|---|---|---|
| 0 | EFTU_HUMAN | 339 | MTTMAAATLLRATPHFSGLAAGRTFLLQGLLRLLKAPALPLLCRGL... |
| 1 | DDX17_HUMAN | 217 | MPTGFVAPILCVLLPSPTREAATVASATGDSASERESAAPAAAPTA... |
| 2 | ARI1B_HUMAN | 114 | MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG... |
| 3 | ANM1_HUMAN | 95 | MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED... |
| 4 | MIER1_HUMAN | 87 | MAEPSVESSSPGGSATSDDHEFDPSADMLVHDFDDERTLEEEEMME... |
| ... | ... | ... | ... |
| 92 | PAR14_HUMAN | 1 | MAVPGSFPLLVEGSWGPDPPKNLNTKLQMYFQSPKRSGGGECEVRQ... |
| 93 | ZNF30_HUMAN | 1 | MAHKYVGLQYHGSVTFEDVAIAFSQQEWESLDSSQRGLYRDVMLEN... |
| 94 | RGS10_HUMAN | 1 | MFNRAVSRLSRKRPPSDIHDSDGSSSSSHQSLKSTAKWAASLENLL... |
| 95 | RAB4A_HUMAN | 1 | MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI... |
| 96 | WDR81_HUMAN | 1 | MAQGSGGREGALRTPAGGWHSPPSPDMQELLRSVERDLSIDPRQLA... |
97 rows × 3 columns
Replace the invalidated sequence with updated sequence
# updated with valid sequences from old uniprot database
update1 = pd.read_excel('raw/seq.xlsx')# full sequence
seq = pd.read_csv('raw/large_scale_sequence.csv').drop(columns=['uniprot_updated'])# get a new column "sequence_new" that shows the updated sequence
result = seq.merge(update1,on='uniprot',how='left',suffixes=('', '_new'))
# Where a new sequence is available, replace the old sequence
result['sequence'] = result['sequence_new'].combine_first(result['sequence'])
# Drop the auxiliary columns
result = result.drop(columns=['sequence_new'])
seq = result.copy()seq| uniprot | sequence | |
|---|---|---|
| 0 | 1433B_HUMAN | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| 1 | 1433E_HUMAN | MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS... |
| 2 | 1433F_HUMAN | MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS... |
| 3 | 1433G_HUMAN | MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS... |
| 4 | 1433S_HUMAN | MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV... |
| ... | ... | ... |
| 4123 | 1B42_HUMAN | MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF... |
| 4124 | FA21D_HUMAN | MRGKRRPQTRAARRLAAQESSEAEDMSVPRGPIAQWADGAISPNGH... |
| 4125 | 1C06_HUMAN | MRVMAPRTLILLLSGALALTETWACSHSMRYFDTAVSRPGRGEPRF... |
| 4126 | MPP6_HUMAN | MQQVLENLTELPSSTGAEEIDLIFLKGIMENPIVKSLAKAHERLED... |
| 4127 | 1C16_HUMAN | MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF... |
4128 rows × 2 columns
Validate again
import pandas as pddf = pd.read_csv('raw/Large_scale_S2.csv').rename(columns={'Substrate_uniprot':'uniprot'})df = df.merge(result)# Extract amino acid and position from "Position"
df['phospho_receptor'] = df['Position'].str[0]
df['phospho_position'] = df['Position'].str[1:].astype(int)def validate_position(row):
# Extract amino acid and position from the new columns
amino_acid = row['phospho_receptor']
position = int(row['phospho_position'])
try:
# Check if the amino acid at the given position matches the specified amino acid
if row['sequence'][position-1] == amino_acid:
return 1
else:
return 0
except IndexError: # Handle the case when position-1 exceeds the length of sequence
return 0df['Validated'] = df.apply(validate_position, axis=1)# invalidated
df.query('Validated==0').shape(215, 10)
Much less than before (2410) !
invalid = df.query('Validated==0')# remove uniprot that has invalid phosphoreceptor, and drop sequence column
df = df[~df.uniprot.isin(invalid.uniprot)].drop(columns=['sequence'])# as we modify the sequence, we need to replace with new sequence
df = df.merge(result)df| Type | Kinase | uniprot | Position | SIDIC | PTMscore\n(P > 075) | phospho_receptor | phospho_position | Validated | sequence | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TK | ABL1 | 1433B_HUMAN | S212 | 1 | 1 | S | 212 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| 1 | TK | ABL1 | 1433B_HUMAN | Y151 | 1 | 1 | Y | 151 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| 2 | TK | ABL1 | 1433B_HUMAN | Y21 | 1 | 1 | Y | 21 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| 3 | TK | ABL1 | 1433B_HUMAN | Y50 | 1 | 1 | Y | 50 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| 4 | TK | ABL2 | 1433B_HUMAN | Y106 | 1 | 1 | Y | 106 | 1 | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 198243 | LK | SPHK2 | SPHK2_HUMAN | T389 | 1 | 1 | T | 389 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
| 198244 | LK | SPHK2 | SPHK2_HUMAN | T402 | 1 | 1 | T | 402 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
| 198245 | LK | SPHK2 | SPHK2_HUMAN | T404 | 1 | 1 | T | 404 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
| 198246 | LK | SPHK2 | SPHK2_HUMAN | T503 | 1 | 1 | T | 503 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
| 198247 | LK | SPHK2 | SPHK2_HUMAN | T614 | 1 | 1 | T | 614 | 1 | MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA... |
198248 rows × 10 columns
Convert substrate sequence to lower case
# for each substrate, find their phosphorylation point
modify = df.groupby('uniprot').agg({'Position':lambda r: r.unique()}).reset_index()modify = modify.merge(result)modify| uniprot | Position | sequence | |
|---|---|---|---|
| 0 | 1433B_HUMAN | [S212, Y151, Y21, Y50, Y106, Y213, S47, S39, T... | MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL... |
| 1 | 1433E_HUMAN | [Y152, Y20, Y49, Y214, S148, S46, Y131, Y121, ... | MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS... |
| 2 | 1433F_HUMAN | [S215, Y154, Y20, Y49, S46, S150, Y216, S145, ... | MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS... |
| 3 | 1433G_HUMAN | [S215, Y20, Y216, Y49, S46, S38, T31, S64, Y12... | MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS... |
| 4 | 1433S_HUMAN | [S149, Y151, Y19, Y213, Y48, S45, S209, S37, T... | MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV... |
| ... | ... | ... | ... |
| 4094 | ZO1_HUMAN | [Y1165, Y669, S1064, Y1354, Y1355, Y1066, S106... | MSARAAAAKSTAMEETAIWEQHTVTLHRAPGFGFGIAISGGRDNPH... |
| 4095 | ZO2_HUMAN | [Y506, S430, Y426, Y1166] | MPVRGDRGFPPRRELSGWLRAPGMEELIWEQYTVTLQKDSKRGFGI... |
| 4096 | ZRAB2_HUMAN | [S188, T100, Y167, S120, Y114, Y124, S181, S15... | MSTKNFRVSDGDWICPDKKCGNVNFARRTSCNRCGREKTTEAKMMK... |
| 4097 | ZW10_HUMAN | [S605, S611] | MASFVTEVLAHSGRLEKEDLGTRISRLTRRVEEIKGEVCNMISKKY... |
| 4098 | ZYX_HUMAN | [S505, S259, S267, T270] | MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP... |
4099 rows × 3 columns
# Modify sequence based on position
def modify_sequence(row):
seq = list(row['sequence'])
for pos in row['Position']:
# Extract character and position
char = pos[0]
position = int(pos[1:]) - 1 # Subtracting 1 because Python uses 0-based indexing
# Check if the character at the position matches
# if seq[position] == char:
seq[position] = char.lower()
return ''.join(seq)modify['sequence2'] = modify.apply(modify_sequence, axis=1)seq2 = modify[['uniprot','sequence2']]# merge with transformed sequence
df = df.drop(columns=['sequence']).merge(seq2)df.Kinase.value_counts()Kinase
EPHA3 2101
FES 2007
TRKC 1926
SRC 1896
EPHA8 1877
...
PKN3 6
SPHK1 4
PIK3CD/PIK3R1 3
PFTAIRE1/cycD3 2
PRPK 2
Name: count, Length: 385, dtype: int64
Extract phosphosite sequence
data = []
for i, r in df.iterrows():
position = r.phospho_position - 1
start = position - 7
end = position + 8
# Extract the subsequence
subseq = r.sequence2[max(0, start):min(len(r.sequence2), end)]
# Pad the subsequence if needed
if start < 0:
subseq = "_" * abs(start) + subseq
if end > len(r.sequence2):
subseq = subseq + "_" * (end - len(r.sequence2))
data.append(subseq)
# breakdf['substrate'] = data# check if the middle position belongs to s,t,y
df.substrate.str[7].value_counts()substrate
y 106558
s 63030
t 28660
Name: count, dtype: int64
# check if the middle position belongs to s,t,y
df.substrate.apply(len).value_counts()substrate
15 198248
Name: count, dtype: int64
df.substrate.value_counts()[:10]substrate
IEQEGPEyWDRNTQI 1085
RKEsysVyVyKVLKQ 765
SSSLEKsyELPDGQV 765
VGMGQKDsyVGDEAQ 750
EsysVyVyKVLKQVH 729
TLNNKFAsFIDKVRF 708
MWISKQEyDEsGPsI 693
LPDGQVItIGNERFR 610
EyDEsGPsIVHRKCF 594
QGNRttPsyVAFtDt 556
Name: count, dtype: int64
df['KS_pairs'] = df.Kinase+"_"+df.uniprot.str.split('_').str[0]+"_"+df.Position
df['S_position'] = df.uniprot.str.split('_').str[0]+"_"+df.Position
df['Uniprot'] = df.uniprot.str.split('_').str[0]Check the most frequence substrate sequence
df.query('substrate == "RKEsysVyVyKVLKQ"').S_position.value_counts()S_position
H2B1C_Y41 85
H2B1D_Y41 85
H2B1H_Y41 85
H2B1K_Y41 85
H2B1L_Y41 85
H2B1M_Y41 85
H2B1N_Y41 85
H2B2F_Y41 85
H2BFS_Y41 85
Name: count, dtype: int64
Check if the sequences of these substrates are identical
name = df.query('substrate == "RKEsysVyVyKVLKQ"').S_position.value_counts().index
df[df.S_position.isin(name)].sequence2.unique()array(['MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPTKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPDPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK',
'MPELAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIASEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPVKSAPVPKKGSKKAINKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPSKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPDPAKSAPAPKKGSKKAVTKVQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
'MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLPHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK'],
dtype=object)
Save
df.shape(198248, 14)
# df.to_excel('raw/large_scale_final2.xlsx',index=False)