Preprocess Large-scale dataset

import pandas as pd
df = pd.read_csv('raw/Large_scale_S2.csv')
# Add substrate and phos position info
df['Substrate'] = df.Substrate_uniprot.str.split('_').str[0]
df['substrate_position'] = df['Substrate'] + "_"+ df['Position']

EDA - Pivot table

index is unique phosphosites and column is kinase

p = df.pivot_table(index='substrate_position',
                   columns='Kinase',
                   values='Substrate', # any column
                   aggfunc='count',
                   fill_value=0)
p
Kinase ABL1 ABL1[E255K] ABL1[T315I] ABL2 ACK ACTR2 ACTR2B AKT1 AKT2 AKT3 ... ZAK ZAP70 p38a p38b p38d p38g p70S6K p70S6Kb skMLCK smMLCK
substrate_position
1433B_S132 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1433B_S145 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1433B_S147 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1433B_S158 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1433B_S212 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZW10_S611 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ZYX_S259 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
ZYX_S267 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0
ZYX_S505 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ZYX_T270 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0

21449 rows × 385 columns

# Check mutation
for kinase_with_mutation in ['ABL','ALK','BRAF','EGFR','FGFR3',
                             'KIT','LRRK2','MET','PDGFRa','RET']:
    print(p[p.columns[p.columns.str.contains(kinase_with_mutation)]].sum().reset_index(name='#substrates'))
    print('---------------')
        Kinase  #substrates
0         ABL1         1577
1  ABL1[E255K]         1274
2  ABL1[T315I]         1322
3         ABL2         1004
---------------
        Kinase  #substrates
0          ALK         1876
1         ALK1           29
2         ALK2          331
3         ALK4          463
4  ALK[F1174L]         1111
5  ALK[R1275Q]         1044
---------------
        Kinase  #substrates
0         BRAF           22
1  BRAF[V600E]          195
---------------
              Kinase  #substrates
0               EGFR          796
1        EGFR[L858R]          648
2        EGFR[L861Q]          981
3  EGFR[T790M/L858R]         1112
4        EGFR[T790M]         1086
---------------
         Kinase  #substrates
0         FGFR3          775
1  FGFR3[K650E]          827
2  FGFR3[K650M]         1220
---------------
       Kinase  #substrates
0         KIT          786
1  KIT[T670I]          674
2  KIT[V560G]          172
---------------
          Kinase  #substrates
0          LRRK2           59
1  LRRK2[G2019S]           11
---------------
        Kinase  #substrates
0          MET         1457
1  MET[Y1235D]          511
---------------
          Kinase  #substrates
0         PDGFRa         1194
1  PDGFRa[T674I]          249
2  PDGFRa[V561D]         1175
---------------
       Kinase  #substrates
0         RET         1735
1  RET[G691S]         1165
2  RET[M918T]         1221
3  RET[S891A]          876
4  RET[Y791F]          860
---------------
# Number of substrates of kinase
p.sum().sort_values(ascending=False)
Kinase
EPHA3             2102
FES               2008
TRKC              1928
SRC               1899
EPHA8             1878
                  ... 
PKN3                 6
SPHK1                4
PIK3CD/PIK3R1        3
PFTAIRE1/cycD3       2
PRPK                 2
Length: 385, dtype: int64
# Most phosphorylated substrates
p.sum(1).sort_values(ascending=False)
substrate_position
HSPB1_S82     307
ALDOA_S39     191
HS90B_S462    189
RS29_Y7       185
RS16_S9       180
             ... 
ZN768_Y359      1
ZN687_S316      1
ZN706_Y39       1
1433F_T210      1
1433F_T231      1
Length: 21449, dtype: int64

index is unique substrates and column is kinase

p2 = df.pivot_table(index='Substrate',
                   columns='Kinase',
                   values='Type', # any column
                   aggfunc='count',
                   fill_value=0)
p2
Kinase ABL1 ABL1[E255K] ABL1[T315I] ABL2 ACK ACTR2 ACTR2B AKT1 AKT2 AKT3 ... ZAK ZAP70 p38a p38b p38d p38g p70S6K p70S6Kb skMLCK smMLCK
Substrate
1433B 4 4 3 4 2 0 0 1 0 0 ... 0 3 1 1 1 1 0 0 0 0
1433E 3 4 3 4 2 1 0 1 0 0 ... 1 4 1 1 2 2 1 0 0 0
1433F 4 3 4 4 1 0 0 1 0 0 ... 0 3 1 1 1 2 1 0 0 0
1433G 4 4 4 3 1 0 0 1 0 0 ... 1 2 2 2 2 2 1 0 0 0
1433S 5 4 3 3 2 0 0 1 0 0 ... 0 3 2 1 2 2 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZO1 2 1 0 1 0 0 0 0 0 0 ... 0 0 1 0 0 1 1 0 0 0
ZO2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ZRAB2 0 0 2 0 0 0 0 0 0 0 ... 2 4 0 0 0 0 1 0 0 0
ZW10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ZYX 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 3 2 0 0 0 0

4128 rows × 385 columns

Find substrates that share same kinase pattern

p2[p2.duplicated(keep=False)].sort_values(list(p2.columns))
Kinase ABL1 ABL1[E255K] ABL1[T315I] ABL2 ACK ACTR2 ACTR2B AKT1 AKT2 AKT3 ... ZAK ZAP70 p38a p38b p38d p38g p70S6K p70S6Kb skMLCK smMLCK
Substrate
ATS1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
CARM1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
CLN3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
DPOE1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
EHMT2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
PP2AB 4 2 4 1 0 0 0 0 0 0 ... 0 0 1 0 0 1 0 0 0 0
ACTB 6 7 5 3 4 2 0 2 2 2 ... 11 4 6 4 7 5 3 2 4 0
ACTG 6 7 5 3 4 2 0 2 2 2 ... 11 4 6 4 7 5 3 2 4 0
TBA1A 7 4 4 6 1 1 0 0 0 0 ... 2 3 5 4 3 4 0 1 0 0
TBA1B 7 4 4 6 1 1 0 0 0 0 ... 2 3 5 4 3 4 0 1 0 0

869 rows × 385 columns

ID mapping

Query the uniprot id of substrates in the uniprot web

seq=pd.read_excel('raw/id_mapping.xlsx')

seq = seq.rename(columns={'From':'uniprot','Sequence':'sequence'})
df = pd.read_csv('raw/Large_scale_S2.csv').rename(columns={'Substrate_uniprot':'uniprot'})
df
Type Kinase uniprot Position SIDIC PTMscore\n(P > 075)
0 TK ABL1 1433B_HUMAN S212 1 1
1 TK ABL1 1433B_HUMAN Y151 1 1
2 TK ABL1 1433B_HUMAN Y21 1 1
3 TK ABL1 1433B_HUMAN Y50 1 1
4 TK ABL1 1433E_HUMAN Y152 1 1
... ... ... ... ... ... ...
198531 LK SPHK2 TICN3_HUMAN T118 1 1
198532 LK SPHK2 TPM4_HUMAN T241 1 1
198533 LK SPHK2 ULK3_HUMAN S305 1 1
198534 LK SPHK2 ZRAB2_HUMAN S165 1 1
198535 LK SPHK2 ZRAB2_HUMAN S181 1 1

198536 rows × 6 columns

df = df.merge(seq)
# Extract amino acid and position from "Position"
df['phospho_receptor'] = df['Position'].str[0]

df['phospho_position'] = df['Position'].str[1:].astype(int)
df.phospho_receptor.value_counts()
phospho_receptor
Y    106611
S     63242
T     28683
Name: count, dtype: int64
def validate_position(row):
    # Extract amino acid and position from the new columns
    amino_acid = row['phospho_receptor']
    position = int(row['phospho_position'])
    
    try:
        # Check if the amino acid at the given position matches the specified amino acid
        if row['sequence'][position-1] == amino_acid:
            return 1
        else:
            return 0
    except IndexError:  # Handle the case when position-1 exceeds the length of sequence
        return 0
df['Validated'] = df.apply(validate_position, axis=1)
df.Validated.value_counts()
Validated
1    196126
0      2410
Name: count, dtype: int64
df.query('Validated==0').uniprot.value_counts()
uniprot
EFTU_HUMAN     339
DDX17_HUMAN    217
ARI1B_HUMAN    114
ANM1_HUMAN      95
MIER1_HUMAN     87
              ... 
PAR14_HUMAN      1
ZNF30_HUMAN      1
RGS10_HUMAN      1
RAB4A_HUMAN      1
WDR81_HUMAN      1
Name: count, Length: 97, dtype: int64

Find substrate that does not have ideal amino acid at the position

invalid = df.query('Validated==0').uniprot.value_counts().reset_index(name='count')\
.rename(columns={'index':'uniprot'})\
.merge(seq)

invalid
uniprot count sequence
0 EFTU_HUMAN 339 MTTMAAATLLRATPHFSGLAAGRTFLLQGLLRLLKAPALPLLCRGL...
1 DDX17_HUMAN 217 MPTGFVAPILCVLLPSPTREAATVASATGDSASERESAAPAAAPTA...
2 ARI1B_HUMAN 114 MAARAAAAAAAAAARARARAGSGERRAPPGPRPAPGARDLEAGARG...
3 ANM1_HUMAN 95 MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED...
4 MIER1_HUMAN 87 MAEPSVESSSPGGSATSDDHEFDPSADMLVHDFDDERTLEEEEMME...
... ... ... ...
92 PAR14_HUMAN 1 MAVPGSFPLLVEGSWGPDPPKNLNTKLQMYFQSPKRSGGGECEVRQ...
93 ZNF30_HUMAN 1 MAHKYVGLQYHGSVTFEDVAIAFSQQEWESLDSSQRGLYRDVMLEN...
94 RGS10_HUMAN 1 MFNRAVSRLSRKRPPSDIHDSDGSSSSSHQSLKSTAKWAASLENLL...
95 RAB4A_HUMAN 1 MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...
96 WDR81_HUMAN 1 MAQGSGGREGALRTPAGGWHSPPSPDMQELLRSVERDLSIDPRQLA...

97 rows × 3 columns

Replace the invalidated sequence with updated sequence

# updated with valid sequences from old uniprot database
update1 = pd.read_excel('raw/seq.xlsx')
# full sequence
seq = pd.read_csv('raw/large_scale_sequence.csv').drop(columns=['uniprot_updated'])
# get a new column "sequence_new" that shows the updated sequence
result = seq.merge(update1,on='uniprot',how='left',suffixes=('', '_new'))

# Where a new sequence is available, replace the old sequence
result['sequence'] = result['sequence_new'].combine_first(result['sequence'])

# Drop the auxiliary columns
result = result.drop(columns=['sequence_new'])

seq = result.copy()
seq
uniprot sequence
0 1433B_HUMAN MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
1 1433E_HUMAN MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
2 1433F_HUMAN MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...
3 1433G_HUMAN MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...
4 1433S_HUMAN MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV...
... ... ...
4123 1B42_HUMAN MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
4124 FA21D_HUMAN MRGKRRPQTRAARRLAAQESSEAEDMSVPRGPIAQWADGAISPNGH...
4125 1C06_HUMAN MRVMAPRTLILLLSGALALTETWACSHSMRYFDTAVSRPGRGEPRF...
4126 MPP6_HUMAN MQQVLENLTELPSSTGAEEIDLIFLKGIMENPIVKSLAKAHERLED...
4127 1C16_HUMAN MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...

4128 rows × 2 columns

Validate again

import pandas as pd
df = pd.read_csv('raw/Large_scale_S2.csv').rename(columns={'Substrate_uniprot':'uniprot'})
df = df.merge(result)
# Extract amino acid and position from "Position"
df['phospho_receptor'] = df['Position'].str[0]
df['phospho_position'] = df['Position'].str[1:].astype(int)
def validate_position(row):
    # Extract amino acid and position from the new columns
    amino_acid = row['phospho_receptor']
    position = int(row['phospho_position'])
    
    try:
        # Check if the amino acid at the given position matches the specified amino acid
        if row['sequence'][position-1] == amino_acid:
            return 1
        else:
            return 0
    except IndexError:  # Handle the case when position-1 exceeds the length of sequence
        return 0
df['Validated'] = df.apply(validate_position, axis=1)
# invalidated
df.query('Validated==0').shape
(215, 10)

Much less than before (2410) !

invalid = df.query('Validated==0')
# remove uniprot that has invalid phosphoreceptor, and drop sequence column
df = df[~df.uniprot.isin(invalid.uniprot)].drop(columns=['sequence'])
# as we modify the sequence, we need to replace with new sequence
df = df.merge(result)
df
Type Kinase uniprot Position SIDIC PTMscore\n(P > 075) phospho_receptor phospho_position Validated sequence
0 TK ABL1 1433B_HUMAN S212 1 1 S 212 1 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
1 TK ABL1 1433B_HUMAN Y151 1 1 Y 151 1 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
2 TK ABL1 1433B_HUMAN Y21 1 1 Y 21 1 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
3 TK ABL1 1433B_HUMAN Y50 1 1 Y 50 1 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
4 TK ABL2 1433B_HUMAN Y106 1 1 Y 106 1 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
... ... ... ... ... ... ... ... ... ... ...
198243 LK SPHK2 SPHK2_HUMAN T389 1 1 T 389 1 MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
198244 LK SPHK2 SPHK2_HUMAN T402 1 1 T 402 1 MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
198245 LK SPHK2 SPHK2_HUMAN T404 1 1 T 404 1 MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
198246 LK SPHK2 SPHK2_HUMAN T503 1 1 T 503 1 MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
198247 LK SPHK2 SPHK2_HUMAN T614 1 1 T 614 1 MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...

198248 rows × 10 columns

Convert substrate sequence to lower case

# for each substrate, find their phosphorylation point
modify = df.groupby('uniprot').agg({'Position':lambda r: r.unique()}).reset_index()
modify = modify.merge(result)
modify
uniprot Position sequence
0 1433B_HUMAN [S212, Y151, Y21, Y50, Y106, Y213, S47, S39, T... MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...
1 1433E_HUMAN [Y152, Y20, Y49, Y214, S148, S46, Y131, Y121, ... MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...
2 1433F_HUMAN [S215, Y154, Y20, Y49, S46, S150, Y216, S145, ... MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...
3 1433G_HUMAN [S215, Y20, Y216, Y49, S46, S38, T31, S64, Y12... MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...
4 1433S_HUMAN [S149, Y151, Y19, Y213, Y48, S45, S209, S37, T... MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV...
... ... ... ...
4094 ZO1_HUMAN [Y1165, Y669, S1064, Y1354, Y1355, Y1066, S106... MSARAAAAKSTAMEETAIWEQHTVTLHRAPGFGFGIAISGGRDNPH...
4095 ZO2_HUMAN [Y506, S430, Y426, Y1166] MPVRGDRGFPPRRELSGWLRAPGMEELIWEQYTVTLQKDSKRGFGI...
4096 ZRAB2_HUMAN [S188, T100, Y167, S120, Y114, Y124, S181, S15... MSTKNFRVSDGDWICPDKKCGNVNFARRTSCNRCGREKTTEAKMMK...
4097 ZW10_HUMAN [S605, S611] MASFVTEVLAHSGRLEKEDLGTRISRLTRRVEEIKGEVCNMISKKY...
4098 ZYX_HUMAN [S505, S259, S267, T270] MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...

4099 rows × 3 columns

# Modify sequence based on position
def modify_sequence(row):
    seq = list(row['sequence'])
    for pos in row['Position']:
        # Extract character and position
        char = pos[0]
        position = int(pos[1:]) - 1  # Subtracting 1 because Python uses 0-based indexing
        
        # Check if the character at the position matches
        # if seq[position] == char:
        seq[position] = char.lower()
    return ''.join(seq)
modify['sequence2'] = modify.apply(modify_sequence, axis=1)
seq2 = modify[['uniprot','sequence2']]
# merge with transformed sequence
df = df.drop(columns=['sequence']).merge(seq2)
df.Kinase.value_counts()
Kinase
EPHA3             2101
FES               2007
TRKC              1926
SRC               1896
EPHA8             1877
                  ... 
PKN3                 6
SPHK1                4
PIK3CD/PIK3R1        3
PFTAIRE1/cycD3       2
PRPK                 2
Name: count, Length: 385, dtype: int64

Extract phosphosite sequence

data = []
for i, r in df.iterrows():
    position = r.phospho_position - 1
    start = position - 7
    end = position + 8

    # Extract the subsequence
    subseq = r.sequence2[max(0, start):min(len(r.sequence2), end)]

    # Pad the subsequence if needed
    if start < 0:
        subseq = "_" * abs(start) + subseq
    if end > len(r.sequence2):
        subseq = subseq + "_" * (end - len(r.sequence2))
    
    data.append(subseq)
    # break
df['substrate'] = data
# check if the middle position belongs to s,t,y
df.substrate.str[7].value_counts()
substrate
y    106558
s     63030
t     28660
Name: count, dtype: int64
# check if the middle position belongs to s,t,y
df.substrate.apply(len).value_counts()
substrate
15    198248
Name: count, dtype: int64
df.substrate.value_counts()[:10]
substrate
IEQEGPEyWDRNTQI    1085
RKEsysVyVyKVLKQ     765
SSSLEKsyELPDGQV     765
VGMGQKDsyVGDEAQ     750
EsysVyVyKVLKQVH     729
TLNNKFAsFIDKVRF     708
MWISKQEyDEsGPsI     693
LPDGQVItIGNERFR     610
EyDEsGPsIVHRKCF     594
QGNRttPsyVAFtDt     556
Name: count, dtype: int64
df['KS_pairs'] = df.Kinase+"_"+df.uniprot.str.split('_').str[0]+"_"+df.Position

df['S_position'] = df.uniprot.str.split('_').str[0]+"_"+df.Position

df['Uniprot'] = df.uniprot.str.split('_').str[0]

Check the most frequence substrate sequence

df.query('substrate == "RKEsysVyVyKVLKQ"').S_position.value_counts()
S_position
H2B1C_Y41    85
H2B1D_Y41    85
H2B1H_Y41    85
H2B1K_Y41    85
H2B1L_Y41    85
H2B1M_Y41    85
H2B1N_Y41    85
H2B2F_Y41    85
H2BFS_Y41    85
Name: count, dtype: int64

Check if the sequences of these substrates are identical

name = df.query('substrate == "RKEsysVyVyKVLKQ"').S_position.value_counts().index

df[df.S_position.isin(name)].sequence2.unique()
array(['MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPEPTKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPDPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPEPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK',
       'MPELAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIASEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPEPVKSAPVPKKGSKKAINKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPEPSKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPDPAKSAPAPKKGSKKAVTKVQKKDGKKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
       'MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKEsysVyVyKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLPHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK'],
      dtype=object)

Save

df.shape
(198248, 14)
# df.to_excel('raw/large_scale_final2.xlsx',index=False)