Build human phosphoproteome dataset

Setup

import pandas as pd
from matplotlib import pyplot as plt
from katlas.core import *
import seaborn as sns
from tqdm import tqdm
import numpy as np

tqdm.pandas()

Load PhosphoSitePlus and Ochoa et al.

ochoa = Data.get_ochoa_site()
psp = Data.get_psp_human_site()

Both sets’ gene column contains nan

PSP is gene name, ochoa is porotein name

PSP

psp = psp[psp.site.str[0].isin(['S','T','Y'])]
psp = psp[psp.LT_LIT.notna()].reset_index(drop=True)
psp['site_seq'] = psp['site_seq'].str.upper()
psp = psp[['gene','uniprot','site','site_seq']]

Ochoa

ochoa['site'] = ochoa.residue+ochoa.position.astype(str)
ochoa = ochoa[['gene','current_uniprot','site','site_seq']]
ochoa = ochoa.rename(columns={'current_uniprot':'uniprot'})

Combine

ochoa['source']='ochoa'
psp['source']='psp'
comb = psp.merge(ochoa,how='outer',on=['uniprot','site'],suffixes=('_psp','_ochoa'))
def join_columns(row, columns):
    "Join non-NA values from specified columns with a separator"
    valid_values = set(row[col] for col in columns if pd.notna(row[col]))
    # return nan if all empty list
    return np.nan if not valid_values else '|'.join(valid_values)
comb['gene'] = comb.apply(lambda r: join_columns(r, ['gene_psp','gene_ochoa']), axis=1)

comb['source'] = comb.apply(lambda r: join_columns(r, ['source_psp','source_ochoa']), axis=1)

comb['site_seq'] = comb.apply(lambda r: join_columns(r, ['site_seq_psp','site_seq_ochoa']), axis=1)
comb = comb[['uniprot','gene','site','site_seq','source']]
comb.source.value_counts()
source
ochoa        106327
psp            9138
ochoa|psp      5954
Name: count, dtype: int64
comb.shape
(121419, 5)
comb['position'] =comb.site.str[1:].astype(int)

comb = comb.sort_values(by=['uniprot', 'position']).reset_index(drop=True)

comb = comb.iloc[:,:5]

To save and load:

# comb.to_parquet('combine_site_psp_ochoa.parquet')

# comb = pd.read_parquet('combine_site_psp_ochoa.parquet')

Query uniprot sequence on uniprot; mapping sequence

Uncheck below to use the csv for uniprot id mapping

# comb.drop_duplicates(subset='uniprot').to_csv('uniprot.csv',index=False)
# unmapped = pd.Series(['AAC50053',
# 'P18433-2',
# 'AAA58698',
# 'NP_001184222',
# 'AAA60149'])
sequence = pd.read_excel('raw/idmapping_2024_06_17.xlsx')

# there are few duplicates uniprot (history uniprot)
sequence = sequence.drop_duplicates(subset='uniprot')
seq = sequence[['uniprot','sequence']].copy()
comb = comb.merge(seq,how='inner',on='uniprot')
comb.shape
(121272, 6)
121419-121272 # unmatched
147

Validate position

comb['position'] = comb.site.str[1:].astype(int)
comb['acceptor'] = comb.site.str[0]
def validate_position(row):
    # Extract amino acid and position from the new columns
    amino_acid = row['acceptor']
    position = int(row['position'])
    
    try:
        # Check if the amino acid at the given position matches the specified amino acid
        if row['sequence'][position-1] == amino_acid:
            return 1
        else:
            return 0
    except IndexError:  # Handle the case when position-1 exceeds the length of sequence
        return 0
comb['is_valid'] = comb.apply(validate_position,axis=1)
comb.is_valid.value_counts()
is_valid
1    120104
0      1168
Name: count, dtype: int64
comb = comb[comb.is_valid==1]
comb.source.value_counts()
source
ochoa        105775
psp            8382
ochoa|psp      5947
Name: count, dtype: int64
comb.shape
(120104, 9)

Phosphorylate sequence

modify=comb.groupby('uniprot').agg({'site':lambda r: r.unique()}).reset_index()
modify = modify.merge(seq)
def phosphorylate_seq(row):
    seq = list(row['sequence'])
    for pos in row['site']:
        # extract character and position
        position = int(pos[1:]) - 1  # Subtracting 1 because Python uses 0-based indexing

        # convert sequence
        seq[position] = seq[position].lower()
    return ''.join(seq)
modify['phospho_seq'] = modify.apply(phosphorylate_seq,axis=1)
seq2 = modify[['uniprot','phospho_seq']]

Extract sequence

comb = comb.merge(seq2)
comb.shape
(120104, 10)
site_seq = extract_site_seq(comb,'phospho_seq','position')
100%|██████████| 120104/120104 [00:03<00:00, 31171.64it/s]
comb['site_seq'] = site_seq
comb.shape
(120104, 10)

Reorder

comb['position'] =comb.site.str[1:].astype(int)

comb = comb.sort_values(by=['uniprot', 'position']).reset_index(drop=True)

comb = comb.iloc[:,:5]

To save and load:

# comb.to_parquet('phosphorylated_combine_site.parquet')

# comb=pd.read_parquet('phosphorylated_combine_site.parquet')
comb
uniprot gene site site_seq source
0 A0A024R4G9 C19orf48 S20 ITGSRLLsMVPGPAR psp
1 A0A075B6Q4 NaN S24 VDDEKGDsNDDYDSA ochoa
2 A0A075B6Q4 NaN S35 YDSAGLLsDEDCMSV ochoa
3 A0A075B6Q4 NaN S57 IADHLFWsEETKSRF ochoa
4 A0A075B6Q4 NaN S68 KSRFTEYsMTssVMR ochoa
... ... ... ... ... ...
120099 V9GYY5 NaN S127 EGGAGDRsEEEAsst ochoa
120100 V9GYY5 NaN S132 DRsEEEAsstEKPtK ochoa
120101 V9GYY5 NaN S133 RsEEEAsstEKPtKA ochoa
120102 V9GYY5 NaN T134 sEEEAsstEKPtKAL ochoa
120103 V9GYY5 NaN T138 AsstEKPtKALPRKS ochoa

120104 rows × 5 columns

Access the dataset through Data

Data.get_combine_site_psp_ochoa()
uniprot gene site site_seq source AM_pathogenicity CDDM_upper CDDM_max_score
0 A0A024R4G9 C19orf48 S20 ITGSRLLSMVPGPAR psp NaN PRKX,AKT1,PKG1,P90RSK,HIPK4,AKT3,HIPK1,PKACB,H... 2.407041
1 A0A075B6Q4 None S24 VDDEKGDSNDDYDSA ochoa NaN CK2A2,CK2A1,GRK7,GRK5,CK1G1,CK1A,IKKA,CK1G2,CA... 2.295654
2 A0A075B6Q4 None S35 YDSAGLLSDEDCMSV ochoa NaN CK2A2,CK2A1,IKKA,ATM,IKKB,CAMK1D,MARK2,GRK7,IK... 2.488683
3 A0A075B6Q4 None S57 IADHLFWSEETKSRF ochoa NaN GRK7,CK2A1,CK2A2,PKN2,GRK1,GRK5,MARK1,MARK2,UL... 1.851894
4 A0A075B6Q4 None S68 KSRFTEYSMTSSVMR ochoa NaN AKT1,P90RSK,AKT3,SGK1,AKT2,NDR2,RSK2,P70S6K,RS... 2.026384
... ... ... ... ... ... ... ... ...
121414 V9GYY5 None S127 EGGAGDRSEEEASST ochoa NaN CK2A1,CK2A2,GRK7,GRK5,ALK2,GRK1,CK1E,PLK3,CK1A... 2.665606
121415 V9GYY5 None S132 DRSEEEASSTEKPTK ochoa NaN CK2A2,CK2A1,GRK7,TGFBR1,GRK2,ALK2,PLK3,CLK3,BM... 2.445179
121416 V9GYY5 None S133 RSEEEASSTEKPTKA ochoa NaN CK2A1,ATR,GRK1,CK1G1,PLK3,CLK3,GRK7,CK1G2,MARK... 2.090739
121417 V9GYY5 None T134 SEEEASSTEKPTKAL ochoa NaN ASK1,PERK,EEF2K,MAP2K4,MEKK2,MST1,BMPR1B,OSR1,... 1.832532
121418 V9GYY5 None T138 ASSTEKPTKALPRKS ochoa NaN ASK1,MEK2,MPSK1,TNIK,PBK,MST2,MINK,NEK4,LKB1,MEK5 1.807565

121419 rows × 8 columns

comb2 = Data.get_combine_site_phosphorylated()
comb2[comb2.gene=='CTNNB1']
uniprot gene site site_seq source AM_pathogenicity CDDM PSPA CDDM_max_score PSPA_max_score
28253 P35222 CTNNB1 T3 _____MAtQADLMEL ochoa 0.350216 ATR,ATM,DNAPK,CAMKK1,CAMKK2,PBK,ASK1,OSR1,TNIK... MARK1,MARK2,DNAPK,ATR,SMG1,HUNK,QSK,MARK3,MARK... 1.685871 2.839518
28254 P35222 CTNNB1 S23 PDRKAAVsHWQQQsy psp 0.434195 P90RSK,RSK4,MARK1,RSK2,AKT1,TSSK2,SGK1,P70S6K,... SSTK,BRSK1,PRKD3,BRSK2,P70S6K,SNRK,MARK3,MAPKA... 1.795827 3.255439
28255 P35222 CTNNB1 S29 VsHWQQQsyLDsGIH ochoa|psp 0.628389 PAK4,CAMK1D,NIM1,LATS2,PAK5,TBK1,TSSK1,GRK7,NU... GSK3A,GSK3B,LATS2,MAPKAPK2,CAMK2A,CAMK2B,LATS1... 1.693298 6.465129
28256 P35222 CTNNB1 Y30 sHWQQQsyLDsGIHs ochoa|psp 0.780358 ERBB4,FGFR4,TNK1,JAK3,CSK,KIT,EPHA5,EGFR,JAK2,... BMPR2_TYR,PTK2,SYK,ERBB4,PDHK1_TYR,EPHA3,PDHK4... 1.596045 2.796830
28257 P35222 CTNNB1 S33 QQQsyLDsGIHsGAT psp 0.978753 GSK3B,IKKB,IKKA,GSK3A,TBK1,PAK4,GRK1,IKKE,P90R... CK1G2,CK1A,CK1G3,GSK3A,GSK3B,GRK3,CK1A2,CK1D,J... 1.797053 8.479370
28258 P35222 CTNNB1 S37 yLDsGIHsGATtTAP ochoa|psp 0.954689 GSK3A,PAK6,PAK5,GSK3B,TBK1,PAK4,PRKX,IKKB,ULK3... GSK3A,GSK3B,CK1A,CK1G2,GRK7,IKKA,GRK4,GRK5,IKK... 1.795597 8.083523
28259 P35222 CTNNB1 T41 GIHsGATtTAPsLsG psp 0.903105 MPSK1,GSK3B,PBK,GSK3A,MEK2,ASK1,LKB1,TNIK,MEKK... GSK3A,GSK3B,PRP4,PASK,CK1G2,CK1A,GRK7,CK1D,CK1... 1.649193 7.866048
28260 P35222 CTNNB1 S45 GATtTAPsLsGKGNP ochoa|psp 0.915674 TBK1,IKKE,PAK6,MTOR,PAK5,PRKX,RSK4,RSK2,CK1A,ULK3 CK1A,CK1G1,CK1D,CK1G2,MTOR,CK1E,CK1A2,CLK3,IKK... 2.044766 4.275749
28261 P35222 CTNNB1 S47 TtTAPsLsGKGNPEE ochoa 0.556963 CK1G2,PAK5,PAK6,PRKX,PKACB,CK2A1,PRKD3,ULK3,RS... MPSK1,CK1E,ERK5,GRK1,CK1D,COT,MOS,NLK,MLK3,ERK1 1.823853 2.982430
28262 P35222 CTNNB1 S60 EEEDVDTsQVLyEWE psp 0.208932 ATM,ATR,GRK7,GRK1,BUB1B,CK2A2,GRK5,DNAPK,CK2A1... ATM,ACVR2B,ACVR2A,GRK4,GRK7,CK1G2,PLK1,TLK2,PL... 2.656530 5.809839
28263 P35222 CTNNB1 Y64 VDTsQVLyEWEQGFS psp 0.294942 TEC,BLK,EPHA4,BTK,SRMS,SYK,FES,EPHA7,EPHA6,FYN SYK,MERTK,EPHA5,SRMS,EPHA4,FER,EPHA3,PTK6,FES,TEK 1.804839 2.871988
28264 P35222 CTNNB1 S73 WEQGFSQsFTQEQVA psp 0.347105 BRSK2,IKKB,ULK3,TSSK1,NUAK1,PKACB,PRKX,NDR2,TB... CAMK2G,ULK1,PLK1,PLK3,IKKA,CAMK2B,DSTYK,NEK2,I... 1.725664 2.099936
28265 P35222 CTNNB1 Y86 VADIDGQyAMTRAQR psp 0.688526 JAK3,FGFR4,EGFR,TEK,KIT,FYN,BLK,LTK,TNK2,ABL1 ABL1,ABL2,MATK,BLK,FYN,FGR,MERTK,ZAP70,LCK,HCK 1.819756 2.467626
28266 P35222 CTNNB1 T102 RAAMFPEtLDEGMQI psp 0.894711 TNIK,CK2A2,HGK,CAMKK1,MEKK3,NEK4,MINK,GCK,KHS1... DNAPK,CK2A1,CK2A2,CAMK2G,PLK2,PLK3,CAMK2A,TGFB... 1.607649 3.374482
28267 P35222 CTNNB1 T112 EGMQIPStQFDAAHP psp 0.663232 ATR,ATM,DNAPK,EEF2K,CAMKK2,PBK,ICK,MPSK1,OSR1,... ATR,DNAPK,SMG1,P38D,MPSK1,ATM,ICK,EEF2K,TLK2,A... 1.758952 2.576788
28268 P35222 CTNNB1 T120 QFDAAHPtNVQRLAE psp 0.878526 PBK,LKB1,CAMKK1,OSR1,TNIK,EEF2K,ROCK2,NEK4,MIN... MPSK1,TAK1,PINK1,VRK1,YANK2,GAK,GSK3A,PASK,ROC... 1.595444 1.445671
28269 P35222 CTNNB1 Y142 AVVNLINyQDDAELA psp 0.980268 EPHA4,ATR,EPHA6,EPHA7,EPHB1,EPHA3,EPHB2,EPHA8,... PDHK3_TYR,EPHA4,PDHK4_TYR,EPHA5,EPHA3,PDHK1_TY... 1.930682 0.982576
28270 P35222 CTNNB1 S184 QLSKKEAsRHAIMRs psp 0.927547 PAK4,TSSK1,BRSK2,MNK2,PAK6,PKN2,MARK2,NUAK1,DC... SSTK,DCAMKL1,BRSK2,TSSK1,BRSK1,TLK1,TSSK2,AMPK... 1.935281 4.978343
28271 P35222 CTNNB1 S191 sRHAIMRsPQMVSAI ochoa|psp 0.811158 CDK4,ERK2,DYRK1A,CDK1,HIPK3,CDK2,CDK5,ERK1,DYR... CDK17,CDK16,CDK14,CDK18,ERK1,ERK2,P38G,CDK9,CD... 2.070521 4.814151
28272 P35222 CTNNB1 S246 ALVKMLGsPVDSVLF psp 0.772711 CDK4,CDK1,CDK2,CDK5,ERK2,CDK3,JNK1,ERK1,DYRK4,... JNK1,JNK2,P38D,JNK3,P38G,P38B,P38A,CDK8,NLK,CDK14 2.267036 6.378471
28273 P35222 CTNNB1 T298 VKFLAITtDCLQILA psp 0.972658 PBK,LKB1,TNIK,NEK4,MST2,MINK,CAMKK1,MST1,NEK1,HGK MEK1,GSK3B,MOS,DAPK2,ALK4,ACVR2B,ACVR2A,MEK2,B... 1.589268 0.414628
28274 P35222 CTNNB1 S311 LAYGNQEsKLIILAS psp 0.972500 TSSK1,NIM1,QIK,TSSK2,MARK2,MARK1,CAMK1D,MARK3,... IRE2,SKMLCK,TSSK1,SSTK,SMMLCK,TLK1,TSSK2,MELK,... 1.821695 2.632692
28275 P35222 CTNNB1 Y331 LVNIMRTytyEKLLW psp 0.837516 DDR2,FGFR4,EPHB1,LCK,RET,EPHA8,FES,EPHA4,EPHA5... PTK2,EPHA4,EPHA7,EPHA3,EPHA6,PTK2B,BMPR2_TYR,S... 1.629153 4.126294
28276 P35222 CTNNB1 T332 VNIMRTytyEKLLWT psp 0.408153 AKT1,PBK,AMPKA1,LRRK2,CHK2,LKB1,TNIK,GCK,ROCK2... ALPHAK3,YANK2,PASK,GRK7,SBK,YANK3,MAPKAPK2,NUA... 1.434642 4.102963
28277 P35222 CTNNB1 Y333 NIMRTytyEKLLWTT psp 0.982195 EPHA1,EPHA7,EPHA2,BLK,TNK2,SRMS,EPHA4,LTK,EPHA... SYK,BLK,EPHA6,EPHA4,FRK,PTK2,ZAP70,ERBB4,EPHA8... 1.711815 3.995439
28278 P35222 CTNNB1 S352 KVLSVCSsNKPAIVE psp 0.839116 PRKD2,PRKD3,HIPK4,PRKD1,MAPKAPK3,TSSK2,CAMK4,M... BUB1,HIPK4,PRKD1,PRKD2,PRKD3,CDK18,KIS,PERK,ML... 2.003260 2.875428
28279 P35222 CTNNB1 T371 QALGLHLtDPsQRLV psp 0.286979 PBK,LKB1,NEK4,TNIK,CAMKK1,MEKK3,CAMKK2,MINK,GC... SMG1,MEK1,MEK2,ACVR2B,CK2A1,GSK3B,MOS,DAPK2,BU... 1.658914 1.683408
28280 P35222 CTNNB1 S374 GLHLtDPsQRLVQNC psp 0.960511 ATR,ATM,DNAPK,ERK7,NIM1,TSSK2,LATS1,PKCA,DSTYK... CK1G3,CK1D,CK1A,CK1E,CK1A2,CK1G2,CK1G1,KIS,TGF... 2.421310 5.546010
28281 P35222 CTNNB1 T384 LVQNCLWtLRNLSDA psp 0.945400 CAMKK1,MINK,TNIK,CAMKK2,MST1,ERK7,NEK4,HGK,LKB... ERK7,TAK1,NEK4,MST1,HPK1,NEK8,CAMKK1,IRAK1,NEK... 1.728943 2.678529
28282 P35222 CTNNB1 T393 RNLSDAAtKQEGMEG psp 0.846858 CK2A2,MEKK3,PBK,TNIK,CAMKK1,CAMKK2,MINK,MST1,M... GSK3A,CK1G3,TLK1,TSSK2,TLK2,GRK7,ALPHAK3,GSK3B... 1.609937 1.804045
28283 P35222 CTNNB1 T472 ICALRHLtSRHQEAE psp 0.988368 AKT1,ROCK1,ROCK2,SGK1,ERK7,MRCKB,PBK,AKT3,MST1... PIM2,PKN1,ROCK1,MRCKB,PKCE,AKT3,LOK,AKT1,SGK1,... 1.585884 4.709928
28284 P35222 CTNNB1 Y489 QNAVRLHyGLPVVVK ochoa|psp 0.690221 CSK,PTK6,TNK1,TNK2,ABL1,PTK2B,LTK,TYRO3,LCK,PD... ABL1,PTK2B,ABL2,PKMYT1_TYR,TNK2,FER,TNK1,PTK6,... 1.824113 2.370134
28285 P35222 CTNNB1 T547 LVRAHQDtQRRtsMG ochoa 0.646316 ATR,OSR1,ATM,DNAPK,TNIK,MINK,ERK7,HGK,EEF2K,LKB1 GSK3B,GSK3A,EEF2K,OSR1,PIM1,MLK4,PKCB,CAMK2A,E... 1.709029 5.511960
28286 P35222 CTNNB1 T551 HQDtQRRtsMGGtQQ ochoa|psp 0.402405 LKB1,PBK,ROCK1,OSR1,TNIK,ROCK2,AURB,ASK1,HASPI... DAPK1,DAPK3,CK1G2,GRK2,GRK3,AURA,CK1A,CK1A2,CK... 1.511777 4.392697
28287 P35222 CTNNB1 S552 QDtQRRtsMGGtQQQ ochoa|psp 0.596179 PRKX,PKACA,PKACB,AKT1,PKG2,PKG1,AURB,AURC,AKT3... AURA,PRKX,CLK2,GSK3B,PKACB,MSK1,AURC,GSK3A,DAP... 2.333742 7.778372
28288 P35222 CTNNB1 T556 RRtsMGGtQQQFVEG ochoa|psp 0.330547 ATR,ATM,DNAPK,OSR1,PBK,HASPIN,TNIK,GCK,HGK,MINK ATR,DNAPK,SMG1,CK1G3,ATM,TLK2,NEK5,CK1D,MEKK2,... 1.792496 3.552549
28289 P35222 CTNNB1 S605 LFVQLLYsPIENIQR psp 0.744484 CDK1,CDK4,CDK5,CDK2,CDK3,ERK2,ERK1,JNK1,P38G,JNK2 JNK1,JNK3,JNK2,P38D,P38G,P38B,ERK5,ERK2,ERK1,D... 2.161258 5.347646
28290 P35222 CTNNB1 S646 PLTELLHsRNEGVAt psp 0.956311 CK2A2,CK2A1,IKKB,GRK1,GRK7,PAK4,IKKA,ULK3,P90R... GRK6,PINK1,NEK9,PLK2,CAMKK1,NEK7,PLK3,GRK5,COT... 2.018886 2.662771
28291 P35222 CTNNB1 T653 sRNEGVAtyAAAVLF ochoa|psp 0.986426 TNIK,PBK,OSR1,TAO2,MINK,MST1,KHS1,MST2,GCK,TAO1 GRK7,PRP4,TGFBR1,ALK4,JNK1,P38G,P38B,ACVR2B,BM... 1.631637 2.709797
28292 P35222 CTNNB1 Y654 RNEGVAtyAAAVLFR ochoa|psp 0.993016 EPHA7,EPHA4,TXK,CSK,LTK,BTK,EPHB3,LYN,EPHA1,BLK PTK2,EPHA6,EPHB2,BLK,FYN,ZAP70,ABL2,ABL1,SYK,LCK 1.929364 3.188853
28293 P35222 CTNNB1 Y670 SEDKPQDyKKRLsVE psp 0.954805 FLT1,SYK,PTK2,KIT,CSF1R,EPHA4,EPHA6,JAK2,EPHA2... PDHK4_TYR,WEE1_TYR,TNNI3K_TYR,BMPR2_TYR,PDHK3_... 1.951307 1.536509
28294 P35222 CTNNB1 S675 QDyKKRLsVELtssL ochoa|psp 0.980063 PAK4,PAK6,PAK5,PKACA,AURB,PAK1,PKG2,PRKX,PAK2,... GSK3B,AURA,MYLK4,MSK1,CLK4,PKACA,DAPK1,SKMLCK,... 2.388692 6.234278
28295 P35222 CTNNB1 T679 KRLsVELtssLFRTE ochoa 0.953053 TGFBR1,BMPR1B,ALK2,PBK,CAMKK2,ALK4,OSR1,PERK,T... BMPR1B,BMPR1A,ACVR2B,ACVR2A,TGFBR1,GRK2,GRK3,A... 1.719372 6.180493
28296 P35222 CTNNB1 S680 RLsVELtssLFRTEP ochoa 0.810021 IKKB,GRK7,CLK3,CK1G1,CDK7,CK1G3,P90RSK,RSK4,CK... CK1G2,CK1A,GRK2,CK1E,GRK3,CK1A2,CK1D,CK1G3,ACV... 1.845225 4.125982
28297 P35222 CTNNB1 S681 LsVELtssLFRTEPM ochoa 0.905768 ULK3,CLK3,TBK1,MTOR,IKKE,DSTYK,IKKB,NEK7,NEK6,... CK1G2,CK1D,CK1E,CK1A2,CK1A,GRK1,CK1G3,GRK3,GRK... 1.881727 4.856114
28298 P35222 CTNNB1 S715 GYRQDDPsyRsFHSG ochoa|psp 0.276474 CK1G2,CK1G3,ERK7,CK1G1,AKT1,CK1A,RSK4,P90RSK,T... GRK7,CK1G2,CLK3,GRK3,CK1G3,GRK2,ACVR2A,GSK3A,L... 1.845451 5.621210
28299 P35222 CTNNB1 Y716 YRQDDPsyRsFHSGG ochoa 0.276789 KIT,EGFR,JAK1,FGFR4,FLT3,CSF1R,FYN,ABL1,TEK,FLT1 PTK2,FYN,ERBB4,FLT1,SYK,ZAP70,SRC,BLK,MET,LCK 1.730527 6.357552
28300 P35222 CTNNB1 S718 QDDPsyRsFHSGGYG psp 0.365442 IKKA,TBK1,IKKB,CK1A,PAK5,CK1G2,PAK6,ULK3,MNK2,... CK1G2,CK1G3,CK1A,YANK2,GRK3,CK1G1,YANK3,CK1A2,... 1.670313 10.234688

(code only) Add AlphaMissense score to each site

The AM_mean.parquet file is too big to upload to the current repository. To generate the file, refer to others_01_Process_AM.ipynb notebook.

df = pd.read_parquet('raw/AM_mean.parquet')
df.columns = ['uniprot','site','AM_pathogenicity','position']
df = df.iloc[:,:3]

The original:

comb1 = Data.get_combine_site_psp_ochoa()
comb1=comb1.merge(df,how='left',on=['uniprot','site'])

The phosphorylated

comb2 = Data.get_combine_site_phosphorylated()
comb2=comb2.merge(df,how='left',on=['uniprot','site'])

Add kinase prediction

for uppercase, only cddm

comb1['site_seq'] = comb1['site_seq'].str.split('|').str[0] # only one site with one aa difference
cddm_upper = predict_kinase_df(comb1,'site_seq',**param_CDDM_upper)
input dataframe has a length 121419
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [02:22<00:00,  2.03it/s]
def get_top(r, n):
    top = r.sort_values(ascending=False)[:n].index
    return ','.join(top)
cddm_upper_rnk = cddm_upper.apply(lambda r: get_top(r,10),axis=1)
comb1['CDDM_upper']=cddm_upper_rnk
comb1['CDDM_max_score'] = cddm_upper.max(1)

Uncheck below to save:

# comb1.to_parquet('raw/combine_site_psp_ochoa.parquet')

For phosphorylated:

cddm = predict_kinase_df(comb2,'site_seq',**param_CDDM)
input dataframe has a length 120104
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [02:17<00:00,  2.11it/s]
pspa = predict_kinase_df(comb2,'site_seq',**param_PSPA)
input dataframe has a length 120104
Preprocessing
Finish preprocessing
Calculating position: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
100%|██████████| 396/396 [04:16<00:00,  1.55it/s]
cddm_rnk = cddm.apply(lambda r: get_top(r,10),axis=1)
pspa_rnk = pspa.apply(lambda r: get_top(r,10),axis=1)
comb2['CDDM']=cddm_rnk
comb2['PSPA']=pspa_rnk
comb2['CDDM_max_score']=cddm.max(1)
comb2['PSPA_max_score']=pspa.max(1)

To save and load:

# comb2.to_parquet('raw/phosphorylated_combine_site.parquet')

# comb2 = pd.read_parquet('raw/phosphorylated_combine_site.parquet')