import pandas as pd
from matplotlib import pyplot as plt
from katlas.core import *
import seaborn as sns
from tqdm import tqdm
import numpy as np
tqdm.pandas()
Build human phosphoproteome dataset
Setup
Load PhosphoSitePlus and Ochoa et al.
= Data.get_ochoa_site()
ochoa = Data.get_psp_human_site() psp
Both sets’ gene column contains nan
PSP is gene name, ochoa is porotein name
PSP
= psp[psp.site.str[0].isin(['S','T','Y'])] psp
= psp[psp.LT_LIT.notna()].reset_index(drop=True) psp
'site_seq'] = psp['site_seq'].str.upper() psp[
= psp[['gene','uniprot','site','site_seq']] psp
Ochoa
'site'] = ochoa.residue+ochoa.position.astype(str) ochoa[
= ochoa[['gene','current_uniprot','site','site_seq']] ochoa
= ochoa.rename(columns={'current_uniprot':'uniprot'}) ochoa
Combine
'source']='ochoa' ochoa[
'source']='psp' psp[
= psp.merge(ochoa,how='outer',on=['uniprot','site'],suffixes=('_psp','_ochoa')) comb
def join_columns(row, columns):
"Join non-NA values from specified columns with a separator"
= set(row[col] for col in columns if pd.notna(row[col]))
valid_values # return nan if all empty list
return np.nan if not valid_values else '|'.join(valid_values)
'gene'] = comb.apply(lambda r: join_columns(r, ['gene_psp','gene_ochoa']), axis=1)
comb[
'source'] = comb.apply(lambda r: join_columns(r, ['source_psp','source_ochoa']), axis=1)
comb[
'site_seq'] = comb.apply(lambda r: join_columns(r, ['site_seq_psp','site_seq_ochoa']), axis=1) comb[
= comb[['uniprot','gene','site','site_seq','source']] comb
comb.source.value_counts()
source
ochoa 106327
psp 9138
ochoa|psp 5954
Name: count, dtype: int64
comb.shape
(121419, 5)
'position'] =comb.site.str[1:].astype(int)
comb[
= comb.sort_values(by=['uniprot', 'position']).reset_index(drop=True)
comb
= comb.iloc[:,:5] comb
To save and load:
# comb.to_parquet('combine_site_psp_ochoa.parquet')
# comb = pd.read_parquet('combine_site_psp_ochoa.parquet')
Query uniprot sequence on uniprot; mapping sequence
Uncheck below to use the csv for uniprot id mapping
# comb.drop_duplicates(subset='uniprot').to_csv('uniprot.csv',index=False)
# unmapped = pd.Series(['AAC50053',
# 'P18433-2',
# 'AAA58698',
# 'NP_001184222',
# 'AAA60149'])
= pd.read_excel('raw/idmapping_2024_06_17.xlsx')
sequence
# there are few duplicates uniprot (history uniprot)
= sequence.drop_duplicates(subset='uniprot') sequence
= sequence[['uniprot','sequence']].copy() seq
= comb.merge(seq,how='inner',on='uniprot') comb
comb.shape
(121272, 6)
121419-121272 # unmatched
147
Validate position
'position'] = comb.site.str[1:].astype(int) comb[
'acceptor'] = comb.site.str[0] comb[
def validate_position(row):
# Extract amino acid and position from the new columns
= row['acceptor']
amino_acid = int(row['position'])
position
try:
# Check if the amino acid at the given position matches the specified amino acid
if row['sequence'][position-1] == amino_acid:
return 1
else:
return 0
except IndexError: # Handle the case when position-1 exceeds the length of sequence
return 0
'is_valid'] = comb.apply(validate_position,axis=1) comb[
comb.is_valid.value_counts()
is_valid
1 120104
0 1168
Name: count, dtype: int64
= comb[comb.is_valid==1] comb
comb.source.value_counts()
source
ochoa 105775
psp 8382
ochoa|psp 5947
Name: count, dtype: int64
comb.shape
(120104, 9)
Phosphorylate sequence
=comb.groupby('uniprot').agg({'site':lambda r: r.unique()}).reset_index() modify
= modify.merge(seq) modify
def phosphorylate_seq(row):
= list(row['sequence'])
seq for pos in row['site']:
# extract character and position
= int(pos[1:]) - 1 # Subtracting 1 because Python uses 0-based indexing
position
# convert sequence
= seq[position].lower()
seq[position] return ''.join(seq)
'phospho_seq'] = modify.apply(phosphorylate_seq,axis=1) modify[
= modify[['uniprot','phospho_seq']] seq2
Extract sequence
= comb.merge(seq2) comb
comb.shape
(120104, 10)
= extract_site_seq(comb,'phospho_seq','position') site_seq
100%|██████████| 120104/120104 [00:03<00:00, 31171.64it/s]
'site_seq'] = site_seq comb[
comb.shape
(120104, 10)
Reorder
'position'] =comb.site.str[1:].astype(int)
comb[
= comb.sort_values(by=['uniprot', 'position']).reset_index(drop=True)
comb
= comb.iloc[:,:5] comb
To save and load:
# comb.to_parquet('phosphorylated_combine_site.parquet')
# comb=pd.read_parquet('phosphorylated_combine_site.parquet')
comb
uniprot | gene | site | site_seq | source | |
---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 | S20 | ITGSRLLsMVPGPAR | psp |
1 | A0A075B6Q4 | NaN | S24 | VDDEKGDsNDDYDSA | ochoa |
2 | A0A075B6Q4 | NaN | S35 | YDSAGLLsDEDCMSV | ochoa |
3 | A0A075B6Q4 | NaN | S57 | IADHLFWsEETKSRF | ochoa |
4 | A0A075B6Q4 | NaN | S68 | KSRFTEYsMTssVMR | ochoa |
... | ... | ... | ... | ... | ... |
120099 | V9GYY5 | NaN | S127 | EGGAGDRsEEEAsst | ochoa |
120100 | V9GYY5 | NaN | S132 | DRsEEEAsstEKPtK | ochoa |
120101 | V9GYY5 | NaN | S133 | RsEEEAsstEKPtKA | ochoa |
120102 | V9GYY5 | NaN | T134 | sEEEAsstEKPtKAL | ochoa |
120103 | V9GYY5 | NaN | T138 | AsstEKPtKALPRKS | ochoa |
120104 rows × 5 columns
Access the dataset through Data
Data.get_combine_site_psp_ochoa()
uniprot | gene | site | site_seq | source | AM_pathogenicity | CDDM_upper | CDDM_max_score | |
---|---|---|---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 | S20 | ITGSRLLSMVPGPAR | psp | NaN | PRKX,AKT1,PKG1,P90RSK,HIPK4,AKT3,HIPK1,PKACB,H... | 2.407041 |
1 | A0A075B6Q4 | None | S24 | VDDEKGDSNDDYDSA | ochoa | NaN | CK2A2,CK2A1,GRK7,GRK5,CK1G1,CK1A,IKKA,CK1G2,CA... | 2.295654 |
2 | A0A075B6Q4 | None | S35 | YDSAGLLSDEDCMSV | ochoa | NaN | CK2A2,CK2A1,IKKA,ATM,IKKB,CAMK1D,MARK2,GRK7,IK... | 2.488683 |
3 | A0A075B6Q4 | None | S57 | IADHLFWSEETKSRF | ochoa | NaN | GRK7,CK2A1,CK2A2,PKN2,GRK1,GRK5,MARK1,MARK2,UL... | 1.851894 |
4 | A0A075B6Q4 | None | S68 | KSRFTEYSMTSSVMR | ochoa | NaN | AKT1,P90RSK,AKT3,SGK1,AKT2,NDR2,RSK2,P70S6K,RS... | 2.026384 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
121414 | V9GYY5 | None | S127 | EGGAGDRSEEEASST | ochoa | NaN | CK2A1,CK2A2,GRK7,GRK5,ALK2,GRK1,CK1E,PLK3,CK1A... | 2.665606 |
121415 | V9GYY5 | None | S132 | DRSEEEASSTEKPTK | ochoa | NaN | CK2A2,CK2A1,GRK7,TGFBR1,GRK2,ALK2,PLK3,CLK3,BM... | 2.445179 |
121416 | V9GYY5 | None | S133 | RSEEEASSTEKPTKA | ochoa | NaN | CK2A1,ATR,GRK1,CK1G1,PLK3,CLK3,GRK7,CK1G2,MARK... | 2.090739 |
121417 | V9GYY5 | None | T134 | SEEEASSTEKPTKAL | ochoa | NaN | ASK1,PERK,EEF2K,MAP2K4,MEKK2,MST1,BMPR1B,OSR1,... | 1.832532 |
121418 | V9GYY5 | None | T138 | ASSTEKPTKALPRKS | ochoa | NaN | ASK1,MEK2,MPSK1,TNIK,PBK,MST2,MINK,NEK4,LKB1,MEK5 | 1.807565 |
121419 rows × 8 columns
= Data.get_combine_site_phosphorylated() comb2
=='CTNNB1'] comb2[comb2.gene
uniprot | gene | site | site_seq | source | AM_pathogenicity | CDDM | PSPA | CDDM_max_score | PSPA_max_score | |
---|---|---|---|---|---|---|---|---|---|---|
28253 | P35222 | CTNNB1 | T3 | _____MAtQADLMEL | ochoa | 0.350216 | ATR,ATM,DNAPK,CAMKK1,CAMKK2,PBK,ASK1,OSR1,TNIK... | MARK1,MARK2,DNAPK,ATR,SMG1,HUNK,QSK,MARK3,MARK... | 1.685871 | 2.839518 |
28254 | P35222 | CTNNB1 | S23 | PDRKAAVsHWQQQsy | psp | 0.434195 | P90RSK,RSK4,MARK1,RSK2,AKT1,TSSK2,SGK1,P70S6K,... | SSTK,BRSK1,PRKD3,BRSK2,P70S6K,SNRK,MARK3,MAPKA... | 1.795827 | 3.255439 |
28255 | P35222 | CTNNB1 | S29 | VsHWQQQsyLDsGIH | ochoa|psp | 0.628389 | PAK4,CAMK1D,NIM1,LATS2,PAK5,TBK1,TSSK1,GRK7,NU... | GSK3A,GSK3B,LATS2,MAPKAPK2,CAMK2A,CAMK2B,LATS1... | 1.693298 | 6.465129 |
28256 | P35222 | CTNNB1 | Y30 | sHWQQQsyLDsGIHs | ochoa|psp | 0.780358 | ERBB4,FGFR4,TNK1,JAK3,CSK,KIT,EPHA5,EGFR,JAK2,... | BMPR2_TYR,PTK2,SYK,ERBB4,PDHK1_TYR,EPHA3,PDHK4... | 1.596045 | 2.796830 |
28257 | P35222 | CTNNB1 | S33 | QQQsyLDsGIHsGAT | psp | 0.978753 | GSK3B,IKKB,IKKA,GSK3A,TBK1,PAK4,GRK1,IKKE,P90R... | CK1G2,CK1A,CK1G3,GSK3A,GSK3B,GRK3,CK1A2,CK1D,J... | 1.797053 | 8.479370 |
28258 | P35222 | CTNNB1 | S37 | yLDsGIHsGATtTAP | ochoa|psp | 0.954689 | GSK3A,PAK6,PAK5,GSK3B,TBK1,PAK4,PRKX,IKKB,ULK3... | GSK3A,GSK3B,CK1A,CK1G2,GRK7,IKKA,GRK4,GRK5,IKK... | 1.795597 | 8.083523 |
28259 | P35222 | CTNNB1 | T41 | GIHsGATtTAPsLsG | psp | 0.903105 | MPSK1,GSK3B,PBK,GSK3A,MEK2,ASK1,LKB1,TNIK,MEKK... | GSK3A,GSK3B,PRP4,PASK,CK1G2,CK1A,GRK7,CK1D,CK1... | 1.649193 | 7.866048 |
28260 | P35222 | CTNNB1 | S45 | GATtTAPsLsGKGNP | ochoa|psp | 0.915674 | TBK1,IKKE,PAK6,MTOR,PAK5,PRKX,RSK4,RSK2,CK1A,ULK3 | CK1A,CK1G1,CK1D,CK1G2,MTOR,CK1E,CK1A2,CLK3,IKK... | 2.044766 | 4.275749 |
28261 | P35222 | CTNNB1 | S47 | TtTAPsLsGKGNPEE | ochoa | 0.556963 | CK1G2,PAK5,PAK6,PRKX,PKACB,CK2A1,PRKD3,ULK3,RS... | MPSK1,CK1E,ERK5,GRK1,CK1D,COT,MOS,NLK,MLK3,ERK1 | 1.823853 | 2.982430 |
28262 | P35222 | CTNNB1 | S60 | EEEDVDTsQVLyEWE | psp | 0.208932 | ATM,ATR,GRK7,GRK1,BUB1B,CK2A2,GRK5,DNAPK,CK2A1... | ATM,ACVR2B,ACVR2A,GRK4,GRK7,CK1G2,PLK1,TLK2,PL... | 2.656530 | 5.809839 |
28263 | P35222 | CTNNB1 | Y64 | VDTsQVLyEWEQGFS | psp | 0.294942 | TEC,BLK,EPHA4,BTK,SRMS,SYK,FES,EPHA7,EPHA6,FYN | SYK,MERTK,EPHA5,SRMS,EPHA4,FER,EPHA3,PTK6,FES,TEK | 1.804839 | 2.871988 |
28264 | P35222 | CTNNB1 | S73 | WEQGFSQsFTQEQVA | psp | 0.347105 | BRSK2,IKKB,ULK3,TSSK1,NUAK1,PKACB,PRKX,NDR2,TB... | CAMK2G,ULK1,PLK1,PLK3,IKKA,CAMK2B,DSTYK,NEK2,I... | 1.725664 | 2.099936 |
28265 | P35222 | CTNNB1 | Y86 | VADIDGQyAMTRAQR | psp | 0.688526 | JAK3,FGFR4,EGFR,TEK,KIT,FYN,BLK,LTK,TNK2,ABL1 | ABL1,ABL2,MATK,BLK,FYN,FGR,MERTK,ZAP70,LCK,HCK | 1.819756 | 2.467626 |
28266 | P35222 | CTNNB1 | T102 | RAAMFPEtLDEGMQI | psp | 0.894711 | TNIK,CK2A2,HGK,CAMKK1,MEKK3,NEK4,MINK,GCK,KHS1... | DNAPK,CK2A1,CK2A2,CAMK2G,PLK2,PLK3,CAMK2A,TGFB... | 1.607649 | 3.374482 |
28267 | P35222 | CTNNB1 | T112 | EGMQIPStQFDAAHP | psp | 0.663232 | ATR,ATM,DNAPK,EEF2K,CAMKK2,PBK,ICK,MPSK1,OSR1,... | ATR,DNAPK,SMG1,P38D,MPSK1,ATM,ICK,EEF2K,TLK2,A... | 1.758952 | 2.576788 |
28268 | P35222 | CTNNB1 | T120 | QFDAAHPtNVQRLAE | psp | 0.878526 | PBK,LKB1,CAMKK1,OSR1,TNIK,EEF2K,ROCK2,NEK4,MIN... | MPSK1,TAK1,PINK1,VRK1,YANK2,GAK,GSK3A,PASK,ROC... | 1.595444 | 1.445671 |
28269 | P35222 | CTNNB1 | Y142 | AVVNLINyQDDAELA | psp | 0.980268 | EPHA4,ATR,EPHA6,EPHA7,EPHB1,EPHA3,EPHB2,EPHA8,... | PDHK3_TYR,EPHA4,PDHK4_TYR,EPHA5,EPHA3,PDHK1_TY... | 1.930682 | 0.982576 |
28270 | P35222 | CTNNB1 | S184 | QLSKKEAsRHAIMRs | psp | 0.927547 | PAK4,TSSK1,BRSK2,MNK2,PAK6,PKN2,MARK2,NUAK1,DC... | SSTK,DCAMKL1,BRSK2,TSSK1,BRSK1,TLK1,TSSK2,AMPK... | 1.935281 | 4.978343 |
28271 | P35222 | CTNNB1 | S191 | sRHAIMRsPQMVSAI | ochoa|psp | 0.811158 | CDK4,ERK2,DYRK1A,CDK1,HIPK3,CDK2,CDK5,ERK1,DYR... | CDK17,CDK16,CDK14,CDK18,ERK1,ERK2,P38G,CDK9,CD... | 2.070521 | 4.814151 |
28272 | P35222 | CTNNB1 | S246 | ALVKMLGsPVDSVLF | psp | 0.772711 | CDK4,CDK1,CDK2,CDK5,ERK2,CDK3,JNK1,ERK1,DYRK4,... | JNK1,JNK2,P38D,JNK3,P38G,P38B,P38A,CDK8,NLK,CDK14 | 2.267036 | 6.378471 |
28273 | P35222 | CTNNB1 | T298 | VKFLAITtDCLQILA | psp | 0.972658 | PBK,LKB1,TNIK,NEK4,MST2,MINK,CAMKK1,MST1,NEK1,HGK | MEK1,GSK3B,MOS,DAPK2,ALK4,ACVR2B,ACVR2A,MEK2,B... | 1.589268 | 0.414628 |
28274 | P35222 | CTNNB1 | S311 | LAYGNQEsKLIILAS | psp | 0.972500 | TSSK1,NIM1,QIK,TSSK2,MARK2,MARK1,CAMK1D,MARK3,... | IRE2,SKMLCK,TSSK1,SSTK,SMMLCK,TLK1,TSSK2,MELK,... | 1.821695 | 2.632692 |
28275 | P35222 | CTNNB1 | Y331 | LVNIMRTytyEKLLW | psp | 0.837516 | DDR2,FGFR4,EPHB1,LCK,RET,EPHA8,FES,EPHA4,EPHA5... | PTK2,EPHA4,EPHA7,EPHA3,EPHA6,PTK2B,BMPR2_TYR,S... | 1.629153 | 4.126294 |
28276 | P35222 | CTNNB1 | T332 | VNIMRTytyEKLLWT | psp | 0.408153 | AKT1,PBK,AMPKA1,LRRK2,CHK2,LKB1,TNIK,GCK,ROCK2... | ALPHAK3,YANK2,PASK,GRK7,SBK,YANK3,MAPKAPK2,NUA... | 1.434642 | 4.102963 |
28277 | P35222 | CTNNB1 | Y333 | NIMRTytyEKLLWTT | psp | 0.982195 | EPHA1,EPHA7,EPHA2,BLK,TNK2,SRMS,EPHA4,LTK,EPHA... | SYK,BLK,EPHA6,EPHA4,FRK,PTK2,ZAP70,ERBB4,EPHA8... | 1.711815 | 3.995439 |
28278 | P35222 | CTNNB1 | S352 | KVLSVCSsNKPAIVE | psp | 0.839116 | PRKD2,PRKD3,HIPK4,PRKD1,MAPKAPK3,TSSK2,CAMK4,M... | BUB1,HIPK4,PRKD1,PRKD2,PRKD3,CDK18,KIS,PERK,ML... | 2.003260 | 2.875428 |
28279 | P35222 | CTNNB1 | T371 | QALGLHLtDPsQRLV | psp | 0.286979 | PBK,LKB1,NEK4,TNIK,CAMKK1,MEKK3,CAMKK2,MINK,GC... | SMG1,MEK1,MEK2,ACVR2B,CK2A1,GSK3B,MOS,DAPK2,BU... | 1.658914 | 1.683408 |
28280 | P35222 | CTNNB1 | S374 | GLHLtDPsQRLVQNC | psp | 0.960511 | ATR,ATM,DNAPK,ERK7,NIM1,TSSK2,LATS1,PKCA,DSTYK... | CK1G3,CK1D,CK1A,CK1E,CK1A2,CK1G2,CK1G1,KIS,TGF... | 2.421310 | 5.546010 |
28281 | P35222 | CTNNB1 | T384 | LVQNCLWtLRNLSDA | psp | 0.945400 | CAMKK1,MINK,TNIK,CAMKK2,MST1,ERK7,NEK4,HGK,LKB... | ERK7,TAK1,NEK4,MST1,HPK1,NEK8,CAMKK1,IRAK1,NEK... | 1.728943 | 2.678529 |
28282 | P35222 | CTNNB1 | T393 | RNLSDAAtKQEGMEG | psp | 0.846858 | CK2A2,MEKK3,PBK,TNIK,CAMKK1,CAMKK2,MINK,MST1,M... | GSK3A,CK1G3,TLK1,TSSK2,TLK2,GRK7,ALPHAK3,GSK3B... | 1.609937 | 1.804045 |
28283 | P35222 | CTNNB1 | T472 | ICALRHLtSRHQEAE | psp | 0.988368 | AKT1,ROCK1,ROCK2,SGK1,ERK7,MRCKB,PBK,AKT3,MST1... | PIM2,PKN1,ROCK1,MRCKB,PKCE,AKT3,LOK,AKT1,SGK1,... | 1.585884 | 4.709928 |
28284 | P35222 | CTNNB1 | Y489 | QNAVRLHyGLPVVVK | ochoa|psp | 0.690221 | CSK,PTK6,TNK1,TNK2,ABL1,PTK2B,LTK,TYRO3,LCK,PD... | ABL1,PTK2B,ABL2,PKMYT1_TYR,TNK2,FER,TNK1,PTK6,... | 1.824113 | 2.370134 |
28285 | P35222 | CTNNB1 | T547 | LVRAHQDtQRRtsMG | ochoa | 0.646316 | ATR,OSR1,ATM,DNAPK,TNIK,MINK,ERK7,HGK,EEF2K,LKB1 | GSK3B,GSK3A,EEF2K,OSR1,PIM1,MLK4,PKCB,CAMK2A,E... | 1.709029 | 5.511960 |
28286 | P35222 | CTNNB1 | T551 | HQDtQRRtsMGGtQQ | ochoa|psp | 0.402405 | LKB1,PBK,ROCK1,OSR1,TNIK,ROCK2,AURB,ASK1,HASPI... | DAPK1,DAPK3,CK1G2,GRK2,GRK3,AURA,CK1A,CK1A2,CK... | 1.511777 | 4.392697 |
28287 | P35222 | CTNNB1 | S552 | QDtQRRtsMGGtQQQ | ochoa|psp | 0.596179 | PRKX,PKACA,PKACB,AKT1,PKG2,PKG1,AURB,AURC,AKT3... | AURA,PRKX,CLK2,GSK3B,PKACB,MSK1,AURC,GSK3A,DAP... | 2.333742 | 7.778372 |
28288 | P35222 | CTNNB1 | T556 | RRtsMGGtQQQFVEG | ochoa|psp | 0.330547 | ATR,ATM,DNAPK,OSR1,PBK,HASPIN,TNIK,GCK,HGK,MINK | ATR,DNAPK,SMG1,CK1G3,ATM,TLK2,NEK5,CK1D,MEKK2,... | 1.792496 | 3.552549 |
28289 | P35222 | CTNNB1 | S605 | LFVQLLYsPIENIQR | psp | 0.744484 | CDK1,CDK4,CDK5,CDK2,CDK3,ERK2,ERK1,JNK1,P38G,JNK2 | JNK1,JNK3,JNK2,P38D,P38G,P38B,ERK5,ERK2,ERK1,D... | 2.161258 | 5.347646 |
28290 | P35222 | CTNNB1 | S646 | PLTELLHsRNEGVAt | psp | 0.956311 | CK2A2,CK2A1,IKKB,GRK1,GRK7,PAK4,IKKA,ULK3,P90R... | GRK6,PINK1,NEK9,PLK2,CAMKK1,NEK7,PLK3,GRK5,COT... | 2.018886 | 2.662771 |
28291 | P35222 | CTNNB1 | T653 | sRNEGVAtyAAAVLF | ochoa|psp | 0.986426 | TNIK,PBK,OSR1,TAO2,MINK,MST1,KHS1,MST2,GCK,TAO1 | GRK7,PRP4,TGFBR1,ALK4,JNK1,P38G,P38B,ACVR2B,BM... | 1.631637 | 2.709797 |
28292 | P35222 | CTNNB1 | Y654 | RNEGVAtyAAAVLFR | ochoa|psp | 0.993016 | EPHA7,EPHA4,TXK,CSK,LTK,BTK,EPHB3,LYN,EPHA1,BLK | PTK2,EPHA6,EPHB2,BLK,FYN,ZAP70,ABL2,ABL1,SYK,LCK | 1.929364 | 3.188853 |
28293 | P35222 | CTNNB1 | Y670 | SEDKPQDyKKRLsVE | psp | 0.954805 | FLT1,SYK,PTK2,KIT,CSF1R,EPHA4,EPHA6,JAK2,EPHA2... | PDHK4_TYR,WEE1_TYR,TNNI3K_TYR,BMPR2_TYR,PDHK3_... | 1.951307 | 1.536509 |
28294 | P35222 | CTNNB1 | S675 | QDyKKRLsVELtssL | ochoa|psp | 0.980063 | PAK4,PAK6,PAK5,PKACA,AURB,PAK1,PKG2,PRKX,PAK2,... | GSK3B,AURA,MYLK4,MSK1,CLK4,PKACA,DAPK1,SKMLCK,... | 2.388692 | 6.234278 |
28295 | P35222 | CTNNB1 | T679 | KRLsVELtssLFRTE | ochoa | 0.953053 | TGFBR1,BMPR1B,ALK2,PBK,CAMKK2,ALK4,OSR1,PERK,T... | BMPR1B,BMPR1A,ACVR2B,ACVR2A,TGFBR1,GRK2,GRK3,A... | 1.719372 | 6.180493 |
28296 | P35222 | CTNNB1 | S680 | RLsVELtssLFRTEP | ochoa | 0.810021 | IKKB,GRK7,CLK3,CK1G1,CDK7,CK1G3,P90RSK,RSK4,CK... | CK1G2,CK1A,GRK2,CK1E,GRK3,CK1A2,CK1D,CK1G3,ACV... | 1.845225 | 4.125982 |
28297 | P35222 | CTNNB1 | S681 | LsVELtssLFRTEPM | ochoa | 0.905768 | ULK3,CLK3,TBK1,MTOR,IKKE,DSTYK,IKKB,NEK7,NEK6,... | CK1G2,CK1D,CK1E,CK1A2,CK1A,GRK1,CK1G3,GRK3,GRK... | 1.881727 | 4.856114 |
28298 | P35222 | CTNNB1 | S715 | GYRQDDPsyRsFHSG | ochoa|psp | 0.276474 | CK1G2,CK1G3,ERK7,CK1G1,AKT1,CK1A,RSK4,P90RSK,T... | GRK7,CK1G2,CLK3,GRK3,CK1G3,GRK2,ACVR2A,GSK3A,L... | 1.845451 | 5.621210 |
28299 | P35222 | CTNNB1 | Y716 | YRQDDPsyRsFHSGG | ochoa | 0.276789 | KIT,EGFR,JAK1,FGFR4,FLT3,CSF1R,FYN,ABL1,TEK,FLT1 | PTK2,FYN,ERBB4,FLT1,SYK,ZAP70,SRC,BLK,MET,LCK | 1.730527 | 6.357552 |
28300 | P35222 | CTNNB1 | S718 | QDDPsyRsFHSGGYG | psp | 0.365442 | IKKA,TBK1,IKKB,CK1A,PAK5,CK1G2,PAK6,ULK3,MNK2,... | CK1G2,CK1G3,CK1A,YANK2,GRK3,CK1G1,YANK3,CK1A2,... | 1.670313 | 10.234688 |
(code only) Add AlphaMissense score to each site
The AM_mean.parquet file is too big to upload to the current repository. To generate the file, refer to others_01_Process_AM.ipynb
notebook.
= pd.read_parquet('raw/AM_mean.parquet') df
= ['uniprot','site','AM_pathogenicity','position'] df.columns
= df.iloc[:,:3] df
The original:
= Data.get_combine_site_psp_ochoa() comb1
=comb1.merge(df,how='left',on=['uniprot','site']) comb1
The phosphorylated
= Data.get_combine_site_phosphorylated() comb2
=comb2.merge(df,how='left',on=['uniprot','site']) comb2
Add kinase prediction
for uppercase, only cddm
'site_seq'] = comb1['site_seq'].str.split('|').str[0] # only one site with one aa difference comb1[
= predict_kinase_df(comb1,'site_seq',**param_CDDM_upper) cddm_upper
input dataframe has a length 121419
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [02:22<00:00, 2.03it/s]
def get_top(r, n):
= r.sort_values(ascending=False)[:n].index
top return ','.join(top)
= cddm_upper.apply(lambda r: get_top(r,10),axis=1) cddm_upper_rnk
'CDDM_upper']=cddm_upper_rnk comb1[
'CDDM_max_score'] = cddm_upper.max(1) comb1[
Uncheck below to save:
# comb1.to_parquet('raw/combine_site_psp_ochoa.parquet')
For phosphorylated:
= predict_kinase_df(comb2,'site_seq',**param_CDDM) cddm
input dataframe has a length 120104
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [02:17<00:00, 2.11it/s]
= predict_kinase_df(comb2,'site_seq',**param_PSPA) pspa
input dataframe has a length 120104
Preprocessing
Finish preprocessing
Calculating position: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
100%|██████████| 396/396 [04:16<00:00, 1.55it/s]
= cddm.apply(lambda r: get_top(r,10),axis=1)
cddm_rnk = pspa.apply(lambda r: get_top(r,10),axis=1) pspa_rnk
'CDDM']=cddm_rnk comb2[
'PSPA']=pspa_rnk comb2[
'CDDM_max_score']=cddm.max(1)
comb2['PSPA_max_score']=pspa.max(1) comb2[
To save and load:
# comb2.to_parquet('raw/phosphorylated_combine_site.parquet')
# comb2 = pd.read_parquet('raw/phosphorylated_combine_site.parquet')