= pd.DataFrame({'gene': ['A', 'B', 'C']})
df1 = pd.DataFrame({'gene': ['B', 'C', 'D']})
df2
= get_diff(df1, df2, 'gene') df1_unq, df2_unq
Preprocess
Setup
from katlas.utils import *
Data processing
get_diff
get_diff (df1, df2, col1, col2=None)
Get non-overlap parts of two dataframes.
df1_unq
gene | |
---|---|
0 | A |
df2_unq
gene | |
---|---|
2 | D |
Checker
In many phosphorylation datsets, there are amino acids in the site sequence that are in lower case but does not belong to s/t/y. Also, there are uncommon amino acids such as U or O that appear in the sequence. Therefore, it is essential to convert the sequence string for kinase ranking.
check_seq
check_seq (seq)
Convert non-s/t/y characters to uppercase and replace disallowed characters with underscores.
try:
'aaadaaa')
check_seq(except Exception as e:
print(e)
aaadaaa has d at position 3; need to have one of 's', 't', or 'y' in the center
'AAkUuPSFstTH') # if the center amino acid does not belong to sty/STY, will raise an error check_seq(
'AAK__PSFstTH'
check_seq_df
check_seq_df (df, col)
Convert non-s/t/y to upper case & replace with underscore if the character is not in the allowed set
=Data.get_human_site()
df df.head()
substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
1 | A0A075B6Q4 | None | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
2 | A0A075B6Q4 | None | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
3 | A0A075B6Q4 | None | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
4 | A0A075B6Q4 | None | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
'site_seq') check_seq_df(df.head(),
0 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: object
validate_site
validate_site (site_info, seq)
Validate site position residue match with site residue.
='S610'
site= 'MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVGPKDLRSAMGEGGGPEPGPANAKWLKEGQNQLRRAATAHRDQNRNVTLTLAEEASQEPEMAPLGPKGLIHLYSELELSAHNAANRGLRGPGLIISTQEQGPDEGEEKAAGEAEEEEEDDDDEEEEEDLSSPPGLPEPLESVEAPPRPQALTDGPREHSKSASLLFGMRNSAASDEDSSWATLSQGSPSYGSPEDTDSFWNPNAFETDSDLPAGWMRVQDTSGTYYWHIPTGTTQWEPPGRASPSQGSSPQEESQLTWTGFAHGEGFEDGEFWKDEPSDEAPMELGLKEPEEGTLTFPAQSLSPEPLPQEEEKLPPRNTNPGIKCFAVRSLGWVEMTEEELAPGRSSVAVNNCIRQLSYHKNNLHDPMSGGWGEGKDLLLQLEDETLKLVEPQSQALLHAQPIISIRVWGVGRDSGRERDFAYVARDKLTQMLKCHVFRCEAPAKNIATSLHEICSKIMAERRNARCLVNGLSLDHSKLVDVPFQVEFPAPKNELVQKFQVYYLGNVPVAKPVGVDVINGALESVLSSSSREQWTPSHVSVAPATLTILHQQTEAVLGECRVRFLSFLAVGRDVHTFAFIMAAGPASFCCHMFWCEPNAASLSEAVQAACMLRYQKCLDARSQASTSCLPAPPAESVARRVGWTVRRGVQSLWGSLKPKRLGAHTP' seq
validate_site(site,seq)
1
validate_site_df
validate_site_df (df, site_info_col, protein_seq_col)
Validate site position residue match with site residue in a dataframe.
'site','substrate_sequence') validate_site_df(df.head(),
0 1
1 1
2 1
3 1
4 1
dtype: int64
Phosphorylate protein seq
phosphorylate_seq
phosphorylate_seq (seq, *sites)
Phosphorylate protein sequence based on phosphosites (e.g.,S140).
Type | Details | |
---|---|---|
seq | full protein sequence | |
sites | VAR_POSITIONAL | site info, e.g., S140 |
= 'MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDSQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDSVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
seq *['S95', 'S22', 'T25', 'S6', 'S158']) phosphorylate_seq(seq,
'MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq_df
phosphorylate_seq_df (df, id_col='substrate_uniprot', seq_col='substrate_sequence', site_col='site')
Phosphorylate whole sequence based on phosphosites in a dataframe
Type | Default | Details | |
---|---|---|---|
df | |||
id_col | str | substrate_uniprot | column of sequence ID |
seq_col | str | substrate_sequence | column that contains protein sequence |
site_col | str | site | column that contains site info, e.g., S140 |
=Data.get_human_site()
df df.head()
substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
1 | A0A075B6Q4 | None | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
2 | A0A075B6Q4 | None | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
3 | A0A075B6Q4 | None | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
4 | A0A075B6Q4 | None | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
100),'substrate_uniprot','substrate_sequence','site') phosphorylate_seq_df(df.head(
substrate_uniprot | site | substrate_sequence | phosphoseq | |
---|---|---|---|---|
0 | A0A024R4G9 | [S20] | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... |
1 | A0A075B6Q4 | [S24, S35, S57, S68, S71, S72] | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... |
... | ... | ... | ... | ... |
22 | A0A0A6YYL6 | [S5, Y139, S141, S142] | MVRYSLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... | MVRYsLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... |
23 | A0A0B4J1R7 | [T6, S43, S45, S46] | MMATGTPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGSRSS... | MMATGtPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGsRss... |
24 rows × 4 columns
Extract site seq
extract_site_seq
extract_site_seq (df:pandas.core.frame.DataFrame, seq_col:str, site_col:str, n=7)
Extract -n to +n site sequence from protein sequence
Type | Default | Details | |
---|---|---|---|
df | DataFrame | dataframe that contains protein sequence | |
seq_col | str | column name of protein sequence | |
site_col | str | column name of site information (e.g., S10) | |
n | int | 7 | length of surrounding sequence (default -7 to +7) |
As some datasets only contains protein information and position of phosphorylation sites, but not phosphorylation site sequence, we can retreive protein sequence and use this function to get -7 to +7 phosphorylation site sequence (as numpy array).
Remember to validate the phospho-acceptor at position 0 before extract the site sequence, as there could be mismatch due to the protein sequence database updates.
df.head()
substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
1 | A0A075B6Q4 | None | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
2 | A0A075B6Q4 | None | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
3 | A0A075B6Q4 | None | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
4 | A0A075B6Q4 | None | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
extract_site_seq(df.head(),='substrate_sequence',
seq_col='site',
site_col=30
n )
100%|██████████| 5/5 [00:00<00:00, 4705.30it/s]
array(['___________MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSHTPRR',
'_______MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHL',
'KSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFT',
'DYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERF',
'DCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERFEKFYEQYDDDE'],
dtype='<U61')