df1 = pd.DataFrame({'gene': ['A', 'B', 'C']})
df2 = pd.DataFrame({'gene': ['B', 'C', 'D']})
df1_unq, df2_unq = get_diff(df1, df2, 'gene')Utils
Setup
Common funcs
prepare_path
prepare_path (path)
Ensure the parent directory exists and return the full file path.
pssm_df = get_prob(df_k,'site_seq')
plot_logo_heatmap(pssm_df,title=f'{k} (n={len(df_k):,})',figsize=(17,10))
# check if directory exist; if not, create one, then return the full path
path1=prepare_path(f'fig/cddm/{k}/pssm_freq.svg')
save_show(path1,show_only=SHOW)get_diff
get_diff (df1, df2, col1, col2=None)
Get non-overlap parts of two dataframes.
df1_unq| gene | |
|---|---|
| 0 | A |
df2_unq| gene | |
|---|---|
| 2 | D |
pSTY2sty
pSTY2sty (string)
Convert pS/pT/pY to s/t/y in a string.
sty2pSTY
sty2pSTY (string)
Convert s/t/y to pS/pT/pY in a string.
Checker
In many phosphorylation datsets, there are amino acids in the site sequence that are in lower case but does not belong to s/t/y. Also, there are uncommon amino acids such as U or O that appear in the sequence. Therefore, it is essential to convert the sequence string for kinase ranking.
check_seq
check_seq (seq)
Convert non-s/t/y characters to uppercase and replace disallowed characters with underscores.
try:
check_seq('aaadaaa')
except Exception as e:
print(e)aaadaaa has d at position 3; need to have one of 's', 't', or 'y' in the center
check_seq('AAkUuPSFstTH') # if the center amino acid does not belong to sty/STY, will raise an error'AAK__PSFstTH'
check_seqs
check_seqs (data, col=None)
Convert non-s/t/y to upper case & replace with underscore if the character is not in the allowed set
df=Data.get_human_site()
check_seqs(df.head(),'site_seq')0 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: object
check_seqs(df['site_seq'].head())0 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: object
validate_site
validate_site (site_info, seq)
Validate site position residue match with site residue.
site='S610'
seq = 'MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVGPKDLRSAMGEGGGPEPGPANAKWLKEGQNQLRRAATAHRDQNRNVTLTLAEEASQEPEMAPLGPKGLIHLYSELELSAHNAANRGLRGPGLIISTQEQGPDEGEEKAAGEAEEEEEDDDDEEEEEDLSSPPGLPEPLESVEAPPRPQALTDGPREHSKSASLLFGMRNSAASDEDSSWATLSQGSPSYGSPEDTDSFWNPNAFETDSDLPAGWMRVQDTSGTYYWHIPTGTTQWEPPGRASPSQGSSPQEESQLTWTGFAHGEGFEDGEFWKDEPSDEAPMELGLKEPEEGTLTFPAQSLSPEPLPQEEEKLPPRNTNPGIKCFAVRSLGWVEMTEEELAPGRSSVAVNNCIRQLSYHKNNLHDPMSGGWGEGKDLLLQLEDETLKLVEPQSQALLHAQPIISIRVWGVGRDSGRERDFAYVARDKLTQMLKCHVFRCEAPAKNIATSLHEICSKIMAERRNARCLVNGLSLDHSKLVDVPFQVEFPAPKNELVQKFQVYYLGNVPVAKPVGVDVINGALESVLSSSSREQWTPSHVSVAPATLTILHQQTEAVLGECRVRFLSFLAVGRDVHTFAFIMAAGPASFCCHMFWCEPNAASLSEAVQAACMLRYQKCLDARSQASTSCLPAPPAESVARRVGWTVRRGVQSLWGSLKPKRLGAHTP'validate_site(site,seq)1
validate_site_df
validate_site_df (df, site_info_col, protein_seq_col)
Validate site position residue match with site residue in a dataframe.
validate_site_df(df.head(),'site','substrate_sequence')0 1
1 1
2 1
3 1
4 1
dtype: int64
Phosphorylate protein seq
phosphorylate_seq
phosphorylate_seq (seq, *sites)
Phosphorylate protein sequence based on phosphosites (e.g.,S140).
| Type | Details | |
|---|---|---|
| seq | full protein sequence | |
| sites | VAR_POSITIONAL | site info, e.g., S140 |
seq = 'MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDSQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDSVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq(seq,*['S95', 'S22', 'T25', 'S6', 'S158'])'MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq_df
phosphorylate_seq_df (df, id_col='substrate_uniprot', seq_col='substrate_sequence', site_col='site')
Phosphorylate whole sequence based on phosphosites in a dataframe
| Type | Default | Details | |
|---|---|---|---|
| df | |||
| id_col | str | substrate_uniprot | column of sequence ID |
| seq_col | str | substrate_sequence | column that contains protein sequence |
| site_col | str | site | column that contains site info, e.g., S140 |
df=Data.get_human_site()
df.head()| substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
| 1 | A0A075B6Q4 | None | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
| 2 | A0A075B6Q4 | None | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
| 3 | A0A075B6Q4 | None | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
| 4 | A0A075B6Q4 | None | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
phosphorylate_seq_df(df.head(100),'substrate_uniprot','substrate_sequence','site')| substrate_uniprot | site | substrate_sequence | phosphoseq | |
|---|---|---|---|---|
| 0 | A0A024R4G9 | [S20] | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... |
| 1 | A0A075B6Q4 | [S24, S35, S57, S68, S71, S72] | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... |
| ... | ... | ... | ... | ... |
| 22 | A0A0A6YYL6 | [S5, Y139, S141, S142] | MVRYSLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... | MVRYsLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... |
| 23 | A0A0B4J1R7 | [T6, S43, S45, S46] | MMATGTPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGSRSS... | MMATGtPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGsRss... |
24 rows × 4 columns
Extract site seq
extract_site_seq
extract_site_seq (df:pandas.core.frame.DataFrame, seq_col:str, site_col:str, n=7)
Extract -n to +n site sequence from protein sequence
| Type | Default | Details | |
|---|---|---|---|
| df | DataFrame | dataframe that contains protein sequence | |
| seq_col | str | column name of protein sequence | |
| site_col | str | column name of site information (e.g., S10) | |
| n | int | 7 | length of surrounding sequence (default -7 to +7) |
As some datasets only contains protein information and position of phosphorylation sites, but not phosphorylation site sequence, we can retreive protein sequence and use this function to get -7 to +7 phosphorylation site sequence (as numpy array).
Remember to validate the phospho-acceptor at position 0 before extract the site sequence, as there could be mismatch due to the protein sequence database updates.
df.head()| substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
| 1 | A0A075B6Q4 | None | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
| 2 | A0A075B6Q4 | None | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
| 3 | A0A075B6Q4 | None | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
| 4 | A0A075B6Q4 | None | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
extract_site_seq(df.head(),
seq_col='substrate_sequence',
site_col='site',
n=30
)100%|██████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 9493.67it/s]
array(['___________MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSHTPRR',
'_______MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHL',
'KSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFT',
'DYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERF',
'DCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERFEKFYEQYDDDE'],
dtype='<U61')
Alignment
get_fasta
get_fasta (df, seq_col='kd_seq', id_col='kd_ID', path='out.fasta')
Generate fasta file from sequences.
get_fasta(kd,seq_col='kd_seq',id_col='kd_ID',path='raw/kinase_domains.fasta')To run clustalo alignment, can run either through terminal or the function
sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clurun_clustalo
run_clustalo (input_fasta, output_aln, outfmt='clu')
Run Clustal Omega to perform multiple sequence alignment.
| Type | Default | Details | |
|---|---|---|---|
| input_fasta | .fasta fname | ||
| output_aln | .aln output fname | ||
| outfmt | str | clu |
run_clustalo("kinase_domains.fasta", "raw/kinase_domains.aln")aln2df
aln2df (fname)
df = aln2df("raw/kinase_domains.aln")get_aln_freq
get_aln_freq (df)
Get frequency of each amino acid across each position from the aln2df output.
freq_df = get_aln_freq(df)