# it will create the parent directory if not exist, then return the full path, so you can directly use it to save file without worrying about the directory existence
# prepare_path('fig/results/my_plot.svg')Utils
Setup
Utils
prepare_path
def prepare_path(
path
):
Ensure the parent directory exists and return the full file path.
info = Data.kinase_info()
info.modi_group.value_counts().indexIndex(['TK', 'CAMK', 'Other', 'CMGC', 'AGC', 'STE', 'TKL', 'Atypical', 'CK1',
'NEK', 'RGC'],
dtype='str', name='modi_group')
get_subfamily_color
def get_subfamily_color(
):
Call self as a function.
get_subfamily_color().keys()dict_keys(['NAK', 'Lmr', 'Abl', 'STKR1', 'STKR2', 'ABC1-B', 'ABC1-C', 'ABC1-D', 'Akt', 'ALK', 'Alpha', 'AMPK', 'RIPK', 'RAF', 'STE11', 'ATM', 'ATR', 'Aur', 'Axl', 'PDHK', 'Src', 'Tec', 'BRD', 'BRSK', 'BUB', 'CAMK1', 'CAMK2', 'CAMKK-Meta', 'CAMK-Unique', 'MLCK', 'CASK', 'GEK', 'CDC7', 'CDC2', 'CDK10', 'CDK11', 'CRK7', 'PFTAIRE', 'PCTAIRE', 'CDK8', 'CDK2', 'CDK', 'CDK4', 'CDK5', 'CDK7', 'CDK9', 'CDKL', 'ChaK', 'CHK1', 'RAD53', 'CK1', 'CK2', 'CLK', 'ABC1-A', 'STE-Unique', 'DMPK', 'PDGFR', 'Csk', 'DAPK', 'DCAMKL', 'DDR', 'LZK', 'DNAPK', 'Dusty', 'DYRK1', 'DYRK2', 'eEF2K', 'EGFR', 'Eph', 'ERK1', 'ERK5', 'ERK7', 'FAM20C', 'Fer', 'FGFR', 'VEGFR', 'KHS', 'GCN2', 'GRK', 'BARK', 'GSK', 'RGC', 'Haspin', 'MSN', 'HIPK', 'PEK', 'HUNK', 'RCK', 'InsR', 'IKK', 'ILK', 'IRAK', 'IRE', 'Jak', 'JakB', 'JNK', 'Trio', 'KIS', 'KSR', 'NDR', 'LIMK', 'LKB', 'SLK', 'LRRK', 'STE7', 'ERK3', 'MK2', 'MK5', 'MARK', 'MAST', 'MELK', 'Met', 'MLK', 'TKL-Unique', 'MNK', 'MOS', 'MSK', 'MSKb', 'MST', 'YSK', 'FRAP', 'Musk', 'NinaC', 'NEK', 'NIM1', 'nmo', 'NRBP', 'Trk', 'NuaK', 'FRAY', 'p38', 'RSKp70', 'RSKp90', 'RSKb', 'PAKA', 'PAKB', 'PASK', 'TOPK', 'NKF4', 'PDK1', 'NKF3', 'PHK', 'VPS15', 'PIM', 'NKF2', 'PKA', 'PKCa', 'PKCd', 'PKCh', 'PKCi', 'SgK493', 'PKG', 'WEE', 'PKN', 'PLK', 'Other-Unique', 'PKD', 'PRP4', 'Bud32', 'PSK', 'FAK', 'CCK4', 'Slob', 'QIK', 'Ret', 'RIO1', 'RIO2', 'RIO3', 'ROCK', 'Ror', 'Sev', 'RSKL', 'RSKR', 'Ryk', 'NKF1', 'SCY1', 'SGK', 'SMG1', 'SNRK', 'SRPK', 'TSSK', 'YANK', 'STK33', 'ULK', 'SgK495', 'SgK071', 'STLK', 'TK-Unique', 'Syk', 'TAK1', 'TAO', 'TBCK', 'Tie', 'TESK', 'NKF5', 'TLK', 'Ack', 'HH498', 'Trbl', 'TIF1', 'TRRAP', 'TTBK', 'TTK', 'VRK', 'WNK', 'ZAK'])
info.pspa_category_big.str.split('_').str[0].value_counts()[:20].indexIndex(['Basophilic', 'Pro-directed', 'Acidophilic', 'Map3k', 'Map4k',
'Alpha/mlk', 'Assorted', 'Fgf and vegf receptors', 'Ripk/wnk', 'Pkc',
'Ephrin receptors', 'Nek/ask', 'Eif2ak/tlk', 'Pdgf receptors', 'Src',
'Jak', 'Tam receptors', 'Tec', 'Cmgc', 'Ulk/ttbk'],
dtype='object', name='pspa_category_big')
pspa_category_color.keys()dict_keys(['Basophilic', 'Pro-directed', 'Acidophilic', 'Map3k', 'Map4k', 'Alpha/mlk', 'Fgf and vegf receptors', 'Assorted', 'Ripk/wnk', 'Pkc', 'Ephrin receptors', 'Eif2ak/tlk', 'Nek/ask', 'Pdgf receptors', 'Src', 'Jak', 'Ulk/ttbk', 'Cmgc', 'Tec', 'Tam receptors'])
remove_hi_corr
def remove_hi_corr(
df:DataFrame, thr:float=0.98, # threshold
):
Remove highly correlated features in a dataframe given a pearson threshold
remove_hi_corr is a function to remove highly correlated features based on threshold of Pearson correlation between features.
# Load data
df = Data.aa_rdkit()
df.shape(25, 106)
remove_hi_corr(df,thr=0.9).shape(25, 78)
clean_feat
def clean_feat(
df:DataFrame, thr:float=0.98
):
Remove features with no variance, and highly correlated features based on threshold
This function is similar to remove_hi_corr, but can additionaly remove features of zero variance (e.g., 1 across all samples)
clean_feat(df,thr=0.9).shaperemoving columns: {'Chi3v', 'fr_SH', 'Chi2v', 'Chi3n', 'Chi0v', 'Kappa1', 'VSA_EState6', 'Chi1v', 'fr_Ar_N', 'NumAromaticRings', 'fr_NH2', 'Chi2n', 'VSA_EState10', 'NumAromaticCarbocycles', 'Chi4n', 'SlogP_VSA5', 'RingCount', 'Ipc', 'NOCount', 'Chi0n', 'Chi4v', 'NumRotatableBonds', 'VSA_EState2', 'SMR_VSA9', 'SMR_VSA1', 'NumHDonors', 'Chi1', 'NumHeteroatoms'}
(25, 78)
standardize
def standardize(
df
):
Standardize features from a df
Checker
In many phosphorylation datsets, there are amino acids in the site sequence that are in lower case but does not belong to s/t/y. Also, there are uncommon amino acids such as U or O that appear in the sequence. Therefore, it is essential to convert the sequence string for kinase ranking.
Convert between uppercase phospho-acceptors, lowercase center residues, and pS/pT/pY labels.
STY2sty
def STY2sty(
input_string:str
):
Replace all uppercase S/T/Y with lowercase s/t/y in a sequence.
pSTY2sty
def pSTY2sty(
string
):
Convert pS/pT/pY to s/t/y in a string.
sty2pSTY
def sty2pSTY(
string
):
Convert s/t/y to pS/pT/pY in a string.
sty2pSTY_df
def sty2pSTY_df(
df
):
Apply sty to pSTY conversion to a dataframe index.
check_seq
def check_seq(
seq
):
Convert non-s/t/y characters to uppercase and replace disallowed characters with underscores.
try:
check_seq('aaadaaa')
except Exception as e:
print(e)Center must be s/t/y; got d in 'aaadaaa'
check_seq('AAkUuPSFstTH') # if the center amino acid does not belong to sty/STY, will raise an error'AAK__PSFstTH'
check_seqs
def check_seqs(
data, col:NoneType=None
):
Convert non-s/t/y to upper case & replace with underscore if the character is not in the allowed set
df=Data.human_site()
check_seqs(df.head(),'site_seq')0 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: str
check_seqs(df['site_seq'].head())0 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: str
validate_site
def validate_site(
site_info, seq
):
Validate site position residue match with site residue.
site='S610'
seq = 'MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVGPKDLRSAMGEGGGPEPGPANAKWLKEGQNQLRRAATAHRDQNRNVTLTLAEEASQEPEMAPLGPKGLIHLYSELELSAHNAANRGLRGPGLIISTQEQGPDEGEEKAAGEAEEEEEDDDDEEEEEDLSSPPGLPEPLESVEAPPRPQALTDGPREHSKSASLLFGMRNSAASDEDSSWATLSQGSPSYGSPEDTDSFWNPNAFETDSDLPAGWMRVQDTSGTYYWHIPTGTTQWEPPGRASPSQGSSPQEESQLTWTGFAHGEGFEDGEFWKDEPSDEAPMELGLKEPEEGTLTFPAQSLSPEPLPQEEEKLPPRNTNPGIKCFAVRSLGWVEMTEEELAPGRSSVAVNNCIRQLSYHKNNLHDPMSGGWGEGKDLLLQLEDETLKLVEPQSQALLHAQPIISIRVWGVGRDSGRERDFAYVARDKLTQMLKCHVFRCEAPAKNIATSLHEICSKIMAERRNARCLVNGLSLDHSKLVDVPFQVEFPAPKNELVQKFQVYYLGNVPVAKPVGVDVINGALESVLSSSSREQWTPSHVSVAPATLTILHQQTEAVLGECRVRFLSFLAVGRDVHTFAFIMAAGPASFCCHMFWCEPNAASLSEAVQAACMLRYQKCLDARSQASTSCLPAPPAESVARRVGWTVRRGVQSLWGSLKPKRLGAHTP'validate_site(site,seq)1
validate_site_df
def validate_site_df(
df, site_info_col, protein_seq_col
):
Validate site position residue match with site residue in a dataframe.
validate_site_df(df.head(),'site','substrate_sequence')0 1
1 1
2 1
3 1
4 1
dtype: int64
Phosphorylate protein seq
phosphorylate_seq
def phosphorylate_seq(
seq, # full protein sequence
sites:VAR_POSITIONAL, # site info, e.g., S140
):
Phosphorylate protein sequence based on phosphosites (e.g.,S140).
seq = 'MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDSQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDSVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq(seq,*['S95', 'S22', 'T25', 'S6', 'S158'])'MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq_df
def phosphorylate_seq_df(
df, id_col:str='substrate_uniprot', # column of sequence ID
seq_col:str='substrate_sequence', # column that contains protein sequence
site_col:str='site', # column that contains site info, e.g., S140
):
Phosphorylate whole sequence based on phosphosites in a dataframe
df=Data.human_site()
df.head()| substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
| 1 | A0A075B6Q4 | NaN | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
| 2 | A0A075B6Q4 | NaN | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
| 3 | A0A075B6Q4 | NaN | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
| 4 | A0A075B6Q4 | NaN | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
phosphorylate_seq_df(df.head(100),'substrate_uniprot','substrate_sequence','site')| substrate_uniprot | site | substrate_sequence | phosphoseq | |
|---|---|---|---|---|
| 0 | A0A024R4G9 | [S20] | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... |
| 1 | A0A075B6Q4 | [S24, S35, S57, S68, S71, S72] | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... |
| 2 | A0A075B6T3 | [S24, S26] | XLKRAYRGLEEVQWCLEQLLTSPSPS | XLKRAYRGLEEVQWCLEQLLTSPsPs |
| 3 | A0A075B759 | [T68] | MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF... | MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF... |
| 4 | A0A087WTJ2 | [T8, Y14, Y213, T215, S221, S421, S424] | MGGRKMATDEENVYGLEENAQSRQESTRRLILVGRTGAGKSATGNS... | MGGRKMAtDEENVyGLEENAQSRQESTRRLILVGRTGAGKSATGNS... |
| 5 | A0A087WUL8 | [S364] | MVVSAGPWSSEKAEMNILEINETLRPQLAEKKQQFRNLKEKCFLTQ... | MVVSAGPWSSEKAEMNILEINETLRPQLAEKKQQFRNLKEKCFLTQ... |
| 6 | A0A087WUV0 | [T244] | MEPEGRGSLFEDSDLLHAGNPKENDVTAVLLTPGSQELMIRDMAEA... | MEPEGRGSLFEDSDLLHAGNPKENDVTAVLLTPGSQELMIRDMAEA... |
| 7 | A0A087WV48 | [S9] | MGVQVETISPGDGRTFPKRGQTCVVHYTGCGQFWPSCSRISSTFLF... | MGVQVETIsPGDGRTFPKRGQTCVVHYTGCGQFWPSCSRISSTFLF... |
| 8 | A0A087WV96 | [T138, S139] | MDLIPNLAVETWLLLAVSLILLYLYGTRTHGLFKKLGIPGPTPLPF... | MDLIPNLAVETWLLLAVSLILLYLYGTRTHGLFKKLGIPGPTPLPF... |
| 9 | A0A087WZ62 | [S246, S249, S250, T253] | MAPRRLLLVGEGNFSFAAALSETLDQSTQLTATCLQRPAELARDPL... | MAPRRLLLVGEGNFSFAAALSETLDQSTQLTATCLQRPAELARDPL... |
| 10 | A0A087X0R7 | [S99, T106, S111, S118, S142, S147, T159, S162... | DASASEEEEEEEEEEDEDEEEEVAAWRLPPRWSQLGTSQRPRPSRP... | DASASEEEEEEEEEEDEDEEEEVAAWRLPPRWSQLGTSQRPRPSRP... |
| 11 | A0A087X1C1 | [S2] | MSDKPDLSEVEKFDRSKLKKTNTEEKNTLPSKEKGVSLCRQAGVQR... | MsDKPDLSEVEKFDRSKLKKTNTEEKNTLPSKEKGVSLCRQAGVQR... |
| 12 | A0A088AWK7 | [T213] | MDAKSLTAWSRTLVTFKDVFVDFTREEWKLLDTAQQIVYRNVMLEN... | MDAKSLTAWSRTLVTFKDVFVDFTREEWKLLDTAQQIVYRNVMLEN... |
| 13 | A0A096LP49 | [S519, S592, S775, S777, S779, S1027] | MPTLVVGTPPTCLGDTPQPCHKNSQRQGPFSHGAPGRAADWKAVAK... | MPTLVVGTPPTCLGDTPQPCHKNSQRQGPFSHGAPGRAADWKAVAK... |
| 14 | A0A096LP55 | [T11, S13] | MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA... | MGLEDEQKMLtEsGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA... |
| 15 | A0A0A0MRY4 | [T180, S184, S252, S254, T255, S270, T271, S29... | MTQAAVRPWAPCLENMTTAPNGLGPGPAAPCAGSDLKDAKMVTSLA... | MTQAAVRPWAPCLENMTTAPNGLGPGPAAPCAGSDLKDAKMVTSLA... |
| 16 | A0A0A6YYC7 | [S82, S83, S85, S101, S103, S142, S146, S149, ... | MPGETEEPRPPEQQDQEGGEAAKAAPEEPQQRPPEAVAAAPAGTTS... | MPGETEEPRPPEQQDQEGGEAAKAAPEEPQQRPPEAVAAAPAGTTS... |
| 17 | A0A0A6YYG9 | [T2, T4, S42, S43] | MTATLRPYLSAVRATLQAALCLENFSSQVVERHNKPEVEVRSSKEL... | MtAtLRPYLSAVRATLQAALCLENFSSQVVERHNKPEVEVRssKEL... |
| 18 | A0A0A6YYH1 | [S2, Y5, S98] | MSRIYHDGALRNKAVQSVRLPGAWDPAAHQGGNGVLLEGELIDVSR... | MsRIyHDGALRNKAVQSVRLPGAWDPAAHQGGNGVLLEGELIDVSR... |
| 19 | A0A0A6YYK5 | [S23, S141, S189, S191, S218] | MSMLAERRRKQKWAVDPQNTAWSNDDSKFGQRMLEKMGWSKGKGLG... | MSMLAERRRKQKWAVDPQNTAWsNDDSKFGQRMLEKMGWSKGKGLG... |
| 20 | A0A0A6YYL0 | [S2, S4] | MSDSDLGEDEGLLSLAGKRKRRGNLPKESVKILRDWLYLHRYNAYP... | MsDsDLGEDEGLLSLAGKRKRRGNLPKESVKILRDWLYLHRYNAYP... |
| 21 | A0A0A6YYL1 | [Y59, S73] | MARSRLTATSVSQVQVIAHSEYQKSKRISIFLSMQDEIETEEIIKD... | MARSRLTATSVSQVQVIAHSEYQKSKRISIFLSMQDEIETEEIIKD... |
| 22 | A0A0A6YYL6 | [S5, Y139, S141, S142] | MVRYSLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... | MVRYsLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... |
| 23 | A0A0B4J1R7 | [T6, S43, S45, S46] | MMATGTPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGSRSS... | MMATGtPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGsRss... |
Extract site seq
extract_site_seq
def extract_site_seq(
df:DataFrame, # dataframe that contains protein sequence
seq_col:str, # column name of protein sequence
site_col:str, # column name of site information (e.g., S10)
n:int=7, # length of surrounding sequence (default -7 to +7)
):
Extract -n to +n site sequence from protein sequence
As some datasets only contains protein information and position of phosphorylation sites, but not phosphorylation site sequence, we can retreive protein sequence and use this function to get -7 to +7 phosphorylation site sequence (as numpy array).
Remember to validate the phospho-acceptor at position 0 before extract the site sequence, as there could be mismatch due to the protein sequence database updates.
df.head()| substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR |
| 1 | A0A075B6Q4 | NaN | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG |
| 2 | A0A075B6Q4 | NaN | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF |
| 3 | A0A075B6Q4 | NaN | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN |
| 4 | A0A075B6Q4 | NaN | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE |
extract_site_seq(df.head(),
seq_col='substrate_sequence',
site_col='site',
n=30
)100%|██████████| 5/5 [00:00<00:00, 10140.97it/s]
array(['___________MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSHTPRR',
'_______MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHL',
'KSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFT',
'DYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERF',
'DCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERFEKFYEQYDDDE'],
dtype='<U61')
Alignment
get_fasta
def get_fasta(
df, seq_col:str='kd_seq', id_col:str='kd_ID', path:str='out.fasta'
):
Generate fasta file from sequences.
get_fasta(kd,seq_col='kd_seq',id_col='kd_ID',path='raw/kinase_domains.fasta')To run clustalo alignment, can run either through terminal or the function
sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clurun_clustalo
def run_clustalo(
input_fasta, # .fasta fname
output_aln, # .aln output fname
outfmt:str='clu'
):
Run Clustal Omega to perform multiple sequence alignment.
run_clustalo("kinase_domains.fasta", "raw/kinase_domains.aln")aln2df
def aln2df(
fname
):
Call self as a function.
df = aln2df("raw/kinase_domains.aln")get_aln_freq
def get_aln_freq(
df
):
Get frequency of each amino acid across each position from the aln2df output.
freq_df = get_aln_freq(df)