Utils

Functions to preprocess sequence to prepare kinase substrate dataset

Setup

Utils


prepare_path


def prepare_path(
    path
):

Ensure the parent directory exists and return the full file path.

# it will create the parent directory if not exist, then return the full path, so you can directly use it to save file without worrying about the directory existence
# prepare_path('fig/results/my_plot.svg')
info = Data.kinase_info()
info.modi_group.value_counts().index
Index(['TK', 'CAMK', 'Other', 'CMGC', 'AGC', 'STE', 'TKL', 'Atypical', 'CK1',
       'NEK', 'RGC'],
      dtype='str', name='modi_group')

get_subfamily_color


def get_subfamily_color(
    
):

Call self as a function.

get_subfamily_color().keys()
dict_keys(['NAK', 'Lmr', 'Abl', 'STKR1', 'STKR2', 'ABC1-B', 'ABC1-C', 'ABC1-D', 'Akt', 'ALK', 'Alpha', 'AMPK', 'RIPK', 'RAF', 'STE11', 'ATM', 'ATR', 'Aur', 'Axl', 'PDHK', 'Src', 'Tec', 'BRD', 'BRSK', 'BUB', 'CAMK1', 'CAMK2', 'CAMKK-Meta', 'CAMK-Unique', 'MLCK', 'CASK', 'GEK', 'CDC7', 'CDC2', 'CDK10', 'CDK11', 'CRK7', 'PFTAIRE', 'PCTAIRE', 'CDK8', 'CDK2', 'CDK', 'CDK4', 'CDK5', 'CDK7', 'CDK9', 'CDKL', 'ChaK', 'CHK1', 'RAD53', 'CK1', 'CK2', 'CLK', 'ABC1-A', 'STE-Unique', 'DMPK', 'PDGFR', 'Csk', 'DAPK', 'DCAMKL', 'DDR', 'LZK', 'DNAPK', 'Dusty', 'DYRK1', 'DYRK2', 'eEF2K', 'EGFR', 'Eph', 'ERK1', 'ERK5', 'ERK7', 'FAM20C', 'Fer', 'FGFR', 'VEGFR', 'KHS', 'GCN2', 'GRK', 'BARK', 'GSK', 'RGC', 'Haspin', 'MSN', 'HIPK', 'PEK', 'HUNK', 'RCK', 'InsR', 'IKK', 'ILK', 'IRAK', 'IRE', 'Jak', 'JakB', 'JNK', 'Trio', 'KIS', 'KSR', 'NDR', 'LIMK', 'LKB', 'SLK', 'LRRK', 'STE7', 'ERK3', 'MK2', 'MK5', 'MARK', 'MAST', 'MELK', 'Met', 'MLK', 'TKL-Unique', 'MNK', 'MOS', 'MSK', 'MSKb', 'MST', 'YSK', 'FRAP', 'Musk', 'NinaC', 'NEK', 'NIM1', 'nmo', 'NRBP', 'Trk', 'NuaK', 'FRAY', 'p38', 'RSKp70', 'RSKp90', 'RSKb', 'PAKA', 'PAKB', 'PASK', 'TOPK', 'NKF4', 'PDK1', 'NKF3', 'PHK', 'VPS15', 'PIM', 'NKF2', 'PKA', 'PKCa', 'PKCd', 'PKCh', 'PKCi', 'SgK493', 'PKG', 'WEE', 'PKN', 'PLK', 'Other-Unique', 'PKD', 'PRP4', 'Bud32', 'PSK', 'FAK', 'CCK4', 'Slob', 'QIK', 'Ret', 'RIO1', 'RIO2', 'RIO3', 'ROCK', 'Ror', 'Sev', 'RSKL', 'RSKR', 'Ryk', 'NKF1', 'SCY1', 'SGK', 'SMG1', 'SNRK', 'SRPK', 'TSSK', 'YANK', 'STK33', 'ULK', 'SgK495', 'SgK071', 'STLK', 'TK-Unique', 'Syk', 'TAK1', 'TAO', 'TBCK', 'Tie', 'TESK', 'NKF5', 'TLK', 'Ack', 'HH498', 'Trbl', 'TIF1', 'TRRAP', 'TTBK', 'TTK', 'VRK', 'WNK', 'ZAK'])
info.pspa_category_big.str.split('_').str[0].value_counts()[:20].index
Index(['Basophilic', 'Pro-directed', 'Acidophilic', 'Map3k', 'Map4k',
       'Alpha/mlk', 'Assorted', 'Fgf and vegf receptors', 'Ripk/wnk', 'Pkc',
       'Ephrin receptors', 'Nek/ask', 'Eif2ak/tlk', 'Pdgf receptors', 'Src',
       'Jak', 'Tam receptors', 'Tec', 'Cmgc', 'Ulk/ttbk'],
      dtype='object', name='pspa_category_big')
pspa_category_color.keys()
dict_keys(['Basophilic', 'Pro-directed', 'Acidophilic', 'Map3k', 'Map4k', 'Alpha/mlk', 'Fgf and vegf receptors', 'Assorted', 'Ripk/wnk', 'Pkc', 'Ephrin receptors', 'Eif2ak/tlk', 'Nek/ask', 'Pdgf receptors', 'Src', 'Jak', 'Ulk/ttbk', 'Cmgc', 'Tec', 'Tam receptors'])

remove_hi_corr


def remove_hi_corr(
    df:DataFrame, thr:float=0.98, # threshold
):

Remove highly correlated features in a dataframe given a pearson threshold

remove_hi_corr is a function to remove highly correlated features based on threshold of Pearson correlation between features.

# Load data
df = Data.aa_rdkit()
df.shape
(25, 106)
remove_hi_corr(df,thr=0.9).shape
(25, 78)

clean_feat


def clean_feat(
    df:DataFrame, thr:float=0.98
):

Remove features with no variance, and highly correlated features based on threshold

This function is similar to remove_hi_corr, but can additionaly remove features of zero variance (e.g., 1 across all samples)

clean_feat(df,thr=0.9).shape
removing columns: {'Chi3v', 'fr_SH', 'Chi2v', 'Chi3n', 'Chi0v', 'Kappa1', 'VSA_EState6', 'Chi1v', 'fr_Ar_N', 'NumAromaticRings', 'fr_NH2', 'Chi2n', 'VSA_EState10', 'NumAromaticCarbocycles', 'Chi4n', 'SlogP_VSA5', 'RingCount', 'Ipc', 'NOCount', 'Chi0n', 'Chi4v', 'NumRotatableBonds', 'VSA_EState2', 'SMR_VSA9', 'SMR_VSA1', 'NumHDonors', 'Chi1', 'NumHeteroatoms'}
(25, 78)

standardize


def standardize(
    df
):

Standardize features from a df

Checker

In many phosphorylation datsets, there are amino acids in the site sequence that are in lower case but does not belong to s/t/y. Also, there are uncommon amino acids such as U or O that appear in the sequence. Therefore, it is essential to convert the sequence string for kinase ranking.

Convert between uppercase phospho-acceptors, lowercase center residues, and pS/pT/pY labels.


STY2sty


def STY2sty(
    input_string:str
):

Replace all uppercase S/T/Y with lowercase s/t/y in a sequence.


pSTY2sty


def pSTY2sty(
    string
):

Convert pS/pT/pY to s/t/y in a string.


sty2pSTY


def sty2pSTY(
    string
):

Convert s/t/y to pS/pT/pY in a string.


sty2pSTY_df


def sty2pSTY_df(
    df
):

Apply sty to pSTY conversion to a dataframe index.


check_seq


def check_seq(
    seq
):

Convert non-s/t/y characters to uppercase and replace disallowed characters with underscores.

try:
    check_seq('aaadaaa')
except Exception as e:
    print(e)
Center must be s/t/y; got d in 'aaadaaa'
check_seq('AAkUuPSFstTH') # if the center amino acid does not belong to sty/STY, will raise an error
'AAK__PSFstTH'

check_seqs


def check_seqs(
    data, col:NoneType=None
):

Convert non-s/t/y to upper case & replace with underscore if the character is not in the allowed set

df=Data.human_site()
check_seqs(df.head(),'site_seq')
0    _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1    QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2    EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3    EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4    RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: str
check_seqs(df['site_seq'].head())
0    _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1    QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2    EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3    EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4    RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
Name: site_seq, dtype: str

validate_site


def validate_site(
    site_info, seq
):

Validate site position residue match with site residue.

site='S610'
seq = 'MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVGPKDLRSAMGEGGGPEPGPANAKWLKEGQNQLRRAATAHRDQNRNVTLTLAEEASQEPEMAPLGPKGLIHLYSELELSAHNAANRGLRGPGLIISTQEQGPDEGEEKAAGEAEEEEEDDDDEEEEEDLSSPPGLPEPLESVEAPPRPQALTDGPREHSKSASLLFGMRNSAASDEDSSWATLSQGSPSYGSPEDTDSFWNPNAFETDSDLPAGWMRVQDTSGTYYWHIPTGTTQWEPPGRASPSQGSSPQEESQLTWTGFAHGEGFEDGEFWKDEPSDEAPMELGLKEPEEGTLTFPAQSLSPEPLPQEEEKLPPRNTNPGIKCFAVRSLGWVEMTEEELAPGRSSVAVNNCIRQLSYHKNNLHDPMSGGWGEGKDLLLQLEDETLKLVEPQSQALLHAQPIISIRVWGVGRDSGRERDFAYVARDKLTQMLKCHVFRCEAPAKNIATSLHEICSKIMAERRNARCLVNGLSLDHSKLVDVPFQVEFPAPKNELVQKFQVYYLGNVPVAKPVGVDVINGALESVLSSSSREQWTPSHVSVAPATLTILHQQTEAVLGECRVRFLSFLAVGRDVHTFAFIMAAGPASFCCHMFWCEPNAASLSEAVQAACMLRYQKCLDARSQASTSCLPAPPAESVARRVGWTVRRGVQSLWGSLKPKRLGAHTP'
validate_site(site,seq)
1

validate_site_df


def validate_site_df(
    df, site_info_col, protein_seq_col
):

Validate site position residue match with site residue in a dataframe.

validate_site_df(df.head(),'site','substrate_sequence')
0    1
1    1
2    1
3    1
4    1
dtype: int64

Phosphorylate protein seq


phosphorylate_seq


def phosphorylate_seq(
    seq, # full protein sequence
    sites:VAR_POSITIONAL, # site info, e.g., S140
):

Phosphorylate protein sequence based on phosphosites (e.g.,S140).

seq = 'MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDSQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDSVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'
phosphorylate_seq(seq,*['S95', 'S22', 'T25', 'S6', 'S158'])
'MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVMRDPNTKRSRGFGFVTYATVEEVDAAMNARPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKEDTEEHHLRDYFEQYGKIEVIEIMTDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVRKALSKQEMASASSSQRGRSGSGNFGGGRGGGFGGNDNFGRGGNFSGRGGFGGSRGGGGYGGSGDGYNGFGNDGSNFGGGGSYNDFGNYNNQSSNFGPMKGGNFEGRSSGPHGGGGQYFAKPRNQGGYGGSSSSSSYGSGRRF'

phosphorylate_seq_df


def phosphorylate_seq_df(
    df, id_col:str='substrate_uniprot', # column of sequence ID
    seq_col:str='substrate_sequence', # column that contains protein sequence
    site_col:str='site', # column that contains site info, e.g., S140
):

Phosphorylate whole sequence based on phosphosites in a dataframe

df=Data.human_site()
df.head()
substrate_uniprot substrate_genes site source AM_pathogenicity substrate_sequence substrate_species sub_site substrate_phosphoseq position site_seq
0 A0A024R4G9 C19orf48 MGC13170 hCG_2008493 S20 psp NaN MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... Homo sapiens (Human) A0A024R4G9_S20 MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... 20 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 A0A075B6Q4 NaN S24 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S24 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 24 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 A0A075B6Q4 NaN S35 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S35 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 35 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 A0A075B6Q4 NaN S57 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S57 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 57 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 A0A075B6Q4 NaN S68 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S68 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 68 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
phosphorylate_seq_df(df.head(100),'substrate_uniprot','substrate_sequence','site')
substrate_uniprot site substrate_sequence phosphoseq
0 A0A024R4G9 [S20] MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...
1 A0A075B6Q4 [S24, S35, S57, S68, S71, S72] MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...
2 A0A075B6T3 [S24, S26] XLKRAYRGLEEVQWCLEQLLTSPSPS XLKRAYRGLEEVQWCLEQLLTSPsPs
3 A0A075B759 [T68] MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF... MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
4 A0A087WTJ2 [T8, Y14, Y213, T215, S221, S421, S424] MGGRKMATDEENVYGLEENAQSRQESTRRLILVGRTGAGKSATGNS... MGGRKMAtDEENVyGLEENAQSRQESTRRLILVGRTGAGKSATGNS...
5 A0A087WUL8 [S364] MVVSAGPWSSEKAEMNILEINETLRPQLAEKKQQFRNLKEKCFLTQ... MVVSAGPWSSEKAEMNILEINETLRPQLAEKKQQFRNLKEKCFLTQ...
6 A0A087WUV0 [T244] MEPEGRGSLFEDSDLLHAGNPKENDVTAVLLTPGSQELMIRDMAEA... MEPEGRGSLFEDSDLLHAGNPKENDVTAVLLTPGSQELMIRDMAEA...
7 A0A087WV48 [S9] MGVQVETISPGDGRTFPKRGQTCVVHYTGCGQFWPSCSRISSTFLF... MGVQVETIsPGDGRTFPKRGQTCVVHYTGCGQFWPSCSRISSTFLF...
8 A0A087WV96 [T138, S139] MDLIPNLAVETWLLLAVSLILLYLYGTRTHGLFKKLGIPGPTPLPF... MDLIPNLAVETWLLLAVSLILLYLYGTRTHGLFKKLGIPGPTPLPF...
9 A0A087WZ62 [S246, S249, S250, T253] MAPRRLLLVGEGNFSFAAALSETLDQSTQLTATCLQRPAELARDPL... MAPRRLLLVGEGNFSFAAALSETLDQSTQLTATCLQRPAELARDPL...
10 A0A087X0R7 [S99, T106, S111, S118, S142, S147, T159, S162... DASASEEEEEEEEEEDEDEEEEVAAWRLPPRWSQLGTSQRPRPSRP... DASASEEEEEEEEEEDEDEEEEVAAWRLPPRWSQLGTSQRPRPSRP...
11 A0A087X1C1 [S2] MSDKPDLSEVEKFDRSKLKKTNTEEKNTLPSKEKGVSLCRQAGVQR... MsDKPDLSEVEKFDRSKLKKTNTEEKNTLPSKEKGVSLCRQAGVQR...
12 A0A088AWK7 [T213] MDAKSLTAWSRTLVTFKDVFVDFTREEWKLLDTAQQIVYRNVMLEN... MDAKSLTAWSRTLVTFKDVFVDFTREEWKLLDTAQQIVYRNVMLEN...
13 A0A096LP49 [S519, S592, S775, S777, S779, S1027] MPTLVVGTPPTCLGDTPQPCHKNSQRQGPFSHGAPGRAADWKAVAK... MPTLVVGTPPTCLGDTPQPCHKNSQRQGPFSHGAPGRAADWKAVAK...
14 A0A096LP55 [T11, S13] MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA... MGLEDEQKMLtEsGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...
15 A0A0A0MRY4 [T180, S184, S252, S254, T255, S270, T271, S29... MTQAAVRPWAPCLENMTTAPNGLGPGPAAPCAGSDLKDAKMVTSLA... MTQAAVRPWAPCLENMTTAPNGLGPGPAAPCAGSDLKDAKMVTSLA...
16 A0A0A6YYC7 [S82, S83, S85, S101, S103, S142, S146, S149, ... MPGETEEPRPPEQQDQEGGEAAKAAPEEPQQRPPEAVAAAPAGTTS... MPGETEEPRPPEQQDQEGGEAAKAAPEEPQQRPPEAVAAAPAGTTS...
17 A0A0A6YYG9 [T2, T4, S42, S43] MTATLRPYLSAVRATLQAALCLENFSSQVVERHNKPEVEVRSSKEL... MtAtLRPYLSAVRATLQAALCLENFSSQVVERHNKPEVEVRssKEL...
18 A0A0A6YYH1 [S2, Y5, S98] MSRIYHDGALRNKAVQSVRLPGAWDPAAHQGGNGVLLEGELIDVSR... MsRIyHDGALRNKAVQSVRLPGAWDPAAHQGGNGVLLEGELIDVSR...
19 A0A0A6YYK5 [S23, S141, S189, S191, S218] MSMLAERRRKQKWAVDPQNTAWSNDDSKFGQRMLEKMGWSKGKGLG... MSMLAERRRKQKWAVDPQNTAWsNDDSKFGQRMLEKMGWSKGKGLG...
20 A0A0A6YYL0 [S2, S4] MSDSDLGEDEGLLSLAGKRKRRGNLPKESVKILRDWLYLHRYNAYP... MsDsDLGEDEGLLSLAGKRKRRGNLPKESVKILRDWLYLHRYNAYP...
21 A0A0A6YYL1 [Y59, S73] MARSRLTATSVSQVQVIAHSEYQKSKRISIFLSMQDEIETEEIIKD... MARSRLTATSVSQVQVIAHSEYQKSKRISIFLSMQDEIETEEIIKD...
22 A0A0A6YYL6 [S5, Y139, S141, S142] MVRYSLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK... MVRYsLDPENPTKSCKSRGSNLRVHFKNTRETAQAIKGMHIRKATK...
23 A0A0B4J1R7 [T6, S43, S45, S46] MMATGTPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGSRSS... MMATGtPESQARFGQSVKGLLTEKVTTCGTDVIALTKQVLKGsRss...

Extract site seq


extract_site_seq


def extract_site_seq(
    df:DataFrame, # dataframe that contains protein sequence
    seq_col:str, # column name of protein sequence
    site_col:str, # column name of site information (e.g., S10)
    n:int=7, # length of surrounding sequence (default -7 to +7)
):

Extract -n to +n site sequence from protein sequence

As some datasets only contains protein information and position of phosphorylation sites, but not phosphorylation site sequence, we can retreive protein sequence and use this function to get -7 to +7 phosphorylation site sequence (as numpy array).

Remember to validate the phospho-acceptor at position 0 before extract the site sequence, as there could be mismatch due to the protein sequence database updates.

df.head()
substrate_uniprot substrate_genes site source AM_pathogenicity substrate_sequence substrate_species sub_site substrate_phosphoseq position site_seq
0 A0A024R4G9 C19orf48 MGC13170 hCG_2008493 S20 psp NaN MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... Homo sapiens (Human) A0A024R4G9_S20 MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... 20 _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1 A0A075B6Q4 NaN S24 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S24 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 24 QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
2 A0A075B6Q4 NaN S35 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S35 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 35 EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF
3 A0A075B6Q4 NaN S57 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S57 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 57 EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN
4 A0A075B6Q4 NaN S68 ochoa NaN MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... Homo sapiens (Human) A0A075B6Q4_S68 MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... 68 RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE
extract_site_seq(df.head(),
                 seq_col='substrate_sequence',
                 site_col='site',
                 n=30
                 )
100%|██████████| 5/5 [00:00<00:00, 10140.97it/s]
array(['___________MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSHTPRR',
       '_______MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHL',
       'KSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFT',
       'DYDSAGLLSDEDCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERF',
       'DCMSVPGKTHRAIADHLFWSEETKSRFTEYSMTSSVMRRNEQLTLHDERFEKFYEQYDDDE'],
      dtype='<U61')

Alignment


get_fasta


def get_fasta(
    df, seq_col:str='kd_seq', id_col:str='kd_ID', path:str='out.fasta'
):

Generate fasta file from sequences.

get_fasta(kd,seq_col='kd_seq',id_col='kd_ID',path='raw/kinase_domains.fasta')

To run clustalo alignment, can run either through terminal or the function

sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clu

run_clustalo


def run_clustalo(
    input_fasta, # .fasta fname
    output_aln, # .aln output fname
    outfmt:str='clu'
):

Run Clustal Omega to perform multiple sequence alignment.

run_clustalo("kinase_domains.fasta", "raw/kinase_domains.aln")

aln2df


def aln2df(
    fname
):

Call self as a function.

df = aln2df("raw/kinase_domains.aln")

get_aln_freq


def get_aln_freq(
    df
):

Get frequency of each amino acid across each position from the aln2df output.

freq_df = get_aln_freq(df)