from katlas.core import *
import pandas as pd
Background frequencies
= Data.get_ks_unique() df
df.head()
sub_site | num_kin | bin | sub_genes | site_seq | source_combine | acceptor | O00141_SGK1 | O00238_BMPR1B | O00311_CDC7 | ... | Q9Y2K2_SIK3 | Q9Y2U5_MAP3K2 | Q9Y3S1_WNK2 | Q9Y463_DYRK1B | Q9Y4K4_MAP4K5 | Q9Y572_RIPK3 | Q9Y5S2_CDC42BPB | Q9Y6E0_STK24 | Q9Y6M4_CSNK1G3 | Q9Y6R4_MAP3K4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A2R8Y4L2_S158 | 1 | 1 | HNRNPA1L3 HNRNPA1P48 | TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR | Sugiyama | S | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | A0A2R8Y4L2_S22 | 3 | 2~10 | HNRNPA1L3 HNRNPA1P48 | SKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTD | Sugiyama | S | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | A0A2R8Y4L2_S6 | 3 | 2~10 | HNRNPA1L3 HNRNPA1P48 | _______________MSKSEsPKEPEQLRKLFIGGLsFEtT | Sugiyama | S | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | A0A2R8Y4L2_S95 | 65 | 11~100 | HNRNPA1L3 HNRNPA1P48 | RPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKED | Sugiyama | S | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | A0A2R8Y4L2_T25 | 3 | 2~10 | HNRNPA1L3 HNRNPA1P48 | EsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVV | Sugiyama | T | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 462 columns
PSSMs - ks dataset
def get_bg_dict(df, acceptor, seq_col='site_seq'):
= df[df.acceptor.isin(list(acceptor))].copy()
site = get_prob(site, seq_col)
site_pssm return flatten_pssm(site_pssm, True)
= ['S','T','Y','ST','STY'] acceptors
= 'KS Dataset Background: ' + pd.Series(acceptors) index_names
= pd.DataFrame([get_bg_dict(df,acceptor) for acceptor in acceptors],index=index_names) bg_pssms
def get_site_cnt(df, acceptor, seq_col='site_seq'):
= df[df.acceptor.isin(list(acceptor))].copy()
site return len(site)
= {name:get_site_cnt(df,acceptor) for acceptor,name in zip(acceptors,index_names)} cnt
='')
plot_logos(bg_pssms,cnt,prefix'fig/background_ks.pdf') save_pdf(
PSSMs - human phosphoproteome
= Data.get_human_site() human
'acceptor']=human.site.str[0] human[
1) human.head(
substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | site_seq | acceptor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR | S |
= 'Human Phosphoproteome Background: ' + pd.Series(acceptors) index_names2
= pd.DataFrame([get_bg_dict(human,acceptor) for acceptor in acceptors],index=index_names2) bg_pssms_human
bg_pssms_human
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20s | 20t | 20y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Human Phosphoproteome Background: S | 0.08364 | 0.06732 | 0.07243 | 0.01298 | 0.06843 | 0.04076 | 0.04860 | 0.03262 | 0.08046 | 0.01853 | ... | 0.02240 | 0.06643 | 0.06452 | 0.04737 | 0.03316 | 0.05179 | 0.07982 | 0.04460 | 0.01317 | 0.00367 |
Human Phosphoproteome Background: T | 0.07986 | 0.06436 | 0.07476 | 0.01194 | 0.06270 | 0.04093 | 0.05299 | 0.03346 | 0.07854 | 0.02181 | ... | 0.02007 | 0.07009 | 0.06264 | 0.04805 | 0.03220 | 0.05181 | 0.07866 | 0.04366 | 0.01702 | 0.00485 |
Human Phosphoproteome Background: Y | 0.07390 | 0.06714 | 0.07310 | 0.01602 | 0.05840 | 0.03946 | 0.05178 | 0.04145 | 0.08741 | 0.02026 | ... | 0.02177 | 0.06572 | 0.05960 | 0.04994 | 0.03130 | 0.05538 | 0.07811 | 0.03470 | 0.01619 | 0.01660 |
Human Phosphoproteome Background: ST | 0.08279 | 0.06666 | 0.07295 | 0.01275 | 0.06714 | 0.04079 | 0.04959 | 0.03281 | 0.08003 | 0.01927 | ... | 0.02188 | 0.06725 | 0.06410 | 0.04752 | 0.03295 | 0.05180 | 0.07956 | 0.04439 | 0.01403 | 0.00394 |
Human Phosphoproteome Background: STY | 0.08221 | 0.06669 | 0.07296 | 0.01296 | 0.06657 | 0.04071 | 0.04973 | 0.03338 | 0.08052 | 0.01933 | ... | 0.02187 | 0.06715 | 0.06381 | 0.04768 | 0.03284 | 0.05203 | 0.07946 | 0.04377 | 0.01417 | 0.00475 |
5 rows × 943 columns
= pd.concat([bg_pssms,bg_pssms_human]) all_pssms
# all_pssms.to_parquet('~/katlas/dataset/CDDM/ks_background.parquet')
Accessible through Data.get_ks_background
= {name:get_site_cnt(human,acceptor) for acceptor,name in zip(acceptors,index_names2)} cnt_human
cnt.update(cnt_human)
cnt
{'KS Dataset Background: S': 15981,
'KS Dataset Background: T': 6762,
'KS Dataset Background: Y': 7068,
'KS Dataset Background: ST': 22743,
'KS Dataset Background: STY': 29811,
'Human Phosphoproteome Background: S': 86995,
'Human Phosphoproteome Background: T': 25091,
'Human Phosphoproteome Background: Y': 7869,
'Human Phosphoproteome Background: ST': 112086,
'Human Phosphoproteome Background: STY': 119955}
='')
plot_logos(bg_pssms_human,cnt,prefix'fig/background_human.pdf') save_pdf(