Background frequencies

from katlas.core import *
import pandas as pd

df = Data.get_ks_unique()

PSSMs - ks dataset

def get_bg_dict(df, acceptor, seq_col='site_seq'):
    site = df[df.acceptor.isin(list(acceptor))].copy()
    site_pssm = get_prob(site, seq_col)
    return flatten_pssm(site_pssm, True)

acceptors = ['S','T','Y','ST','STY']

index_names = 'ks_' + pd.Series(acceptors)

bg_pssms = pd.DataFrame([get_bg_dict(df,acceptor) for acceptor in acceptors],index=index_names)

bg_pssms

	-20P	-20G	-20A	-20C	-20S	-20T	-20V	-20I	-20L	-20M	...	20H	20K	20R	20Q	20N	20D	20E	20s	20t	20y
ks_S	0.07430	0.06810	0.07593	0.01466	0.05325	0.03852	0.05129	0.04034	0.07899	0.02027	...	0.02129	0.07033	0.05931	0.04420	0.03627	0.05488	0.07732	0.04346	0.01498	0.00645
ks_T	0.06163	0.06590	0.07427	0.01644	0.04383	0.03439	0.05966	0.04018	0.08948	0.02481	...	0.02239	0.07507	0.06056	0.04526	0.03801	0.05867	0.07664	0.03548	0.01987	0.00899
ks_Y	0.05618	0.07242	0.07067	0.01595	0.04448	0.03877	0.05589	0.04594	0.08632	0.02429	...	0.01950	0.07115	0.05789	0.04250	0.03763	0.05622	0.08318	0.02316	0.01249	0.01737
ks_ST	0.07050	0.06744	0.07543	0.01520	0.05042	0.03728	0.05380	0.04029	0.08214	0.02163	...	0.02162	0.07174	0.05969	0.04452	0.03679	0.05601	0.07712	0.04108	0.01644	0.00721
ks_STY	0.06710	0.06863	0.07430	0.01537	0.04901	0.03763	0.05430	0.04163	0.08313	0.02226	...	0.02112	0.07160	0.05926	0.04404	0.03699	0.05606	0.07855	0.03685	0.01551	0.00961

5 rows × 943 columns

def get_site_cnt(df, acceptor, seq_col='site_seq'):
    site = df[df.acceptor.isin(list(acceptor))].copy()
    return len(site)

cnt = {name:get_site_cnt(df,acceptor) for acceptor,name in zip(acceptors,index_names)}

plot_logos(bg_pssms,cnt,prefix='')

PSSMs - human phosphoproteome

human = Data.get_human_site()

human['acceptor']=human.site.str[0]

human.head(1)

	substrate_uniprot	substrate_genes	site	source	AM_pathogenicity	substrate_sequence	substrate_species	sub_site	substrate_phosphoseq	position	site_seq	acceptor
0	A0A024R4G9	C19orf48 MGC13170 hCG_2008493	S20	psp	NaN	MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...	Homo sapiens (Human)	A0A024R4G9_S20	MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...	20	_MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR	S

index_names2 = 'human_' + pd.Series(acceptors)

bg_pssms_human = pd.DataFrame([get_bg_dict(human,acceptor) for acceptor in acceptors],index=index_names2)

bg_pssms_human

	-20P	-20G	-20A	-20C	-20S	-20T	-20V	-20I	-20L	-20M	...	20H	20K	20R	20Q	20N	20D	20E	20s	20t	20y
human_S	0.08364	0.06732	0.07243	0.01298	0.06843	0.04076	0.04860	0.03262	0.08046	0.01853	...	0.02240	0.06643	0.06452	0.04737	0.03316	0.05179	0.07982	0.04460	0.01317	0.00367
human_T	0.07986	0.06436	0.07476	0.01194	0.06270	0.04093	0.05299	0.03346	0.07854	0.02181	...	0.02007	0.07009	0.06264	0.04805	0.03220	0.05181	0.07866	0.04366	0.01702	0.00485
human_Y	0.07390	0.06714	0.07310	0.01602	0.05840	0.03946	0.05178	0.04145	0.08741	0.02026	...	0.02177	0.06572	0.05960	0.04994	0.03130	0.05538	0.07811	0.03470	0.01619	0.01660
human_ST	0.08279	0.06666	0.07295	0.01275	0.06714	0.04079	0.04959	0.03281	0.08003	0.01927	...	0.02188	0.06725	0.06410	0.04752	0.03295	0.05180	0.07956	0.04439	0.01403	0.00394
human_STY	0.08221	0.06669	0.07296	0.01296	0.06657	0.04071	0.04973	0.03338	0.08052	0.01933	...	0.02187	0.06715	0.06381	0.04768	0.03284	0.05203	0.07946	0.04377	0.01417	0.00475

5 rows × 943 columns

all_pssms = pd.concat([bg_pssms,bg_pssms_human])

# all_pssms.to_parquet('~/katlas/dataset/CDDM/ks_background.parquet')

Accessible through Data.get_ks_background

cnt_human = {name:get_site_cnt(human,acceptor) for acceptor,name in zip(acceptors,index_names2)}

cnt.update(cnt_human)

cnt

{'ks_S': 15981,
 'ks_T': 6762,
 'ks_Y': 7068,
 'ks_ST': 22743,
 'ks_STY': 29811,
 'human_S': 86995,
 'human_T': 25091,
 'human_Y': 7869,
 'human_ST': 112086,
 'human_STY': 119955}

plot_logos(bg_pssms_human,cnt,prefix='')