from katlas.core import *
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from katlas.plot import *
import matplotlib.ticker as mticker
Site promiscuity
=Data.get_ks_dataset() df
df.head()
kin_sub_site | kinase_uniprot | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | kinase_on_tree | kinase_genes | kinase_group | kinase_family | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O00141_A4FU28_S140 | O00141 | A4FU28 | S140 | Sugiyama | CTAGE9 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 140 | AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH | A4FU28_S140 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 |
1 | O00141_O00141_S252 | O00141 | O00141 | S252 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 252 | SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ | O00141_S252 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 |
2 | O00141_O00141_S255 | O00141 | O00141 | S255 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 255 | HIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQPYD | O00141_S255 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 |
3 | O00141_O00141_S397 | O00141 | O00141 | S397 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 397 | sGPNDLRHFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEA | O00141_S397 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 |
4 | O00141_O00141_S404 | O00141 | O00141 | S404 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 404 | HFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEAFLGFsYA | O00141_S404 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 |
Total kinase count per site
= pd.crosstab(df['sub_site'], df['substrate_uniprot'])
pivot
= pivot.sum(1) # total kinase count per sub_site
hist_data
# Define bins and labels
= [0, 1, 10, 100, 300]
bins = ['1', '2~10', '11~100', '101~300']
labels
# Cut into categories
= pd.cut(hist_data, bins=bins, labels=labels, right=True, include_lowest=True)
binned
# Count how many kin_sub_sites fall into each bin
= binned.value_counts().sort_index()
binned_counts
= pd.concat([hist_data,binned],axis=1)
sites
=['num_kin','bin']
sites.columns
= sites.reset_index() sites
Plot sites kin distribution
# pip install brokenaxes
import matplotlib.pyplot as plt
from brokenaxes import brokenaxes
import pandas as pd
import numpy as np
set_sns()
= plt.figure(figsize=(5, 5))
fig = brokenaxes(ylims=((0, 5000), (20_000, 22_000)), hspace=0.2)
bax
# Plot histogram
=100, edgecolor='black');
bax.hist(sites.num_kin, bins
# bax.set_xlabel('# Kinases')
# bax.set_ylabel('Frequency') # overlap, does not work very well
'Histogram of # Kinases per Substrate Site'); plt.title(
=sites.bin.value_counts() cnt
def plot_pie(value_counts, # value counts
=None, # list of strings
hue_order=0.8,
labeldistance=12,
fontsize='black',
fontcolor='tab20' ,
palette=(4,3)
figsize
):if hue_order is not None: value_counts = value_counts.reindex(hue_order)
= sns.color_palette(palette, n_colors=len(value_counts))
colors
value_counts.plot.pie(='%1.1f%%', # Show percentage inside slices
autopct=labeldistance, # Move labels closer to center
labeldistance={'fontsize': fontsize, 'color': fontcolor} ,
textprops=colors,
colors=figsize,
figsize
)'')
plt.ylabel(f'n={value_counts.sum():,}') plt.title(
plot_pie(cnt,=9,labeldistance=1,palette='Pastel1') fontsize
def plot_cnt(cnt, xlabel=None,ylabel='Count',figsize=(6, 3)):
= plt.subplots(figsize=figsize)
fig, ax =ax)
cnt.plot.bar(ax# Add text on top of each bar
for idx, value in enumerate(cnt):
+ 0.5, f"{value:,}", ha='center', va='bottom', fontsize=10)
ax.text(idx, value
'top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines[
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)=0)
plt.xticks(rotation plt.tight_layout()
plot_cnt(cnt,='# Kinases per Substrate Site',
xlabel='# Substrate Sites',
ylabel=(6,2)
figsize )
# sites.num_kin.sort_values(ascending=False).to_csv('test.csv')
Characterize promicuous motif
seqs.site_source_all
str.len().sort_values() seqs.sub_site.
29155 9
28948 9
28949 9
28950 9
28951 9
...
3778 77
10625 83
19363 87
16772 120
1150 120
Name: sub_site, Length: 29156, dtype: int64
1150] seqs.iloc[
site_seq AMGIMNsFVNDIFERIAGEAsRLAHyNKRStItsREIQTAV
site_source_all EPSD
substrate_gene H2BC12,H2BC11,H2BC17,H2BC3,H2BC5,H2BC4,H2BC21,...
sub_site O60814_S79,P06899_S79,P23527_S79,P33778_S79,P5...
O00141_SGK1 0
...
Q9Y572_RIPK3 0
Q9Y5S2_CDC42BPB 0
Q9Y6E0_STK24 0
Q9Y6M4_CSNK1G3 0
Q9Y6R4_MAP3K4 0
Name: 1150, Length: 459, dtype: object
= Data.get_ks_unique() seqs
seqs
site_seq | site_source_all | substrate_gene | sub_site | O00141_SGK1 | O00238_BMPR1B | O00311_CDC7 | O00329_PIK3CD | O00418_EEF2K | O00443_PIK3C2A | ... | Q9Y2K2_SIK3 | Q9Y2U5_MAP3K2 | Q9Y3S1_WNK2 | Q9Y463_DYRK1B | Q9Y4K4_MAP4K5 | Q9Y572_RIPK3 | Q9Y5S2_CDC42BPB | Q9Y6E0_STK24 | Q9Y6M4_CSNK1G3 | Q9Y6R4_MAP3K4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAAAAAAAAVAAPPTAVGSLsGAEGVPVSsQPLPSQPW___ | SIGNOR|human_phosphoproteome|PSP|iPTMNet | MAZ | P56270_S460 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | AAAAAAASGGAQQRsHHAPMsPGssGGGGQPLARtPQPssP | PSP|human_phosphoproteome|EPSD|Sugiyama | ARID1A | O14497_S363 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | AAAAAAAVtAAstsYYGRDRsPLRRATAPVPTVGEGYGYGH | human_phosphoproteome|PSP|EPSD | RBM4 | Q9BWF3_S309 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | AAAAAVSRRRKAEYPRRRRssPsARPPDVPGQQPQAAKsPs | human_phosphoproteome|Sugiyama | ZFP91 | Q96JP5_S83 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | AAAAGAGKAEELHyPLGERRsDyDREALLGVQEDVDEyVKL | Sugiyama | RCN2 | Q14257_S37 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
29151 | ___________________MstVHEILCKLsLEGDHstPPs | SIGNOR|human_phosphoproteome|EPSD | ANXA2 | P07355_S2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29152 | ___________________MsYRRELEKyRDLDEDEILGAL | human_phosphoproteome|PSP|EPSD | TMOD1 | P28289_S2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29153 | ___________________MtAKMETtFYDDALNASFLPSE | SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 | JUN | P05412_T2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29154 | ___________________MtSSyGHVLERQPALGGRLDsP | Sugiyama | PRRX1 | P54821_T2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29155 | ___________________MttsQKHRDFVAEPMGEKPVGS | SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 | BANF1 | O75531_T2 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
29156 rows × 459 columns
=df.drop_duplicates('sub_site').set_index('sub_site').site_seq
seq_map
= df.drop_duplicates('sub_site').set_index('sub_site').substrate_genes gene_map
'sub_genes'] = sites.sub_site.map(gene_map)
sites['site_seq'] = sites.sub_site.map(seq_map) sites[
for b in sites.bin.value_counts().index:
=sites[sites.bin==b].copy()
sites_b= get_prob(sites_b,'site_seq')
pssm_df=(13,2))
plot_logo(pssm_df,b,figsize='kinase' if b=='1' else 'kinases'
kin_strf"Sites with {b} {kin_str} (n={len(sites_b)})")
plt.title(
plt.show() plt.close()
# a = sites[(sites.bin=='101~300')|(sites.bin=='11~100')].copy()
# s_sites = a[a.sub_site.str.split('_').str[1].str[0]=='S']
# y_sites = a[a.sub_site.str.split('_').str[1].str[0]=='Y']
# pssm_df_s = get_prob(s_sites,'site_seq')
# pssm_df_y = get_prob(y_sites,'site_seq')
Genes in promicuous site
'gene'] = sites.sub_genes.str.split(' ').str[0] sites[
=sites[sites.bin=='101~300'].copy() sites_b
= pd.read_csv('raw/genes_grouped.csv') data
sum() data.Gene.isin(genes).
72
# remove (small) and (large) for rebosomal protein
= data.Group.str.split('(').str[0] data.Group
= data.set_index('Gene')['Group'].to_dict() group_map
'gene_group'] = sites_b.gene.map(group_map) sites_b[
sites_b.gene_group.value_counts()
gene_group
Glycolysis 16
Actin cytoskeleton 15
Heat shock protein 12
Ribosomal protein 12
RNA‑binding protein 6
POTE family 3
NME family 2
Signaling/regulatory 2
Tubulin cytoskeleton 2
Cytoskeleton‑associated 1
Regulatory/other 1
Name: count, dtype: int64
Stacked by source
def convert_source(x):
if x == "Sugiyama":
return x
elif 'Sugiyama' in x and '|' in x:
return 'Both'
elif 'Sugiyama' not in x:
return 'Non-Sugiyama'
# df=Data.get_ks_dataset()
'source2'] = df.source.apply(convert_source) df[
= df.groupby('sub_site')['source2'].unique() out
'source'] = sites.sub_site.map(out) sites[
def combine_source(sources):
= set(sources) # remove duplicates
sources if sources == {'Sugiyama'}:
return 'Sugiyama'
elif sources == {'Non-Sugiyama'}:
return 'Non-Sugiyama'
else:
return 'Both'
'source_num'] = sites.source.str.len() sites[
'source_combine'] = sites.source.apply(combine_source) sites[
def get_pct(df,bin_col, hue_col):
= df.groupby([bin_col, hue_col], observed=False).size().unstack(fill_value=0)
count_df = count_df.div(count_df.sum(axis=1), axis=0) * 100
pct_df return pct_df
= get_pct(sites,'bin','source_combine') pct_df
pct_df
source_combine | Both | Non-Sugiyama | Sugiyama |
---|---|---|---|
bin | |||
1 | 1.126993 | 47.471138 | 51.401869 |
2~10 | 7.478214 | 30.535301 | 61.986484 |
11~100 | 7.072793 | 1.058338 | 91.868869 |
101~300 | 12.500000 | 0.000000 | 87.500000 |
pct_df.plot.bar()='Source') plt.legend(title
def get_plt_color(palette, # dict, list, or set name (tab10)
# columns in the df for plot
columns,
):"Given a dict, list or set name, return the list of names; if dict, need to provide column names of the df."
if isinstance(palette, dict):
# Match colors to column order in pct_df
= [palette.get(col, '#cccccc') for col in columns] # fallback color if missing
colors elif isinstance(palette, str):
= sns.color_palette(palette, n_colors=len(columns))
colors elif isinstance(palette, list):
= palette
colors return colors
'Set2',['a','b']) get_plt_color(
import matplotlib.pyplot as plt
def plot_composition(df, bin_col, hue_col,palette='tab20',legend_title=None,rotate=45,xlabel=None,ylabel='Percentage',figsize=(5,3)):
= get_pct(df,bin_col,hue_col)
pct_df
= get_plt_color(palette,pct_df.columns)
colors
='bar', figsize=figsize,stacked=True,color=colors)
pct_df.plot(kind
plt.ylabel(ylabel)
plt.xlabel(xlabel)=rotate)
plt.xticks(rotationif legend_title is None: legend_title = hue_col
=legend_title, bbox_to_anchor=(1.05, 1), loc='upper left') plt.legend(title
plot_composition(sites,'bin',
'source_combine',
='Set2',
palette='Source',
legend_title=(4,3)
figsize )
Stacked plot by site type
set_sns()
'acceptor'] = sites.sub_site.str.split('_').str[1].str[0] sites[
'bin',
plot_composition(sites,'acceptor',
=sty_color,
palette=(4,3),
figsize='Acceptor'
legend_title )
=get_pct(sites,'bin','acceptor') pct_df
pct_df
acceptor | S | T | Y |
---|---|---|---|
bin | |||
1 | 57.744640 | 24.422760 | 17.832600 |
2~10 | 55.779833 | 22.736973 | 21.483194 |
11~100 | 31.440372 | 16.339701 | 52.219928 |
101~300 | 58.333333 | 4.166667 | 37.500000 |
= plt.subplots(3, 1, figsize=(5, 4.6), sharex=True)
fig, axes for i, (ax, acc) in enumerate(zip(axes, ['S', 'T', 'Y'])):
='bar', ax=ax, color=sty_color[acc])
pct_df[acc].plot(kindf'{acc} Sites')
ax.set_title(='y')
ax.grid(axis='x', labelrotation=0)
ax.tick_params(axis
# Set ylabel only on the middle plot
if i == 1:
'% of Total Substrate Sites Per Bin')
ax.set_ylabel(else:
'') # remove label from top/bottom
ax.set_ylabel(
-1].set_xlabel('Number of Kinases Per Substrate Site')
axes[=0.3) plt.subplots_adjust(hspace
Save & Add num_kin to the ks_dataset
= sites.drop(columns='source') sites
Let’s add kinase binding info as one-hot
'kinase_uniprot_gene']=df['kinase_uniprot']+'_'+df['kinase_genes'].str.split(' ').str[0] df[
= pd.crosstab(df['sub_site'],df['kinase_uniprot_gene']) pivot
= pivot.reset_index() pivot
= sites.merge(pivot) sites
sites.head()
sub_site | num_kin | bin | sub_genes | site_seq | source_combine | acceptor | O00141_SGK1 | O00238_BMPR1B | O00311_CDC7 | ... | Q9Y2K2_SIK3 | Q9Y2U5_MAP3K2 | Q9Y3S1_WNK2 | Q9Y463_DYRK1B | Q9Y4K4_MAP4K5 | Q9Y572_RIPK3 | Q9Y5S2_CDC42BPB | Q9Y6E0_STK24 | Q9Y6M4_CSNK1G3 | Q9Y6R4_MAP3K4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A2R8Y4L2_S158 | 1 | 1 | HNRNPA1L3 HNRNPA1P48 | TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR | Sugiyama | S | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | A0A2R8Y4L2_S22 | 3 | 2~10 | HNRNPA1L3 HNRNPA1P48 | SKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTD | Sugiyama | S | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | A0A2R8Y4L2_S6 | 3 | 2~10 | HNRNPA1L3 HNRNPA1P48 | _______________MSKSEsPKEPEQLRKLFIGGLsFEtT | Sugiyama | S | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | A0A2R8Y4L2_S95 | 65 | 11~100 | HNRNPA1L3 HNRNPA1P48 | RPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKED | Sugiyama | S | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | A0A2R8Y4L2_T25 | 3 | 2~10 | HNRNPA1L3 HNRNPA1P48 | EsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVV | Sugiyama | T | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 462 columns
# sites.to_parquet('out/unique_ks_sites.parquet')
Sites data is available upon ‘Data.get_ks_unique’
The num_kin info is added to ks_dataset in Data