Site promiscuity

from katlas.core import *
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from katlas.plot import *
import matplotlib.ticker as mticker
df=Data.get_ks_dataset()
df.head()
kin_sub_site kinase_uniprot substrate_uniprot site source substrate_genes substrate_phosphoseq position site_seq sub_site substrate_sequence kinase_on_tree kinase_genes kinase_group kinase_family kinase_pspa_big kinase_pspa_small kinase_coral_ID
0 O00141_A4FU28_S140 O00141 A4FU28 S140 Sugiyama CTAGE9 MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... 140 AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH A4FU28_S140 MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... 1 SGK1 SGK AGC SGK Basophilic Akt/rock SGK1
1 O00141_O00141_S252 O00141 O00141 S252 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 252 SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ O00141_S252 MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 1 SGK1 SGK AGC SGK Basophilic Akt/rock SGK1
2 O00141_O00141_S255 O00141 O00141 S255 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 255 HIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQPYD O00141_S255 MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 1 SGK1 SGK AGC SGK Basophilic Akt/rock SGK1
3 O00141_O00141_S397 O00141 O00141 S397 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 397 sGPNDLRHFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEA O00141_S397 MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 1 SGK1 SGK AGC SGK Basophilic Akt/rock SGK1
4 O00141_O00141_S404 O00141 O00141 S404 Sugiyama SGK1 SGK MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 404 HFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEAFLGFsYA O00141_S404 MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... 1 SGK1 SGK AGC SGK Basophilic Akt/rock SGK1

Total kinase count per site

pivot = pd.crosstab(df['sub_site'], df['substrate_uniprot'])

hist_data = pivot.sum(1)  # total kinase count per sub_site

# Define bins and labels
bins = [0, 1, 10, 100, 300]
labels = ['1', '2~10', '11~100', '101~300']

# Cut into categories
binned = pd.cut(hist_data, bins=bins, labels=labels, right=True, include_lowest=True)

# Count how many kin_sub_sites fall into each bin
binned_counts = binned.value_counts().sort_index()

sites = pd.concat([hist_data,binned],axis=1)

sites.columns=['num_kin','bin']

sites = sites.reset_index()

Plot sites kin distribution

# pip install brokenaxes
import matplotlib.pyplot as plt
from brokenaxes import brokenaxes

import pandas as pd
import numpy as np
set_sns()
fig = plt.figure(figsize=(5, 5))
bax = brokenaxes(ylims=((0, 5000), (20_000, 22_000)), hspace=0.2)

# Plot histogram
bax.hist(sites.num_kin, bins=100, edgecolor='black');

# bax.set_xlabel('# Kinases')
# bax.set_ylabel('Frequency') # overlap, does not work very well
plt.title('Histogram of # Kinases per Substrate Site');

cnt=sites.bin.value_counts()
def plot_pie(value_counts, # value counts
             hue_order=None, # list of strings
             labeldistance=0.8,
             fontsize=12,
             fontcolor='black',
             palette='tab20' ,
             figsize=(4,3)
            ):
    if hue_order is not None: value_counts = value_counts.reindex(hue_order)
    colors = sns.color_palette(palette, n_colors=len(value_counts))
    value_counts.plot.pie(
        autopct='%1.1f%%',    # Show percentage inside slices
        labeldistance=labeldistance,    # Move labels closer to center
        textprops={'fontsize': fontsize, 'color': fontcolor} ,
        colors=colors,
        figsize=figsize,
    )
    plt.ylabel('')
    plt.title(f'n={value_counts.sum():,}')
plot_pie(cnt,
         fontsize=9,labeldistance=1,palette='Pastel1')

def plot_cnt(cnt, xlabel=None,ylabel='Count',figsize=(6, 3)):
    fig, ax = plt.subplots(figsize=figsize)
    cnt.plot.bar(ax=ax)
    # Add text on top of each bar
    for idx, value in enumerate(cnt):
        ax.text(idx, value + 0.5, f"{value:,}", ha='center', va='bottom', fontsize=10)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    plt.xticks(rotation=0)
    plt.tight_layout()
plot_cnt(cnt,
         xlabel='# Kinases per Substrate Site',
         ylabel='# Substrate Sites',
         figsize=(6,2)
        )

# sites.num_kin.sort_values(ascending=False).to_csv('test.csv')

Characterize promicuous motif

seqs.site_source_all
seqs.sub_site.str.len().sort_values()
29155      9
28948      9
28949      9
28950      9
28951      9
        ... 
3778      77
10625     83
19363     87
16772    120
1150     120
Name: sub_site, Length: 29156, dtype: int64
seqs.iloc[1150]
site_seq                   AMGIMNsFVNDIFERIAGEAsRLAHyNKRStItsREIQTAV
site_source_all                                                 EPSD
substrate_gene     H2BC12,H2BC11,H2BC17,H2BC3,H2BC5,H2BC4,H2BC21,...
sub_site           O60814_S79,P06899_S79,P23527_S79,P33778_S79,P5...
O00141_SGK1                                                        0
                                         ...                        
Q9Y572_RIPK3                                                       0
Q9Y5S2_CDC42BPB                                                    0
Q9Y6E0_STK24                                                       0
Q9Y6M4_CSNK1G3                                                     0
Q9Y6R4_MAP3K4                                                      0
Name: 1150, Length: 459, dtype: object
seqs = Data.get_ks_unique()
seqs
site_seq site_source_all substrate_gene sub_site O00141_SGK1 O00238_BMPR1B O00311_CDC7 O00329_PIK3CD O00418_EEF2K O00443_PIK3C2A ... Q9Y2K2_SIK3 Q9Y2U5_MAP3K2 Q9Y3S1_WNK2 Q9Y463_DYRK1B Q9Y4K4_MAP4K5 Q9Y572_RIPK3 Q9Y5S2_CDC42BPB Q9Y6E0_STK24 Q9Y6M4_CSNK1G3 Q9Y6R4_MAP3K4
0 AAAAAAAAAVAAPPTAVGSLsGAEGVPVSsQPLPSQPW___ SIGNOR|human_phosphoproteome|PSP|iPTMNet MAZ P56270_S460 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 AAAAAAASGGAQQRsHHAPMsPGssGGGGQPLARtPQPssP PSP|human_phosphoproteome|EPSD|Sugiyama ARID1A O14497_S363 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 AAAAAAAVtAAstsYYGRDRsPLRRATAPVPTVGEGYGYGH human_phosphoproteome|PSP|EPSD RBM4 Q9BWF3_S309 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 AAAAAVSRRRKAEYPRRRRssPsARPPDVPGQQPQAAKsPs human_phosphoproteome|Sugiyama ZFP91 Q96JP5_S83 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 AAAAGAGKAEELHyPLGERRsDyDREALLGVQEDVDEyVKL Sugiyama RCN2 Q14257_S37 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29151 ___________________MstVHEILCKLsLEGDHstPPs SIGNOR|human_phosphoproteome|EPSD ANXA2 P07355_S2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29152 ___________________MsYRRELEKyRDLDEDEILGAL human_phosphoproteome|PSP|EPSD TMOD1 P28289_S2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29153 ___________________MtAKMETtFYDDALNASFLPSE SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 JUN P05412_T2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29154 ___________________MtSSyGHVLERQPALGGRLDsP Sugiyama PRRX1 P54821_T2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29155 ___________________MttsQKHRDFVAEPMGEKPVGS SIGNOR|human_phosphoproteome|EPSD|PSP|GPS6 BANF1 O75531_T2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

29156 rows × 459 columns

seq_map=df.drop_duplicates('sub_site').set_index('sub_site').site_seq

gene_map = df.drop_duplicates('sub_site').set_index('sub_site').substrate_genes
sites['sub_genes'] = sites.sub_site.map(gene_map)
sites['site_seq'] = sites.sub_site.map(seq_map)
for b in sites.bin.value_counts().index:
    sites_b=sites[sites.bin==b].copy()
    pssm_df= get_prob(sites_b,'site_seq')
    plot_logo(pssm_df,b,figsize=(13,2))
    kin_str='kinase' if b=='1' else 'kinases'
    plt.title(f"Sites with {b} {kin_str} (n={len(sites_b)})")
    plt.show()
    plt.close()

# a = sites[(sites.bin=='101~300')|(sites.bin=='11~100')].copy()

# s_sites = a[a.sub_site.str.split('_').str[1].str[0]=='S']

# y_sites = a[a.sub_site.str.split('_').str[1].str[0]=='Y']

# pssm_df_s = get_prob(s_sites,'site_seq')
# pssm_df_y = get_prob(y_sites,'site_seq')

Genes in promicuous site

sites['gene'] = sites.sub_genes.str.split(' ').str[0]
sites_b=sites[sites.bin=='101~300'].copy()
data = pd.read_csv('raw/genes_grouped.csv')
data.Gene.isin(genes).sum()
72
# remove (small) and (large) for rebosomal protein
data.Group = data.Group.str.split('(').str[0]
group_map = data.set_index('Gene')['Group'].to_dict()
sites_b['gene_group'] = sites_b.gene.map(group_map)
sites_b.gene_group.value_counts()
gene_group
Glycolysis                 16
Actin cytoskeleton         15
Heat shock protein         12
Ribosomal protein          12
RNA‑binding protein         6
POTE family                 3
NME family                  2
Signaling/regulatory        2
Tubulin cytoskeleton        2
Cytoskeleton‑associated     1
Regulatory/other            1
Name: count, dtype: int64

Stacked by source

def convert_source(x):
    if x == "Sugiyama":
        return x
    elif 'Sugiyama' in x and '|' in x:
        return 'Both'
    elif 'Sugiyama' not in x:
        return 'Non-Sugiyama'
# df=Data.get_ks_dataset()
df['source2'] = df.source.apply(convert_source)
out = df.groupby('sub_site')['source2'].unique()
sites['source'] = sites.sub_site.map(out)
def combine_source(sources):
    sources = set(sources)  # remove duplicates
    if sources == {'Sugiyama'}:
        return 'Sugiyama'
    elif sources == {'Non-Sugiyama'}:
        return 'Non-Sugiyama'
    else:
        return 'Both'
sites['source_num'] = sites.source.str.len()
sites['source_combine'] = sites.source.apply(combine_source)
def get_pct(df,bin_col, hue_col):
    count_df = df.groupby([bin_col, hue_col], observed=False).size().unstack(fill_value=0)
    pct_df = count_df.div(count_df.sum(axis=1), axis=0) * 100
    return pct_df
pct_df = get_pct(sites,'bin','source_combine')
pct_df
source_combine Both Non-Sugiyama Sugiyama
bin
1 1.126993 47.471138 51.401869
2~10 7.478214 30.535301 61.986484
11~100 7.072793 1.058338 91.868869
101~300 12.500000 0.000000 87.500000
pct_df.plot.bar()
plt.legend(title='Source')

def get_plt_color(palette, # dict, list, or set name (tab10)
                  columns, # columns in the df for plot
                 ):
    "Given a dict, list or set name, return the list of names; if dict, need to provide column names of the df."
    if isinstance(palette, dict):
        # Match colors to column order in pct_df
        colors = [palette.get(col, '#cccccc') for col in columns]  # fallback color if missing
    elif isinstance(palette, str):
        colors = sns.color_palette(palette, n_colors=len(columns))
    elif isinstance(palette, list):
        colors = palette
    return colors
get_plt_color('Set2',['a','b'])
import matplotlib.pyplot as plt

def plot_composition(df, bin_col, hue_col,palette='tab20',legend_title=None,rotate=45,xlabel=None,ylabel='Percentage',figsize=(5,3)):
    pct_df = get_pct(df,bin_col,hue_col)

    colors = get_plt_color(palette,pct_df.columns)
    
    pct_df.plot(kind='bar', figsize=figsize,stacked=True,color=colors)
    
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.xticks(rotation=rotate)
    if legend_title is None: legend_title = hue_col 
    plt.legend(title=legend_title, bbox_to_anchor=(1.05, 1), loc='upper left')
plot_composition(sites,
                 'bin',
                 'source_combine',
                 palette='Set2',
                 legend_title='Source',
                 figsize=(4,3)
                )

Stacked plot by site type

set_sns()
sites['acceptor'] = sites.sub_site.str.split('_').str[1].str[0]
plot_composition(sites,'bin',
                 'acceptor',
                 palette=sty_color,
                 figsize=(4,3),
                 legend_title='Acceptor'
                )

pct_df=get_pct(sites,'bin','acceptor')
pct_df
acceptor S T Y
bin
1 57.744640 24.422760 17.832600
2~10 55.779833 22.736973 21.483194
11~100 31.440372 16.339701 52.219928
101~300 58.333333 4.166667 37.500000
fig, axes = plt.subplots(3, 1, figsize=(5, 4.6), sharex=True)
for i, (ax, acc) in enumerate(zip(axes, ['S', 'T', 'Y'])):
    pct_df[acc].plot(kind='bar', ax=ax, color=sty_color[acc])
    ax.set_title(f'{acc} Sites')
    ax.grid(axis='y')
    ax.tick_params(axis='x', labelrotation=0)
    
    # Set ylabel only on the middle plot
    if i == 1:
        ax.set_ylabel('% of Total Substrate Sites Per Bin')
    else:
        ax.set_ylabel('')  # remove label from top/bottom

axes[-1].set_xlabel('Number of Kinases Per Substrate Site')
plt.subplots_adjust(hspace=0.3)

Save & Add num_kin to the ks_dataset

sites = sites.drop(columns='source')

Let’s add kinase binding info as one-hot

df['kinase_uniprot_gene']=df['kinase_uniprot']+'_'+df['kinase_genes'].str.split(' ').str[0]
pivot = pd.crosstab(df['sub_site'],df['kinase_uniprot_gene'])
pivot = pivot.reset_index()
sites = sites.merge(pivot)
sites.head()
sub_site num_kin bin sub_genes site_seq source_combine acceptor O00141_SGK1 O00238_BMPR1B O00311_CDC7 ... Q9Y2K2_SIK3 Q9Y2U5_MAP3K2 Q9Y3S1_WNK2 Q9Y463_DYRK1B Q9Y4K4_MAP4K5 Q9Y572_RIPK3 Q9Y5S2_CDC42BPB Q9Y6E0_STK24 Q9Y6M4_CSNK1G3 Q9Y6R4_MAP3K4
0 A0A2R8Y4L2_S158 1 1 HNRNPA1L3 HNRNPA1P48 TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR Sugiyama S 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 A0A2R8Y4L2_S22 3 2~10 HNRNPA1L3 HNRNPA1P48 SKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTD Sugiyama S 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 A0A2R8Y4L2_S6 3 2~10 HNRNPA1L3 HNRNPA1P48 _______________MSKSEsPKEPEQLRKLFIGGLsFEtT Sugiyama S 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 A0A2R8Y4L2_S95 65 11~100 HNRNPA1L3 HNRNPA1P48 RPHKVDGRVVEPKRAVSREDsQRPDAHLTVKKIFVGGIKED Sugiyama S 0 1 0 ... 0 1 0 0 1 0 0 0 0 0
4 A0A2R8Y4L2_T25 3 2~10 HNRNPA1L3 HNRNPA1P48 EsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVV Sugiyama T 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 462 columns

# sites.to_parquet('out/unique_ks_sites.parquet')

Sites data is available upon ‘Data.get_ks_unique’

The num_kin info is added to ks_dataset in Data