Motif extraction

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.colors import LogNorm
from katlas.core import *
from katlas.plot import *
from tqdm import tqdm
from pathlib import Path
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set_context('notebook')
sns.set_style("ticks")
df=pd.read_parquet('raw/combine_source_grouped.parquet')
cluster_map=pd.read_parquet('raw/kmeans_site_long_new_cluster.parquet')
cluster_map = cluster_map[['sub_site','cluster_new']].drop_duplicates()
cluster_map = cluster_map[cluster_map.sub_site.isin(df.sub_site.unique())].reset_index(drop=True)
df = df.merge(cluster_map,on='sub_site',how='outer')
df.to_parquet('raw/combine_source_grouped_cluster.parquet',index=False)

Note that the df above is in long form, with duplicates in sub_site due to cluster overlap

df
kin_sub_site kinase_uniprot substrate_uniprot site source substrate_genes substrate_sequence substrate_phosphoseq position site_seq sub_site cluster_new
0 P48730_A0A2R8Y4L2_S158 P48730 A0A2R8Y4L2 S158 Sugiyama HNRNPA1L3 HNRNPA1P48 MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM... MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVM... 158 TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR A0A2R8Y4L2_S158 391
1 P48730_A0A2R8Y4L2_S158 P48730 A0A2R8Y4L2 S158 Sugiyama HNRNPA1L3 HNRNPA1P48 MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM... MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVM... 158 TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR A0A2R8Y4L2_S158 634
2 P48730_A0A2R8Y4L2_S158 P48730 A0A2R8Y4L2 S158 Sugiyama HNRNPA1L3 HNRNPA1P48 MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM... MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVM... 158 TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR A0A2R8Y4L2_S158 392
3 P48730_A0A2R8Y4L2_S158 P48730 A0A2R8Y4L2 S158 Sugiyama HNRNPA1L3 HNRNPA1P48 MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM... MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVM... 158 TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR A0A2R8Y4L2_S158 542
4 P48730_A0A2R8Y4L2_S158 P48730 A0A2R8Y4L2 S158 Sugiyama HNRNPA1L3 HNRNPA1P48 MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM... MSKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTDCVVM... 158 TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR A0A2R8Y4L2_S158 388
... ... ... ... ... ... ... ... ... ... ... ... ...
813555 P52333_Q9Y6Y8_Y935 P52333 Q9Y6Y8 Y935 Sugiyama SEC23IP MSTP053 MAERKPNGGSGGASTSSSGTNLLFSSSATEFSFNVPFIPVTQASAS... MAERKPNGGSGGASTSSSGTNLLFSSSATEFSFNVPFIPVTQASAS... 935 KQVVEAEKVVEsPDFsKDEDyLGKVGMLNGGRRIDYVLQEK Q9Y6Y8_Y935 672
813556 P07948_Q9Y6Y9_Y131 P07948 Q9Y6Y9 Y131 EPSD|PSP LY96 ESOP1 MD2 MLPFLFFSTLFSSIFTEAQKQYWVCNSSDASISYTYCDKMQYPISI... MLPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQYPISI... 131 ETVNTTISFSFKGIKFSKGKyKCVVEAISGSPEEMLFCLEF Q9Y6Y9_Y131 676
813557 P07948_Q9Y6Y9_Y131 P07948 Q9Y6Y9 Y131 EPSD|PSP LY96 ESOP1 MD2 MLPFLFFSTLFSSIFTEAQKQYWVCNSSDASISYTYCDKMQYPISI... MLPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQYPISI... 131 ETVNTTISFSFKGIKFSKGKyKCVVEAISGSPEEMLFCLEF Q9Y6Y9_Y131 672
813558 P07948_Q9Y6Y9_Y22 P07948 Q9Y6Y9 Y22 EPSD|PSP LY96 ESOP1 MD2 MLPFLFFSTLFSSIFTEAQKQYWVCNSSDASISYTYCDKMQYPISI... MLPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQYPISI... 22 LPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQY Q9Y6Y9_Y22 676
813559 P07948_Q9Y6Y9_Y22 P07948 Q9Y6Y9 Y22 EPSD|PSP LY96 ESOP1 MD2 MLPFLFFSTLFSSIFTEAQKQYWVCNSSDASISYTYCDKMQYPISI... MLPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQYPISI... 22 LPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQY Q9Y6Y9_Y22 672

813560 rows × 12 columns

Motifs

df.drop_duplicates('sub_site')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[47], line 1
----> 1 df['kinase_uniprot'].drop_duplicates(subset='sub_site')

TypeError: drop_duplicates() got an unexpected keyword argument 'subset'
def extract_motifs2(df, cluster_col, subcluster_col=None, seq_col='site_seq',count_thr=10, valid_thr=None,plot=False):
    "Extract motifs from clusters in a dataframe"
    pssms = []
    ids = []
    value_counts = df[cluster_col].value_counts()
    for cluster_id, counts in tqdm(value_counts.items(),total=len(value_counts)):
        if count_thr is not None and counts <= count_thr:
            continue

        # Modify this line 
        # df_cluster = df[df[cluster_col] == cluster_id]
        df_cluster = df[df[cluster_col] == cluster_id].drop_duplicates('sub_site')
        n= len(df_cluster)
        pssm = get_prob(df_cluster, seq_col)
        valid_score = (pssm != 0).sum().sum() / (pssm.shape[0] * pssm.shape[1])

        if valid_thr is not None and valid_score <= valid_thr:
            continue

        pssms.append(flatten_pssm(pssm))
        ids.append(cluster_id)

        if plot:
            plot_logo(pssm, title=f'{cluster_id} (n={n})', figsize=(14, 1))
            plt.show()
            plt.close()
            if subcluster_col is not None:
                subcluster_counts = df_cluster[subcluster_col].value_counts()
                print(subcluster_counts)
                for idx, counts in subcluster_counts.items():
                    i=0
                    if count_thr is not None and counts <= count_thr:
                        continue
                    df_subcluster= df_cluster[df_cluster[subcluster_col] == idx].drop_duplicates('sub_site')
                    pssm_sub = get_prob(df_subcluster, seq_col)
                    valid_score_sub = (pssm_sub != 0).sum().sum() / (pssm_sub.shape[0] * pssm_sub.shape[1])

                    if valid_thr is not None and valid_score_sub <= valid_thr:
                        continue
                    n_sub = len(df_subcluster)
                    plot_logo(pssm_sub, title=f'Cluster {idx} (n={n_sub})', figsize=(14, 1))
                    plt.show()
                    plt.close()
                    i += 1
                    if i== 3:
                        break

    pssm_df = pd.DataFrame(pssms, index=ids)
    return pssm_df
_ = extract_motifs2(df,'kinase_uniprot',subcluster_col='cluster_new',count_thr=10,valid_thr=0.5,plot=True)
_ = extract_motifs2(df,'cluster_new','site_seq',count_thr=10,valid_thr=0.3,plot=True)
  0%|          | 0/747 [00:00<?, ?it/s]

  0%|          | 1/747 [00:09<1:57:48,  9.48s/it]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[27], line 1
----> 1 _ = extract_motifs2(df,'cluster_new','site_seq',count_thr=10,valid_thr=0.3,plot=True)

Cell In[26], line 24, in extract_motifs2(df, cluster_col, seq_col, count_thr, valid_thr, plot)
     21 ids.append(cluster_id)
     23 if plot:
---> 24     plot_logo(pssm, title=f'Cluster {cluster_id} (n={n})', figsize=(14, 1))
     25     plt.show()
     26     plt.close()

File f:\git\katlas\katlas\plot.py:132, in plot_logo(prob_df, title, scale_zero, ax, figsize)
    130 if ax is None:
    131     fig, ax = plt.subplots(figsize=figsize)
--> 132 logo = logomaker.Logo(logo_df.T, color_scheme='kinase_protein', flip_below=False, ax=ax)
    133 logo.ax.set_ylabel("Info (bits)")
    134 logo.style_xticks(fmt='%d')

File f:\git\katlas\.venv\lib\site-packages\logomaker\src\error_handling.py:106, in handle_errors.<locals>.wrapped_func(*args, **kwargs)
    102 mistake = None
    104 try:
    105     # Execute function
--> 106     result = func(*args, **kwargs)
    108     # If running functional test and expect to fail
    109     if should_fail is True:

File f:\git\katlas\.venv\lib\site-packages\logomaker\src\Logo.py:186, in Logo.__init__(self, df, color_scheme, font_name, stack_order, center_values, baseline_width, flip_below, shade_below, fade_below, fade_probabilities, vpad, vsep, alpha, show_spines, ax, zorder, figsize, **kwargs)
    183 self.fig = ax.figure
    185 # compute characters
--> 186 self._compute_glyphs()
    188 # style glyphs below x-axis
    189 self.style_glyphs_below(shade=self.shade_below,
    190                         fade=self.fade_below,
    191                         flip=self.flip_below)

File f:\git\katlas\.venv\lib\site-packages\logomaker\src\Logo.py:1058, in Logo._compute_glyphs(self)
   1055 flip = (v < 0 and self.flip_below)
   1057 # Create glyph if height is finite
-> 1058 glyph = Glyph(p, c,
   1059               ax=self.ax,
   1060               floor=floor,
   1061               ceiling=ceiling,
   1062               color=this_color,
   1063               flip=flip,
   1064               zorder=self.zorder,
   1065               font_name=self.font_name,
   1066               alpha=self.alpha,
   1067               vpad=self.vpad,
   1068               **self.glyph_kwargs)
   1070 # Add glyph to glyph_df
   1071 glyph_df.loc[p, c] = glyph

File f:\git\katlas\.venv\lib\site-packages\logomaker\src\error_handling.py:106, in handle_errors.<locals>.wrapped_func(*args, **kwargs)
    102 mistake = None
    104 try:
    105     # Execute function
--> 106     result = func(*args, **kwargs)
    108     # If running functional test and expect to fail
    109     if should_fail is True:

File f:\git\katlas\.venv\lib\site-packages\logomaker\src\Glyph.py:180, in Glyph.__init__(self, p, c, floor, ceiling, ax, width, vpad, font_name, font_weight, color, edgecolor, edgewidth, dont_stretch_more_than, flip, mirror, zorder, alpha, figsize)
    177     self.ax = ax
    179 # Make patch
--> 180 self._make_patch()

File f:\git\katlas\.venv\lib\site-packages\logomaker\src\Glyph.py:320, in Glyph._make_patch(self)
    312 self.patch = PathPatch(char_path,
    313                        facecolor=self.color,
    314                        zorder=self.zorder,
    315                        alpha=self.alpha,
    316                        edgecolor=self.edgecolor,
    317                        linewidth=self.edgewidth)
    319 # add patch to axes
--> 320 self.ax.add_patch(self.patch)

File f:\git\katlas\.venv\lib\site-packages\matplotlib\axes\_base.py:2414, in _AxesBase.add_patch(self, p)
   2412 if p.get_clip_path() is None:
   2413     p.set_clip_path(self.patch)
-> 2414 self._update_patch_limits(p)
   2415 self._children.append(p)
   2416 p._remove_method = self._children.remove

File f:\git\katlas\.venv\lib\site-packages\matplotlib\axes\_base.py:2436, in _AxesBase._update_patch_limits(self, patch)
   2433 # Get all vertices on the path
   2434 # Loop through each segment to get extrema for Bezier curve sections
   2435 vertices = []
-> 2436 for curve, code in p.iter_bezier(simplify=False):
   2437     # Get distance along the curve of any extrema
   2438     _, dzeros = curve.axis_aligned_extrema()
   2439     # Calculate vertices of start, end and any extrema in between

File f:\git\katlas\.venv\lib\site-packages\matplotlib\path.py:447, in Path.iter_bezier(self, **kwargs)
    445     yield BezierSegment(np.array([prev_vert, verts])), code
    446 elif code == Path.CURVE3:
--> 447     yield BezierSegment(np.array([prev_vert, verts[:2],
    448                                   verts[2:]])), code
    449 elif code == Path.CURVE4:
    450     yield BezierSegment(np.array([prev_vert, verts[:2],
    451                                   verts[2:4], verts[4:]])), code

File f:\git\katlas\.venv\lib\site-packages\matplotlib\bezier.py:202, in BezierSegment.__init__(self, control_points)
    200 self._cpoints = np.asarray(control_points)
    201 self._N, self._d = self._cpoints.shape
--> 202 self._orders = np.arange(self._N)
    203 coeff = [math.factorial(self._N - 1)
    204          // (math.factorial(i) * math.factorial(self._N - 1 - i))
    205          for i in range(self._N)]
    206 self._px = (self._cpoints.T * coeff).T

KeyboardInterrupt: 

Extract sequence

df['position']=df['site'].str[1:].astype(int)
df['site_seq'] = extract_site_seq(df,
                                  seq_col='substrate_phosphoseq',
                                  position_col='position',
                                  length=20)
100%|██████████| 187066/187066 [00:06<00:00, 29995.39it/s]
df_k = df[df.kinase_uniprot=='P49841']
len(df_k)
691
pssm = get_prob(df_k,'site_seq')
pssm.head()
Position -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
aa
P 0.069940 0.077381 0.062407 0.084695 0.082840 0.072378 0.087149 0.081121 0.077827 0.071848 0.101025 0.074671 0.084672 0.106414 0.112082 0.094340 0.079826 0.133333 0.150507 0.131693 0.0 0.420290 0.141194 0.094891 0.049708 0.187408 0.082111 0.077266 0.064662 0.106928 0.074242 0.077863 0.078462 0.070878 0.095164 0.092476 0.074132 0.090190 0.071542 0.099359 0.078905
G 0.087798 0.087798 0.069837 0.065379 0.076923 0.054653 0.088626 0.095870 0.064611 0.068915 0.070278 0.087848 0.058394 0.065598 0.066958 0.063861 0.068215 0.101449 0.076700 0.105644 0.0 0.075362 0.065502 0.086131 0.046784 0.054173 0.060117 0.083210 0.084211 0.058735 0.063636 0.083969 0.084615 0.075501 0.076443 0.081505 0.067823 0.060127 0.077901 0.088141 0.066023
A 0.101190 0.095238 0.071322 0.075780 0.073964 0.103397 0.073855 0.063422 0.076358 0.085044 0.073206 0.068814 0.084672 0.080175 0.064047 0.065312 0.047896 0.069565 0.085384 0.073806 0.0 0.039130 0.101892 0.061314 0.062865 0.074671 0.099707 0.078752 0.066165 0.060241 0.083333 0.044275 0.067692 0.078582 0.068643 0.068966 0.085174 0.090190 0.073132 0.054487 0.077295
C 0.022321 0.007440 0.011887 0.014859 0.010355 0.008863 0.007386 0.014749 0.019090 0.008798 0.010249 0.011713 0.018978 0.014577 0.020378 0.011611 0.007257 0.005797 0.024602 0.014472 0.0 0.007246 0.030568 0.027737 0.004386 0.005857 0.024927 0.011887 0.012030 0.015060 0.010606 0.015267 0.023077 0.013867 0.018721 0.014107 0.009464 0.011076 0.033386 0.004808 0.008052
S 0.090774 0.071429 0.086181 0.054978 0.079882 0.064993 0.085672 0.063422 0.082232 0.071848 0.077599 0.073206 0.083212 0.069971 0.071325 0.091437 0.074020 0.039130 0.062229 0.078148 0.0 0.027536 0.062591 0.103650 0.090643 0.057101 0.087977 0.098068 0.093233 0.057229 0.095455 0.083969 0.078462 0.070878 0.070203 0.070533 0.078864 0.060127 0.085851 0.086538 0.095008
plot_logo(pssm,'kinase',figsize=(12,1))

plot_logo_heatmap(pssm,'kinase',figsize=(14,8))

Kmeans

from sklearn.cluster import KMeans
onehot = onehot_encode(df['site_seq'])
df_k=df[df.kinase_uniprot=='P00519'].copy()
onehot_k = onehot[df.kinase_uniprot=='P00519'].copy()
def determine_clusters_elbow(encoded_data, max_clusters=40):
            
    wcss = []
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(encoded_data)
        wcss.append(kmeans.inertia_)
    
    # Plot the Elbow graph
    plt.figure(figsize=(5, 3))
    plt.plot(range(1, max_clusters + 1), wcss)
    plt.title(f'Elbow Method (n={len(encoded_data)})')
    plt.xlabel('# Clusters')
    plt.ylabel('WCSS')
determine_clusters_elbow(onehot_k)

Overall cluster

len(df.kinase_uniprot.unique())
455
kmeans = KMeans(n_clusters=200, random_state=42)
cluster_labels = kmeans.fit_predict(onehot)
df['kmeans_overall'] = cluster_labels
df_k=df[df.kinase_uniprot=='P31749'].copy()
onehot_k = onehot[df.kinase_uniprot=='P31749'].copy()
df_k.kmeans_overall.value_counts()
kmeans_overall
8      252
171    126
7       48
59      27
87      18
      ... 
142      1
61       1
28       1
176      1
89       1
Name: count, Length: 68, dtype: int64
for cluster_id,counts in df_k.kmeans_overall.value_counts().items():

    if counts>15:
        df_k_cluster = df_k[df_k.kmeans_overall==cluster_id]
        pssm = get_prob(df_k_cluster,'site_seq')
        if (pssm==0).sum().sum()/(pssm.shape[0]*pssm.shape[1])<0.5: # filter out low quality pssm
            # plot_logo_heatmap(pssm,title='kinase',figsize=(14,8))
            plot_logo(pssm,title=f'kinase (n={counts})',figsize=(14,1))
            plt.show()
            plt.close()

kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(onehot_k)
df_k['cluster'] = cluster_labels
cluster_counts = df_k.cluster.value_counts()
cluster_counts
cluster
9    523
5    414
1    399
4    113
8     76
7     76
6     44
2     25
0     16
3     16
Name: count, dtype: int64
pssm = get_prob(df_k,'site_seq')
plot_logo_heatmap(pssm,'kinase',figsize=(14,8))

for cluster_id,counts in df_k.cluster.value_counts().items():
    
    df_k_cluster = df_k[df_k.cluster==cluster_id]
    pssm = get_prob(df_k_cluster,'site_seq')
    if (pssm==0).sum().sum()/(pssm.shape[0]*pssm.shape[1])<0.4: # filter out low quality pssm
        # plot_logo_heatmap(pssm,title='kinase',figsize=(14,8))
        plot_logo(pssm,title=f'kinase (n={counts})',figsize=(14,1))
        plt.show()
        plt.close()
    # break