import pandas as pd
import numpy as np
from katlas.data import *
from katlas.plot import *
from katlas.pssm import *
from katlas.feature import *
from matplotlib import pyplot as plt
import seaborn as sns
import math
from sklearn.cluster import KMeans
Plot heatmap and logo of CDDM
Setup
= Data.get_ks_dataset() df
df.head()
kin_sub_site | kinase_uniprot | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | kinase_on_tree | kinase_genes | kinase_group | kinase_family | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | num_kin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O00141_A4FU28_S140 | O00141 | A4FU28 | S140 | Sugiyama | CTAGE9 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 140 | AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH | A4FU28_S140 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 22 |
1 | O00141_O00141_S252 | O00141 | O00141 | S252 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 252 | SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ | O00141_S252 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
2 | O00141_O00141_S255 | O00141 | O00141 | S255 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 255 | HIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQPYD | O00141_S255 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
3 | O00141_O00141_S397 | O00141 | O00141 | S397 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 397 | sGPNDLRHFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEA | O00141_S397 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
4 | O00141_O00141_S404 | O00141 | O00141 | S404 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 404 | HFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEAFLGFsYA | O00141_S404 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
df.shape
(187066, 19)
logo heatmap
'kinase_uniprot_gene'] = df['kinase_uniprot']+'_'+df['kinase_genes'].str.split(' ').str[0] df[
= df.kinase_uniprot_gene.value_counts() cnt
= cnt[cnt>20] cnt20
cnt20
kinase_uniprot_gene
P12931_SRC 2429
P29320_EPHA3 1935
P07332_FES 1903
Q16288_NTRK3 1824
Q9UM73_ALK 1794
...
Q9BUB5_MKNK1 24
Q9BWU1_CDK19 23
Q12852_MAP3K12 22
P37173_TGFBR2 21
Q09013_DMPK 21
Name: count, Length: 352, dtype: int64
len(cnt20)
352
def convert_source(x):
if x == "Sugiyama":
return x
elif 'Sugiyama' in x and '|' in x:
return 'Both'
elif 'Sugiyama' not in x:
return 'Non-Sugiyama'
'source_combine'] = df.source.apply(convert_source) df[
def plot_hist_num_kin(df_k):
"Plot histogram of num kin grouped by source_combine."
= sns.displot(
g
df_k,="num_kin",
x="source_combine",
col=100,
bins=1,
col_wrap=2.0,
height=4,
aspect={'sharex': False, 'sharey': False}
facet_kws
)
"Number of Kinases per Site", "Count")
g.set_axis_labels(
# Customize titles
for ax, source in zip(g.axes.flatten(), g.col_names):
= df_k[df_k['source_combine'] == source].shape[0]
count f"{source} (n={count:,})")
ax.set_title(
"Histogram of # Kinases per Substrate Site")
g.figure.suptitle(
# Adjust layout to make room for suptitle
plt.tight_layout()
def save_show(path=None, # image path, e.g., img.svg, if not None, will save, else plt.show()
):"Show plot or save path"
='tight', pad_inches=0.05) if path is not None else plt.show()
plt.savefig(path,bbox_inches plt.close()
def plot_hist_cddm(df_k,path=None):
plot_hist_num_kin(df_k)
save_show(path)
# plot_hist_cddm(df_k)
def plot_cnt_cddm(df_k,path=None):
"Plot source combine counts via bar graph."
= df_k.source_combine.value_counts()
source_cnt
plot_cnt(source_cnt)'# Kinase-Substrate Pairs per Source',pad=20)
plt.title( save_show(path)
# plot_cnt_cddm(df_k)
# set_sns()
# onehot = onehot_encode(df_k.site_seq)
def kmeans(onehot,n=2,seed=42):
= KMeans(n_clusters=n, random_state=seed,n_init='auto')
kmeans return kmeans.fit_predict(onehot)
def filter_range_columns(df,low=-10,high=10):
= df.columns.str[:-1].astype(int)
positions = (positions >= low) & (positions <= high)
mask return df.loc[:,mask]
# positions = onehot.columns.str[:-1].astype(int)
# mask = (positions >= -10) & (positions <= 10)
# onehot = onehot.loc[:,mask]
# df_k['cluster'] = kmeans(onehot,n=10,seed=42)
# pssms = get_cluster_pssms(df_k,'cluster',valid_thr=0.5)
# kmeans_cnt = df_k.cluster.value_counts()
def plot_logos(pssms_df, count_dict=None, path=None):
"""
Plot all logos from a dataframe of flattened PSSMs as subplots in a single figure.
Save the full figure as an SVG.
"""
= len(pssms_df)
n =0.7
hspace# 14 is width, 1 is height for each logo
= plt.subplots(nrows=n, figsize=(14, n * (1+hspace)),gridspec_kw={'hspace': hspace+0.1})
fig, axes
if n == 1:
= [axes] # ensure axes is iterable
axes
for ax, idx in zip(axes, pssms_df.index):
= recover_pssm(pssms_df.loc[idx])
pssm if count_dict is not None:
=f'Motif {idx} (n={count_dict[idx]:,})',ax=ax)
plot_logo(pssm, titleelse:
=f'Motif {idx}',ax=ax)
plot_logo(pssm, title=path) save_show(path
# plot_logos(pssms,kmeans_cnt)
def get_onehot_add_cluster(df_k,n=10):
= df_k.copy()
df_k = onehot_encode(df_k.site_seq)
onehot = filter_range_columns(onehot)
onehot_10 'cluster'] = kmeans(onehot_10,n=n,seed=42)
df_k[= df_k.reset_index(drop=True)
df_k return df_k,onehot_10
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
def get_kmeans_logos(df_k,path=None):
= get_cluster_pssms(df_k,'cluster',valid_thr=0.5)
pssms = df_k.cluster.value_counts()
kmeans_cnt =path) plot_logos(pssms,kmeans_cnt,path
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
# get_kmeans_logos(df_k)
def plot_onehot(onehot_10,hue,path=None):
'pca',seed=42,complexity=30,hue=hue,legend=True,s=8)
plot_cluster(onehot_10,'PCA of One-Hot Encoded Site Sequences (Position -10 to +10)')
plt.title( save_show(path)
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
# plot_onehot(onehot_10,df_k.cluster)
# get_kmeans_logos(df_k)
# plot_cluster(onehot_10,'pca',seed=42,complexity=30,hue=df_k.cluster,legend=True,s=8)
# plt.title('PCA of One-Hot Encoded Sequences (-10 to +10')
# plot_logo_heatmap(pssm_df,title=f'{k} (n={len(df_k):,})',figsize=(17,10))
from katlas.utils import *
'acceptor']=df.site.str[0] df[
# for site_type in ['S','T','Y']:
# df_sty = df[df.kinase_uniprot_gene.upper()==site_type].copy()
# pssm_sty = get_prob(df_sty,'site_seq')
# plot_logo_heatmap(pssm_sty,title=f'{site_type} sites (n={len(df_sty):,})',figsize=(17,10))
# save_show()
pssm_sty
def get_pssm_LO(pssm_df,
# S, T or Y
site_type,
):"Get log2 (freq pssm/background pssm) --> log odds."
= Data.get_ks_background()
bg_pssms = bg_pssms.loc[f'ks_{site_type.upper()}']
flat_bg = recover_pssm(flat_bg)
pssm_bg = ((pssm_df+1e-5)/(pssm_bg+1e-5)).dropna(axis=0,how='all').dropna(axis=1, how='all')
pssm_odds # return pssm_odds
# make sure all columns and index matched
assert pssm_odds.shape == pssm_df.shape
return np.log2(pssm_odds).replace([np.inf, -np.inf], 0).fillna(0)
= get_pssm_LO(pssm_sty,'S') pssm_LO
="Log-Odds Score (bits)") plot_logo_raw(pssm_LO,ytitle
def plot_logo_heatmap_enrich(pssm_df, # column is position, index is aa
='Motif',
title=(17,10),
figsize=False
include_zero
):
"""Plot logo and heatmap vertically"""
= plt.figure(figsize=figsize)
fig = fig.add_gridspec(2, 2, height_ratios=[1, 5], width_ratios=[4, 1], hspace=0.11, wspace=0)
gs
= fig.add_subplot(gs[0, 0])
ax_logo
="Log-Odds (bits)",ax=ax_logo,title=title)
plot_logo_raw(pssm_df,ytitle
= fig.add_subplot(gs[1, :])
ax_heatmap =ax_heatmap,position_label=False,include_zero=include_zero) plot_heatmap(pssm_df,ax
plot_logo_heatmap_enrich(pssm_LO)
import logomaker
for site_type in ['S','T','Y']:
= df_k[df_k.acceptor.str.upper()==site_type].copy()
df_sty = get_prob(df_sty,'site_seq')
pssm_sty =f'{site_type} sites (n={len(df_sty):,})',figsize=(17,10))
plot_logo_heatmap(pssm_sty,title
save_show()break
for k in cnt20[40:].index:
= df[df.kinase_uniprot_gene==k].copy()
df_k # df_k = df_k[df_k.num_kin>10]
= get_prob(df_k,'site_seq')
pssm_df
# logo heatmap
=f'{k} (n={len(df_k):,})',figsize=(17,10))
plot_logo_heatmap(pssm_df,title=get_path('~/img/cddm/logo_heatmap',f'{k}.svg')
path_logo_heatmap# save_show(path_logo_heatmap)
save_show()
# plot S, T and Y motif
=df.acceptor.value_counts()
sty_cnt for site_type in sty_cnt.index:
= df_k[df_k.acceptor.str.upper()==site_type].copy()
df_sty
# freq map
= get_prob(df_sty,'site_seq')
pssm_sty =f'{site_type} sites (n={len(df_sty):,})',figsize=(17,10))
plot_logo_heatmap(pssm_sty,title
save_show()
# for log-odds
= get_pssm_LO(pssm_sty,site_type)
pssm_LO =f'{site_type} sites (n={len(df_sty):,})',figsize=(17,10))
plot_logo_heatmap_enrich(pssm_LO,title
save_show()
# count of source
=get_path('~/img/cddm/cnt',f'{k}.svg')
path_cnt# plot_cnt_cddm(df_k,path_cnt)
plot_cnt_cddm(df_k)
# histogram of num kin
=get_path('~/img/cddm/hist',f'{k}.svg')
path_hist# plot_hist_cddm(df_k,path_hist)
plot_hist_cddm(df_k)
# # onehot of sequences
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
# path_pca= get_path('~/img/cddm/pca',f'{k}.svg')
# # plot_onehot(onehot_10,df_k.cluster,path_pca)
# plot_onehot(onehot_10,df_k.cluster)
# path_klogo= get_path('~/img/cddm/klogo',f'{k}.svg')
# # get_kmeans_logos(df_k,path_klogo)
# get_kmeans_logos(df_k)
break
# df_k.source.value_counts()
= np.array([a.mean() for a in arrays])
means = 1 - (means - means.min()) / (means.max() - means.min()) im_sw
inverse mean
def get_im(values): return
import matplotlib.pyplot as plt
import seaborn as sns
from katlas.core import *
from katlas.plot import *
from scipy.stats import spearmanr, pearsonr
import os
from PIL import Image
from tqdm import tqdm
def plot_count(df_k,title):
# Get value counts
= df_k.source.replace({'pplus':'PP','large_scale':'LS'}).value_counts()
source_counts =(7,1))
plt.figure(figsize
='barh', stacked=True, color=['darkred', 'darkblue'])
source_counts.plot(kind# Annotate with the actual values
for index, value in enumerate(source_counts):
str(value),fontsize=10,rotation=-90, va='center')
plt.text(value, index,
'Count')
plt.xlabel( plt.title(title)
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
Load data
= Data.get_ks_dataset()
df 'SUB'] = df.substrate.str.upper() df[
= Data.get_kinase_info().query('pseudo=="0"') info
# It only contains kinase on the tree
= df.kinase_paper.value_counts() cnt
= info[info.group!="TK"].kinase ST
10:20] df[df.kinase_paper.isin(ST)].kinase_paper.value_counts()[
NEK6 950
PLK1 943
CK2A1 919
P38D 907
DYRK2 907
HGK 902
TTBK1 896
MST3 890
MST1 884
IKKE 880
Name: kinase_paper, dtype: int64
= cnt[cnt>100] cnt
Generate example figures
def plot_heatmap2(matrix, title, figsize=(6,10), label_size=20):
=figsize)
plt.figure(figsize='binary', annot=False,cbar=False)
sns.heatmap(matrix, cmap=label_size)
plt.title(title,fontsize# Set the font size for the tick labels
=label_size)
plt.xticks(fontsize=label_size)
plt.yticks(fontsize'')
plt.xlabel('') plt.ylabel(
= ['SRC','ABL1','ERK2','PKACA'] kinase_list
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
for k in kinase_list:
= df.query(f'kinase=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
=[0]),f'{k}',figsize=(6,10))
plot_heatmap2(full.drop(columns
plt.show()
plt.close()
break
# if you want to generate and save all of figures, uncomment below
# plt.savefig(f'fig/{k}.png',bbox_inches='tight', pad_inches=0.3)
# plt.close()
Generate all figures
Uncomment plt.savefig to save figures
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
for k in tqdm(cnt.index,total=len(cnt)):
= df.query(f'kinase=="{k}"')
df_k
plot_count(df_k,k)# plt.savefig(f'fig/count/{k}.png',bbox_inches='tight', pad_inches=0.1)
# if visualize in jupyter notebook, uncheck the savefig
plt.show()
plt.close()
= df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
get_logo2(full, k)# plt.savefig(f'fig/logo/{k}.png',bbox_inches='tight', pad_inches=0.3)
plt.show()
plt.close()
=[0]),f'{k} (n={len(df_k)})',figsize=(7.5,10))
plot_heatmap(full.drop(columns# plt.savefig(f'fig/heatmap/{k}.png',bbox_inches='tight', pad_inches=0)
plt.show()
plt.close()# break
break
0%| | 0/289 [00:00<?, ?it/s]
0%| | 0/289 [00:02<?, ?it/s]
Combine figures for pdf
def combine_images_vertically(image_paths, output_path):
= [Image.open(image_path).convert('RGBA') for image_path in image_paths]
images
= max(image.width for image in images)
total_width = sum(image.height for image in images)
total_height
= Image.new('RGBA', (total_width, total_height))
combined_image
= 0
y_offset for image in images:
0, y_offset), image)
combined_image.paste(image, (+= image.height
y_offset
combined_image.save(output_path)
Uncomment below to run
# folders = ["fig/count", "fig/logo", "fig/heatmap"]
# for k in tqdm(cnt.index,total=len(cnt)):
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"fig/combine/{k}.png"
# combine_images_vertically(image_paths, output_path)
# # break
Get PSSM data of CDDM
for i,k in enumerate(cnt.index):
= df.query(f'kinase=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
= full.drop(columns = [0]).reset_index().melt(id_vars=['aa'], value_name=k, var_name='Position')
melt 'substrate']=melt['Position'].astype(str)+ melt['aa']
melt[
= full[0][['s','t','y']].reset_index().rename(columns={0:k})
position_0 'substrate'] = '0'+position_0['aa']
position_0[
if i ==0:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
first else:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
k = pd.concat([first,k],axis=1)
data = data.copy()
first
# break
= data.T
data
= data.index.rename('kinase') data.index
To save
# data.to_csv('supp/CDDM.csv')
# data.to_parquet('ks_main.parquet')
Get specialized CDDM data for all-capital substrates
combine s,t,y to S,T,Y
# List of suffixes
= ['S', 'T', 'Y']
suffixes
for suffix in suffixes:
for i in range(-7, 8): # looping from -7 to 7
if i == 0: # Skip 0
continue
= f"{i}{suffix}" # e.g., -7S
upper_col = f"{i}{suffix.lower()}" # e.g., -7s
lower_col = data[upper_col] + data[lower_col]
data[upper_col] =1,inplace=True) # Drop the lowercase column after combining data.drop(lower_col, axis
str.contains('S')] data.columns[data.columns.
Index(['-7S', '-6S', '-5S', '-4S', '-3S', '-2S', '-1S', '1S', '2S', '3S', '4S',
'5S', '6S', '7S'],
dtype='object', name='substrate')
# make sure the "s" in positions other than 0 is deleted from the columns
str.contains('s')] data.columns[data.columns.
Index(['0s'], dtype='object', name='substrate')
# Make sure very position's sum is 1
str.contains('-7')].sum(1).sort_values() data.loc[:,data.columns.
kinase
DDR2 1.0
NEK11 1.0
MSK1 1.0
TEK 1.0
NIM1 1.0
...
CAMK2G 1.0
PKG2 1.0
MELK 1.0
NEK1 1.0
TLK2 1.0
Length: 289, dtype: float64
= data.rename(columns={'0s':'0S','0t':'0T','0y':'0Y'}) data
= data.index.rename('kinase') data.index
To save
# data.to_parquet('ks_main_upper.parquet')
# data.to_csv('supp/CDDM_upper.csv')
Plot other kinases (mutated, lipid kinase, isoforms)
kinases not on kinome tree
= df.query('on_tree==0').kinase.value_counts()
cnt_other
= cnt_other[cnt_other>100] cnt_other
= cnt_other.index.tolist()+['LYN','ABL1','RET','FGFR3','PDGFRA','ALK',
others 'EGFR','KIT','MET','PKCB','BRAF','PKG1'] # BRAF is less than 100
Uncheck savefig to save figures
for k in tqdm(others,total=len(others)):
= df.query(f'kinase=="{k}"')
df_k
plot_count(df_k,k)# plt.savefig(f'fig_others/count/{k.replace("/", "_")}.png',bbox_inches='tight', pad_inches=0.1)
# if visualize in jupyter notebook, uncheck the savefig
plt.show()
plt.close()
= df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
get_logo2(full,k)# plt.savefig(f'fig_others/logo/{k.replace("/", "_")}.png',bbox_inches='tight', pad_inches=0.3)
plt.show()
plt.close()
=[0]),f'{k} (n={len(df_k)})',figsize=(7.5,10))
plot_heatmap(full.drop(columns# plt.savefig(f'fig_others/heatmap/{k.replace("/", "_")}.png',bbox_inches='tight', pad_inches=0)
plt.show()
plt.close()
break
0%| | 0/36 [00:00<?, ?it/s]
0%| | 0/36 [00:01<?, ?it/s]
Combine the figures for pdf
Uncomment below to run
# folders = ["fig_others/count", "fig_others/logo", "fig_others/heatmap"]
# for k in tqdm(others,total = len(others)):
# k = k.replace("/", "_")
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"fig_others/combine/{k}.png"
# combine_images_vertically(image_paths, output_path)
# # break
Get the PSSMs of other kinases
for i,k in enumerate(others):
= df.query(f'kinase=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
= full.drop(columns = [0]).reset_index().melt(id_vars=['aa'], value_name=k, var_name='Position')
melt 'substrate']=melt['Position'].astype(str)+ melt['aa']
melt[
= full[0][['s','t','y']].reset_index().rename(columns={0:k})
position_0 'substrate'] = '0'+position_0['aa']
position_0[
if i ==0:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
first else:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
k = pd.concat([first,k],axis=1)
data = data.copy() first
= data.T
data
= data.index.rename('kinase') data.index
To save:
# data.to_csv('supp/CDDM_others.csv')
# data.to_parquet('ks_others.parquet')