import pandas as pd
import numpy as np
from katlas.data import *
from katlas.pssm import *
from katlas.utils import *
from katlas.plot import *
from katlas.feature import *
from matplotlib import pyplot as plt
import seaborn as sns
import math
from sklearn.cluster import KMeans
Plot heatmap and logo of CDDM
Setup
= Data.get_ks_dataset() df
df.head()
kin_sub_site | kinase_uniprot | substrate_uniprot | site | source | substrate_genes | substrate_phosphoseq | position | site_seq | sub_site | substrate_sequence | kinase_on_tree | kinase_genes | kinase_group | kinase_family | kinase_pspa_big | kinase_pspa_small | kinase_coral_ID | num_kin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O00141_A4FU28_S140 | O00141 | A4FU28 | S140 | Sugiyama | CTAGE9 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 140 | AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH | A4FU28_S140 | MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 22 |
1 | O00141_O00141_S252 | O00141 | O00141 | S252 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 252 | SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ | O00141_S252 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
2 | O00141_O00141_S255 | O00141 | O00141 | S255 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 255 | HIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQPYD | O00141_S255 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
3 | O00141_O00141_S397 | O00141 | O00141 | S397 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 397 | sGPNDLRHFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEA | O00141_S397 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
4 | O00141_O00141_S404 | O00141 | O00141 | S404 | Sugiyama | SGK1 SGK | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 404 | HFDPEFTEEPVPNsIGKsPDsVLVTAsVKEAAEAFLGFsYA | O00141_S404 | MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS... | 1 | SGK1 SGK | AGC | SGK | Basophilic | Akt/rock | SGK1 | 1 |
df.shape
(187066, 19)
logo heatmap
'kinase_uniprot_gene'] = df['kinase_uniprot']+'_'+df['kinase_genes'].str.split(' ').str[0]
df[
= df.kinase_uniprot_gene.value_counts() cnt
# cnt = df.kinase_coral_ID.value_counts()
def convert_source(x):
if x == "Sugiyama":
return x
elif 'Sugiyama' in x and '|' in x:
return 'Both'
elif 'Sugiyama' not in x:
return 'Non-Sugiyama'
'source_combine'] = df.source.apply(convert_source) df[
def plot_hist_num_kin(df_k):
"Plot histogram of num kin grouped by source_combine."
= sns.displot(
g
df_k,="num_kin",
x="source_combine",
col=100,
bins=1,
col_wrap=2.0,
height=4,
aspect={'sharex': False, 'sharey': False}
facet_kws
)
"Number of Kinases per Site", "Count")
g.set_axis_labels(
# Customize titles
for ax, source in zip(g.axes.flatten(), g.col_names):
= df_k[df_k['source_combine'] == source].shape[0]
count f"{source} (n={count:,})")
ax.set_title(
"Histogram of # Kinases per Substrate Site")
g.figure.suptitle(
# Adjust layout to make room for suptitle
plt.tight_layout()
def plot_cnt_cddm(df_k):
"Plot source combine counts via bar graph."
= df_k.source_combine.value_counts()
source_cnt
plot_cnt(source_cnt)'# Substrate Sites per Source',pad=20) plt.title(
# plot_cnt_cddm(df_k)
def plot_cnt_acceptor(df_k):
"Plot site type via bar graph."
= df_k.acceptor.value_counts()
acceptor_cnt
plot_cnt(acceptor_cnt)'# Substrate Sites per Phospho-Acceptor Type',pad=20) plt.title(
# plot_cnt_acceptor(df_k)
# set_sns()
# onehot = onehot_encode(df_k.site_seq)
def kmeans(onehot,n=2,seed=42):
= KMeans(n_clusters=n, random_state=seed,n_init='auto')
kmeans return kmeans.fit_predict(onehot)
def filter_range_columns(df,low=-10,high=10):
= df.columns.str[:-1].astype(int)
positions = (positions >= low) & (positions <= high)
mask return df.loc[:,mask]
# positions = onehot.columns.str[:-1].astype(int)
# mask = (positions >= -10) & (positions <= 10)
# onehot = onehot.loc[:,mask]
# df_k['cluster'] = kmeans(onehot,n=10,seed=42)
# pssms = get_cluster_pssms(df_k,'cluster',valid_thr=0.5)
# kmeans_cnt = df_k.cluster.value_counts()
# plot_logos(pssms,kmeans_cnt)
def get_onehot_add_cluster(df_k,n=10):
= df_k.copy()
df_k = onehot_encode(df_k.site_seq)
onehot = filter_range_columns(onehot)
onehot_10 'Cluster'] = kmeans(onehot_10,n=n,seed=42)
df_k[= df_k.reset_index(drop=True)
df_k return df_k,onehot_10
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
def get_kmeans_logos(df_k,cnt_thr=10):
= get_cluster_pssms(df_k,'Cluster',valid_thr=0) # remove valid thr
pssms = df_k['Cluster'].value_counts()
kmeans_cnt
# filter cluster with >=10 counts
= kmeans_cnt[kmeans_cnt >= cnt_thr].index
valid_clusters = pssms.loc[valid_clusters]
filtered_pssms
if not filtered_pssms.empty: plot_logos(filtered_pssms,kmeans_cnt)
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
# get_kmeans_logos(df_k)
def plot_onehot(onehot_10,hue):
'pca',seed=42,complexity=30,hue=hue,legend=True,s=8)
plot_cluster(onehot_10,f'PCA of One-Hot Encoded\n Substrate Site Sequences (n={len(onehot_10):,})') plt.title(
# df_k,onehot_10 = get_onehot_add_cluster(df_k,n=10)
# plot_onehot(onehot_10,df_k['Cluster'])
# get_kmeans_logos(df_k)
# plot_cluster(onehot_10,'pca',seed=42,complexity=30,hue=df_k.cluster,legend=True,s=8)
# plt.title('PCA of One-Hot Encoded Sequences (-10 to +10')
# plot_logo_heatmap(pssm_df,title=f'{k} (n={len(df_k):,})',figsize=(17,10))
# for site_type in ['S','T','Y']:
# df_sty = df[df.kinase_uniprot_gene.upper()==site_type].copy()
# pssm_sty = get_prob(df_sty,'site_seq')
# plot_logo_heatmap(pssm_sty,title=f'{site_type} sites (n={len(df_sty):,})',figsize=(17,10))
# save_show()
# pssm_LO = get_pssm_LO(pssm_sty,'S')
# plot_logo_raw(pssm_LO,ytitle="Log-Odds Score (bits)")
# plot_logo_heatmap_enrich(pssm_LO)
# import logomaker
# for site_type in ['S','T','Y']:
# df_sty = df_k[df_k.acceptor.str.upper()==site_type].copy()
# pssm_sty = get_prob(df_sty,'site_seq')
# plot_logo_heatmap(pssm_sty,title=f'{site_type} sites (n={len(df_sty):,})',figsize=(17,10))
# save_show()
# break
Iterate
'acceptor']=df.site.str[0] df[
=False SHOW
= cnt[cnt>=40] filter_cnt
list(filter_cnt.index).index('O43353_RIPK2')
301
for k in filter_cnt.index[301:]: break
k
'O43353_RIPK2'
= cnt[cnt>=40]
filter_cnt for k in filter_cnt.index[301:]:
# for k in ['P00519_ABL1']:
= df[df.kinase_uniprot_gene==k].copy()
df_k # df_k = df_k[df_k.num_kin>10]
# Freq PSSM
= get_prob(df_k,'site_seq')
pssm_df =f'{k} (n={len(df_k):,})',figsize=(17,10))
plot_logo_heatmap(pssm_df,title=get_path('~/img/cddm/pssm_freq',f'{k}.svg')
path1=SHOW)
save_show(path1,show_only
# Log odds PSSM
= get_pssm_LO(pssm_df,'STY')
pssm_LO =f'{k} (n={len(df_k):,})',figsize=(17,10))
plot_logo_heatmap_LO(pssm_LO,title=get_path('~/img/cddm/pssm_LO',f'{k}.svg')
path2=SHOW)
save_show(path2,show_only
# plot S, T and Y motif
=df_k.acceptor.value_counts()
sty_cnt = sty_cnt[(sty_cnt/len(df_k) > 0.08) & (sty_cnt>=10)].index # Skip this site_type if it has less than 8% or less than 10 count
acceptors for site_type in acceptors:
= df_k[df_k.acceptor.str.upper()==site_type].copy()
df_sty
# freq map
= get_prob(df_sty,'site_seq')
pssm_sty =f'{k}: {site_type} sites (n={len(df_sty):,})',figsize=(17,10))
plot_logo_heatmap(pssm_sty,title=get_path('~/img/cddm/pssm_freq_acceptor',f'{k}_{site_type}.svg')
path_3=SHOW)
save_show(path_3,show_only
# for log-odds
= get_pssm_LO(pssm_sty,site_type)
pssm_LO =site_type,title=f'{k}: {site_type} sites (n={len(df_sty):,})',figsize=(17,10))
plot_logo_heatmap_LO(pssm_LO,acceptor=get_path('~/img/cddm/pssm_LO_acceptor',f'{k}_{site_type}.svg')
path_4=SHOW)
save_show(path_4,show_only
=get_path('~/img/cddm/bar_acceptor',f'{k}.svg')
path5
plot_cnt_acceptor(df_k)=SHOW)
save_show(path5,show_only
# count of source
=get_path('~/img/cddm/bar_source',f'{k}.svg')
path6
plot_cnt_cddm(df_k)=SHOW)
save_show(path6,show_only
# histogram of num kin
=get_path('~/img/cddm/hist_num_kin',f'{k}.svg')
path7
plot_hist_num_kin(df_k)=SHOW)
save_show(path7,show_only
# onehot of sequences
# filter out noise acceptor for stratification
= df_k[df_k.acceptor.isin(acceptors)]
df_k = get_onehot_add_cluster(df_k,n=10)
df_k,onehot_10
= get_path('~/img/cddm/pca_onehot',f'{k}.svg')
path8
plot_onehot(onehot_10,df_k.Cluster)=SHOW)
save_show(path8,show_only
= get_path('~/img/cddm/motif_kmeans',f'{k}.svg')
path9
get_kmeans_logos(df_k)=SHOW)
save_show(path9,show_only
= get_path('~/img/cddm/df_k',f'{k}.parquet')
path00
df_k.to_parquet(path00)
plt.close()
# break
100%|██████████| 10/10 [00:00<00:00, 179.35it/s]
100%|██████████| 10/10 [00:00<00:00, 519.73it/s]
100%|██████████| 10/10 [00:00<00:00, 177.82it/s]
100%|██████████| 10/10 [00:00<00:00, 179.58it/s]
100%|██████████| 10/10 [00:00<00:00, 169.36it/s]
100%|██████████| 10/10 [00:00<00:00, 517.55it/s]
100%|██████████| 10/10 [00:00<00:00, 198.53it/s]
100%|██████████| 10/10 [00:00<00:00, 236.72it/s]
100%|██████████| 10/10 [00:00<00:00, 178.37it/s]
100%|██████████| 10/10 [00:00<00:00, 132.38it/s]
100%|██████████| 10/10 [00:00<00:00, 510.61it/s]
100%|██████████| 10/10 [00:00<00:00, 127.77it/s]
100%|██████████| 10/10 [00:00<00:00, 455.63it/s]
100%|██████████| 10/10 [00:00<00:00, 357.39it/s]
100%|██████████| 10/10 [00:00<00:00, 347.38it/s]
100%|██████████| 10/10 [00:00<00:00, 109798.53it/s]
100%|██████████| 10/10 [00:00<00:00, 339.81it/s]
100%|██████████| 10/10 [00:00<00:00, 460.57it/s]
100%|██████████| 10/10 [00:00<00:00, 513.30it/s]
100%|██████████| 10/10 [00:00<00:00, 348.73it/s]
100%|██████████| 10/10 [00:00<00:00, 448.85it/s]
100%|██████████| 10/10 [00:00<00:00, 137518.16it/s]
100%|██████████| 10/10 [00:00<00:00, 87930.90it/s]
100%|██████████| 10/10 [00:00<00:00, 155922.08it/s]
100%|██████████| 10/10 [00:00<00:00, 174037.51it/s]
100%|██████████| 10/10 [00:00<00:00, 420.22it/s]
100%|██████████| 10/10 [00:00<00:00, 63937.56it/s]
100%|██████████| 10/10 [00:00<00:00, 162569.92it/s]
100%|██████████| 10/10 [00:00<00:00, 197.13it/s]
100%|██████████| 10/10 [00:00<00:00, 145635.56it/s]
100%|██████████| 10/10 [00:00<00:00, 112447.83it/s]
100%|██████████| 10/10 [00:00<00:00, 154202.35it/s]
100%|██████████| 10/10 [00:00<00:00, 149263.49it/s]
100%|██████████| 10/10 [00:00<00:00, 135737.99it/s]
= set(df_k.substrate_genes.str.split(' ').str[0]) genes
= pd.read_excel('raw/idmapping_kinase_info_2025_05_27.xlsx') path_ref
k
'P00519_ABL1'
'uniprot_gene'] = path_ref.uniprot+'_'+path_ref['Gene Names (primary)'] path_ref[
= path_ref[path_ref.uniprot_gene==k].Reactome.str.split(';').iloc[0] idx_path
path_df_raw
stId | dbId | name | llp | inDisease | species.dbId | species.taxId | species.name | entities.resource | entities.total | entities.found | entities.ratio | entities.pValue | entities.fdr | entities.exp | reactions.resource | reactions.total | reactions.found | reactions.ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | R-HSA-72649 | 72649 | Translation initiation complex formation | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 62 | 39 | 0.003861 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 2 | 2 | 0.000129 |
1 | R-HSA-72695 | 72695 | Formation of the ternary complex, and subseque... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 54 | 33 | 0.003362 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 3 | 3 | 0.000193 |
2 | R-HSA-72702 | 72702 | Ribosomal scanning and start codon recognition | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 64 | 39 | 0.003985 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 2 | 2 | 0.000129 |
3 | R-HSA-72662 | 72662 | Activation of the mRNA upon binding of the cap... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 66 | 39 | 0.004110 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 6 | 6 | 0.000386 |
4 | R-HSA-72706 | 72706 | GTP hydrolysis and joining of the 60S ribosoma... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 120 | 66 | 0.007472 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 3 | 3 | 0.000193 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1754 | R-HSA-198933 | 198933 | Immunoregulatory interactions between a Lympho... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 249 | 1 | 0.015504 | 1.000000e+00 | 1.000000e+00 | [] | TOTAL | 44 | 1 | 0.002829 |
1755 | R-HSA-373076 | 373076 | Class A/1 (Rhodopsin-like receptors) | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 415 | 2 | 0.025841 | 1.000000e+00 | 1.000000e+00 | [] | TOTAL | 187 | 3 | 0.012023 |
1756 | R-HSA-425407 | 425407 | SLC-mediated transmembrane transport | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 424 | 2 | 0.026401 | 1.000000e+00 | 1.000000e+00 | [] | TOTAL | 200 | 2 | 0.012858 |
1757 | R-HSA-500792 | 500792 | GPCR ligand binding | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 610 | 4 | 0.037983 | 1.000000e+00 | 1.000000e+00 | [] | TOTAL | 219 | 4 | 0.014080 |
1758 | R-HSA-9709957 | 9709957 | Sensory Perception | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 1258 | 10 | 0.078331 | 1.000000e+00 | 1.000000e+00 | [] | TOTAL | 138 | 2 | 0.008872 |
1759 rows × 19 columns
= path_df_raw[path_df_raw.stId.isin(idx_path)] ref_paths
ref_paths.name
80 RUNX1 regulates transcription of genes involve...
129 RHO GTPases Activate WASPs and WAVEs
157 Recruitment and ATM-mediated phosphorylation o...
306 MLL4 and MLL3 complexes regulate expression of...
423 Factors involved in megakaryocyte development ...
424 Cyclin D associated events in G1
473 Turbulent (oscillatory, disturbed) flow shear ...
485 Regulation of actin dynamics for phagocytic cu...
526 HDR through Single Strand Annealing (SSA)
629 FCGR3A-mediated phagocytosis
662 Role of ABL in ROBO-SLIT signaling
922 Myogenesis
1193 RUNX2 regulates osteoblast differentiation
Name: name, dtype: object
idx_path
['R-HSA-2029482',
'R-HSA-428890',
'R-HSA-525793',
'R-HSA-5663213',
'R-HSA-5685938',
'R-HSA-5693565',
'R-HSA-69231',
'R-HSA-8939236',
'R-HSA-8940973',
'R-HSA-9664422',
'R-HSA-983231',
'R-HSA-9841922',
'R-HSA-9860927',
'']
= get_reactome(genes) path_df
path_df
name | fdr | -log10_fdr | |
---|---|---|---|
0 | Translation initiation complex formation | 4.884981e-15 | 14.311 |
1 | Formation of the ternary complex, and subseque... | 4.884981e-15 | 14.311 |
2 | Ribosomal scanning and start codon recognition | 4.884981e-15 | 14.311 |
3 | Activation of the mRNA upon binding of the cap... | 4.884981e-15 | 14.311 |
4 | GTP hydrolysis and joining of the 60S ribosoma... | 4.884981e-15 | 14.311 |
... | ... | ... | ... |
1754 | Immunoregulatory interactions between a Lympho... | 1.000000e+00 | 0.000 |
1755 | Class A/1 (Rhodopsin-like receptors) | 1.000000e+00 | 0.000 |
1756 | SLC-mediated transmembrane transport | 1.000000e+00 | 0.000 |
1757 | GPCR ligand binding | 1.000000e+00 | 0.000 |
1758 | Sensory Perception | 1.000000e+00 | -0.000 |
1759 rows × 3 columns
ref_paths.name.shape
(13,)
=50,path_list=ref_paths.name) plot_path(path_df,top_n
path_df_raw[path_df_raw.stId.isin(idx_path)]
def get_reactome(gene_list,
='entities.fdr', # column of p value or fdr (e.g., entities.pValue)
col=None, # list of reactome idx
ref_list
):"Reactome pathway analysis for a given gene set; returns formated output in dataframe with additional -log10(p)"
= get_reactome_raw(gene_list).copy()
out = col.split('.')[1]
col_rename = out[['stId','name',col]].rename(columns={col:col_rename,'stId':'ID'})
out 'significant']=(out[col_rename]<=0.05).astype(int)
out[f'-log10({col_rename})'] = -np.log10(out[col_rename]).round(3)
out[f'rank']=out[col_rename].rank().astype(int)
out[if ref_list: out['in_ref']=out.ID.isin(ref_list).astype(int)
return out
= get_reactome(genes,ref_list=idx_path) out
==1] out[out.in_ref
ID | name | fdr | significant | -log10(fdr) | rank | in_ref | |
---|---|---|---|---|---|---|---|
80 | R-HSA-8939236 | RUNX1 regulates transcription of genes involve... | 1.253766e-08 | 1 | 7.902 | 81 | 1 |
129 | R-HSA-5663213 | RHO GTPases Activate WASPs and WAVEs | 3.404711e-06 | 1 | 5.468 | 130 | 1 |
157 | R-HSA-5693565 | Recruitment and ATM-mediated phosphorylation o... | 1.482829e-05 | 1 | 4.829 | 156 | 1 |
306 | R-HSA-9841922 | MLL4 and MLL3 complexes regulate expression of... | 6.713774e-04 | 1 | 3.173 | 304 | 1 |
423 | R-HSA-983231 | Factors involved in megakaryocyte development ... | 4.163072e-03 | 1 | 2.381 | 424 | 1 |
424 | R-HSA-69231 | Cyclin D associated events in G1 | 4.191536e-03 | 1 | 2.378 | 426 | 1 |
473 | R-HSA-9860927 | Turbulent (oscillatory, disturbed) flow shear ... | 7.981719e-03 | 1 | 2.098 | 474 | 1 |
485 | R-HSA-2029482 | Regulation of actin dynamics for phagocytic cu... | 9.889877e-03 | 1 | 2.005 | 486 | 1 |
526 | R-HSA-5685938 | HDR through Single Strand Annealing (SSA) | 1.734879e-02 | 1 | 1.761 | 527 | 1 |
629 | R-HSA-9664422 | FCGR3A-mediated phagocytosis | 4.756334e-02 | 1 | 1.323 | 630 | 1 |
662 | R-HSA-428890 | Role of ABL in ROBO-SLIT signaling | 6.281753e-02 | 0 | 1.202 | 663 | 1 |
922 | R-HSA-525793 | Myogenesis | 1.742057e-01 | 0 | 0.759 | 923 | 1 |
1193 | R-HSA-8940973 | RUNX2 regulates osteoblast differentiation | 4.049378e-01 | 0 | 0.393 | 1194 | 1 |
out.fdr.rank()
0 22.0
1 22.0
2 22.0
3 22.0
4 22.0
...
1754 1755.0
1755 1756.0
1756 1757.0
1757 1758.0
1758 1759.0
Name: fdr, Length: 1759, dtype: float64
def plot_path(df,col='-log10(fdr)', top_n=10,max_label_length=80):
"Plot the bar graph of pathways from get_reactome function."
# Extract the data and reverse it
= df.head(top_n).set_index('name')[col].iloc[::-1]
data
# Truncate labels if they are too long
= [label[:max_label_length] + '...' if len(label) > max_label_length else label for label in data.index]
truncated_labels = truncated_labels
data.index
# Calculate the required width: base width + additional width for the longest label
= 2
base_width = max(data.index, key=len)
max_label_length = len(max_label_length) * 0.1 # Adjust scaling factor as needed
additional_width
= (base_width + additional_width, 3*top_n/10) # Adjust height as necessary
figsize
=figsize)
data.plot.barh(figsize'')
plt.ylabel(
plt.xlabel(col) plt.tight_layout()
import matplotlib.pyplot as plt
def plot_path(df, col='-log10(fdr)', top_n=10, max_label_length=80, path_list=None):
"""
Plot the bar graph of pathways from get_reactome function.
Highlights pathways in path_list with a different color (dark red).
"""
# Extract and reverse data
= df.head(top_n).set_index('name')[col].iloc[::-1]
data
# Save original full names to match against path_list
= data.index.tolist()
full_names
# Truncate labels if too long
= [label[:max_label_length] + '...' if len(label) > max_label_length else label for label in full_names]
truncated_labels = truncated_labels
data.index
# Determine colors
if path_list is not None:
= set(path_list)
path_set = ['darkred' if name in path_set else 'darkblue' for name in full_names]
colors else:
= 'darkblue'
colors
# Calculate dynamic figure width
= 2
base_width = max(data.index, key=len)
max_label = len(max_label) * 0.1
additional_width = (base_width + additional_width, 3 * top_n / 10)
figsize
# Plot
= data.plot.barh(figsize=figsize, color=colors)
ax '')
plt.ylabel(
plt.xlabel(col)
plt.tight_layout()return ax
= path_df_raw[path_df_raw['entities.fdr']<0.05] path_fdr
path_fdr
stId | dbId | name | llp | inDisease | species.dbId | species.taxId | species.name | entities.resource | entities.total | entities.found | entities.ratio | entities.pValue | entities.fdr | entities.exp | reactions.resource | reactions.total | reactions.found | reactions.ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | R-HSA-72649 | 72649 | Translation initiation complex formation | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 62 | 39 | 0.003861 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 2 | 2 | 0.000129 |
1 | R-HSA-72695 | 72695 | Formation of the ternary complex, and subseque... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 54 | 33 | 0.003362 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 3 | 3 | 0.000193 |
2 | R-HSA-72702 | 72702 | Ribosomal scanning and start codon recognition | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 64 | 39 | 0.003985 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 2 | 2 | 0.000129 |
3 | R-HSA-72662 | 72662 | Activation of the mRNA upon binding of the cap... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 66 | 39 | 0.004110 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 6 | 6 | 0.000386 |
4 | R-HSA-72706 | 72706 | GTP hydrolysis and joining of the 60S ribosoma... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 120 | 66 | 0.007472 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 3 | 3 | 0.000193 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
631 | R-HSA-8866427 | 8866427 | VLDLR internalisation and degradation | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 16 | 4 | 0.000996 | 2.450345e-02 | 4.900690e-02 | [] | TOTAL | 4 | 4 | 0.000257 |
632 | R-HSA-8984722 | 8984722 | Interleukin-35 Signalling | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 16 | 4 | 0.000996 | 2.450345e-02 | 4.900690e-02 | [] | TOTAL | 26 | 25 | 0.001672 |
633 | R-HSA-1679131 | 1679131 | Trafficking and processing of endosomal TLR | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 16 | 4 | 0.000996 | 2.450345e-02 | 4.900690e-02 | [] | TOTAL | 7 | 4 | 0.000450 |
634 | R-HSA-879415 | 879415 | Advanced glycosylation endproduct receptor sig... | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 16 | 4 | 0.000996 | 2.450345e-02 | 4.900690e-02 | [] | TOTAL | 4 | 2 | 0.000257 |
635 | R-HSA-2173789 | 2173789 | TGF-beta receptor signaling activates SMADs | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 51 | 8 | 0.003176 | 2.483726e-02 | 4.967452e-02 | [] | TOTAL | 44 | 26 | 0.002829 |
636 rows × 19 columns
plot_path??
Signature: plot_path(react_out, top_n=10, max_label_length=80) Source: def plot_path(react_out, top_n=10,max_label_length=80): "Plot the bar graph of pathways from get_reactome function." # Extract the data and reverse it data = react_out.head(top_n).set_index('name')['-log10_pValue'].iloc[::-1] # Truncate labels if they are too long truncated_labels = [label[:max_label_length] + '...' if len(label) > max_label_length else label for label in data.index] data.index = truncated_labels # Calculate the required width: base width + additional width for the longest label base_width = 2 max_label_length = max(data.index, key=len) additional_width = len(max_label_length) * 0.1 # Adjust scaling factor as needed figsize = (base_width + additional_width, 3*top_n/10) # Adjust height as necessary data.plot.barh(figsize=figsize) plt.ylabel('') plt.xlabel('-log10(p)') plt.tight_layout() File: ~/katlas/katlas/utils.py Type: function
'entities.ratio'].sort_values(ascending=False).index] path_fdr.loc[path_fdr[
stId | dbId | name | llp | inDisease | species.dbId | species.taxId | species.name | entities.resource | entities.total | entities.found | entities.ratio | entities.pValue | entities.fdr | entities.exp | reactions.resource | reactions.total | reactions.found | reactions.ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
180 | R-HSA-162582 | 162582 | Signal Transduction | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 3049 | 267 | 0.189851 | 3.562203e-06 | 3.562203e-05 | [] | TOTAL | 2584 | 1258 | 0.166131 |
48 | R-HSA-1643685 | 1643685 | Disease | False | True | 48887 | 9606 | Homo sapiens | TOTAL | 2851 | 299 | 0.177522 | 1.554312e-15 | 6.061818e-14 | [] | TOTAL | 2011 | 716 | 0.129292 |
94 | R-HSA-168256 | 168256 | Immune System | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 2664 | 256 | 0.165878 | 2.664359e-09 | 5.328718e-08 | [] | TOTAL | 1733 | 791 | 0.111418 |
32 | R-HSA-392499 | 392499 | Metabolism of proteins | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 2417 | 268 | 0.150498 | 1.110223e-16 | 4.884981e-15 | [] | TOTAL | 909 | 428 | 0.058442 |
188 | R-HSA-74160 | 74160 | Gene expression (Transcription) | False | False | 48887 | 9606 | Homo sapiens | TOTAL | 1990 | 186 | 0.123910 | 4.133596e-06 | 3.895872e-05 | [] | TOTAL | 1140 | 486 | 0.073293 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
443 | R-HSA-9818025 | 9818025 | NFE2L2 regulating TCA cycle genes | True | False | 48887 | 9606 | Homo sapiens | TOTAL | 7 | 4 | 0.000436 | 1.440323e-03 | 5.761294e-03 | [] | TOTAL | 4 | 4 | 0.000257 |
474 | R-HSA-9649913 | 9649913 | RAS GTPase cycle mutants | False | True | 48887 | 9606 | Homo sapiens | TOTAL | 4 | 3 | 0.000249 | 2.704293e-03 | 8.112878e-03 | [] | TOTAL | 2 | 2 | 0.000129 |
475 | R-HSA-9753510 | 9753510 | Signaling by RAS GAP mutants | True | True | 48887 | 9606 | Homo sapiens | TOTAL | 4 | 3 | 0.000249 | 2.704293e-03 | 8.112878e-03 | [] | TOTAL | 1 | 1 | 0.000064 |
476 | R-HSA-9753512 | 9753512 | Signaling by RAS GTPase mutants | True | True | 48887 | 9606 | Homo sapiens | TOTAL | 4 | 3 | 0.000249 | 2.704293e-03 | 8.112878e-03 | [] | TOTAL | 1 | 1 | 0.000064 |
610 | R-HSA-9636667 | 9636667 | Manipulation of host energy metabolism | True | True | 48887 | 9606 | Homo sapiens | TOTAL | 3 | 2 | 0.000187 | 1.804436e-02 | 4.155485e-02 | [] | TOTAL | 2 | 2 | 0.000129 |
636 rows × 19 columns
= get_reactome(genes) path_df
plot_path??
Signature: plot_path(react_out, top_n=10, max_label_length=80) Source: def plot_path(react_out, top_n=10,max_label_length=80): "Plot the bar graph of pathways from get_reactome function." # Extract the data and reverse it data = react_out.head(top_n).set_index('name')['-log10_pValue'].iloc[::-1] # Truncate labels if they are too long truncated_labels = [label[:max_label_length] + '...' if len(label) > max_label_length else label for label in data.index] data.index = truncated_labels # Calculate the required width: base width + additional width for the longest label base_width = 2 max_label_length = max(data.index, key=len) additional_width = len(max_label_length) * 0.1 # Adjust scaling factor as needed figsize = (base_width + additional_width, 3*top_n/10) # Adjust height as necessary data.plot.barh(figsize=figsize) plt.ylabel('') plt.xlabel('-log10(p)') plt.tight_layout() File: ~/katlas/katlas/utils.py Type: function
path_df
name | pValue | -log10_pValue | |
---|---|---|---|
0 | Translation initiation complex formation | 1.110223e-16 | 15.955 |
1 | Formation of the ternary complex, and subseque... | 1.110223e-16 | 15.955 |
2 | Ribosomal scanning and start codon recognition | 1.110223e-16 | 15.955 |
3 | Activation of the mRNA upon binding of the cap... | 1.110223e-16 | 15.955 |
4 | GTP hydrolysis and joining of the 60S ribosoma... | 1.110223e-16 | 15.955 |
... | ... | ... | ... |
1754 | Immunoregulatory interactions between a Lympho... | 1.000000e+00 | 0.000 |
1755 | Class A/1 (Rhodopsin-like receptors) | 1.000000e+00 | 0.000 |
1756 | SLC-mediated transmembrane transport | 1.000000e+00 | 0.000 |
1757 | GPCR ligand binding | 1.000000e+00 | 0.000 |
1758 | Sensory Perception | 1.000000e+00 | -0.000 |
1759 rows × 3 columns
100)
plot_path(path_df,'AKT1') plt.title(
Text(0.5, 1.0, 'AKT1')
genes
import matplotlib.pyplot as plt
import seaborn as sns
from katlas.core import *
from katlas.plot import *
from scipy.stats import spearmanr, pearsonr
import os
from PIL import Image
from tqdm import tqdm
def plot_count(df_k,title):
# Get value counts
= df_k.source.replace({'pplus':'PP','large_scale':'LS'}).value_counts()
source_counts =(7,1))
plt.figure(figsize
='barh', stacked=True, color=['darkred', 'darkblue'])
source_counts.plot(kind# Annotate with the actual values
for index, value in enumerate(source_counts):
str(value),fontsize=10,rotation=-90, va='center')
plt.text(value, index,
'Count')
plt.xlabel( plt.title(title)
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
Load data
= Data.get_ks_dataset()
df 'SUB'] = df.substrate.str.upper() df[
= Data.get_kinase_info().query('pseudo=="0"') info
# It only contains kinase on the tree
= df.kinase_paper.value_counts() cnt
= info[info.group!="TK"].kinase ST
10:20] df[df.kinase_paper.isin(ST)].kinase_paper.value_counts()[
NEK6 950
PLK1 943
CK2A1 919
P38D 907
DYRK2 907
HGK 902
TTBK1 896
MST3 890
MST1 884
IKKE 880
Name: kinase_paper, dtype: int64
= cnt[cnt>100] cnt
Generate example figures
def plot_heatmap2(matrix, title, figsize=(6,10), label_size=20):
=figsize)
plt.figure(figsize='binary', annot=False,cbar=False)
sns.heatmap(matrix, cmap=label_size)
plt.title(title,fontsize# Set the font size for the tick labels
=label_size)
plt.xticks(fontsize=label_size)
plt.yticks(fontsize'')
plt.xlabel('') plt.ylabel(
= ['SRC','ABL1','ERK2','PKACA'] kinase_list
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
for k in kinase_list:
= df.query(f'kinase=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
=[0]),f'{k}',figsize=(6,10))
plot_heatmap2(full.drop(columns
plt.show()
plt.close()
break
# if you want to generate and save all of figures, uncomment below
# plt.savefig(f'fig/{k}.png',bbox_inches='tight', pad_inches=0.3)
# plt.close()
Generate all figures
Uncomment plt.savefig to save figures
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
for k in tqdm(cnt.index,total=len(cnt)):
= df.query(f'kinase=="{k}"')
df_k
plot_count(df_k,k)# plt.savefig(f'fig/count/{k}.png',bbox_inches='tight', pad_inches=0.1)
# if visualize in jupyter notebook, uncheck the savefig
plt.show()
plt.close()
= df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
get_logo2(full, k)# plt.savefig(f'fig/logo/{k}.png',bbox_inches='tight', pad_inches=0.3)
plt.show()
plt.close()
=[0]),f'{k} (n={len(df_k)})',figsize=(7.5,10))
plot_heatmap(full.drop(columns# plt.savefig(f'fig/heatmap/{k}.png',bbox_inches='tight', pad_inches=0)
plt.show()
plt.close()# break
break
0%| | 0/289 [00:00<?, ?it/s]
0%| | 0/289 [00:02<?, ?it/s]
Combine figures for pdf
def combine_images_vertically(image_paths, output_path):
= [Image.open(image_path).convert('RGBA') for image_path in image_paths]
images
= max(image.width for image in images)
total_width = sum(image.height for image in images)
total_height
= Image.new('RGBA', (total_width, total_height))
combined_image
= 0
y_offset for image in images:
0, y_offset), image)
combined_image.paste(image, (+= image.height
y_offset
combined_image.save(output_path)
Uncomment below to run
# folders = ["fig/count", "fig/logo", "fig/heatmap"]
# for k in tqdm(cnt.index,total=len(cnt)):
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"fig/combine/{k}.png"
# combine_images_vertically(image_paths, output_path)
# # break
Get PSSM data of CDDM
for i,k in enumerate(cnt.index):
= df.query(f'kinase=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
= full.drop(columns = [0]).reset_index().melt(id_vars=['aa'], value_name=k, var_name='Position')
melt 'substrate']=melt['Position'].astype(str)+ melt['aa']
melt[
= full[0][['s','t','y']].reset_index().rename(columns={0:k})
position_0 'substrate'] = '0'+position_0['aa']
position_0[
if i ==0:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
first else:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
k = pd.concat([first,k],axis=1)
data = data.copy()
first
# break
= data.T
data
= data.index.rename('kinase') data.index
To save
# data.to_csv('supp/CDDM.csv')
# data.to_parquet('ks_main.parquet')
Get specialized CDDM data for all-capital substrates
combine s,t,y to S,T,Y
# List of suffixes
= ['S', 'T', 'Y']
suffixes
for suffix in suffixes:
for i in range(-7, 8): # looping from -7 to 7
if i == 0: # Skip 0
continue
= f"{i}{suffix}" # e.g., -7S
upper_col = f"{i}{suffix.lower()}" # e.g., -7s
lower_col = data[upper_col] + data[lower_col]
data[upper_col] =1,inplace=True) # Drop the lowercase column after combining data.drop(lower_col, axis
str.contains('S')] data.columns[data.columns.
Index(['-7S', '-6S', '-5S', '-4S', '-3S', '-2S', '-1S', '1S', '2S', '3S', '4S',
'5S', '6S', '7S'],
dtype='object', name='substrate')
# make sure the "s" in positions other than 0 is deleted from the columns
str.contains('s')] data.columns[data.columns.
Index(['0s'], dtype='object', name='substrate')
# Make sure very position's sum is 1
str.contains('-7')].sum(1).sort_values() data.loc[:,data.columns.
kinase
DDR2 1.0
NEK11 1.0
MSK1 1.0
TEK 1.0
NIM1 1.0
...
CAMK2G 1.0
PKG2 1.0
MELK 1.0
NEK1 1.0
TLK2 1.0
Length: 289, dtype: float64
= data.rename(columns={'0s':'0S','0t':'0T','0y':'0Y'}) data
= data.index.rename('kinase') data.index
To save
# data.to_parquet('ks_main_upper.parquet')
# data.to_csv('supp/CDDM_upper.csv')
Plot other kinases (mutated, lipid kinase, isoforms)
kinases not on kinome tree
= df.query('on_tree==0').kinase.value_counts()
cnt_other
= cnt_other[cnt_other>100] cnt_other
= cnt_other.index.tolist()+['LYN','ABL1','RET','FGFR3','PDGFRA','ALK',
others 'EGFR','KIT','MET','PKCB','BRAF','PKG1'] # BRAF is less than 100
Uncheck savefig to save figures
for k in tqdm(others,total=len(others)):
= df.query(f'kinase=="{k}"')
df_k
plot_count(df_k,k)# plt.savefig(f'fig_others/count/{k.replace("/", "_")}.png',bbox_inches='tight', pad_inches=0.1)
# if visualize in jupyter notebook, uncheck the savefig
plt.show()
plt.close()
= df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
get_logo2(full,k)# plt.savefig(f'fig_others/logo/{k.replace("/", "_")}.png',bbox_inches='tight', pad_inches=0.3)
plt.show()
plt.close()
=[0]),f'{k} (n={len(df_k)})',figsize=(7.5,10))
plot_heatmap(full.drop(columns# plt.savefig(f'fig_others/heatmap/{k.replace("/", "_")}.png',bbox_inches='tight', pad_inches=0)
plt.show()
plt.close()
break
0%| | 0/36 [00:00<?, ?it/s]
0%| | 0/36 [00:01<?, ?it/s]
Combine the figures for pdf
Uncomment below to run
# folders = ["fig_others/count", "fig_others/logo", "fig_others/heatmap"]
# for k in tqdm(others,total = len(others)):
# k = k.replace("/", "_")
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"fig_others/combine/{k}.png"
# combine_images_vertically(image_paths, output_path)
# # break
Get the PSSMs of other kinases
for i,k in enumerate(others):
= df.query(f'kinase=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full
= full.drop(columns = [0]).reset_index().melt(id_vars=['aa'], value_name=k, var_name='Position')
melt 'substrate']=melt['Position'].astype(str)+ melt['aa']
melt[
= full[0][['s','t','y']].reset_index().rename(columns={0:k})
position_0 'substrate'] = '0'+position_0['aa']
position_0[
if i ==0:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
first else:
= pd.concat([melt,position_0])[['substrate',k]].set_index('substrate')
k = pd.concat([first,k],axis=1)
data = data.copy() first
= data.T
data
= data.index.rename('kinase') data.index
To save:
# data.to_csv('supp/CDDM_others.csv')
# data.to_parquet('ks_others.parquet')