from katlas.imports import *
import os, seaborn as sns
from fastbook import *
from scipy.stats import spearmanr, pearsonr
from PIL import Image
from tqdm import tqdm
set_sns()
Compare CDDM and PSPA in Ser/Thr kinases
Setup
Load data
= Data.get_ks_dataset()
df 'SUB'] = df.substrate.str.upper() df[
Get overlap kinase
# normalized PSPA data
= pd.read_csv('raw/pspa_st_norm.csv')
norm = pd.read_csv('raw/pspa_st_raw.csv') raw
#get overlap and count
= df[df.kinase_paper.isin(raw.kinase)].kinase_paper.value_counts() overlap_cnt
overlap_cnt
= overlap_cnt[overlap_cnt>100] overlap_cnt
overlap_cnt
# PSPA data
= raw.set_index('kinase')
raw = norm.set_index('kinase') norm
Plot
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
# aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
= [i for i in 'PGACSTVILMFYWHKRQNDEsty']
aa_order_paper # position = [i for i in range(-7,8)]
= [-5,-4,-3,-2,-1,1,2,3,4] position_paper
Dataset-driven vs. normalized
To generate all of other figures and save them, uncheck plt.savefig, and comment out plt.show() and break
set_sns()
for k in tqdm(overlap_cnt.index,total=len(overlap_cnt)):
= df.query(f'kinase_paper=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= get_freq(df_k)
paper,full = get_one_kinase(norm,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
raw_k
f'{k} from CDDM (n={overlap_cnt[k]})')
plot_heatmap(paper,# plt.savefig(f'corr/KS/{k}.png',bbox_inches='tight', pad_inches=0.05)
plt.show()
plt.close()
f'{k} from PSPA')
plot_heatmap(raw_k,# plt.savefig(f'corr/PSPA/{k}.png',bbox_inches='tight', pad_inches=0.05)
plt.show()
plt.close()
= paper.unstack().values, #dataset driven
plot_corr(y = raw_k.unstack().values, # PSPA
x ='CDDM',
ylabel='PSPA')
xlabel
plt.title(k)# plt.savefig(f'corr/pear/{k}.png',bbox_inches='tight', pad_inches=0.2)
plt.show()
plt.close()
break
Combine the figures: correlation on top, and two heatmaps on the bottom
def combine_images_custom_layout(image_paths, output_path):
= [Image.open(image_path).convert('RGBA') for image_path in image_paths]
images
# Calculate total width and height for the new image
= max(images[0].width, images[1].width + images[2].width)
total_width = images[0].height + max(images[1].height, images[2].height)
total_height
# Create a new image with calculated dimensions
= Image.new('RGBA', (total_width, total_height))
combined_image
# Paste the first image at the top-center
= (total_width - images[0].width) // 2
x_offset 0], (x_offset, 0), images[0])
combined_image.paste(images[
# Paste the second image at the bottom-left
1], (0, images[0].height), images[1])
combined_image.paste(images[
# Paste the third image at the bottom-right
2], (images[1].width, images[0].height), images[2])
combined_image.paste(images[
# Save the result
combined_image.save(output_path)
Uncheck below to save combined figures
# folders = ["corr/pear",'corr/KS','corr/PSPA']
# for k in tqdm(overlap_cnt.index,total=len(overlap_cnt)):
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"corr/combine/{k}.png"
# combine_images_custom_layout(image_paths, output_path)
# # break
Plot comparison
Correlation with raw PSPA
= []
data for k in tqdm(overlap_cnt.index):
= df.query(f'kinase_paper=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= df_k.shape[0]
cnt
= get_freq(df_k)
paper,full = get_one_kinase(raw,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
raw_k
= pearsonr(raw_k.unstack().values,paper.unstack().values)
full_corr,_
data.append([k,full_corr,cnt])
= pd.DataFrame(data,columns= ['kinase','corr_with_raw','count_unique']) corr_raw
Correlation with normalized PSPA
= []
data for k in overlap_cnt.index:
= df.query(f'kinase_paper=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
df_k
= df_k.shape[0]
cnt
= get_freq(df_k)
paper,full = get_one_kinase(norm,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
norm_k
= pearsonr(norm_k.unstack().values,paper.unstack().values)
full_corr,_
data.append([k,full_corr])
= pd.DataFrame(data,columns= ['kinase','corr_with_norm',
corr_norm ])
Merge with specificity
= corr_raw.merge(corr_norm) corr
= pd.read_csv('raw/specificity_pspa.csv') m
= corr.merge(m).rename(columns={'max':'specificity'}) corr
corr
'kinase == "CK1A"') corr.query(
Pearson vs. Specificity
=corr.specificity.values,
plot_corr(x=corr.corr_with_norm.values)
y'Pearson\n dataset-driven vs. raw PSPA')
plt.ylabel('Kinase specificity'); plt.xlabel(
= Data.get_kinase_info().query('pseudo=="0"')
info
= corr.merge(info)
corr2
= load_pickle('raw/kinase_color.pkl') color
'corr_with_norm','group',palette=color)
plot_bar(corr2,'Pearson'); plt.ylabel(
=(7,3))
plt.figure(figsize=20)
corr.corr_with_norm.hist(bins'Pearson')
plt.xlabel('# Kinase')
plt.ylabel('Distribution of Pearson score') plt.title(
='corr_with_norm',x='specificity',c='darkblue')
corr.plot.scatter(y'Pearson\n Dataset-driven vs. norm PSPA')
plt.ylabel('Kinase specificity');
plt.xlabel(# plt.title('Dataset-driven vs. raw PSPA')
Examples of outliers
# Examples of data
= 'CK1A'
k =df.query(f'kinase_paper == "{k}"')
df_k=df_k.drop_duplicates(subset='SUB')
df_k
= get_freq(df_k)
paper, full
= get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)
raw_k
f'{k} from raw PSPA')
plot_heatmap(raw_k,f'{k} from KS datasets (n={len(df_k)})') plot_heatmap(paper,
To check all of the outliers, uncheck below
# # Examples of data
# for k in corr.query('corr_with_norm<0.4 & specificity>0.55').kinase:
# df_k=df.query(f'kinase_paper == "{k}"')
# df_k=df_k.drop_duplicates(subset='SUB')
# paper, full = get_freq(df_k)
# raw_k = get_one_kinase(raw,k,drop_s=False).T
# raw_k = raw_k.reindex(index=aa_order_paper)
# plot_heatmap(raw_k,f'{k} from raw PSPA')
# plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')
Pearson with raw vs. Pearson with norm
= pd.melt(corr[['corr_with_raw','corr_with_norm']]) melt
'variable'] = melt.variable.replace({'corr_with_raw':'raw','corr_with_norm':'normalized'}) melt[
def plot_box(data,x,y,dots=True):
if dots:
=data,x=x,y=y)
sns.stripplot(data=data, x=x, y=y, palette='pastel') sns.boxplot(data
'variable','value')
plot_box(melt,'Pearson \nDataset-driven vs. PSPA')
plt.ylabel('PSPA'); plt.xlabel(
Find out the factor that cause the biggest change in correlation
Plot the kinase with biggest change in pearson after normalization; uncheck ‘break’ to run all
for k in corr.sort_values('change_corr',ascending=False).kinase[:5]:
=df.query(f'kinase_paper == "{k}"')
df_k= df_k.drop_duplicates(subset="SUB")
df_k
= get_freq(df_k)
paper, full
= get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)
raw_k
= get_one_kinase(norm,k,drop_s=False).T
norm_k = norm_k.reindex(index=aa_order_paper)
norm_k
f'{k} from raw PSPA')
plot_heatmap(raw_k,f'{k} from normalized PSPA')
plot_heatmap(norm_k,f'{k} from KS datasets (n={len(df_k)})')
plot_heatmap(paper,
break
Calculate S and T ratio
= []
data for k in overlap_cnt.index:
# paper, full = get_freq(df,k)
= get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)
raw_k
= (raw_k.loc['S']/raw_k.sum()).median() #use median because it can better reflect the distribution of the data than the average
s_ratio = (raw_k.loc['T']/raw_k.sum()).median()
t_ratio data.append([k,s_ratio,t_ratio])
= pd.DataFrame(data,columns=['kinase','S_ratio','T_ratio']) ST_ratio
= corr.merge(ST_ratio) corr
corr
=True) corr.corr(numeric_only
Check change_corr column in the correlation plot, it seems T ratio is highly correlated with change_corr
=corr.change_corr,x=corr.T_ratio)
plot_corr(y'T ratio in raw PSPA')
plt.xlabel('Δ Pearson'); plt.ylabel(