from katlas.imports import *
import os, seaborn as sns
from fastbook import *
from scipy.stats import spearmanr, pearsonr
from PIL import Image
from tqdm import tqdm
set_sns()Compare CDDM and PSPA in Ser/Thr kinases
Setup
Load data
df = Data.get_ks_dataset()
df['SUB'] = df.substrate.str.upper()Get overlap kinase
# normalized PSPA data
norm = pd.read_csv('raw/pspa_st_norm.csv')
raw = pd.read_csv('raw/pspa_st_raw.csv')#get overlap and count
overlap_cnt = df[df.kinase_paper.isin(raw.kinase)].kinase_paper.value_counts()overlap_cntoverlap_cnt = overlap_cnt[overlap_cnt>100]overlap_cnt# PSPA data
raw = raw.set_index('kinase')
norm = norm.set_index('kinase')Plot
sns.set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.set_context('notebook')
sns.set_style("ticks")# aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
# position = [i for i in range(-7,8)]
position_paper = [-5,-4,-3,-2,-1,1,2,3,4]Dataset-driven vs. normalized
To generate all of other figures and save them, uncheck plt.savefig, and comment out plt.show() and break
set_sns()for k in tqdm(overlap_cnt.index,total=len(overlap_cnt)):
df_k = df.query(f'kinase_paper=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
paper,full = get_freq(df_k)
raw_k = get_one_kinase(norm,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
plot_heatmap(paper,f'{k} from CDDM (n={overlap_cnt[k]})')
# plt.savefig(f'corr/KS/{k}.png',bbox_inches='tight', pad_inches=0.05)
plt.show()
plt.close()
plot_heatmap(raw_k,f'{k} from PSPA')
# plt.savefig(f'corr/PSPA/{k}.png',bbox_inches='tight', pad_inches=0.05)
plt.show()
plt.close()
plot_corr(y = paper.unstack().values, #dataset driven
x = raw_k.unstack().values, # PSPA
ylabel='CDDM',
xlabel='PSPA')
plt.title(k)
# plt.savefig(f'corr/pear/{k}.png',bbox_inches='tight', pad_inches=0.2)
plt.show()
plt.close()
breakCombine the figures: correlation on top, and two heatmaps on the bottom
def combine_images_custom_layout(image_paths, output_path):
images = [Image.open(image_path).convert('RGBA') for image_path in image_paths]
# Calculate total width and height for the new image
total_width = max(images[0].width, images[1].width + images[2].width)
total_height = images[0].height + max(images[1].height, images[2].height)
# Create a new image with calculated dimensions
combined_image = Image.new('RGBA', (total_width, total_height))
# Paste the first image at the top-center
x_offset = (total_width - images[0].width) // 2
combined_image.paste(images[0], (x_offset, 0), images[0])
# Paste the second image at the bottom-left
combined_image.paste(images[1], (0, images[0].height), images[1])
# Paste the third image at the bottom-right
combined_image.paste(images[2], (images[1].width, images[0].height), images[2])
# Save the result
combined_image.save(output_path)Uncheck below to save combined figures
# folders = ["corr/pear",'corr/KS','corr/PSPA']
# for k in tqdm(overlap_cnt.index,total=len(overlap_cnt)):
# filename = f"{k}.png"
# image_paths = [os.path.join(folder, filename) for folder in folders]
# output_path = f"corr/combine/{k}.png"
# combine_images_custom_layout(image_paths, output_path)
# # breakPlot comparison
Correlation with raw PSPA
data = []
for k in tqdm(overlap_cnt.index):
df_k = df.query(f'kinase_paper=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
cnt = df_k.shape[0]
paper,full = get_freq(df_k)
raw_k = get_one_kinase(raw,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
full_corr,_ = pearsonr(raw_k.unstack().values,paper.unstack().values)
data.append([k,full_corr,cnt])corr_raw = pd.DataFrame(data,columns= ['kinase','corr_with_raw','count_unique'])Correlation with normalized PSPA
data = []
for k in overlap_cnt.index:
df_k = df.query(f'kinase_paper=="{k}"')
df_k = df_k.drop_duplicates(subset='SUB').reset_index()
cnt = df_k.shape[0]
paper,full = get_freq(df_k)
norm_k = get_one_kinase(norm,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
full_corr,_ = pearsonr(norm_k.unstack().values,paper.unstack().values)
data.append([k,full_corr])corr_norm = pd.DataFrame(data,columns= ['kinase','corr_with_norm',
])Merge with specificity
corr = corr_raw.merge(corr_norm)m = pd.read_csv('raw/specificity_pspa.csv')corr= corr.merge(m).rename(columns={'max':'specificity'})corrcorr.query('kinase == "CK1A"')Pearson vs. Specificity
plot_corr(x=corr.specificity.values,
y=corr.corr_with_norm.values)
plt.ylabel('Pearson\n dataset-driven vs. raw PSPA')
plt.xlabel('Kinase specificity');info = Data.get_kinase_info().query('pseudo=="0"')
corr2 = corr.merge(info)
color = load_pickle('raw/kinase_color.pkl')plot_bar(corr2,'corr_with_norm','group',palette=color)
plt.ylabel('Pearson');plt.figure(figsize=(7,3))
corr.corr_with_norm.hist(bins=20)
plt.xlabel('Pearson')
plt.ylabel('# Kinase')
plt.title('Distribution of Pearson score')corr.plot.scatter(y='corr_with_norm',x='specificity',c='darkblue')
plt.ylabel('Pearson\n Dataset-driven vs. norm PSPA')
plt.xlabel('Kinase specificity');
# plt.title('Dataset-driven vs. raw PSPA')Examples of outliers
# Examples of data
k = 'CK1A'
df_k=df.query(f'kinase_paper == "{k}"')
df_k=df_k.drop_duplicates(subset='SUB')
paper, full = get_freq(df_k)
raw_k = get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)
plot_heatmap(raw_k,f'{k} from raw PSPA')
plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')To check all of the outliers, uncheck below
# # Examples of data
# for k in corr.query('corr_with_norm<0.4 & specificity>0.55').kinase:
# df_k=df.query(f'kinase_paper == "{k}"')
# df_k=df_k.drop_duplicates(subset='SUB')
# paper, full = get_freq(df_k)
# raw_k = get_one_kinase(raw,k,drop_s=False).T
# raw_k = raw_k.reindex(index=aa_order_paper)
# plot_heatmap(raw_k,f'{k} from raw PSPA')
# plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')Pearson with raw vs. Pearson with norm
melt = pd.melt(corr[['corr_with_raw','corr_with_norm']])melt['variable'] = melt.variable.replace({'corr_with_raw':'raw','corr_with_norm':'normalized'})def plot_box(data,x,y,dots=True):
if dots:
sns.stripplot(data=data,x=x,y=y)
sns.boxplot(data=data, x=x, y=y, palette='pastel')plot_box(melt,'variable','value')
plt.ylabel('Pearson \nDataset-driven vs. PSPA')
plt.xlabel('PSPA');Find out the factor that cause the biggest change in correlation
Plot the kinase with biggest change in pearson after normalization; uncheck ‘break’ to run all
for k in corr.sort_values('change_corr',ascending=False).kinase[:5]:
df_k=df.query(f'kinase_paper == "{k}"')
df_k = df_k.drop_duplicates(subset="SUB")
paper, full = get_freq(df_k)
raw_k = get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)
norm_k = get_one_kinase(norm,k,drop_s=False).T
norm_k = norm_k.reindex(index=aa_order_paper)
plot_heatmap(raw_k,f'{k} from raw PSPA')
plot_heatmap(norm_k,f'{k} from normalized PSPA')
plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')
breakCalculate S and T ratio
data = []
for k in overlap_cnt.index:
# paper, full = get_freq(df,k)
raw_k = get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)
s_ratio = (raw_k.loc['S']/raw_k.sum()).median() #use median because it can better reflect the distribution of the data than the average
t_ratio = (raw_k.loc['T']/raw_k.sum()).median()
data.append([k,s_ratio,t_ratio])ST_ratio = pd.DataFrame(data,columns=['kinase','S_ratio','T_ratio'])corr = corr.merge(ST_ratio)corrcorr.corr(numeric_only=True)Check change_corr column in the correlation plot, it seems T ratio is highly correlated with change_corr
plot_corr(y=corr.change_corr,x=corr.T_ratio)
plt.xlabel('T ratio in raw PSPA')
plt.ylabel('Δ Pearson');