Compare CDDM and PSPA in Ser/Thr kinases

Setup

from katlas.imports import *
import os, seaborn as sns

from fastbook import *
from scipy.stats import spearmanr, pearsonr

from PIL import Image
from tqdm import tqdm

set_sns()

Load data

df = Data.get_ks_dataset()
df['SUB'] = df.substrate.str.upper()

Get overlap kinase

# normalized PSPA data
norm = pd.read_csv('raw/pspa_st_norm.csv')
raw = pd.read_csv('raw/pspa_st_raw.csv')
#get overlap and count
overlap_cnt = df[df.kinase_paper.isin(raw.kinase)].kinase_paper.value_counts()
overlap_cnt
overlap_cnt = overlap_cnt[overlap_cnt>100]
overlap_cnt
# PSPA data
raw = raw.set_index('kinase')
norm = norm.set_index('kinase')

Plot

sns.set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.set_context('notebook')
sns.set_style("ticks")
# aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
# position = [i for i in range(-7,8)]
position_paper = [-5,-4,-3,-2,-1,1,2,3,4]

Dataset-driven vs. normalized

To generate all of other figures and save them, uncheck plt.savefig, and comment out plt.show() and break

set_sns()
for k in tqdm(overlap_cnt.index,total=len(overlap_cnt)):
    df_k = df.query(f'kinase_paper=="{k}"')
    df_k = df_k.drop_duplicates(subset='SUB').reset_index()
    
    paper,full = get_freq(df_k)
    raw_k = get_one_kinase(norm,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
    
    plot_heatmap(paper,f'{k} from CDDM (n={overlap_cnt[k]})')
    # plt.savefig(f'corr/KS/{k}.png',bbox_inches='tight', pad_inches=0.05)
    plt.show()
    plt.close()

    plot_heatmap(raw_k,f'{k} from PSPA')
    # plt.savefig(f'corr/PSPA/{k}.png',bbox_inches='tight', pad_inches=0.05)
    plt.show()
    plt.close()

    plot_corr(y = paper.unstack().values, #dataset driven
              x = raw_k.unstack().values, # PSPA
              ylabel='CDDM',
              xlabel='PSPA')
    plt.title(k)
    # plt.savefig(f'corr/pear/{k}.png',bbox_inches='tight', pad_inches=0.2)
    plt.show()
    plt.close()
    
    break

Combine the figures: correlation on top, and two heatmaps on the bottom

def combine_images_custom_layout(image_paths, output_path):
    images = [Image.open(image_path).convert('RGBA') for image_path in image_paths]
    
    # Calculate total width and height for the new image
    total_width = max(images[0].width, images[1].width + images[2].width)
    total_height = images[0].height + max(images[1].height, images[2].height)
    
    # Create a new image with calculated dimensions
    combined_image = Image.new('RGBA', (total_width, total_height))
    
    # Paste the first image at the top-center
    x_offset = (total_width - images[0].width) // 2
    combined_image.paste(images[0], (x_offset, 0), images[0])
    
    # Paste the second image at the bottom-left
    combined_image.paste(images[1], (0, images[0].height), images[1])
    
    # Paste the third image at the bottom-right
    combined_image.paste(images[2], (images[1].width, images[0].height), images[2])
    
    # Save the result
    combined_image.save(output_path)

Uncheck below to save combined figures

# folders = ["corr/pear",'corr/KS','corr/PSPA']

# for k in tqdm(overlap_cnt.index,total=len(overlap_cnt)):
#     filename = f"{k}.png"
#     image_paths = [os.path.join(folder, filename) for folder in folders]
#     output_path = f"corr/combine/{k}.png"
#     combine_images_custom_layout(image_paths, output_path)
#     # break

Plot comparison

Correlation with raw PSPA

data = []
for k in tqdm(overlap_cnt.index):
    df_k = df.query(f'kinase_paper=="{k}"')
    df_k = df_k.drop_duplicates(subset='SUB').reset_index()
    
    cnt = df_k.shape[0]
    
    paper,full = get_freq(df_k)
    raw_k = get_one_kinase(raw,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
    
    full_corr,_ = pearsonr(raw_k.unstack().values,paper.unstack().values)
    
    data.append([k,full_corr,cnt])
corr_raw = pd.DataFrame(data,columns= ['kinase','corr_with_raw','count_unique'])

Correlation with normalized PSPA

data = []
for k in overlap_cnt.index:
    df_k = df.query(f'kinase_paper=="{k}"')
    df_k = df_k.drop_duplicates(subset='SUB').reset_index()
    
    cnt = df_k.shape[0]
    
    paper,full = get_freq(df_k)
    norm_k = get_one_kinase(norm,k,drop_s=False).T.reindex(index=aa_order_paper,columns=position_paper).round(3)
    
    full_corr,_ = pearsonr(norm_k.unstack().values,paper.unstack().values)
    
    data.append([k,full_corr])
corr_norm = pd.DataFrame(data,columns= ['kinase','corr_with_norm',
                                ])

Merge with specificity

corr = corr_raw.merge(corr_norm)
m = pd.read_csv('raw/specificity_pspa.csv')
corr= corr.merge(m).rename(columns={'max':'specificity'})
corr
corr.query('kinase == "CK1A"')

Pearson vs. Specificity

plot_corr(x=corr.specificity.values,
          y=corr.corr_with_norm.values)
plt.ylabel('Pearson\n dataset-driven vs. raw PSPA')
plt.xlabel('Kinase specificity');
info = Data.get_kinase_info().query('pseudo=="0"')

corr2 = corr.merge(info)

color = load_pickle('raw/kinase_color.pkl')
plot_bar(corr2,'corr_with_norm','group',palette=color)
plt.ylabel('Pearson');
plt.figure(figsize=(7,3))
corr.corr_with_norm.hist(bins=20)
plt.xlabel('Pearson')
plt.ylabel('# Kinase')
plt.title('Distribution of Pearson score')
corr.plot.scatter(y='corr_with_norm',x='specificity',c='darkblue')
plt.ylabel('Pearson\n Dataset-driven vs. norm PSPA')
plt.xlabel('Kinase specificity');
# plt.title('Dataset-driven vs. raw PSPA')

Examples of outliers

# Examples of data
k = 'CK1A'
df_k=df.query(f'kinase_paper == "{k}"')
df_k=df_k.drop_duplicates(subset='SUB')

paper, full = get_freq(df_k)

raw_k = get_one_kinase(raw,k,drop_s=False).T
raw_k = raw_k.reindex(index=aa_order_paper)

plot_heatmap(raw_k,f'{k} from raw PSPA')
plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')

To check all of the outliers, uncheck below

# # Examples of data
# for k in corr.query('corr_with_norm<0.4 & specificity>0.55').kinase:
    
#     df_k=df.query(f'kinase_paper == "{k}"')
#     df_k=df_k.drop_duplicates(subset='SUB')

#     paper, full = get_freq(df_k)

#     raw_k = get_one_kinase(raw,k,drop_s=False).T
#     raw_k = raw_k.reindex(index=aa_order_paper)
    
#     plot_heatmap(raw_k,f'{k} from raw PSPA')
#     plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')

Pearson with raw vs. Pearson with norm

melt = pd.melt(corr[['corr_with_raw','corr_with_norm']])
melt['variable'] = melt.variable.replace({'corr_with_raw':'raw','corr_with_norm':'normalized'})
def plot_box(data,x,y,dots=True):
    if dots:
        sns.stripplot(data=data,x=x,y=y)
    sns.boxplot(data=data, x=x, y=y, palette='pastel')
plot_box(melt,'variable','value')
plt.ylabel('Pearson \nDataset-driven vs. PSPA')
plt.xlabel('PSPA');

Check if changed pearson is correlated with specificity

corr['change_corr'] = corr['corr_with_norm'] - corr['corr_with_raw']
plot_corr(y=corr.change_corr,x=corr.specificity,text_location=[0.8,0.85])
plt.xlabel('Kinase specificity based on PSPA')
plt.ylabel('Δ Pearson');
corr.plot.scatter(y='change_corr',x='specificity',c='darkblue')
pear,pvalue = pearsonr(corr.change_corr,corr.specificity)
plt.text(s=f'Pearson = {round(pear,2)}\n   p = {"{:.2e}".format(pvalue)}',x=0.6,y=0.2)
plt.ylabel('Δ Pearson')
plt.xlabel('Kinase specificity');
# corr.to_csv('PSSM_vs_dataset_summary.csv',index=False)

Find out the factor that cause the biggest change in correlation

Plot the kinase with biggest change in pearson after normalization; uncheck ‘break’ to run all

for k in corr.sort_values('change_corr',ascending=False).kinase[:5]:
    
    df_k=df.query(f'kinase_paper == "{k}"')
    df_k = df_k.drop_duplicates(subset="SUB")
    
    paper, full = get_freq(df_k)

    raw_k = get_one_kinase(raw,k,drop_s=False).T
    raw_k = raw_k.reindex(index=aa_order_paper)
    
    norm_k = get_one_kinase(norm,k,drop_s=False).T
    norm_k = norm_k.reindex(index=aa_order_paper)
    
    plot_heatmap(raw_k,f'{k} from raw PSPA')
    plot_heatmap(norm_k,f'{k} from normalized PSPA')
    plot_heatmap(paper,f'{k} from KS datasets (n={len(df_k)})')

    break

Calculate S and T ratio

data = []
for k in overlap_cnt.index:

    # paper, full = get_freq(df,k)

    raw_k = get_one_kinase(raw,k,drop_s=False).T
    raw_k = raw_k.reindex(index=aa_order_paper)
    
    s_ratio = (raw_k.loc['S']/raw_k.sum()).median() #use median because it can better reflect the distribution of the data than the average
    t_ratio = (raw_k.loc['T']/raw_k.sum()).median()
    data.append([k,s_ratio,t_ratio])
ST_ratio = pd.DataFrame(data,columns=['kinase','S_ratio','T_ratio'])
corr = corr.merge(ST_ratio)
corr
corr.corr(numeric_only=True)

Check change_corr column in the correlation plot, it seems T ratio is highly correlated with change_corr

plot_corr(y=corr.change_corr,x=corr.T_ratio)
plt.xlabel('T ratio in raw PSPA')
plt.ylabel('Δ Pearson');