Compare PhosphoSitePlus and Large-scale datasets

Setup

from katlas.core import *
from katlas.plot import *

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import spearmanr, pearsonr
sns.set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.set_context('notebook')
sns.set_style("ticks")

Load data

df = Data.get_ks_dataset()
df['SUB'] = df.substrate.str.upper()
PP = df.query('source == "pplus"').reset_index(drop=True)

LS = df.query('source == "large_scale"').reset_index(drop=True)

Get overlap

cnt = PP[PP.kinase_paper.isin(LS.kinase_paper)].kinase_paper.value_counts()
overlap_PP = cnt[cnt>50]

Calculate Pearson

data = []
for k in overlap_PP.index:

    
    PP_k = PP.query(f'kinase_paper=="{k}"')
    LS_k = LS.query(f'kinase_paper=="{k}"')
    
    # drop duplicates
    PP_k = PP_k.drop_duplicates(subset = 'SUB')
    LS_k = LS_k.drop_duplicates(subset = 'SUB')
    
    PP_cnt = PP_k.shape[0]
    LS_cnt = LS_k.shape[0]
    
    PP_paper, PP_full = get_freq(PP_k)
    LS_paper, LS_full = get_freq(LS_k)
    
#     plot_heatmap(PP_paper,f'{k}_PP')
#     plt.show()
#     plt.close()
    
#     plot_heatmap(LS_paper,f'{k}_LS')
#     plt.show()
#     plt.close()

    # Get pearson of full heatmap, then average
    corr_full,_ = pearsonr(PP_full.unstack().values,LS_full.unstack().values)

    data.append([k,corr_full,PP_cnt,LS_cnt])
PP_LS = pd.DataFrame(data,columns=['kinase','pearson',
                                   'PP_cnt','LS_cnt'])
PP_LS.sort_values('pearson')
PP_LS['min_cnt'] = PP_LS[['PP_cnt','LS_cnt']].min(axis=1)
sns.set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.set_context('notebook')
sns.set_style("ticks")

plt.figure(figsize=(5,3))
PP_LS.pearson.hist(bins=15);
PP_LS.plot.scatter(y='pearson',x='min_cnt',c = 'DarkBlue')
plt.ylabel('Pearson')
plt.xlabel('min count')
plt.title('Agreement between two datasets');

Examples

# Get one example
k = 'CDK1'

PP_k = PP.query(f'kinase_paper=="{k}"')
LS_k = LS.query(f'kinase_paper=="{k}"')

# drop duplicates
PP_k = PP_k.drop_duplicates(subset = 'SUB')
LS_k = LS_k.drop_duplicates(subset = 'SUB')


PP_paper, PP_full = get_freq(PP_k)
LS_paper, LS_full = get_freq(LS_k)


plot_heatmap(PP_paper,f'{k}_PP')
plot_heatmap(LS_paper,f'{k}_LS')
k = 'CK2A1'

PP_k = PP.query(f'kinase_paper=="{k}"')
LS_k = LS.query(f'kinase_paper=="{k}"')

# drop duplicates
PP_k = PP_k.drop_duplicates(subset = 'SUB')
LS_k = LS_k.drop_duplicates(subset = 'SUB')

PP_paper, PP_full = get_freq(PP_k)
LS_paper, LS_full = get_freq(LS_k)


plot_heatmap(PP_paper,f'{k}_PP')
plot_heatmap(LS_paper,f'{k}_LS')