from katlas.core import *
from katlas.plot import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, pearsonrCompare PhosphoSitePlus and Large-scale datasets
Setup
sns.set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.set_context('notebook')
sns.set_style("ticks")Load data
df = Data.get_ks_dataset()df['SUB'] = df.substrate.str.upper()PP = df.query('source == "pplus"').reset_index(drop=True)
LS = df.query('source == "large_scale"').reset_index(drop=True)Get overlap
cnt = PP[PP.kinase_paper.isin(LS.kinase_paper)].kinase_paper.value_counts()overlap_PP = cnt[cnt>50]Calculate Pearson
data = []
for k in overlap_PP.index:
PP_k = PP.query(f'kinase_paper=="{k}"')
LS_k = LS.query(f'kinase_paper=="{k}"')
# drop duplicates
PP_k = PP_k.drop_duplicates(subset = 'SUB')
LS_k = LS_k.drop_duplicates(subset = 'SUB')
PP_cnt = PP_k.shape[0]
LS_cnt = LS_k.shape[0]
PP_paper, PP_full = get_freq(PP_k)
LS_paper, LS_full = get_freq(LS_k)
# plot_heatmap(PP_paper,f'{k}_PP')
# plt.show()
# plt.close()
# plot_heatmap(LS_paper,f'{k}_LS')
# plt.show()
# plt.close()
# Get pearson of full heatmap, then average
corr_full,_ = pearsonr(PP_full.unstack().values,LS_full.unstack().values)
data.append([k,corr_full,PP_cnt,LS_cnt])PP_LS = pd.DataFrame(data,columns=['kinase','pearson',
'PP_cnt','LS_cnt'])PP_LS.sort_values('pearson')PP_LS['min_cnt'] = PP_LS[['PP_cnt','LS_cnt']].min(axis=1)sns.set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.set_context('notebook')
sns.set_style("ticks")
plt.figure(figsize=(5,3))
PP_LS.pearson.hist(bins=15);PP_LS.plot.scatter(y='pearson',x='min_cnt',c = 'DarkBlue')
plt.ylabel('Pearson')
plt.xlabel('min count')
plt.title('Agreement between two datasets');Examples
# Get one example
k = 'CDK1'
PP_k = PP.query(f'kinase_paper=="{k}"')
LS_k = LS.query(f'kinase_paper=="{k}"')
# drop duplicates
PP_k = PP_k.drop_duplicates(subset = 'SUB')
LS_k = LS_k.drop_duplicates(subset = 'SUB')
PP_paper, PP_full = get_freq(PP_k)
LS_paper, LS_full = get_freq(LS_k)
plot_heatmap(PP_paper,f'{k}_PP')
plot_heatmap(LS_paper,f'{k}_LS')k = 'CK2A1'
PP_k = PP.query(f'kinase_paper=="{k}"')
LS_k = LS.query(f'kinase_paper=="{k}"')
# drop duplicates
PP_k = PP_k.drop_duplicates(subset = 'SUB')
LS_k = LS_k.drop_duplicates(subset = 'SUB')
PP_paper, PP_full = get_freq(PP_k)
LS_paper, LS_full = get_freq(LS_k)
plot_heatmap(PP_paper,f'{k}_PP')
plot_heatmap(LS_paper,f'{k}_LS')