from katlas.core import *
from katlas.plot import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
Compare PhosphoSitePlus and Large-scale datasets
Setup
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
Load data
= Data.get_ks_dataset() df
'SUB'] = df.substrate.str.upper() df[
= df.query('source == "pplus"').reset_index(drop=True)
PP
= df.query('source == "large_scale"').reset_index(drop=True) LS
Get overlap
= PP[PP.kinase_paper.isin(LS.kinase_paper)].kinase_paper.value_counts() cnt
= cnt[cnt>50] overlap_PP
Calculate Pearson
= []
data for k in overlap_PP.index:
= PP.query(f'kinase_paper=="{k}"')
PP_k = LS.query(f'kinase_paper=="{k}"')
LS_k
# drop duplicates
= PP_k.drop_duplicates(subset = 'SUB')
PP_k = LS_k.drop_duplicates(subset = 'SUB')
LS_k
= PP_k.shape[0]
PP_cnt = LS_k.shape[0]
LS_cnt
= get_freq(PP_k)
PP_paper, PP_full = get_freq(LS_k)
LS_paper, LS_full
# plot_heatmap(PP_paper,f'{k}_PP')
# plt.show()
# plt.close()
# plot_heatmap(LS_paper,f'{k}_LS')
# plt.show()
# plt.close()
# Get pearson of full heatmap, then average
= pearsonr(PP_full.unstack().values,LS_full.unstack().values)
corr_full,_
data.append([k,corr_full,PP_cnt,LS_cnt])
= pd.DataFrame(data,columns=['kinase','pearson',
PP_LS 'PP_cnt','LS_cnt'])
'pearson') PP_LS.sort_values(
'min_cnt'] = PP_LS[['PP_cnt','LS_cnt']].min(axis=1) PP_LS[
set(rc={"figure.dpi":200, 'savefig.dpi':200})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
=(5,3))
plt.figure(figsize=15); PP_LS.pearson.hist(bins
='pearson',x='min_cnt',c = 'DarkBlue')
PP_LS.plot.scatter(y'Pearson')
plt.ylabel('min count')
plt.xlabel('Agreement between two datasets'); plt.title(
Examples
# Get one example
= 'CDK1'
k
= PP.query(f'kinase_paper=="{k}"')
PP_k = LS.query(f'kinase_paper=="{k}"')
LS_k
# drop duplicates
= PP_k.drop_duplicates(subset = 'SUB')
PP_k = LS_k.drop_duplicates(subset = 'SUB')
LS_k
= get_freq(PP_k)
PP_paper, PP_full = get_freq(LS_k)
LS_paper, LS_full
f'{k}_PP')
plot_heatmap(PP_paper,f'{k}_LS') plot_heatmap(LS_paper,
= 'CK2A1'
k
= PP.query(f'kinase_paper=="{k}"')
PP_k = LS.query(f'kinase_paper=="{k}"')
LS_k
# drop duplicates
= PP_k.drop_duplicates(subset = 'SUB')
PP_k = LS_k.drop_duplicates(subset = 'SUB')
LS_k
= get_freq(PP_k)
PP_paper, PP_full = get_freq(LS_k)
LS_paper, LS_full
f'{k}_PP')
plot_heatmap(PP_paper,f'{k}_LS') plot_heatmap(LS_paper,