Plot AUCDF

Overall evaluation of PSSMs in the PSP dataset

In this notebook, we will evaluate PSSMs derived from two methods:PSPA and CDDM, using kinase-substrate datasets from PhosphoSitePlus (PSP). We will use AUCDF (Area Under Cumulative Distribution Function) to evaluate. AUCDF was previously introduced to evaluate PSSMs in the paper An atlas of substrate specificities for the human serine/threonine kinome.

Setup

from katlas.imports import *
import pickle,pandas as pd, seaborn as sns
from matplotlib import pyplot as plt

Get kinase idx map

info = Data.get_kinase_info().query('pseudo=="0"')

info = info[['kinase','ID_coral','ID_HGNC']].map(lambda x: x.upper())

kinase_map = {}
for idx, row in info.iterrows():
    # Add each of the different kinase name formats to the map
    kinase_map[row['ID_coral']] = row['kinase']
    kinase_map[row['ID_HGNC']] = row['kinase']
    # Ensure the kinase name itself is also in the map
    kinase_map[row['kinase']] = row['kinase']

kinase_map['ABL'] = 'ABL1'
kinase_map['HER2'] = 'ERBB2'
kinase_map['ETK'] = 'BMX'
kinase_map['MKK6'] = 'MAP2K6'
kinase_map['MKK4'] = 'MAP2K4'
kinase_map['MKK3'] = 'MAP2K3'
kinase_map['MKK7'] = 'MAP2K7'

kinase_map['ARG'] = 'ABL2'

Uncheck below to save and load kinase_map.pkl

# import pickle
# with open('raw/kinase_map.pkl', 'wb') as file:
#     pickle.dump(kinase_map, file)
    
# with open('raw/kinase_map.pkl', 'rb') as file:
#     loaded_dict = pickle.load(file)

Load kinase-substrate data from PSP

# load kinase-substrate pairs from PSP
psp = pd.read_csv('raw/PSP_Kinase_Substrate_Dataset.csv')

psp.head()

	GENE	KINASE	KIN_ACC_ID	KIN_ORGANISM	SUBSTRATE	SUB_GENE_ID	SUB_ACC_ID	SUB_GENE	SUB_ORGANISM	SUB_MOD_RSD	SITE_GRP_ID	site_seq	DOMAIN	IN_VIVO_RXN	IN_VITRO_RXN	CST_CAT#
0	Dyrk2	DYRK2	Q5U4C9	mouse	NDEL1	83431.0	Q9ERR1	Ndel1	mouse	S336	1869686801	LGSsRPSsAPGMLPL	NaN		X	NaN
1	Pak2	PAK2	Q64303	rat	MEK1	170851.0	Q01986	Map2k1	rat	S298	448284	RtPGRPLsSYGMDSR	Pkinase		X	9128; 98195
2	Pak2	PAK2	Q64303	rat	PRKD1	85421.0	Q9WTQ1	Prkd1	rat	S203	449896	GVRRRRLsNVsLTGL	NaN	X		NaN
3	Pak2	PAK2	Q64303	rat	prolactin	24683.0	P01237	Prl	rat	S206	451732	IRCLRRDsHKVDNYL	Hormone_1		X	NaN
4	Pak2	PAK2	Q64303	rat	prolactin	5617.0	P01236	PRL	human	S207	451732	LHCLRRDsHKIDNYL	Hormone_1		X	NaN

As there are some sequences in ‘site_seq’ column that do not have s/t/y at the center position, we need to remove them.

# For site sequence
psp = psp.loc[psp.site_seq.str[7].isin(list('stySTY'))]

We also notice that the kinase name in ‘KINASE’ column is not always consistent (e.g., gene name and protein name are mixed in some cases), so we need to convert the kinase name to a consistent name.

# for isoform, suppose they have similar recognition pattern; drop the isoform # and take the kinase name
psp.KINASE = psp.KINASE.str.split(' ').str[0].str.upper()

# for fusion form of kinase,get the second item
psp.KINASE = psp.KINASE.apply(lambda x: x.split('-')[1] if '-' in x else x) 

# map various kinase name (coral ID, gene name) to a common name
psp['kinase'] = psp.KINASE.map(kinase_map)

Check kinase that is not mapped:

# kinase not mapped
psp[psp.kinase.isna()].KINASE.value_counts()[:10]

KINASE
CK2B      20
VEGFR2    15
AMPKB1    15
UL97      14
ILK       13
PKM       12
PIK3CA    11
AMPKG2    10
CSFR      10
VEGFR1     7
Name: count, dtype: int64

# drop kinase not mapped
psp = psp.dropna(subset='kinase')

# drop duplicates
psp = psp[['site_seq','kinase']].drop_duplicates()

psp.site_seq.str[7].value_counts()

site_seq
s    13543
t     4349
y     3049
Name: count, dtype: int64

Filter sites and kinase for PSPA scoring

import pandas as pd, numpy as np
from tqdm import tqdm

ref_y = Data.get_pspa_tyr_norm()

ref_st = Data.get_pspa_st_norm()

TK = ref_y.index.str.split('_').str[0].tolist()
ST = ref_st.index.str.split('_').str[0].tolist()

We will use two kinds of inputs for PSPA evaluation:

All capital (the official method from the Nature paper.)
With lowercase indicating phosphorylation status

# filter samples, include only available kinase from the reference for scoring
df_st = psp[psp.kinase.isin(ref_st.index)].copy()
df_y = psp[psp.kinase.isin(ref_y.index)].copy()

# keep ST sites
df_st = df_st[df_st.site_seq.str[7].isin(list('stST'))]

# keep Y sites
df_y = df_y[df_y.site_seq.str[7].isin(list('yY'))]

df_st.site_seq.str[7].value_counts()

site_seq
s    13398
t     4287
Name: count, dtype: int64

df_y.site_seq.str[7].value_counts()

site_seq
y    2904
Name: count, dtype: int64

# convert site sequence to capital, for percentile calculation
df_st['site_seq_upper'] = df_st['site_seq'].str.upper()
df_y['site_seq_upper'] = df_y['site_seq'].str.upper()

df_st.head()

	site_seq	kinase	site_seq_upper
0	LGSsRPSsAPGMLPL	DYRK2	LGSSRPSSAPGMLPL
1	RtPGRPLsSYGMDSR	PAK2	RTPGRPLSSYGMDSR
2	GVRRRRLsNVsLTGL	PAK2	GVRRRRLSNVSLTGL
3	IRCLRRDsHKVDNYL	PAK2	IRCLRRDSHKVDNYL
4	LHCLRRDsHKIDNYL	PAK2	LHCLRRDSHKIDNYL

Multiply score

y_param_multiply = param_PSPA_y
st_param_multiply = param_PSPA_st

# multiply score on all capital
st_mul_up = predict_kinase_df(df_st,'site_seq_upper',**st_param_multiply)

# multiply score on phosphorylated substrates
st_mul_lo = predict_kinase_df(df_st,'site_seq',**st_param_multiply)

input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging

100%|██████████| 303/303 [00:02<00:00, 135.63it/s]

input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging

100%|██████████| 303/303 [00:02<00:00, 131.82it/s]

# multiply score on all capital
st_mul_up = predict_kinase_df(df_st,'site_seq_upper',**st_param_multiply)

# multiply score on phosphorylated substrates
st_mul_lo = predict_kinase_df(df_st,'site_seq',**st_param_multiply)

input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging

100%|██████████| 303/303 [00:02<00:00, 137.92it/s]

input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging

100%|██████████| 303/303 [00:02<00:00, 132.47it/s]

# multiply score on all capital
y_mul_up = predict_kinase_df(df_y,'site_seq_upper',**y_param_multiply)

# multiply score on phosphorylated substrates
y_mul_lo = predict_kinase_df(df_y,'site_seq',**y_param_multiply)

input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging

100%|██████████| 93/93 [00:00<00:00, 319.46it/s]

input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging

100%|██████████| 93/93 [00:00<00:00, 349.33it/s]

df_st

	site_seq	kinase	site_seq_upper
0	LGSsRPSsAPGMLPL	DYRK2	LGSSRPSSAPGMLPL
1	RtPGRPLsSYGMDSR	PAK2	RTPGRPLSSYGMDSR
2	GVRRRRLsNVsLTGL	PAK2	GVRRRRLSNVSLTGL
3	IRCLRRDsHKVDNYL	PAK2	IRCLRRDSHKVDNYL
4	LHCLRRDsHKIDNYL	PAK2	LHCLRRDSHKIDNYL
...	...	...	...
23276	QRVLDtssLtQsAPA	ULK2	QRVLDTSSLTQSAPA
23277	DtssLtQsAPAsPtN	ULK2	DTSSLTQSAPASPTN
23278	LAQPINFsVSLSNSH	ULK2	LAQPINFSVSLSNSH
23279	ESsPILTsFELVKVP	ULK2	ESSPILTSFELVKVP
23280	THRRMVVsMPNLQDI	ULK2	THRRMVVSMPNLQDI

17685 rows × 3 columns

Sum score

y_param_sum = {'ref':Data.get_pspa_tyr_norm(), 'func': sumup}
st_param_sum = {'ref':Data.get_pspa_st_norm(), 'func': sumup}

# sum score on all capital
st_sum_up = predict_kinase_df(df_st,'site_seq_upper',**st_param_sum)

# sum score on phosphorylated substrates
st_sum_lo = predict_kinase_df(df_st,'site_seq',**st_param_sum)

input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging

# sum score on all capital
y_sum_up = predict_kinase_df(df_y,'site_seq_upper',**y_param_sum)

# sum score on phosphorylated substrates
y_sum_lo = predict_kinase_df(df_y,'site_seq',**y_param_sum)

input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging
input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging

Plot differences of all-capital and phosphorylated

df_y['acceptor'] = df_y.site_seq_upper.str[7]

palette = get_color_dict(['S','T','Y'],'tab20')

sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set_context('notebook')
sns.set_style("ticks")

k = 'SYK'
xmin,xmax = y_mul_lo[k].min()-0.1, y_mul_lo[k].max()+0.1

plot_hist(y_mul_up,k,hue = df_y.acceptor,palette=palette)
plt.title(f'{k}, without considering phospho-priming')
plt.xlim(xmin,xmax);

plot_hist(y_mul_lo,k,hue = df_y.acceptor,palette=palette)
plt.title(f'{k}, with considering phospho-priming')
plt.xlim(xmin,xmax);

Get score rank

# get rank of multiply score for all capital
y_rnk_mul_up = y_mul_up.rank(axis=1,ascending=False)
st_rnk_mul_up = st_mul_up.rank(axis=1,ascending=False)

# get rank of multiply score for phosphorylated
y_rnk_mul_lo = y_mul_lo.rank(axis=1,ascending=False)
st_rnk_mul_lo = st_mul_lo.rank(axis=1,ascending=False)

# get rank of sum score for all capital
y_rnk_sum_up = y_sum_up.rank(axis=1,ascending=False)
st_rnk_sum_up = st_sum_up.rank(axis=1,ascending=False)

# get rank of sum score for all capital
y_rnk_sum_lo = y_sum_lo.rank(axis=1,ascending=False)
st_rnk_sum_lo = st_sum_lo.rank(axis=1,ascending=False)

As the reference for percentile calculation is calculated based on all-capital sequences, we will calculate percentile score and its rank only for the uppercase one.

For the lowercase, it should be also noted that the phosphorylation status from PSP might not be accurate, as it includes all high-throughput phosphorylation and low-throughput phosphorylation sites.

Percentile

Percentile calculation, for all capital only:

# get percentile based on percentile_reference
y_pct_ref = Data.get_pspa_tyr_pct()
st_pct_ref = Data.get_pspa_st_pct()

y_pct = get_pct_df(y_mul_up,y_pct_ref)
st_pct = get_pct_df(st_mul_up,st_pct_ref)

# get percentile rank across kinases
y_pct_rnk = y_pct.rank(axis=1,ascending=False)
st_pct_rnk = st_pct.rank(axis=1,ascending=False)

100%|██████████| 93/93 [00:00<00:00, 2401.17it/s]
100%|██████████| 303/303 [00:00<00:00, 310.19it/s]

Match values

def match_values(df,rnk):
    return pd.Series([rnk.at[k,v] for k,v in df.kinase.items()],index=df.index)

# merge rank values to df
df_y['y_rnk_mul_up'] = match_values(df_y,y_rnk_mul_up)
df_y['y_rnk_mul_lo'] = match_values(df_y,y_rnk_mul_lo)

df_y['y_rnk_sum_up'] = match_values(df_y,y_rnk_sum_up)
df_y['y_rnk_sum_lo'] = match_values(df_y,y_rnk_sum_lo)

# merge rank values to df
df_st['st_rnk_mul_up'] = match_values(df_st,st_rnk_mul_up)
df_st['st_rnk_mul_lo'] = match_values(df_st,st_rnk_mul_lo)

df_st['st_rnk_sum_up'] = match_values(df_st,st_rnk_sum_up)
df_st['st_rnk_sum_lo'] = match_values(df_st,st_rnk_sum_lo)

# for uppercase only
df_y['pct'] = match_values(df_y,y_pct)
df_y['pct_rnk'] = match_values(df_y,y_pct_rnk)

df_st['pct'] = match_values(df_st,st_pct)
df_st['pct_rnk'] = match_values(df_st,st_pct_rnk)

df_y.head()

	site_seq	kinase	site_seq_upper	acceptor	y_rnk_mul_up	y_rnk_mul_lo	y_rnk_sum_up	y_rnk_sum_lo	pct	pct_rnk
1516	KETEGQFyNYFPN__	ITK	KETEGQFYNYFPN__	Y	8.0	8.0	11.0	11.0	83.472	17.5
1517	ETLVIALyDYQTNDP	ITK	ETLVIALYDYQTNDP	Y	25.0	25.0	31.0	31.0	39.262	26.0
1518	PNEGDNDyIIPLPDP	PDGFRB	PNEGDNDYIIPLPDP	Y	26.0	26.0	43.5	43.5	98.237	15.0
1519	ERKEVsKysDIQRsL	PDGFRB	ERKEVSKYSDIQRSL	Y	44.0	63.0	60.0	70.0	58.742	36.0
1520	LDTSSVLyTAVQPNE	PDGFRB	LDTSSVLYTAVQPNE	Y	58.0	58.0	71.0	71.0	58.524	59.0

Percentile

sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set_context('notebook')
sns.set_style("ticks")

get_AUCDF(df_y,'pct',reverse=True,xlabel='Percentile of reported kinase')

0.613949614139531

get_AUCDF(df_st,'pct',reverse=True,xlabel='Percentile of reported kinase')

0.7996590541217666

Percentile rank

get_AUCDF(df_y,'pct_rnk')

0.6021502629886102

get_AUCDF(df_st,'pct_rnk')

0.7903312239631667

Multiply score on all capital

get_AUCDF(df_y,'y_rnk_mul_up')

0.6240460933796205

get_AUCDF(df_st,'st_rnk_mul_up')

0.8225713446901245

Multiply score on phosphorylated

get_AUCDF(df_y,'y_rnk_mul_lo')

0.6373005243034091

get_AUCDF(df_st,'st_rnk_mul_lo')

0.8232876176843243

Sum score on all capital

get_AUCDF(df_y,'y_rnk_sum_up')

0.6115071915765649

get_AUCDF(df_st,'st_rnk_sum_up')

0.8151544558555669

Sum score on phosphorylated

get_AUCDF(df_y,'y_rnk_sum_lo')

0.6251077799990844

get_AUCDF(df_st,'st_rnk_sum_lo')

0.8235627655503237

Plot average rank for each kinase

The bar plot will reflect how accurate the prediction is for each kinase. The lower the y axis value is, the better.

cnt_y = df_y.kinase.value_counts()
cnt_st = df_st.kinase.value_counts()

df_y['count'] = df_y.kinase.map(cnt_y)
df_st['count'] = df_st.kinase.map(cnt_st)

dd_y = df_y.query('count>=20')
dd_st = df_st.query('count>=20')

bar_param = {'dots':False, 'fontsize':10, 'figsize':(17,3),'ascending':True}

plot_bar(dd_y,'y_rnk_mul_up','kinase',**bar_param)
plt.ylabel('Rank with all capital');

plot_bar(dd_y,'y_rnk_mul_lo','kinase',**bar_param)
plt.ylabel('Rank with phosphorylation status');

plot_bar(dd_st,'st_rnk_mul_up','kinase',**bar_param)
plt.ylabel('Rank with all capital');

plot_bar(dd_st,'st_rnk_mul_lo','kinase',**bar_param)
plt.ylabel('Rank with phosphorylation status');

Statistical analysis

dd_y = dd_y.rename(columns={'y_rnk_mul_lo':'phosphorylated','y_rnk_mul_up':'all-capital'})
dd_st = dd_st.rename(columns={'st_rnk_mul_lo':'phosphorylated','st_rnk_mul_up':'all-capital'})

import scipy.stats as stats

delta_y = dd_y.groupby('kinase')[['all-capital', 'phosphorylated']].mean()
delta_st = dd_st.groupby('kinase')[['all-capital', 'phosphorylated']].mean()

delta_y['diff'] = (delta_y['all-capital'] - delta_y['phosphorylated'])/delta_y['all-capital']
delta_st['diff'] = (delta_st['all-capital'] - delta_st['phosphorylated'])/delta_st['all-capital']

delta_y.sort_values('diff',ascending=False).head()

	all-capital	phosphorylated	diff
kinase
PTK2	30.382979	15.170213	0.500700
SYK	20.233333	12.011111	0.406370
ZAP70	25.238095	16.119048	0.361321
EGFR	32.346457	24.929134	0.229309
BMX	41.250000	31.800000	0.229091

def t_test(group):
    t_stat, p_val = stats.ttest_rel(group['all-capital'], group['phosphorylated'])
    return pd.Series({'t-statistic': t_stat, 'p-value': p_val})

# Apply the t-test function to each group
ttest_y = dd_y.groupby('kinase').apply(t_test)

ttest_st = dd_st.groupby('kinase').apply(t_test)

y_rnk_statistics = pd.concat([delta_y,ttest_y],axis=1)
st_rnk_statistics = pd.concat([delta_st,ttest_st],axis=1)

y_rnk_statistics.head()

	all-capital	phosphorylated	diff	t-statistic	p-value
kinase
ABL1	27.249110	30.587189	-0.122502	-4.800813	2.575967e-06
ABL2	26.205882	28.352941	-0.081930	-1.498866	1.434148e-01
BMX	41.250000	31.800000	0.229091	2.516893	2.097901e-02
BTK	50.185185	50.629630	-0.008856	-0.676868	5.044680e-01
EGFR	32.346457	24.929134	0.229309	6.082135	1.314753e-08

Plot all-capital vs. phosphorylated

def plot_rnk(df,value_cols,group,figsize,order=None,fontsize=18,rotation=90,title=None,**kwargs):
    # Prepare the dataframe for plotting
    # Melt the dataframe to go from wide to long format
    df_melted = df.melt(id_vars=group, value_vars=value_cols, var_name='Ranking', value_name='Value')

    plt.figure(figsize=figsize)
    
    # Create the bar plot
    sns.barplot(data=df_melted, x=group, y='Value', hue='Ranking',order=order, 
                capsize=0.1,errwidth=1.5,errcolor='gray', # adjust the error bar settings
                **kwargs)
    
    # Increase font size for the x-axis and y-axis tick labels
    plt.tick_params(axis='x', labelsize=fontsize)  # Increase x-axis label size
    plt.tick_params(axis='y', labelsize=fontsize)  # Increase y-axis label size
    
    # Modify x and y label and increase font size
    plt.xlabel('', fontsize=fontsize)
    plt.ylabel('Rank of kinases (count≥20)', fontsize=fontsize)
    
    # Rotate X labels
    plt.xticks(rotation=rotation)
    
    # Plot titles
    if title is not None:
        plt.title(title, fontsize=fontsize)
    
    plt.gca().spines[['right', 'top']].set_visible(False)
    # plt.legend(title='Substrate', fontsize=fontsize-1, title_fontsize=fontsize-1)
    plt.legend(fontsize=fontsize)

y_order = y_rnk_statistics.sort_values('diff',ascending=False).index
st_order = st_rnk_statistics.sort_values('diff',ascending=False).index

plot_rnk(dd_y,['all-capital','phosphorylated'],'kinase',figsize=(24,5),order=y_order)

From the graph, it seems PTK2,SYK,ZAP70,EGFR,BMX rank increased a lot when considering phosphorylation status in the calculation. These kinases are known to prefer phosphopriming.

plot_rnk(dd_st,['all-capital','phosphorylated'],'kinase',figsize=(24,5),order=st_order,fontsize=14)

From the graph, it seems GRK, CK2, CK1s, GSK3s rank increased a lot when considering phosphorylation status in the calculation. These kinases are known to prefer phosphopriming.

CDDM scoring

As PSSMs from CDDM contains both tyrosine kinases and Ser/Thr kinases, we need to calculate AUCDF separately for each type.

set_sns()

param_CDDM['ref']['0Y'].hist(bins=50)
plt.title('Distribution of 0Y ratio');

# Get TK and ST kinase list
TK = param_CDDM['ref']['0Y']>0.3
ST = param_CDDM['ref']['0Y']<0.7

TK = TK[TK].index.tolist()
ST = ST[ST].index.tolist()

CDDM Scoring (contains lowercase STY indicating phosphorylation status)

# include only available kinase from the reference for scoring
TK_df = psp[psp.kinase.isin(TK)].copy()
ST_df = psp[psp.kinase.isin(ST)].copy()


# get log2(score)
ST_out = predict_kinase_df(ST_df,'site_seq',**param_CDDM)
TK_out = predict_kinase_df(TK_df,'site_seq',**param_CDDM)

# to rank, need to split TK and ST kinase columns
ST_out = ST_out[ST]
TK_out = TK_out[TK]

# get rank of score
TK_rnk = TK_out.rank(axis=1,ascending=False)
ST_rnk = ST_out.rank(axis=1,ascending=False)

TK_df['rnk']=match_values(TK_df,TK_rnk)
ST_df['rnk']=match_values(ST_df,ST_rnk)

input dataframe has a length 16799
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]

100%|██████████| 289/289 [00:51<00:00,  5.62it/s]

input dataframe has a length 3009
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]

100%|██████████| 289/289 [00:10<00:00, 27.98it/s]

get_AUCDF(ST_df,'rnk')
get_AUCDF(TK_df,'rnk')

0.7136624421990068

CDDM scoring (all capital)

# convert to capital
TK_df['site_seq_upper']=TK_df['site_seq'].str.upper()
ST_df['site_seq_upper']=ST_df['site_seq'].str.upper()

# get log2(score)
ST_out_upper = predict_kinase_df(ST_df,'site_seq_upper',**param_CDDM_upper)
TK_out_upper = predict_kinase_df(TK_df,'site_seq_upper',**param_CDDM_upper)

# to rank, need to split TK and ST kinase columns
ST_out_upper = ST_out_upper[ST]
TK_out_upper = TK_out_upper[TK]

# get rank of score
ST_rnk_upper = ST_out_upper.rank(axis=1,ascending=False)
TK_rnk_upper = TK_out_upper.rank(axis=1,ascending=False)

ST_df['rnk_upper']=match_values(ST_df,ST_rnk_upper)
TK_df['rnk_upper']=match_values(TK_df,TK_rnk_upper)

input dataframe has a length 16799
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]

100%|██████████| 289/289 [01:06<00:00,  4.36it/s]

input dataframe has a length 3009
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]

100%|██████████| 289/289 [00:06<00:00, 42.56it/s]

get_AUCDF(ST_df,'rnk_upper')
get_AUCDF(TK_df,'rnk_upper')

0.7216848757497858

Plot rank

Find the corresponding rank and map them in the kinase-substrate dataset

ST_cnt = ST_df.kinase.value_counts()
TK_cnt = TK_df.kinase.value_counts()

ST_df['count'] = ST_df.kinase.map(ST_cnt)
TK_df['count'] = TK_df.kinase.map(TK_cnt)

# remove kinases that have substrate pairs less than 20
st_v = ST_df.query('count>=20')
tk_v = TK_df.query('count>=20')

For the rank value, the lower the better.

plot_bar(st_v,'rnk','kinase',**bar_param)
plt.ylabel('Rank of kinases')

Text(0, 0.5, 'Rank of kinases')

plot_bar(tk_v,'rnk','kinase',**bar_param)
plt.ylabel('Rank of kinases')

Text(0, 0.5, 'Rank of kinases')