from katlas.imports import *
import pickle,pandas as pd, seaborn as sns
from matplotlib import pyplot as plt
Plot AUCDF
In this notebook, we will evaluate PSSMs derived from two methods:PSPA and CDDM, using kinase-substrate datasets from PhosphoSitePlus (PSP). We will use AUCDF (Area Under Cumulative Distribution Function) to evaluate. AUCDF was previously introduced to evaluate PSSMs in the paper An atlas of substrate specificities for the human serine/threonine kinome.
Setup
Get kinase idx map
= Data.get_kinase_info().query('pseudo=="0"')
info
= info[['kinase','ID_coral','ID_HGNC']].map(lambda x: x.upper())
info
= {}
kinase_map for idx, row in info.iterrows():
# Add each of the different kinase name formats to the map
'ID_coral']] = row['kinase']
kinase_map[row['ID_HGNC']] = row['kinase']
kinase_map[row[# Ensure the kinase name itself is also in the map
'kinase']] = row['kinase']
kinase_map[row[
'ABL'] = 'ABL1'
kinase_map['HER2'] = 'ERBB2'
kinase_map['ETK'] = 'BMX'
kinase_map['MKK6'] = 'MAP2K6'
kinase_map['MKK4'] = 'MAP2K4'
kinase_map['MKK3'] = 'MAP2K3'
kinase_map['MKK7'] = 'MAP2K7'
kinase_map[
'ARG'] = 'ABL2' kinase_map[
Uncheck below to save and load kinase_map.pkl
# import pickle
# with open('raw/kinase_map.pkl', 'wb') as file:
# pickle.dump(kinase_map, file)
# with open('raw/kinase_map.pkl', 'rb') as file:
# loaded_dict = pickle.load(file)
Load kinase-substrate data from PSP
# load kinase-substrate pairs from PSP
= pd.read_csv('raw/PSP_Kinase_Substrate_Dataset.csv') psp
psp.head()
GENE | KINASE | KIN_ACC_ID | KIN_ORGANISM | SUBSTRATE | SUB_GENE_ID | SUB_ACC_ID | SUB_GENE | SUB_ORGANISM | SUB_MOD_RSD | SITE_GRP_ID | site_seq | DOMAIN | IN_VIVO_RXN | IN_VITRO_RXN | CST_CAT# | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Dyrk2 | DYRK2 | Q5U4C9 | mouse | NDEL1 | 83431.0 | Q9ERR1 | Ndel1 | mouse | S336 | 1869686801 | LGSsRPSsAPGMLPL | NaN | X | NaN | |
1 | Pak2 | PAK2 | Q64303 | rat | MEK1 | 170851.0 | Q01986 | Map2k1 | rat | S298 | 448284 | RtPGRPLsSYGMDSR | Pkinase | X | 9128; 98195 | |
2 | Pak2 | PAK2 | Q64303 | rat | PRKD1 | 85421.0 | Q9WTQ1 | Prkd1 | rat | S203 | 449896 | GVRRRRLsNVsLTGL | NaN | X | NaN | |
3 | Pak2 | PAK2 | Q64303 | rat | prolactin | 24683.0 | P01237 | Prl | rat | S206 | 451732 | IRCLRRDsHKVDNYL | Hormone_1 | X | NaN | |
4 | Pak2 | PAK2 | Q64303 | rat | prolactin | 5617.0 | P01236 | PRL | human | S207 | 451732 | LHCLRRDsHKIDNYL | Hormone_1 | X | NaN |
As there are some sequences in ‘site_seq’ column that do not have s/t/y at the center position, we need to remove them.
# For site sequence
= psp.loc[psp.site_seq.str[7].isin(list('stySTY'))] psp
We also notice that the kinase name in ‘KINASE’ column is not always consistent (e.g., gene name and protein name are mixed in some cases), so we need to convert the kinase name to a consistent name.
# for isoform, suppose they have similar recognition pattern; drop the isoform # and take the kinase name
= psp.KINASE.str.split(' ').str[0].str.upper()
psp.KINASE
# for fusion form of kinase,get the second item
= psp.KINASE.apply(lambda x: x.split('-')[1] if '-' in x else x)
psp.KINASE
# map various kinase name (coral ID, gene name) to a common name
'kinase'] = psp.KINASE.map(kinase_map) psp[
Check kinase that is not mapped:
# kinase not mapped
10] psp[psp.kinase.isna()].KINASE.value_counts()[:
KINASE
CK2B 20
VEGFR2 15
AMPKB1 15
UL97 14
ILK 13
PKM 12
PIK3CA 11
AMPKG2 10
CSFR 10
VEGFR1 7
Name: count, dtype: int64
# drop kinase not mapped
= psp.dropna(subset='kinase') psp
# drop duplicates
= psp[['site_seq','kinase']].drop_duplicates() psp
str[7].value_counts() psp.site_seq.
site_seq
s 13543
t 4349
y 3049
Name: count, dtype: int64
Filter sites and kinase for PSPA scoring
import pandas as pd, numpy as np
from tqdm import tqdm
= Data.get_pspa_tyr_norm()
ref_y
= Data.get_pspa_st_norm() ref_st
= ref_y.index.str.split('_').str[0].tolist()
TK = ref_st.index.str.split('_').str[0].tolist() ST
We will use two kinds of inputs for PSPA evaluation:
- All capital (the official method from the Nature paper.)
- With lowercase indicating phosphorylation status
# filter samples, include only available kinase from the reference for scoring
= psp[psp.kinase.isin(ref_st.index)].copy()
df_st = psp[psp.kinase.isin(ref_y.index)].copy()
df_y
# keep ST sites
= df_st[df_st.site_seq.str[7].isin(list('stST'))]
df_st
# keep Y sites
= df_y[df_y.site_seq.str[7].isin(list('yY'))] df_y
str[7].value_counts() df_st.site_seq.
site_seq
s 13398
t 4287
Name: count, dtype: int64
str[7].value_counts() df_y.site_seq.
site_seq
y 2904
Name: count, dtype: int64
# convert site sequence to capital, for percentile calculation
'site_seq_upper'] = df_st['site_seq'].str.upper()
df_st['site_seq_upper'] = df_y['site_seq'].str.upper() df_y[
df_st.head()
site_seq | kinase | site_seq_upper | |
---|---|---|---|
0 | LGSsRPSsAPGMLPL | DYRK2 | LGSSRPSSAPGMLPL |
1 | RtPGRPLsSYGMDSR | PAK2 | RTPGRPLSSYGMDSR |
2 | GVRRRRLsNVsLTGL | PAK2 | GVRRRRLSNVSLTGL |
3 | IRCLRRDsHKVDNYL | PAK2 | IRCLRRDSHKVDNYL |
4 | LHCLRRDsHKIDNYL | PAK2 | LHCLRRDSHKIDNYL |
Multiply score
= param_PSPA_y
y_param_multiply = param_PSPA_st st_param_multiply
# multiply score on all capital
= predict_kinase_df(df_st,'site_seq_upper',**st_param_multiply)
st_mul_up
# multiply score on phosphorylated substrates
= predict_kinase_df(df_st,'site_seq',**st_param_multiply) st_mul_lo
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 303/303 [00:02<00:00, 135.63it/s]
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 303/303 [00:02<00:00, 131.82it/s]
# multiply score on all capital
= predict_kinase_df(df_st,'site_seq_upper',**st_param_multiply)
st_mul_up
# multiply score on phosphorylated substrates
= predict_kinase_df(df_st,'site_seq',**st_param_multiply) st_mul_lo
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 303/303 [00:02<00:00, 137.92it/s]
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 303/303 [00:02<00:00, 132.47it/s]
# multiply score on all capital
= predict_kinase_df(df_y,'site_seq_upper',**y_param_multiply)
y_mul_up
# multiply score on phosphorylated substrates
= predict_kinase_df(df_y,'site_seq',**y_param_multiply) y_mul_lo
input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 93/93 [00:00<00:00, 319.46it/s]
input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 93/93 [00:00<00:00, 349.33it/s]
df_st
site_seq | kinase | site_seq_upper | |
---|---|---|---|
0 | LGSsRPSsAPGMLPL | DYRK2 | LGSSRPSSAPGMLPL |
1 | RtPGRPLsSYGMDSR | PAK2 | RTPGRPLSSYGMDSR |
2 | GVRRRRLsNVsLTGL | PAK2 | GVRRRRLSNVSLTGL |
3 | IRCLRRDsHKVDNYL | PAK2 | IRCLRRDSHKVDNYL |
4 | LHCLRRDsHKIDNYL | PAK2 | LHCLRRDSHKIDNYL |
... | ... | ... | ... |
23276 | QRVLDtssLtQsAPA | ULK2 | QRVLDTSSLTQSAPA |
23277 | DtssLtQsAPAsPtN | ULK2 | DTSSLTQSAPASPTN |
23278 | LAQPINFsVSLSNSH | ULK2 | LAQPINFSVSLSNSH |
23279 | ESsPILTsFELVKVP | ULK2 | ESSPILTSFELVKVP |
23280 | THRRMVVsMPNLQDI | ULK2 | THRRMVVSMPNLQDI |
17685 rows × 3 columns
Sum score
= {'ref':Data.get_pspa_tyr_norm(), 'func': sumup}
y_param_sum = {'ref':Data.get_pspa_st_norm(), 'func': sumup} st_param_sum
# sum score on all capital
= predict_kinase_df(df_st,'site_seq_upper',**st_param_sum)
st_sum_up
# sum score on phosphorylated substrates
= predict_kinase_df(df_st,'site_seq',**st_param_sum) st_sum_lo
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
input dataframe has a length 17685
Preprocessing
Finish preprocessing
Merging reference
Finish merging
# sum score on all capital
= predict_kinase_df(df_y,'site_seq_upper',**y_param_sum)
y_sum_up
# sum score on phosphorylated substrates
= predict_kinase_df(df_y,'site_seq',**y_param_sum) y_sum_lo
input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging
input dataframe has a length 2904
Preprocessing
Finish preprocessing
Merging reference
Finish merging
Plot differences of all-capital and phosphorylated
'acceptor'] = df_y.site_seq_upper.str[7] df_y[
= get_color_dict(['S','T','Y'],'tab20') palette
set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
= 'SYK'
k = y_mul_lo[k].min()-0.1, y_mul_lo[k].max()+0.1
xmin,xmax
= df_y.acceptor,palette=palette)
plot_hist(y_mul_up,k,hue f'{k}, without considering phospho-priming')
plt.title(;
plt.xlim(xmin,xmax)
= df_y.acceptor,palette=palette)
plot_hist(y_mul_lo,k,hue f'{k}, with considering phospho-priming')
plt.title(; plt.xlim(xmin,xmax)
Get score rank
# get rank of multiply score for all capital
= y_mul_up.rank(axis=1,ascending=False)
y_rnk_mul_up = st_mul_up.rank(axis=1,ascending=False)
st_rnk_mul_up
# get rank of multiply score for phosphorylated
= y_mul_lo.rank(axis=1,ascending=False)
y_rnk_mul_lo = st_mul_lo.rank(axis=1,ascending=False) st_rnk_mul_lo
# get rank of sum score for all capital
= y_sum_up.rank(axis=1,ascending=False)
y_rnk_sum_up = st_sum_up.rank(axis=1,ascending=False)
st_rnk_sum_up
# get rank of sum score for all capital
= y_sum_lo.rank(axis=1,ascending=False)
y_rnk_sum_lo = st_sum_lo.rank(axis=1,ascending=False) st_rnk_sum_lo
As the reference for percentile calculation is calculated based on all-capital sequences, we will calculate percentile score and its rank only for the uppercase one.
For the lowercase, it should be also noted that the phosphorylation status from PSP might not be accurate, as it includes all high-throughput phosphorylation and low-throughput phosphorylation sites.
Percentile
Percentile calculation, for all capital only:
# get percentile based on percentile_reference
= Data.get_pspa_tyr_pct()
y_pct_ref = Data.get_pspa_st_pct()
st_pct_ref
= get_pct_df(y_mul_up,y_pct_ref)
y_pct = get_pct_df(st_mul_up,st_pct_ref)
st_pct
# get percentile rank across kinases
= y_pct.rank(axis=1,ascending=False)
y_pct_rnk = st_pct.rank(axis=1,ascending=False) st_pct_rnk
100%|██████████| 93/93 [00:00<00:00, 2401.17it/s]
100%|██████████| 303/303 [00:00<00:00, 310.19it/s]
Match values
def match_values(df,rnk):
return pd.Series([rnk.at[k,v] for k,v in df.kinase.items()],index=df.index)
# merge rank values to df
'y_rnk_mul_up'] = match_values(df_y,y_rnk_mul_up)
df_y['y_rnk_mul_lo'] = match_values(df_y,y_rnk_mul_lo)
df_y[
'y_rnk_sum_up'] = match_values(df_y,y_rnk_sum_up)
df_y['y_rnk_sum_lo'] = match_values(df_y,y_rnk_sum_lo) df_y[
# merge rank values to df
'st_rnk_mul_up'] = match_values(df_st,st_rnk_mul_up)
df_st['st_rnk_mul_lo'] = match_values(df_st,st_rnk_mul_lo)
df_st[
'st_rnk_sum_up'] = match_values(df_st,st_rnk_sum_up)
df_st['st_rnk_sum_lo'] = match_values(df_st,st_rnk_sum_lo) df_st[
# for uppercase only
'pct'] = match_values(df_y,y_pct)
df_y['pct_rnk'] = match_values(df_y,y_pct_rnk)
df_y[
'pct'] = match_values(df_st,st_pct)
df_st['pct_rnk'] = match_values(df_st,st_pct_rnk) df_st[
df_y.head()
site_seq | kinase | site_seq_upper | acceptor | y_rnk_mul_up | y_rnk_mul_lo | y_rnk_sum_up | y_rnk_sum_lo | pct | pct_rnk | |
---|---|---|---|---|---|---|---|---|---|---|
1516 | KETEGQFyNYFPN__ | ITK | KETEGQFYNYFPN__ | Y | 8.0 | 8.0 | 11.0 | 11.0 | 83.472 | 17.5 |
1517 | ETLVIALyDYQTNDP | ITK | ETLVIALYDYQTNDP | Y | 25.0 | 25.0 | 31.0 | 31.0 | 39.262 | 26.0 |
1518 | PNEGDNDyIIPLPDP | PDGFRB | PNEGDNDYIIPLPDP | Y | 26.0 | 26.0 | 43.5 | 43.5 | 98.237 | 15.0 |
1519 | ERKEVsKysDIQRsL | PDGFRB | ERKEVSKYSDIQRSL | Y | 44.0 | 63.0 | 60.0 | 70.0 | 58.742 | 36.0 |
1520 | LDTSSVLyTAVQPNE | PDGFRB | LDTSSVLYTAVQPNE | Y | 58.0 | 58.0 | 71.0 | 71.0 | 58.524 | 59.0 |
Percentile
set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.'notebook')
sns.set_context("ticks")
sns.set_style(
'pct',reverse=True,xlabel='Percentile of reported kinase') get_AUCDF(df_y,
0.613949614139531
'pct',reverse=True,xlabel='Percentile of reported kinase') get_AUCDF(df_st,
0.7996590541217666
Percentile rank
'pct_rnk') get_AUCDF(df_y,
0.6021502629886102
'pct_rnk') get_AUCDF(df_st,
0.7903312239631667
Multiply score on all capital
'y_rnk_mul_up') get_AUCDF(df_y,
0.6240460933796205
'st_rnk_mul_up') get_AUCDF(df_st,
0.8225713446901245
Multiply score on phosphorylated
'y_rnk_mul_lo') get_AUCDF(df_y,
0.6373005243034091
'st_rnk_mul_lo') get_AUCDF(df_st,
0.8232876176843243
Sum score on all capital
'y_rnk_sum_up') get_AUCDF(df_y,
0.6115071915765649
'st_rnk_sum_up') get_AUCDF(df_st,
0.8151544558555669
Sum score on phosphorylated
'y_rnk_sum_lo') get_AUCDF(df_y,
0.6251077799990844
'st_rnk_sum_lo') get_AUCDF(df_st,
0.8235627655503237
Plot average rank for each kinase
The bar plot will reflect how accurate the prediction is for each kinase. The lower the y axis value is, the better.
= df_y.kinase.value_counts()
cnt_y = df_st.kinase.value_counts() cnt_st
'count'] = df_y.kinase.map(cnt_y)
df_y['count'] = df_st.kinase.map(cnt_st) df_st[
= df_y.query('count>=20')
dd_y = df_st.query('count>=20') dd_st
= {'dots':False, 'fontsize':10, 'figsize':(17,3),'ascending':True} bar_param
'y_rnk_mul_up','kinase',**bar_param)
plot_bar(dd_y,'Rank with all capital');
plt.ylabel(
'y_rnk_mul_lo','kinase',**bar_param)
plot_bar(dd_y,'Rank with phosphorylation status'); plt.ylabel(
'st_rnk_mul_up','kinase',**bar_param)
plot_bar(dd_st,'Rank with all capital');
plt.ylabel(
'st_rnk_mul_lo','kinase',**bar_param)
plot_bar(dd_st,'Rank with phosphorylation status'); plt.ylabel(
Statistical analysis
= dd_y.rename(columns={'y_rnk_mul_lo':'phosphorylated','y_rnk_mul_up':'all-capital'})
dd_y = dd_st.rename(columns={'st_rnk_mul_lo':'phosphorylated','st_rnk_mul_up':'all-capital'}) dd_st
import scipy.stats as stats
= dd_y.groupby('kinase')[['all-capital', 'phosphorylated']].mean()
delta_y = dd_st.groupby('kinase')[['all-capital', 'phosphorylated']].mean() delta_st
'diff'] = (delta_y['all-capital'] - delta_y['phosphorylated'])/delta_y['all-capital']
delta_y['diff'] = (delta_st['all-capital'] - delta_st['phosphorylated'])/delta_st['all-capital'] delta_st[
'diff',ascending=False).head() delta_y.sort_values(
all-capital | phosphorylated | diff | |
---|---|---|---|
kinase | |||
PTK2 | 30.382979 | 15.170213 | 0.500700 |
SYK | 20.233333 | 12.011111 | 0.406370 |
ZAP70 | 25.238095 | 16.119048 | 0.361321 |
EGFR | 32.346457 | 24.929134 | 0.229309 |
BMX | 41.250000 | 31.800000 | 0.229091 |
def t_test(group):
= stats.ttest_rel(group['all-capital'], group['phosphorylated'])
t_stat, p_val return pd.Series({'t-statistic': t_stat, 'p-value': p_val})
# Apply the t-test function to each group
= dd_y.groupby('kinase').apply(t_test)
ttest_y
= dd_st.groupby('kinase').apply(t_test) ttest_st
= pd.concat([delta_y,ttest_y],axis=1)
y_rnk_statistics = pd.concat([delta_st,ttest_st],axis=1) st_rnk_statistics
y_rnk_statistics.head()
all-capital | phosphorylated | diff | t-statistic | p-value | |
---|---|---|---|---|---|
kinase | |||||
ABL1 | 27.249110 | 30.587189 | -0.122502 | -4.800813 | 2.575967e-06 |
ABL2 | 26.205882 | 28.352941 | -0.081930 | -1.498866 | 1.434148e-01 |
BMX | 41.250000 | 31.800000 | 0.229091 | 2.516893 | 2.097901e-02 |
BTK | 50.185185 | 50.629630 | -0.008856 | -0.676868 | 5.044680e-01 |
EGFR | 32.346457 | 24.929134 | 0.229309 | 6.082135 | 1.314753e-08 |
Plot all-capital vs. phosphorylated
def plot_rnk(df,value_cols,group,figsize,order=None,fontsize=18,rotation=90,title=None,**kwargs):
# Prepare the dataframe for plotting
# Melt the dataframe to go from wide to long format
= df.melt(id_vars=group, value_vars=value_cols, var_name='Ranking', value_name='Value')
df_melted
=figsize)
plt.figure(figsize
# Create the bar plot
=df_melted, x=group, y='Value', hue='Ranking',order=order,
sns.barplot(data=0.1,errwidth=1.5,errcolor='gray', # adjust the error bar settings
capsize**kwargs)
# Increase font size for the x-axis and y-axis tick labels
='x', labelsize=fontsize) # Increase x-axis label size
plt.tick_params(axis='y', labelsize=fontsize) # Increase y-axis label size
plt.tick_params(axis
# Modify x and y label and increase font size
'', fontsize=fontsize)
plt.xlabel('Rank of kinases (count≥20)', fontsize=fontsize)
plt.ylabel(
# Rotate X labels
=rotation)
plt.xticks(rotation
# Plot titles
if title is not None:
=fontsize)
plt.title(title, fontsize
'right', 'top']].set_visible(False)
plt.gca().spines[[# plt.legend(title='Substrate', fontsize=fontsize-1, title_fontsize=fontsize-1)
=fontsize) plt.legend(fontsize
= y_rnk_statistics.sort_values('diff',ascending=False).index
y_order = st_rnk_statistics.sort_values('diff',ascending=False).index st_order
'all-capital','phosphorylated'],'kinase',figsize=(24,5),order=y_order) plot_rnk(dd_y,[
From the graph, it seems PTK2,SYK,ZAP70,EGFR,BMX rank increased a lot when considering phosphorylation status in the calculation. These kinases are known to prefer phosphopriming.
'all-capital','phosphorylated'],'kinase',figsize=(24,5),order=st_order,fontsize=14) plot_rnk(dd_st,[
From the graph, it seems GRK, CK2, CK1s, GSK3s rank increased a lot when considering phosphorylation status in the calculation. These kinases are known to prefer phosphopriming.
CDDM scoring
As PSSMs from CDDM contains both tyrosine kinases and Ser/Thr kinases, we need to calculate AUCDF separately for each type.
set_sns()
'ref']['0Y'].hist(bins=50)
param_CDDM['Distribution of 0Y ratio'); plt.title(
# Get TK and ST kinase list
= param_CDDM['ref']['0Y']>0.3
TK = param_CDDM['ref']['0Y']<0.7
ST
= TK[TK].index.tolist()
TK = ST[ST].index.tolist() ST
CDDM Scoring (contains lowercase STY indicating phosphorylation status)
# include only available kinase from the reference for scoring
= psp[psp.kinase.isin(TK)].copy()
TK_df = psp[psp.kinase.isin(ST)].copy()
ST_df
# get log2(score)
= predict_kinase_df(ST_df,'site_seq',**param_CDDM)
ST_out = predict_kinase_df(TK_df,'site_seq',**param_CDDM)
TK_out
# to rank, need to split TK and ST kinase columns
= ST_out[ST]
ST_out = TK_out[TK]
TK_out
# get rank of score
= TK_out.rank(axis=1,ascending=False)
TK_rnk = ST_out.rank(axis=1,ascending=False)
ST_rnk
'rnk']=match_values(TK_df,TK_rnk)
TK_df['rnk']=match_values(ST_df,ST_rnk) ST_df[
input dataframe has a length 16799
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [00:51<00:00, 5.62it/s]
input dataframe has a length 3009
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [00:10<00:00, 27.98it/s]
'rnk')
get_AUCDF(ST_df,'rnk') get_AUCDF(TK_df,
0.7136624421990068
CDDM scoring (all capital)
# convert to capital
'site_seq_upper']=TK_df['site_seq'].str.upper()
TK_df['site_seq_upper']=ST_df['site_seq'].str.upper()
ST_df[
# get log2(score)
= predict_kinase_df(ST_df,'site_seq_upper',**param_CDDM_upper)
ST_out_upper = predict_kinase_df(TK_df,'site_seq_upper',**param_CDDM_upper)
TK_out_upper
# to rank, need to split TK and ST kinase columns
= ST_out_upper[ST]
ST_out_upper = TK_out_upper[TK]
TK_out_upper
# get rank of score
= ST_out_upper.rank(axis=1,ascending=False)
ST_rnk_upper = TK_out_upper.rank(axis=1,ascending=False)
TK_rnk_upper
'rnk_upper']=match_values(ST_df,ST_rnk_upper)
ST_df['rnk_upper']=match_values(TK_df,TK_rnk_upper) TK_df[
input dataframe has a length 16799
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [01:06<00:00, 4.36it/s]
input dataframe has a length 3009
Preprocessing
Finish preprocessing
Calculating position: [-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
100%|██████████| 289/289 [00:06<00:00, 42.56it/s]
'rnk_upper')
get_AUCDF(ST_df,'rnk_upper') get_AUCDF(TK_df,
0.7216848757497858
Plot rank
Find the corresponding rank and map them in the kinase-substrate dataset
= ST_df.kinase.value_counts()
ST_cnt = TK_df.kinase.value_counts()
TK_cnt
'count'] = ST_df.kinase.map(ST_cnt)
ST_df['count'] = TK_df.kinase.map(TK_cnt) TK_df[
# remove kinases that have substrate pairs less than 20
= ST_df.query('count>=20')
st_v = TK_df.query('count>=20') tk_v
For the rank value, the lower the better.
'rnk','kinase',**bar_param)
plot_bar(st_v,'Rank of kinases') plt.ylabel(
Text(0, 0.5, 'Rank of kinases')
'rnk','kinase',**bar_param)
plot_bar(tk_v,'Rank of kinases') plt.ylabel(
Text(0, 0.5, 'Rank of kinases')