!pip install python-katlas -Uq
Kinase enrichment analysis - AKTi
In this session, we will analyze the differential change of phosphorylation sites in phosphoproteomics dataset.
Setup
# katlas
from katlas.core import *
# from katlas.plot import plot_rank, set_sns
# utils
import pandas as pd, numpy as np, seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce, partial
from tqdm import tqdm
# statistics
from scipy.stats import ttest_rel,ttest_ind
from statsmodels.stats.multitest import multipletests
# disable warning
import warnings
"ignore", message="converting a masked element to nan")
warnings.filterwarnings(
'display.precision', 15) pd.set_option(
Data
The phosphoproteomics dataset is from paper Chemical Phosphoproteomics Sheds New Light on the Targets and Modes of Action of AKT Inhibitors; five clinical AKT inhibitors are evaluated in this study.
= pd.read_csv('https://github.com/sky1ove/katlas_raw/raw/refs/heads/main/nbs/raw/AKT_inhibitor.csv') df
df.head()
gene_site | site_seq | Control_R1 | Control_R2 | Control_R3 | Control_R4 | AZD5363_R1 | AZD5363_R2 | AZD5363_R3 | AZD5363_R4 | ... | GSK690693_R3 | GSK690693_R4 | Ipatasertib_R1 | Ipatasertib_R2 | Ipatasertib_R3 | Ipatasertib_R4 | MK-2206_R1 | MK-2206_R2 | MK-2206_R3 | MK-2206_R4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AAK1_S637 | AGHRRILsDVtHsAV | 14.167260000000001 | 13.791940000000000 | 13.888430000000000 | 13.991310000000000 | 14.581049999999999 | 14.802630000000001 | 14.723060000000000 | 14.857880000000000 | ... | 14.47591 | 14.810269999999999 | 14.541550000000001 | 14.598560000000001 | 14.669480000000000 | 14.267690000000000 | 14.500560000000000 | 14.700110000000000 | 14.642170000000000 | 14.394790000000000 |
1 | ABCF1_S105 | MERLKKLsVPtsDEE | 14.184260000000000 | 14.422390000000000 | 14.320520000000000 | 14.310689999999999 | 14.905970000000000 | 15.157950000000000 | 14.848050000000001 | 14.968950000000000 | ... | 14.82788 | 14.924620000000001 | 14.888140000000000 | 15.019850000000000 | 14.857740000000000 | 14.809049999999999 | 14.915050000000001 | 14.815270000000000 | 15.067920000000001 | 14.526149999999999 |
2 | ACIN1_S208 | HSPRKSSsIsEEKGD | 15.956730000000000 | 15.791060000000000 | NaN | 15.961280000000000 | 14.763690000000000 | 14.926340000000000 | NaN | 14.933990000000000 | ... | NaN | 14.728800000000000 | 15.004510000000000 | 14.969420000000000 | NaN | 14.797020000000000 | 14.923180000000000 | 15.012650000000001 | NaN | 15.046220000000000 |
3 | ACIN1_S216 | IsEEKGDsDDEKPRK | 17.457689999999999 | 16.997420000000002 | 17.373349999999999 | 17.434950000000001 | 16.252500000000001 | 16.341049999999999 | 16.316970000000001 | 16.411560000000001 | ... | 16.38372 | 16.193570000000001 | 16.486070000000002 | 16.445869999999999 | 16.275169999999999 | 16.242329999999999 | 16.405529999999999 | 16.500750000000000 | 16.558160000000001 | 16.533030000000000 |
4 | ACIN1_S240 | QARAAkLsEGsQPAE | 13.211040000000001 | 13.129730000000000 | 13.565530000000001 | 13.572440000000000 | 12.127129999999999 | 12.011320000000000 | 12.125540000000001 | 11.806080000000000 | ... | 12.09775 | 12.322410000000000 | 11.838360000000000 | 12.090270000000000 | 12.027799999999999 | 11.561370000000000 | 12.093400000000001 | 12.152469999999999 | 12.159110000000000 | 11.888489999999999 |
5 rows × 26 columns
df.columns
Index(['gene_site', 'site_seq', 'Control_R1', 'Control_R2', 'Control_R3',
'Control_R4', 'AZD5363_R1', 'AZD5363_R2', 'AZD5363_R3', 'AZD5363_R4',
'GSK2110183_R1', 'GSK2110183_R2', 'GSK2110183_R3', 'GSK2110183_R4',
'GSK690693_R1', 'GSK690693_R2', 'GSK690693_R3', 'GSK690693_R4',
'Ipatasertib_R1', 'Ipatasertib_R2', 'Ipatasertib_R3', 'Ipatasertib_R4',
'MK-2206_R1', 'MK-2206_R2', 'MK-2206_R3', 'MK-2206_R4'],
dtype='object')
= df.columns[df.columns.str.contains('Control')]
ctrl
# below are five AKT inhibitors
= df.columns[df.columns.str.contains('AZD5363')]
AZD = df.columns[df.columns.str.contains('GSK2110183')]
GSK2 = df.columns[df.columns.str.contains('GSK690693')]
GSK6 = df.columns[df.columns.str.contains('Ipatasertib')]
ipa = df.columns[df.columns.str.contains('MK')] MK
Statistical test
Let’s use AZD for example:
= get_pvalue(df,ctrl, AZD,'mann_whitney') azd
Computing mann_whitney tests: 100%|██████████| 10900/10900 [00:10<00:00, 1061.00it/s]
azd.head()
log2FC | p_value | p_adj | signed_logP | signed_logPadj | |
---|---|---|---|---|---|
0 | 0.822975000000001 | 0.028571428571429 | 0.186595908585124 | 1.544068044350276 | 0.729097883088899 |
1 | 0.621855000000000 | 0.028571428571429 | 0.186595908585124 | 1.544068044350276 | 0.729097883088899 |
2 | -1.030390000000001 | 0.100000000000000 | 0.314892387693197 | -1.000000000000000 | -0.501837838006831 |
3 | -1.075140000000001 | 0.028571428571429 | 0.186595908585124 | -1.544068044350276 | -0.729097883088899 |
4 | -1.319855000000000 | 0.028571428571429 | 0.186595908585124 | -1.544068044350276 | -0.729097883088899 |
azd.p_value.sort_values()
0 0.028571428571429
10584 0.028571428571429
2011 0.028571428571429
10585 0.028571428571429
10603 0.028571428571429
...
7584 1.000000000000000
3648 1.000000000000000
9407 1.000000000000000
3654 1.000000000000000
10899 1.000000000000000
Name: p_value, Length: 10900, dtype: float64
As the sample size for calculating p value is small in this case, we will use uncorrected p_value to allow more data to be considered; it is more important to look at the effect size
= df.iloc[:,:2] info
= pd.concat([info,azd],axis=1) azd
Cut through threshold
# set a threshold of 0.05 for p_value
= azd[azd.p_value<0.05].reset_index(drop=True) azd2
azd2.shape
(1669, 7)
str[7].value_counts().plot.bar(); azd2.site_seq.
Visualize fold change values
def plot_value(df,col):
=(10, 5))
plt.figure(figsize
# sort values
=df.copy().sort_values(col).reset_index(drop=True)
df
=(df[col] >= 0), color='blue', alpha=0.3, label='Positive')
plt.fill_between(df.index, df[col], where=(df[col] <= 0), color='red', alpha=0.3, label='Negative')
plt.fill_between(df.index, df[col], where'Area Plot of Values')
plt.title('Ranked Index')
plt.xlabel('Values') plt.ylabel(
'log2FC') plot_value(azd2,
= azd2[azd2.site_seq.str[7].str.upper()!='Y']
st_sites = azd2[azd2.site_seq.str[7].str.upper()=='Y'] y_sites
'log2FC')
plot_value(st_sites,'S/T sites')
plt.title('log2FC')
plot_value(y_sites,'Y sites') plt.title(
Text(0.5, 1.0, 'Y sites')
Dataset processing
There are NaNs in site_seq, we will drop them
=azd2.dropna(subset='site_seq') azd2
Some site_seq contains multiple sequences splitted by “;”, we will take the first one.
'site_seq2'] = azd2.site_seq.str.split(';').str[0] azd2[
Make sure site sequence length are all consistent
'len'] = azd2.site_seq2.str.len() azd2[
len.value_counts() azd2.
len
15 1647
Name: count, dtype: int64
'acceptor'] = azd2.site_seq2.str[7] azd2[
Substrate scoring
= predict_kinase_df(azd2,seq_col='site_seq2', **param_PSPA)
pspa_out = predict_kinase_df(azd2,seq_col='site_seq2', **param_CDDM) cddm_out
input dataframe has a length 1647
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 396/396 [00:03<00:00, 114.38it/s]
input dataframe has a length 1647
Preprocessing
Finish preprocessing
Merging reference
Finish merging
Kinase enrichment
def get_enrichment(score_df, # output df of predict_kinase_df
# df that contains site sequence
site_df, # column that indicates direction (sign) and weights for the site
weight_col, =False, # whether or not it is Y site or S/T site
is_Y= 5, # top n kinase to consider for each site
top_n
):
"Calculate kinase enrichment score for sites with signed log10(p) that pass a threshold(1.3) "
if is_Y:
= site_df.index[(site_df.acceptor.str.upper()=="Y") & (site_df[weight_col]>0)]
idx_up = site_df.index[(site_df.acceptor.str.upper()=="Y") & (site_df[weight_col]<0)]
idx_dn else:
= site_df.index[(site_df.acceptor.str.upper()!="Y") & (site_df[weight_col]>0)]
idx_up = site_df.index[(site_df.acceptor.str.upper()!="Y") & (site_df[weight_col]<0)]
idx_dn
= site_df.loc[idx_up]
up_site = site_df.loc[idx_dn]
dn_site
= score_df.loc[idx_up]
up_score = score_df.loc[idx_dn]
dn_score
def top_kinases(site_row,top_n=5):
# Sort the row in descending order and get the top n kinases
= site_row.sort_values(ascending=False).head(top_n)
top_kinases
# Get the counts of the top kinases
= top_kinases.index.value_counts()
kinase_counts
return kinase_counts
= partial(top_kinases,top_n=top_n)
func
= up_score.apply(func,axis=1)
up_cnt = dn_score.apply(func,axis=1)
dn_cnt
= up_cnt.sum().reset_index(name = 'up_cnt')
df_up_cnt = dn_cnt.sum().reset_index(name = 'dn_cnt')
df_dn_cnt
= up_cnt.multiply(abs(up_site[weight_col]),axis=0).sum().reset_index(name = 'up_weighted_cnt')
df_w_up = dn_cnt.multiply(abs(dn_site[weight_col]),axis=0).sum().reset_index(name = 'dn_weighted_cnt')
df_w_dn
= [df_up_cnt,df_dn_cnt,df_w_up,df_w_dn]
dfs
= reduce(lambda left, right: pd.merge(left, right, how='outer'), dfs)
result
= result.fillna(0)
result 'max'] = result.apply(lambda r: r.up_weighted_cnt if r.up_weighted_cnt >= r.dn_weighted_cnt else -r.dn_weighted_cnt,axis=1)
result['delta'] = result.up_weighted_cnt - result.dn_weighted_cnt
result[= result.rename(columns={'index':'kinase'})
result
return result
As we only have 6 y sites, we will do S/T sites kinase enrichment; setis_Y
to False
= get_enrichment(cddm_out,azd2,'log2FC',is_Y=False,top_n=10)
st_cddm = get_enrichment(pspa_out,azd2,'log2FC',is_Y=False,top_n=10) st_pspa
st_cddm.head()
kinase | up_cnt | dn_cnt | up_weighted_cnt | dn_weighted_cnt | max | delta | |
---|---|---|---|---|---|---|---|
0 | ACVR2A | 19.0 | 3.0 | 5.507104999999999 | 0.896350000000004 | 5.507104999999999 | 4.610754999999996 |
1 | AKT1 | 116.0 | 111.0 | 38.424563000000049 | 95.306570000000008 | -95.306570000000008 | -56.882006999999959 |
2 | AKT2 | 14.0 | 66.0 | 4.007335000000003 | 64.575879999999998 | -64.575879999999998 | -60.568544999999993 |
3 | AKT3 | 78.0 | 92.0 | 24.802710000000030 | 84.765155000000007 | -84.765155000000007 | -59.962444999999974 |
4 | ALK2 | 94.0 | 14.0 | 22.789808500000021 | 6.202690000000000 | 22.789808500000021 | 16.587118500000020 |
Delta is the difference between up_weighted_cnt and dn_weighted_cnt vaue, max is the maximum value among the two.
'delta') st_cddm.sort_values(
kinase | up_cnt | dn_cnt | up_weighted_cnt | dn_weighted_cnt | max | delta | |
---|---|---|---|---|---|---|---|
2 | AKT2 | 14.0 | 66.0 | 4.007335000000003 | 64.575879999999998 | -64.575879999999998 | -60.568544999999993 |
3 | AKT3 | 78.0 | 92.0 | 24.802710000000030 | 84.765155000000007 | -84.765155000000007 | -59.962444999999974 |
182 | SGK1 | 72.0 | 95.0 | 23.715893000000033 | 83.583534999999998 | -83.583534999999998 | -59.867641999999961 |
1 | AKT1 | 116.0 | 111.0 | 38.424563000000049 | 95.306570000000008 | -95.306570000000008 | -56.882006999999959 |
140 | P90RSK | 162.0 | 132.0 | 57.145470500000023 | 105.883935000000008 | -105.883935000000008 | -48.738464499999985 |
... | ... | ... | ... | ... | ... | ... | ... |
29 | CDK2 | 589.0 | 72.0 | 178.796763500000026 | 41.434269500000013 | 178.796763500000026 | 137.362494000000027 |
28 | CDK1 | 645.0 | 87.0 | 197.268107000000043 | 49.980519500000014 | 197.268107000000043 | 147.287587500000029 |
60 | ERK1 | 650.0 | 79.0 | 199.277800000000013 | 42.192084500000021 | 199.277800000000013 | 157.085715499999992 |
31 | CDK4 | 666.0 | 81.0 | 202.695893499999983 | 43.129879500000015 | 202.695893499999983 | 159.566013999999967 |
61 | ERK2 | 691.0 | 86.0 | 210.536440000000027 | 45.306319500000008 | 210.536440000000027 | 165.230120500000027 |
203 rows × 7 columns
'max') st_cddm.sort_values(
kinase | up_cnt | dn_cnt | up_weighted_cnt | dn_weighted_cnt | max | delta | |
---|---|---|---|---|---|---|---|
174 | PRKX | 285.0 | 136.0 | 104.613877000000002 | 109.507792000000023 | -109.507792000000023 | -4.893915000000021 |
140 | P90RSK | 162.0 | 132.0 | 57.145470500000023 | 105.883935000000008 | -105.883935000000008 | -48.738464499999985 |
1 | AKT1 | 116.0 | 111.0 | 38.424563000000049 | 95.306570000000008 | -95.306570000000008 | -56.882006999999959 |
3 | AKT3 | 78.0 | 92.0 | 24.802710000000030 | 84.765155000000007 | -84.765155000000007 | -59.962444999999974 |
182 | SGK1 | 72.0 | 95.0 | 23.715893000000033 | 83.583534999999998 | -83.583534999999998 | -59.867641999999961 |
... | ... | ... | ... | ... | ... | ... | ... |
29 | CDK2 | 589.0 | 72.0 | 178.796763500000026 | 41.434269500000013 | 178.796763500000026 | 137.362494000000027 |
28 | CDK1 | 645.0 | 87.0 | 197.268107000000043 | 49.980519500000014 | 197.268107000000043 | 147.287587500000029 |
60 | ERK1 | 650.0 | 79.0 | 199.277800000000013 | 42.192084500000021 | 199.277800000000013 | 157.085715499999992 |
31 | CDK4 | 666.0 | 81.0 | 202.695893499999983 | 43.129879500000015 | 202.695893499999983 | 159.566013999999967 |
61 | ERK2 | 691.0 | 86.0 | 210.536440000000027 | 45.306319500000008 | 210.536440000000027 | 165.230120500000027 |
203 rows × 7 columns
Visualize in bar graph
def get_bar_data(df, score_col, top_n = 10, plot=True):
= df.sort_values(score_col)[['kinase',score_col]]
data
= ['Kinase','Score']
data.columns
= pd.concat([data.head(top_n),data.tail(top_n)]).set_index('Kinase')
data
if plot:
=False)
data.plot.barh(legend'Kinase Enrichment')
plt.title(f'{score_col} score')
plt.xlabel('')
plt.ylabel(
return data
= get_bar_data(st_cddm,'delta')
bar_cddm
= get_bar_data(st_pspa,'delta') bar_pspa