Kinase enrichment analysis - AKTi

Open In Colab

In this session, we will analyze the differential change of phosphorylation sites in phosphoproteomics dataset.

Setup

!pip install python-katlas -Uq
# katlas
from katlas.core import *
# from katlas.plot import plot_rank, set_sns

# utils
import pandas as pd, numpy as np, seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce, partial
from tqdm import tqdm

# statistics
from scipy.stats import ttest_rel,ttest_ind
from statsmodels.stats.multitest import multipletests

# disable warning
import warnings
warnings.filterwarnings("ignore", message="converting a masked element to nan")

pd.set_option('display.precision', 15)

Data

The phosphoproteomics dataset is from paper Chemical Phosphoproteomics Sheds New Light on the Targets and Modes of Action of AKT Inhibitors; five clinical AKT inhibitors are evaluated in this study.

df = pd.read_csv('https://github.com/sky1ove/katlas_raw/raw/refs/heads/main/nbs/raw/AKT_inhibitor.csv')
df.head()
gene_site site_seq Control_R1 Control_R2 Control_R3 Control_R4 AZD5363_R1 AZD5363_R2 AZD5363_R3 AZD5363_R4 ... GSK690693_R3 GSK690693_R4 Ipatasertib_R1 Ipatasertib_R2 Ipatasertib_R3 Ipatasertib_R4 MK-2206_R1 MK-2206_R2 MK-2206_R3 MK-2206_R4
0 AAK1_S637 AGHRRILsDVtHsAV 14.167260000000001 13.791940000000000 13.888430000000000 13.991310000000000 14.581049999999999 14.802630000000001 14.723060000000000 14.857880000000000 ... 14.47591 14.810269999999999 14.541550000000001 14.598560000000001 14.669480000000000 14.267690000000000 14.500560000000000 14.700110000000000 14.642170000000000 14.394790000000000
1 ABCF1_S105 MERLKKLsVPtsDEE 14.184260000000000 14.422390000000000 14.320520000000000 14.310689999999999 14.905970000000000 15.157950000000000 14.848050000000001 14.968950000000000 ... 14.82788 14.924620000000001 14.888140000000000 15.019850000000000 14.857740000000000 14.809049999999999 14.915050000000001 14.815270000000000 15.067920000000001 14.526149999999999
2 ACIN1_S208 HSPRKSSsIsEEKGD 15.956730000000000 15.791060000000000 NaN 15.961280000000000 14.763690000000000 14.926340000000000 NaN 14.933990000000000 ... NaN 14.728800000000000 15.004510000000000 14.969420000000000 NaN 14.797020000000000 14.923180000000000 15.012650000000001 NaN 15.046220000000000
3 ACIN1_S216 IsEEKGDsDDEKPRK 17.457689999999999 16.997420000000002 17.373349999999999 17.434950000000001 16.252500000000001 16.341049999999999 16.316970000000001 16.411560000000001 ... 16.38372 16.193570000000001 16.486070000000002 16.445869999999999 16.275169999999999 16.242329999999999 16.405529999999999 16.500750000000000 16.558160000000001 16.533030000000000
4 ACIN1_S240 QARAAkLsEGsQPAE 13.211040000000001 13.129730000000000 13.565530000000001 13.572440000000000 12.127129999999999 12.011320000000000 12.125540000000001 11.806080000000000 ... 12.09775 12.322410000000000 11.838360000000000 12.090270000000000 12.027799999999999 11.561370000000000 12.093400000000001 12.152469999999999 12.159110000000000 11.888489999999999

5 rows × 26 columns

df.columns
Index(['gene_site', 'site_seq', 'Control_R1', 'Control_R2', 'Control_R3',
       'Control_R4', 'AZD5363_R1', 'AZD5363_R2', 'AZD5363_R3', 'AZD5363_R4',
       'GSK2110183_R1', 'GSK2110183_R2', 'GSK2110183_R3', 'GSK2110183_R4',
       'GSK690693_R1', 'GSK690693_R2', 'GSK690693_R3', 'GSK690693_R4',
       'Ipatasertib_R1', 'Ipatasertib_R2', 'Ipatasertib_R3', 'Ipatasertib_R4',
       'MK-2206_R1', 'MK-2206_R2', 'MK-2206_R3', 'MK-2206_R4'],
      dtype='object')
ctrl = df.columns[df.columns.str.contains('Control')]

# below are five AKT inhibitors
AZD = df.columns[df.columns.str.contains('AZD5363')]
GSK2 = df.columns[df.columns.str.contains('GSK2110183')]
GSK6 = df.columns[df.columns.str.contains('GSK690693')] 
ipa = df.columns[df.columns.str.contains('Ipatasertib')]
MK = df.columns[df.columns.str.contains('MK')]

Statistical test

Let’s use AZD for example:

azd = get_pvalue(df,ctrl, AZD,'mann_whitney')
Computing mann_whitney tests: 100%|██████████| 10900/10900 [00:10<00:00, 1061.00it/s]
azd.head()
log2FC p_value p_adj signed_logP signed_logPadj
0 0.822975000000001 0.028571428571429 0.186595908585124 1.544068044350276 0.729097883088899
1 0.621855000000000 0.028571428571429 0.186595908585124 1.544068044350276 0.729097883088899
2 -1.030390000000001 0.100000000000000 0.314892387693197 -1.000000000000000 -0.501837838006831
3 -1.075140000000001 0.028571428571429 0.186595908585124 -1.544068044350276 -0.729097883088899
4 -1.319855000000000 0.028571428571429 0.186595908585124 -1.544068044350276 -0.729097883088899
azd.p_value.sort_values()
0        0.028571428571429
10584    0.028571428571429
2011     0.028571428571429
10585    0.028571428571429
10603    0.028571428571429
               ...        
7584     1.000000000000000
3648     1.000000000000000
9407     1.000000000000000
3654     1.000000000000000
10899    1.000000000000000
Name: p_value, Length: 10900, dtype: float64

As the sample size for calculating p value is small in this case, we will use uncorrected p_value to allow more data to be considered; it is more important to look at the effect size

info = df.iloc[:,:2]
azd = pd.concat([info,azd],axis=1)

Cut through threshold

# set a threshold of 0.05 for p_value
azd2 = azd[azd.p_value<0.05].reset_index(drop=True)
azd2.shape
(1669, 7)
azd2.site_seq.str[7].value_counts().plot.bar();

Visualize fold change values

def plot_value(df,col):
    plt.figure(figsize=(10, 5))
    
    # sort values
    df=df.copy().sort_values(col).reset_index(drop=True) 
    
    plt.fill_between(df.index, df[col], where=(df[col] >= 0), color='blue', alpha=0.3, label='Positive')
    plt.fill_between(df.index, df[col], where=(df[col] <= 0), color='red', alpha=0.3, label='Negative')
    plt.title('Area Plot of Values')
    plt.xlabel('Ranked Index')
    plt.ylabel('Values')
plot_value(azd2,'log2FC')

st_sites = azd2[azd2.site_seq.str[7].str.upper()!='Y']
y_sites = azd2[azd2.site_seq.str[7].str.upper()=='Y']
plot_value(st_sites,'log2FC')
plt.title('S/T sites')
plot_value(y_sites,'log2FC')
plt.title('Y sites')
Text(0.5, 1.0, 'Y sites')

Dataset processing

There are NaNs in site_seq, we will drop them

azd2=azd2.dropna(subset='site_seq')

Some site_seq contains multiple sequences splitted by “;”, we will take the first one.

azd2['site_seq2'] = azd2.site_seq.str.split(';').str[0]

Make sure site sequence length are all consistent

azd2['len'] = azd2.site_seq2.str.len()
azd2.len.value_counts()
len
15    1647
Name: count, dtype: int64
azd2['acceptor'] = azd2.site_seq2.str[7]

Substrate scoring

pspa_out = predict_kinase_df(azd2,seq_col='site_seq2', **param_PSPA)
cddm_out = predict_kinase_df(azd2,seq_col='site_seq2', **param_CDDM)
input dataframe has a length 1647
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 396/396 [00:03<00:00, 114.38it/s]
input dataframe has a length 1647
Preprocessing
Finish preprocessing
Merging reference
Finish merging

Kinase enrichment

def get_enrichment(score_df, # output df of predict_kinase_df
                    site_df, # df that contains site sequence
                    weight_col, # column that indicates direction (sign) and weights for the site
                    is_Y=False, # whether or not it is Y site or S/T site
                    top_n = 5, # top n kinase to consider for each site
                                  ):

    "Calculate kinase enrichment score for sites with signed log10(p) that pass a threshold(1.3) "

    if is_Y:
        idx_up = site_df.index[(site_df.acceptor.str.upper()=="Y") & (site_df[weight_col]>0)]
        idx_dn = site_df.index[(site_df.acceptor.str.upper()=="Y") & (site_df[weight_col]<0)]
    else:
        idx_up = site_df.index[(site_df.acceptor.str.upper()!="Y") & (site_df[weight_col]>0)]
        idx_dn = site_df.index[(site_df.acceptor.str.upper()!="Y") & (site_df[weight_col]<0)]

    up_site = site_df.loc[idx_up]
    dn_site = site_df.loc[idx_dn]

    up_score = score_df.loc[idx_up]
    dn_score = score_df.loc[idx_dn]

    def top_kinases(site_row,top_n=5):
        # Sort the row in descending order and get the top n kinases
        top_kinases = site_row.sort_values(ascending=False).head(top_n)

        # Get the counts of the top kinases
        kinase_counts = top_kinases.index.value_counts()

        return kinase_counts

    func = partial(top_kinases,top_n=top_n)

    up_cnt = up_score.apply(func,axis=1)
    dn_cnt = dn_score.apply(func,axis=1)

    df_up_cnt = up_cnt.sum().reset_index(name = 'up_cnt')
    df_dn_cnt = dn_cnt.sum().reset_index(name = 'dn_cnt')

    df_w_up = up_cnt.multiply(abs(up_site[weight_col]),axis=0).sum().reset_index(name = 'up_weighted_cnt')
    df_w_dn = dn_cnt.multiply(abs(dn_site[weight_col]),axis=0).sum().reset_index(name = 'dn_weighted_cnt')

    dfs = [df_up_cnt,df_dn_cnt,df_w_up,df_w_dn]

    result = reduce(lambda left, right: pd.merge(left, right, how='outer'), dfs)

    result = result.fillna(0)
    result['max'] = result.apply(lambda r: r.up_weighted_cnt if r.up_weighted_cnt >= r.dn_weighted_cnt else -r.dn_weighted_cnt,axis=1)
    result['delta'] = result.up_weighted_cnt - result.dn_weighted_cnt
    result = result.rename(columns={'index':'kinase'})

    return result

As we only have 6 y sites, we will do S/T sites kinase enrichment; setis_Y to False

st_cddm = get_enrichment(cddm_out,azd2,'log2FC',is_Y=False,top_n=10)
st_pspa = get_enrichment(pspa_out,azd2,'log2FC',is_Y=False,top_n=10)
st_cddm.head()
kinase up_cnt dn_cnt up_weighted_cnt dn_weighted_cnt max delta
0 ACVR2A 19.0 3.0 5.507104999999999 0.896350000000004 5.507104999999999 4.610754999999996
1 AKT1 116.0 111.0 38.424563000000049 95.306570000000008 -95.306570000000008 -56.882006999999959
2 AKT2 14.0 66.0 4.007335000000003 64.575879999999998 -64.575879999999998 -60.568544999999993
3 AKT3 78.0 92.0 24.802710000000030 84.765155000000007 -84.765155000000007 -59.962444999999974
4 ALK2 94.0 14.0 22.789808500000021 6.202690000000000 22.789808500000021 16.587118500000020

Delta is the difference between up_weighted_cnt and dn_weighted_cnt vaue, max is the maximum value among the two.

st_cddm.sort_values('delta')
kinase up_cnt dn_cnt up_weighted_cnt dn_weighted_cnt max delta
2 AKT2 14.0 66.0 4.007335000000003 64.575879999999998 -64.575879999999998 -60.568544999999993
3 AKT3 78.0 92.0 24.802710000000030 84.765155000000007 -84.765155000000007 -59.962444999999974
182 SGK1 72.0 95.0 23.715893000000033 83.583534999999998 -83.583534999999998 -59.867641999999961
1 AKT1 116.0 111.0 38.424563000000049 95.306570000000008 -95.306570000000008 -56.882006999999959
140 P90RSK 162.0 132.0 57.145470500000023 105.883935000000008 -105.883935000000008 -48.738464499999985
... ... ... ... ... ... ... ...
29 CDK2 589.0 72.0 178.796763500000026 41.434269500000013 178.796763500000026 137.362494000000027
28 CDK1 645.0 87.0 197.268107000000043 49.980519500000014 197.268107000000043 147.287587500000029
60 ERK1 650.0 79.0 199.277800000000013 42.192084500000021 199.277800000000013 157.085715499999992
31 CDK4 666.0 81.0 202.695893499999983 43.129879500000015 202.695893499999983 159.566013999999967
61 ERK2 691.0 86.0 210.536440000000027 45.306319500000008 210.536440000000027 165.230120500000027

203 rows × 7 columns

st_cddm.sort_values('max')
kinase up_cnt dn_cnt up_weighted_cnt dn_weighted_cnt max delta
174 PRKX 285.0 136.0 104.613877000000002 109.507792000000023 -109.507792000000023 -4.893915000000021
140 P90RSK 162.0 132.0 57.145470500000023 105.883935000000008 -105.883935000000008 -48.738464499999985
1 AKT1 116.0 111.0 38.424563000000049 95.306570000000008 -95.306570000000008 -56.882006999999959
3 AKT3 78.0 92.0 24.802710000000030 84.765155000000007 -84.765155000000007 -59.962444999999974
182 SGK1 72.0 95.0 23.715893000000033 83.583534999999998 -83.583534999999998 -59.867641999999961
... ... ... ... ... ... ... ...
29 CDK2 589.0 72.0 178.796763500000026 41.434269500000013 178.796763500000026 137.362494000000027
28 CDK1 645.0 87.0 197.268107000000043 49.980519500000014 197.268107000000043 147.287587500000029
60 ERK1 650.0 79.0 199.277800000000013 42.192084500000021 199.277800000000013 157.085715499999992
31 CDK4 666.0 81.0 202.695893499999983 43.129879500000015 202.695893499999983 159.566013999999967
61 ERK2 691.0 86.0 210.536440000000027 45.306319500000008 210.536440000000027 165.230120500000027

203 rows × 7 columns

Visualize in bar graph

def get_bar_data(df, score_col, top_n = 10, plot=True):
    
    data = df.sort_values(score_col)[['kinase',score_col]]

    data.columns = ['Kinase','Score']
    
    data = pd.concat([data.head(top_n),data.tail(top_n)]).set_index('Kinase')
    
    if plot:
        data.plot.barh(legend=False)
        plt.title('Kinase Enrichment')
        plt.xlabel(f'{score_col} score')
        plt.ylabel('')
    
    return data
bar_cddm = get_bar_data(st_cddm,'delta')

bar_pspa = get_bar_data(st_pspa,'delta')