Kinase enrichment analysis - AKTi

Open In Colab

In this session, we will analyze the differential change of phosphorylation sites in phosphoproteomics dataset.

Setup

Run below to install:

!pip install python-katlas
# katlas
from katlas.core import *
# from katlas.plot import plot_rank, set_sns

# utils
import pandas as pd, numpy as np, seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce, partial
from tqdm import tqdm

# statistics
from scipy.stats import ttest_rel,ttest_ind
from statsmodels.stats.multitest import multipletests

# disable warning
import warnings
warnings.filterwarnings("ignore", message="converting a masked element to nan")

pd.set_option('display.precision', 15)

Data

The phosphoproteomics dataset is from paper Chemical Phosphoproteomics Sheds New Light on the Targets and Modes of Action of AKT Inhibitors; five clinical AKT inhibitors are evaluated in this study.

df = pd.read_csv('https://github.com/sky1ove/katlas_raw/raw/refs/heads/main/nbs/raw/AKT_inhibitor.csv')
df.head()
gene_site site_seq Control_R1 Control_R2 Control_R3 Control_R4 AZD5363_R1 AZD5363_R2 AZD5363_R3 AZD5363_R4 ... GSK690693_R3 GSK690693_R4 Ipatasertib_R1 Ipatasertib_R2 Ipatasertib_R3 Ipatasertib_R4 MK-2206_R1 MK-2206_R2 MK-2206_R3 MK-2206_R4
0 AAK1_S637 AGHRRILsDVtHsAV 14.167260000000001 13.791940000000000 13.888430000000000 13.991310000000000 14.581049999999999 14.802630000000001 14.723060000000000 14.857880000000000 ... 14.47591 14.810269999999999 14.541550000000001 14.598560000000001 14.669480000000000 14.267690000000000 14.500560000000000 14.700110000000000 14.642170000000000 14.394790000000000
1 ABCF1_S105 MERLKKLsVPtsDEE 14.184260000000000 14.422390000000000 14.320520000000000 14.310689999999999 14.905970000000000 15.157950000000000 14.848050000000001 14.968950000000000 ... 14.82788 14.924620000000001 14.888140000000000 15.019850000000000 14.857740000000000 14.809049999999999 14.915050000000001 14.815270000000000 15.067920000000001 14.526149999999999
2 ACIN1_S208 HSPRKSSsIsEEKGD 15.956730000000000 15.791060000000000 NaN 15.961280000000000 14.763690000000000 14.926340000000000 NaN 14.933990000000000 ... NaN 14.728800000000000 15.004510000000000 14.969420000000000 NaN 14.797020000000000 14.923180000000000 15.012650000000001 NaN 15.046220000000000
3 ACIN1_S216 IsEEKGDsDDEKPRK 17.457689999999999 16.997420000000002 17.373349999999999 17.434950000000001 16.252500000000001 16.341049999999999 16.316970000000001 16.411560000000001 ... 16.38372 16.193570000000001 16.486070000000002 16.445869999999999 16.275169999999999 16.242329999999999 16.405529999999999 16.500750000000000 16.558160000000001 16.533030000000000
4 ACIN1_S240 QARAAkLsEGsQPAE 13.211040000000001 13.129730000000000 13.565530000000001 13.572440000000000 12.127129999999999 12.011320000000000 12.125540000000001 11.806080000000000 ... 12.09775 12.322410000000000 11.838360000000000 12.090270000000000 12.027799999999999 11.561370000000000 12.093400000000001 12.152469999999999 12.159110000000000 11.888489999999999

5 rows × 26 columns

df.columns
Index(['gene_site', 'site_seq', 'Control_R1', 'Control_R2', 'Control_R3',
       'Control_R4', 'AZD5363_R1', 'AZD5363_R2', 'AZD5363_R3', 'AZD5363_R4',
       'GSK2110183_R1', 'GSK2110183_R2', 'GSK2110183_R3', 'GSK2110183_R4',
       'GSK690693_R1', 'GSK690693_R2', 'GSK690693_R3', 'GSK690693_R4',
       'Ipatasertib_R1', 'Ipatasertib_R2', 'Ipatasertib_R3', 'Ipatasertib_R4',
       'MK-2206_R1', 'MK-2206_R2', 'MK-2206_R3', 'MK-2206_R4'],
      dtype='object')
ctrl = df.columns[df.columns.str.contains('Control')]

# below are five AKT inhibitors
AZD = df.columns[df.columns.str.contains('AZD5363')]
GSK2 = df.columns[df.columns.str.contains('GSK2110183')]
GSK6 = df.columns[df.columns.str.contains('GSK690693')] 
ipa = df.columns[df.columns.str.contains('Ipatasertib')]
MK = df.columns[df.columns.str.contains('MK')]

Statistical test

Let’s use AZD for example:

azd = get_pvalue(df,ctrl, AZD,'mann_whitney')
Computing mann_whitney tests: 100%|██████████| 10900/10900 [00:05<00:00, 1894.69it/s]
azd.head()
log2FC p_value p_adj signed_logP signed_logPadj
0 0.822975000000001 0.028571428571429 0.186595908585124 1.544068044350276 0.729097883088899
1 0.621855000000000 0.028571428571429 0.186595908585124 1.544068044350276 0.729097883088899
2 -1.030390000000001 0.100000000000000 0.314892387693197 -1.000000000000000 -0.501837838006831
3 -1.075140000000001 0.028571428571429 0.186595908585124 -1.544068044350276 -0.729097883088899
4 -1.319855000000000 0.028571428571429 0.186595908585124 -1.544068044350276 -0.729097883088899
azd.p_value.sort_values()
92       0.028571428571429
94       0.028571428571429
34       0.028571428571429
36       0.028571428571429
38       0.028571428571429
               ...        
10899    1.000000000000000
10841    1.000000000000000
297      1.000000000000000
10898    1.000000000000000
10864    1.000000000000000
Name: p_value, Length: 10900, dtype: float64

As the sample size for calculating p value is small in this case, we will use uncorrected p_value to allow more data to be considered; it is more important to look at the effect size

info = df.iloc[:,:2]
azd = pd.concat([info,azd],axis=1)

Cut through threshold

# set a threshold of 0.05 for p_value
azd2 = azd[azd.p_value<0.05].reset_index(drop=True)
azd2.shape
(1669, 7)
azd2.site_seq.str[7].value_counts().plot.bar();

Visualize fold change values

def plot_value(df,col):
    plt.figure(figsize=(10, 5))
    
    # sort values
    df=df.copy().sort_values(col).reset_index(drop=True) 
    
    plt.fill_between(df.index, df[col], where=(df[col] >= 0), color='blue', alpha=0.3, label='Positive')
    plt.fill_between(df.index, df[col], where=(df[col] <= 0), color='red', alpha=0.3, label='Negative')
    plt.title('Area Plot of Values')
    plt.xlabel('Ranked Index')
    plt.ylabel('Values')
plot_value(azd2,'log2FC')

st_sites = azd2[azd2.site_seq.str[7].str.upper()!='Y']
y_sites = azd2[azd2.site_seq.str[7].str.upper()=='Y']
plot_value(st_sites,'log2FC')
plt.title('S/T sites')
plot_value(y_sites,'log2FC')
plt.title('Y sites')
Text(0.5, 1.0, 'Y sites')

Dataset processing

There are NaNs in site_seq, we will drop them

azd2=azd2.dropna(subset='site_seq')

Some site_seq contains multiple sequences splitted by “;”, we will take the first one.

azd2['site_seq2'] = azd2.site_seq.str.split(';').str[0]

Make sure site sequence length are all consistent

azd2['len'] = azd2.site_seq2.str.len()
azd2.len.value_counts()
len
15    1647
Name: count, dtype: int64
azd2['acceptor'] = azd2.site_seq2.str[7]

Substrate scoring

pspa_out = predict_kinase_df(azd2,seq_col='site_seq2', **Params('PSPA'))
cddm_out = predict_kinase_df(azd2,seq_col='site_seq2', **Params('CDDM'))
input dataframe has a length 1647
Preprocessing
Finish preprocessing
Merging reference
Finish merging
100%|██████████| 396/396 [00:01<00:00, 327.83it/s]
input dataframe has a length 1647
Preprocessing
Finish preprocessing
Merging reference
Finish merging

Kinase enrichment

def get_enrichment(score_df, # output df of predict_kinase_df
                    site_df, # df that contains site sequence
                    weight_col, # column that indicates direction (sign) and weights for the site
                    is_Y=False, # whether or not it is Y site or S/T site
                    top_n = 5, # top n kinase to consider for each site
                                  ):

    "Calculate kinase enrichment score for sites with signed log10(p) that pass a threshold(1.3) "

    if is_Y:
        idx_up = site_df.index[(site_df.acceptor.str.upper()=="Y") & (site_df[weight_col]>0)]
        idx_dn = site_df.index[(site_df.acceptor.str.upper()=="Y") & (site_df[weight_col]<0)]
    else:
        idx_up = site_df.index[(site_df.acceptor.str.upper()!="Y") & (site_df[weight_col]>0)]
        idx_dn = site_df.index[(site_df.acceptor.str.upper()!="Y") & (site_df[weight_col]<0)]

    up_site = site_df.loc[idx_up]
    dn_site = site_df.loc[idx_dn]

    up_score = score_df.loc[idx_up]
    dn_score = score_df.loc[idx_dn]

    def top_kinases(site_row,top_n=5):
        # Sort the row in descending order and get the top n kinases
        top_kinases = site_row.sort_values(ascending=False).head(top_n)

        # Get the counts of the top kinases
        kinase_counts = top_kinases.index.value_counts()

        return kinase_counts

    func = partial(top_kinases,top_n=top_n)

    up_cnt = up_score.apply(func,axis=1)
    dn_cnt = dn_score.apply(func,axis=1)

    df_up_cnt = up_cnt.sum().reset_index(name = 'up_cnt')
    df_dn_cnt = dn_cnt.sum().reset_index(name = 'dn_cnt')

    df_w_up = up_cnt.multiply(abs(up_site[weight_col]),axis=0).sum().reset_index(name = 'up_weighted_cnt')
    df_w_dn = dn_cnt.multiply(abs(dn_site[weight_col]),axis=0).sum().reset_index(name = 'dn_weighted_cnt')

    dfs = [df_up_cnt,df_dn_cnt,df_w_up,df_w_dn]

    result = reduce(lambda left, right: pd.merge(left, right, how='outer'), dfs)

    result = result.fillna(0)
    result['max'] = result.apply(lambda r: r.up_weighted_cnt if r.up_weighted_cnt >= r.dn_weighted_cnt else -r.dn_weighted_cnt,axis=1)
    result['delta'] = result.up_weighted_cnt - result.dn_weighted_cnt
    result = result.rename(columns={'index':'kinase'})

    return result

As we only have 6 y sites, we will do S/T sites kinase enrichment; setis_Y to False

st_cddm = get_enrichment(cddm_out,azd2,'log2FC',is_Y=False,top_n=10)
st_pspa = get_enrichment(pspa_out,azd2,'log2FC',is_Y=False,top_n=10)
st_cddm.head()
kinase up_cnt dn_cnt up_weighted_cnt dn_weighted_cnt max delta
0 ACVR2A 16.0 2.0 4.321139999999996 0.387275000000001 4.321139999999996 3.933864999999996
1 AKT1 115.0 130.0 37.703091500000035 110.405810000000002 -110.405810000000002 -72.702718499999975
2 AKT2 13.0 78.0 3.840655000000002 75.697464999999994 -75.697464999999994 -71.856809999999996
3 AKT3 79.0 110.0 25.234140000000036 100.077135000000027 -100.077135000000027 -74.842994999999988
4 ALK2 90.0 11.0 21.950408500000012 5.124545000000005 21.950408500000012 16.825863500000008

Delta is the difference between up_weighted_cnt and dn_weighted_cnt vaue, max is the maximum value among the two.

st_cddm.sort_values('delta')
kinase up_cnt dn_cnt up_weighted_cnt dn_weighted_cnt max delta
3 AKT3 79.0 110.0 25.234140000000036 100.077135000000027 -100.077135000000027 -74.842994999999988
1 AKT1 115.0 130.0 37.703091500000035 110.405810000000002 -110.405810000000002 -72.702718499999975
2 AKT2 13.0 78.0 3.840655000000002 75.697464999999994 -75.697464999999994 -71.856809999999996
159 SGK1 74.0 110.0 24.613551500000032 95.519774999999996 -95.519774999999996 -70.906223499999967
122 P90RSK 166.0 149.0 58.454870500000020 118.607010000000017 -118.607010000000017 -60.152139499999997
... ... ... ... ... ... ... ...
27 CDK2 610.0 70.0 185.843078500000047 38.980064500000012 185.843078500000047 146.863014000000021
26 CDK1 662.0 84.0 202.509342000000004 46.487524500000013 202.509342000000004 156.021817499999997
52 ERK1 666.0 76.0 205.446255000000036 40.366904500000004 205.446255000000036 165.079350500000032
29 CDK4 691.0 83.0 210.494168500000001 43.878864500000013 210.494168500000001 166.615303999999981
53 ERK2 713.0 84.0 217.655180000000030 44.031049499999995 217.655180000000030 173.624130500000035

177 rows × 7 columns

st_cddm.sort_values('max')
kinase up_cnt dn_cnt up_weighted_cnt dn_weighted_cnt max delta
152 PRKX 293.0 157.0 107.897737000000006 124.321542000000051 -124.321542000000051 -16.423805000000044
122 P90RSK 166.0 149.0 58.454870500000020 118.607010000000017 -118.607010000000017 -60.152139499999997
1 AKT1 115.0 130.0 37.703091500000035 110.405810000000002 -110.405810000000002 -72.702718499999975
3 AKT3 79.0 110.0 25.234140000000036 100.077135000000027 -100.077135000000027 -74.842994999999988
159 SGK1 74.0 110.0 24.613551500000032 95.519774999999996 -95.519774999999996 -70.906223499999967
... ... ... ... ... ... ... ...
27 CDK2 610.0 70.0 185.843078500000047 38.980064500000012 185.843078500000047 146.863014000000021
26 CDK1 662.0 84.0 202.509342000000004 46.487524500000013 202.509342000000004 156.021817499999997
52 ERK1 666.0 76.0 205.446255000000036 40.366904500000004 205.446255000000036 165.079350500000032
29 CDK4 691.0 83.0 210.494168500000001 43.878864500000013 210.494168500000001 166.615303999999981
53 ERK2 713.0 84.0 217.655180000000030 44.031049499999995 217.655180000000030 173.624130500000035

177 rows × 7 columns

Visualize in bar graph

def get_bar_data(df, score_col, top_n = 10, plot=True):
    
    data = df.sort_values(score_col)[['kinase',score_col]]

    data.columns = ['Kinase','Score']
    
    data = pd.concat([data.head(top_n),data.tail(top_n)]).set_index('Kinase')
    
    if plot:
        data.plot.barh(legend=False)
        plt.title('Kinase Enrichment')
        plt.xlabel(f'{score_col} score')
        plt.ylabel('')
    
    return data
bar_cddm = get_bar_data(st_cddm,'delta')

bar_pspa = get_bar_data(st_pspa,'delta')