Entropy

from katlas.core import *
pspa = Data.get_pspa_all_norm()
pspa.index.duplicated(keep=False).sum()
np.int64(0)
pspa = pspa.dropna(axis=1)
entropy??
Signature: entropy(pssm_df, return_min=False, exclude_zero=False, contain_sty=True)
Source:   
def entropy(pssm_df,# a dataframe of pssm wtih index as aa and column as position
            return_min=False, # return min entropy as a single value or return all entropy as a series
            exclude_zero=False, # exclude the column of 0 (center position) in the entropy calculation
            contain_sty=True, # keep only s,t,y values (last three) in center 0 position
            ): 
    "Calculate entropy per position (max) of a PSSM surrounding 0"
    pssm_df = pssm_df.copy()
    pssm_df.columns= pssm_df.columns.astype(int)
    if 0 in pssm_df.columns:
        if exclude_zero:
            pssm_df = pssm_df.drop(columns=[0])
        if contain_sty:                       
            pssm_df.loc[pssm_df.index[:-3], 0] = 0
    pssm_df = pssm_df/pssm_df.sum()
    per_position = -np.sum(pssm_df * np.log2(pssm_df + 1e-9), axis=0)
    return per_position.min() if return_min else per_position
File:      f:\git\katlas\katlas\core.py
Type:      function
entropy_flat??
Signature:
entropy_flat(
    flat_pssm: pandas.core.series.Series,
    return_min=False,
    exclude_zero=False,
    contain_sty=True,
)
Source:   
def entropy_flat(flat_pssm:pd.Series,return_min=False,exclude_zero=False,contain_sty=True): 
    "Calculate entropy per position of a flat PSSM surrounding 0"
    pssm_df = recover_pssm(flat_pssm)
    return entropy(pssm_df,return_min=return_min,exclude_zero=exclude_zero,contain_sty=contain_sty)
File:      f:\git\katlas\katlas\core.py
Type:      function
entropies = []
ICs = []
for i,r in pspa.iterrows():
    entropies.append(entropy_flat(r,return_min=False).to_dict())
    ICs.append(get_IC_flat(r).to_dict())
entropy_df = pd.DataFrame(entropies,index=pspa.index)
IC_df = pd.DataFrame(ICs,index=pspa.index)
entropy_df
-5 -4 -3 -2 -1 0 1 2 3 4
kinase
AAK1 4.238872 4.477492 4.419067 4.334337 4.285531 4.430518e-01 2.367128 4.484580 4.459749 4.448665
ACVR2A 4.492276 4.483099 4.422408 3.851257 4.450203 9.999489e-01 4.101970 4.509378 4.502156 4.509964
ACVR2B 4.480671 4.478871 4.409857 3.939154 4.426689 9.996887e-01 4.074009 4.491815 4.508044 4.505800
AKT1 4.427160 4.402988 3.143867 3.590452 4.374148 9.659053e-01 4.334536 4.429082 4.442455 4.412808
AKT2 4.427318 4.415247 2.970578 3.821267 4.416441 9.566125e-01 4.467609 4.463490 4.452789 4.435681
... ... ... ... ... ... ... ... ... ... ...
KDR 4.491261 4.472633 4.457427 4.448105 4.381677 -1.442695e-09 4.390681 4.443358 4.152800 4.462793
FLT4 4.511274 4.501896 4.500559 4.504176 4.297943 -1.442695e-09 4.290937 4.344806 4.154417 4.498858
WEE1_TYR 4.507984 4.495537 4.489914 4.470009 4.089527 -1.442695e-09 4.284853 4.403815 4.301392 4.426540
YES1 4.497127 4.491665 4.442265 4.465032 4.274232 -1.442695e-09 4.350331 4.485518 4.275385 4.492019
ZAP70 4.355980 4.260120 4.111361 4.128756 3.473012 -1.442695e-09 3.634941 4.358286 4.286572 4.474739

396 rows × 10 columns

# columns surrounding 0
cols = pspa.columns[~pspa.columns.str.startswith('0')]
pspa[cols].max(1).sort_values()
kinase
VRK2         0.0941
ROS1         0.0983
TYK2         0.0995
LIMK1_TYR    0.1027
RET          0.1027
              ...  
YANK2        3.7589
GSK3B        3.9147
YANK3        4.2045
CK1A         5.8890
CK1G3        8.4920
Length: 396, dtype: float64
def plot_dots(df,ylabel='bits',figsize=(5,3)):
    df.columns = df.columns.astype(str)
    plt.figure(figsize=figsize)
    for i, col in enumerate(df.columns):
        x_jitter = np.random.normal(loc=i, scale=0.1, size=len(df))
        plt.scatter(x_jitter, df[col], alpha=0.7, s=5,edgecolors='none')

    plt.xticks(range(len(df.columns)), df.columns)
    plt.xlabel("Position")
    plt.ylabel(ylabel)
def plot_violin(df, ylabel='bits', figsize=(5, 3)):
    df_melted = df.melt(var_name='Position', value_name='Value')
    plt.figure(figsize=figsize)

    sns.violinplot(x='Position', y='Value', data=df_melted, inner=None, density_norm='width')
    sns.stripplot(x='Position', y='Value', data=df_melted, color='k', size=2, jitter=True, alpha=0.5)

    plt.xlabel('Position')
    plt.ylabel(ylabel)
    plt.tight_layout()
plot_violin(entropy_df,ylabel='Entropy (bits)')
plt.title('Entropy per Position');

plot_violin(IC_df,ylabel='Information content (bits)')
plt.title('Information Content per Position');
plot_dots(IC_df,ylabel='Information Content (bits)')
plt.title('Information Content per Position');

plot_dots(entropy_df,ylabel='Entropy (bits)')
plt.title('Entropy per Position');
entropy_df.columns
Index(['-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4'], dtype='object')
entropy_df2 = entropy_df.drop(columns=['0']).copy()
entropy_df2.min(1).sort_values().head(20)
kinase
CK1G3     1.674890
CK1A      1.825040
YANK3     1.943948
YANK2     2.017911
P38G      2.053687
P38D      2.060065
GSK3B     2.067587
GSK3A     2.108316
CDK17     2.123790
CK1G2     2.148218
CDK3      2.198466
SBK       2.216507
CK1A2     2.221651
ERK7      2.223145
CK1D      2.236495
CDK16     2.255882
AAK1      2.367128
FAM20C    2.400912
CDK18     2.435806
CDK4      2.452885
dtype: float64