from katlas.core import *
Entropy
= Data.get_pspa_all_norm() pspa
=False).sum() pspa.index.duplicated(keep
np.int64(0)
= pspa.dropna(axis=1) pspa
entropy??
Signature: entropy(pssm_df, return_min=False, exclude_zero=False, contain_sty=True)
Source:
def entropy(pssm_df,# a dataframe of pssm wtih index as aa and column as position
return_min=False, # return min entropy as a single value or return all entropy as a series
exclude_zero=False, # exclude the column of 0 (center position) in the entropy calculation
contain_sty=True, # keep only s,t,y values (last three) in center 0 position
):
"Calculate entropy per position (max) of a PSSM surrounding 0"
pssm_df = pssm_df.copy()
pssm_df.columns= pssm_df.columns.astype(int)
if 0 in pssm_df.columns:
if exclude_zero:
pssm_df = pssm_df.drop(columns=[0])
if contain_sty:
pssm_df.loc[pssm_df.index[:-3], 0] = 0
pssm_df = pssm_df/pssm_df.sum()
per_position = -np.sum(pssm_df * np.log2(pssm_df + 1e-9), axis=0)
return per_position.min() if return_min else per_position
File: f:\git\katlas\katlas\core.py
Type: function
entropy_flat??
Signature:
entropy_flat(
flat_pssm: pandas.core.series.Series,
return_min=False,
exclude_zero=False,
contain_sty=True,
)
Source:
def entropy_flat(flat_pssm:pd.Series,return_min=False,exclude_zero=False,contain_sty=True):
"Calculate entropy per position of a flat PSSM surrounding 0"
pssm_df = recover_pssm(flat_pssm)
return entropy(pssm_df,return_min=return_min,exclude_zero=exclude_zero,contain_sty=contain_sty)
File: f:\git\katlas\katlas\core.py
Type: function
= []
entropies = []
ICs for i,r in pspa.iterrows():
=False).to_dict())
entropies.append(entropy_flat(r,return_min ICs.append(get_IC_flat(r).to_dict())
= pd.DataFrame(entropies,index=pspa.index)
entropy_df = pd.DataFrame(ICs,index=pspa.index) IC_df
entropy_df
-5 | -4 | -3 | -2 | -1 | 0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|---|---|---|---|---|
kinase | ||||||||||
AAK1 | 4.238872 | 4.477492 | 4.419067 | 4.334337 | 4.285531 | 4.430518e-01 | 2.367128 | 4.484580 | 4.459749 | 4.448665 |
ACVR2A | 4.492276 | 4.483099 | 4.422408 | 3.851257 | 4.450203 | 9.999489e-01 | 4.101970 | 4.509378 | 4.502156 | 4.509964 |
ACVR2B | 4.480671 | 4.478871 | 4.409857 | 3.939154 | 4.426689 | 9.996887e-01 | 4.074009 | 4.491815 | 4.508044 | 4.505800 |
AKT1 | 4.427160 | 4.402988 | 3.143867 | 3.590452 | 4.374148 | 9.659053e-01 | 4.334536 | 4.429082 | 4.442455 | 4.412808 |
AKT2 | 4.427318 | 4.415247 | 2.970578 | 3.821267 | 4.416441 | 9.566125e-01 | 4.467609 | 4.463490 | 4.452789 | 4.435681 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
KDR | 4.491261 | 4.472633 | 4.457427 | 4.448105 | 4.381677 | -1.442695e-09 | 4.390681 | 4.443358 | 4.152800 | 4.462793 |
FLT4 | 4.511274 | 4.501896 | 4.500559 | 4.504176 | 4.297943 | -1.442695e-09 | 4.290937 | 4.344806 | 4.154417 | 4.498858 |
WEE1_TYR | 4.507984 | 4.495537 | 4.489914 | 4.470009 | 4.089527 | -1.442695e-09 | 4.284853 | 4.403815 | 4.301392 | 4.426540 |
YES1 | 4.497127 | 4.491665 | 4.442265 | 4.465032 | 4.274232 | -1.442695e-09 | 4.350331 | 4.485518 | 4.275385 | 4.492019 |
ZAP70 | 4.355980 | 4.260120 | 4.111361 | 4.128756 | 3.473012 | -1.442695e-09 | 3.634941 | 4.358286 | 4.286572 | 4.474739 |
396 rows × 10 columns
# columns surrounding 0
= pspa.columns[~pspa.columns.str.startswith('0')] cols
max(1).sort_values() pspa[cols].
kinase
VRK2 0.0941
ROS1 0.0983
TYK2 0.0995
LIMK1_TYR 0.1027
RET 0.1027
...
YANK2 3.7589
GSK3B 3.9147
YANK3 4.2045
CK1A 5.8890
CK1G3 8.4920
Length: 396, dtype: float64
def plot_dots(df,ylabel='bits',figsize=(5,3)):
= df.columns.astype(str)
df.columns =figsize)
plt.figure(figsizefor i, col in enumerate(df.columns):
= np.random.normal(loc=i, scale=0.1, size=len(df))
x_jitter =0.7, s=5,edgecolors='none')
plt.scatter(x_jitter, df[col], alpha
range(len(df.columns)), df.columns)
plt.xticks("Position")
plt.xlabel( plt.ylabel(ylabel)
def plot_violin(df, ylabel='bits', figsize=(5, 3)):
= df.melt(var_name='Position', value_name='Value')
df_melted =figsize)
plt.figure(figsize
='Position', y='Value', data=df_melted, inner=None, density_norm='width')
sns.violinplot(x='Position', y='Value', data=df_melted, color='k', size=2, jitter=True, alpha=0.5)
sns.stripplot(x
'Position')
plt.xlabel(
plt.ylabel(ylabel) plt.tight_layout()
='Entropy (bits)')
plot_violin(entropy_df,ylabel'Entropy per Position');
plt.title(
='Information content (bits)')
plot_violin(IC_df,ylabel'Information Content per Position'); plt.title(
='Information Content (bits)')
plot_dots(IC_df,ylabel'Information Content per Position');
plt.title(
='Entropy (bits)')
plot_dots(entropy_df,ylabel'Entropy per Position'); plt.title(
entropy_df.columns
Index(['-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4'], dtype='object')
= entropy_df.drop(columns=['0']).copy() entropy_df2
min(1).sort_values().head(20) entropy_df2.
kinase
CK1G3 1.674890
CK1A 1.825040
YANK3 1.943948
YANK2 2.017911
P38G 2.053687
P38D 2.060065
GSK3B 2.067587
GSK3A 2.108316
CDK17 2.123790
CK1G2 2.148218
CDK3 2.198466
SBK 2.216507
CK1A2 2.221651
ERK7 2.223145
CK1D 2.236495
CDK16 2.255882
AAK1 2.367128
FAM20C 2.400912
CDK18 2.435806
CDK4 2.452885
dtype: float64