from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from katlas.core import *
from katlas.plot import *
import pandas as pd, numpy as np
Kmeans motifs
Kmeans
Data
# human = pd.read_parquet('raw/human_phosphoproteome.parquet')
# df_grouped = pd.read_parquet('raw/combine_source_grouped.parquet')
= Data.get_human_site()
human = Data.get_ks_dataset() df_grouped
= pd.concat([human,df_grouped]) all_site
sum() all_site.sub_site.isna().
0
= all_site.drop_duplicates('sub_site') all_site
all_site.shape
(131843, 19)
# all_site = all_site[['sub_site','site_seq']].drop_duplicates('sub_site')
One-hot encode
= onehot_encode(all_site['site_seq']) onehot
CPU times: user 4.16 s, sys: 768 ms, total: 4.93 s
Wall time: 4.93 s
onehot.head()
-20A | -20C | -20D | -20E | -20F | -20G | -20H | -20I | -20K | -20L | ... | 20R | 20S | 20T | 20V | 20W | 20Y | 20_ | 20s | 20t | 20y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 967 columns
Elbow method
all_site.shape
(131843, 19)
set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.'notebook')
sns.set_context("ticks") sns.set_style(
get_clusters_elbow(onehot)
CPU times: user 23min 20s, sys: 46.3 s, total: 24min 6s
Wall time: 9min 50s
Kmeans
If using RAPIDS
# # pip install --extra-index-url=https://pypi.nvidia.com \"cudf-cu12==25.2.*\" \"cuml-cu12==25.2.*\"
# %load_ext cudf.pandas
# import numpy as np, pandas as pd
# from cuml import KMeans
# import matplotlib.pyplot as plt
# import seaborn as sns
# from tqdm import tqdm
# from katlas.core import *
# from katlas.plot import *
def kmeans(onehot,n=2,seed=42):
= KMeans(n_clusters=n, random_state=seed,n_init='auto')
kmeans return kmeans.fit_predict(onehot)
=[50,150,300]
ncluster=[42,2025,28] seeds
'test_id']=1 all_site[
'test_id') get_cluster_pssms(all_site,
100%|██████████| 1/1 [00:03<00:00, 3.73s/it]
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20pS | 20pT | 20pY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.07984 | 0.06669 | 0.07291 | 0.01333 | 0.06488 | 0.04065 | 0.0505 | 0.0346 | 0.0812 | 0.01958 | ... | 0.02192 | 0.06782 | 0.06339 | 0.04731 | 0.0334 | 0.05239 | 0.0798 | 0.04184 | 0.01385 | 0.00508 |
1 rows × 943 columns
=[]
pssmsfor seed in seeds:
print('seed',seed)
for n in ncluster:
= f'cluster{n}_seed{seed}'
colname print(colname)
= kmeans(onehot,n=n,seed=seed)
all_site[colname] = get_cluster_pssms(all_site,colname,blank_thr=0.5)
pssm_df =colname+'_'+pssm_df.index.astype(str)
pssm_df.index pssms.append(pssm_df)
seed 42
cluster50_seed42
100%|██████████| 50/50 [00:04<00:00, 11.50it/s]
cluster150_seed42
100%|██████████| 150/150 [00:06<00:00, 23.67it/s]
cluster300_seed42
100%|██████████| 300/300 [00:09<00:00, 32.17it/s]
seed 2025
cluster50_seed2025
100%|██████████| 50/50 [00:04<00:00, 12.48it/s]
cluster150_seed2025
100%|██████████| 150/150 [00:06<00:00, 23.80it/s]
cluster300_seed2025
100%|██████████| 300/300 [00:09<00:00, 32.20it/s]
seed 28
cluster50_seed28
100%|██████████| 50/50 [00:04<00:00, 12.24it/s]
cluster150_seed28
100%|██████████| 150/150 [00:06<00:00, 23.97it/s]
cluster300_seed28
100%|██████████| 300/300 [00:09<00:00, 30.65it/s]
= pd.concat(pssms,axis=0) pssms
Save:
# pssms.to_parquet('raw/kmeans.parquet')
# all_site.to_parquet('raw/kmeans_site.parquet',index=False)
= pd.read_parquet('raw/kmeans.parquet')
pssms
=pd.read_parquet('raw/kmeans_site.parquet') all_site
all_site.head()
substrate_uniprot | substrate_genes | site | source | AM_pathogenicity | substrate_sequence | substrate_species | sub_site | substrate_phosphoseq | position | ... | test_id | cluster50_seed42 | cluster150_seed42 | cluster300_seed42 | cluster50_seed2025 | cluster150_seed2025 | cluster300_seed2025 | cluster50_seed28 | cluster150_seed28 | cluster300_seed28 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0A024R4G9 | C19orf48 MGC13170 hCG_2008493 | S20 | psp | NaN | MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH... | Homo sapiens (Human) | A0A024R4G9_S20 | MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH... | 20 | ... | 1 | 33 | 126 | 187 | 48 | 136 | 119 | 3 | 45 | 261 |
1 | A0A075B6Q4 | None | S24 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S24 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 24 | ... | 1 | 41 | 28 | 288 | 42 | 117 | 28 | 2 | 149 | 149 |
2 | A0A075B6Q4 | None | S35 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S35 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 35 | ... | 1 | 45 | 102 | 100 | 30 | 122 | 122 | 6 | 149 | 149 |
3 | A0A075B6Q4 | None | S57 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S57 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 57 | ... | 1 | 30 | 81 | 81 | 30 | 122 | 58 | 40 | 110 | 252 |
4 | A0A075B6Q4 | None | S68 | ochoa | NaN | MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT... | Homo sapiens (Human) | A0A075B6Q4_S68 | MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT... | 68 | ... | 1 | 11 | 75 | 68 | 10 | 48 | 164 | 39 | 97 | 137 |
5 rows × 29 columns
Get JS distance
Get 1D distance
def compute_distance_matrix(df,func):
"Compute 1D distance matrix for each row in a dataframe given a distance function "
= len(df)
n = []
dist for i in tqdm(range(n)):
for j in range(i+1, n):
= func(df.iloc[i], df.iloc[j])
d
dist.append(d)return np.array(dist)
from tqdm.contrib.concurrent import process_map
from functools import partial
import numpy as np
import pandas as pd
def compute_distance(pair, df, func):
= pair
i, j return func(df.iloc[i], df.iloc[j])
def compute_distance_matrix_parallel(df, func, max_workers=4, chunksize=100):
= len(data)
n = [(i, j) for i in range(n) for j in range(i + 1, n)]
index_pairs
# Create a wrapper for partial application
= partial(compute_distance, data=df, func=func)
bound_worker
= process_map(bound_worker, index_pairs, max_workers=max_workers, chunksize=chunksize)
dist return np.array(dist)
from fastcore.meta import delegates
from functools import partial
from tqdm.contrib.concurrent import process_map
def compute_distance_matrix_parallel(df, func, max_workers=4, chunksize=100):
= len(df)
n = [(i, j) for i in range(n) for j in range(i + 1, n)]
index_pairs
= partial(compute_distance, df=df, func=func)
bound_worker
= process_map(bound_worker, index_pairs, max_workers=max_workers, chunksize=chunksize)
dist return np.array(dist)
@delegates(compute_distance_matrix_parallel)
def compute_JS_matrix_parallel(df, func=js_divergence_flat, **kwargs):
"Compute 1D distance matrix using JS divergence."
return compute_distance_matrix_parallel(df, func=func, **kwargs)
str.contains('cluster0')] pssms[pssms.index.
-20P | -20G | -20A | -20C | -20S | -20T | -20V | -20I | -20L | -20M | ... | 20H | 20K | 20R | 20Q | 20N | 20D | 20E | 20pS | 20pT | 20pY |
---|
0 rows × 943 columns
= compute_JS_matrix_parallel(pssms,max_workers=32) distances
distances
array([0.02719099, 0.02964083, 0.03350885, ..., 0.23451136, 0.32064881,
0.29473009])
Hierarchical clustering
from scipy.cluster.hierarchy import linkage, fcluster,dendrogram
= linkage(distances, method='ward') Z
def plot_dendrogram(Z,output='dendrogram.pdf',color_thr=0.03,**kwargs):
=(len(Z)+1)//8
length
=(5,length))
plt.figure(figsize='left', leaf_font_size=7, color_threshold=color_thr,**kwargs)
dendrogram(Z, orientation'Hierarchical Clustering Dendrogram')
plt.title('Distance')
plt.ylabel(='tight')
plt.savefig(output, bbox_inches plt.close()
= [i+': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms.iterrows()] labels
5] labels[:
['cluster50_seed42_28: ....................t*....................',
'cluster50_seed42_2: ....................y*....................',
'cluster50_seed42_31: ....................y*....................',
'cluster50_seed42_10: ..................sPt*.s..................',
'cluster50_seed42_9: ....................s*....................']
# plot_dendrogram(Z,output='raw/dendrogram1.pdf',labels=labels)
Visualize and find out the color threshold that works.
After determine the color threshold, use it to cut the tree.
Visualize some logos
'cluster300_seed28_217','cluster50_seed42_40') plot_logos(pssms,
Cut trees
= fcluster(Z, t=0.03, criterion='distance')
labels # pssm_df['cluster'] = labels
Z.shape
(1460, 4)
len(labels)
1461
10] # always start from 1 np.unique(labels)[:
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=int32)
Expand the cluster into a single column
= ['sub_site', 'site_seq']
id_vars = [col for col in all_site.columns if col.startswith('cluster')] value_vars
= pd.melt(all_site, id_vars=id_vars, value_vars=value_vars, var_name='cluster_info', value_name='cluster') all_site_long
'cluster_id']=all_site_long['cluster_info'] + '_' + all_site_long['cluster'].astype(str) all_site_long[
all_site_long.head()
sub_site | site_seq | cluster_info | cluster | cluster_id | |
---|---|---|---|---|---|
0 | A0A024R4G9_S20 | _MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR | cluster50_seed42 | 33 | cluster50_seed42_33 |
1 | A0A075B6Q4_S24 | QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG | cluster50_seed42 | 41 | cluster50_seed42_41 |
2 | A0A075B6Q4_S35 | EDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKTHRAIADHLF | cluster50_seed42 | 45 | cluster50_seed42_45 |
3 | A0A075B6Q4_S57 | EDCMSVPGKTHRAIADHLFWsEETKSRFTEYsMTssVMRRN | cluster50_seed42 | 30 | cluster50_seed42_30 |
4 | A0A075B6Q4_S68 | RAIADHLFWsEETKSRFTEYsMTssVMRRNEQLTLHDERFE | cluster50_seed42 | 11 | cluster50_seed42_11 |
# all_site_long.to_parquet('raw/kmeans_site_long.parquet',index=False)
Map new cluster
len(labels)
1461
= pd.Series(labels,index=pssms.index) cluster_map
cluster_map.sort_values()
cluster150_seed2025_55 1
cluster300_seed28_117 1
cluster150_seed28_117 1
cluster300_seed2025_293 1
cluster300_seed28_28 2
...
cluster150_seed2025_67 747
cluster300_seed2025_9 747
cluster300_seed28_115 747
cluster300_seed2025_278 748
cluster300_seed42_44 748
Length: 1461, dtype: int32
For those unmapped cluster_ID, we assign them zero value:
# not all cluster_id have a corresponding for new cluster ID, as they could be filtered out
'cluster_new'] = all_site_long.cluster_id.map(lambda x: cluster_map.get(x, 0)) #0 is unmapped all_site_long[
# all_site_long.to_parquet('raw/kmeans_site_long_cluster_new.parquet',index=False)
Get new cluster motifs
get_cluster_pssms?
Signature: get_cluster_pssms( df, cluster_col, seq_col='site_seq', id_col='sub_site', count_thr=10, valid_thr=None, plot=False, ) Docstring: Extract motifs from clusters in a dataframe File: /tmp/ipykernel_2927/3575038003.py Type: function
get_cluster_pssms?
Signature: get_cluster_pssms( df, cluster_col, seq_col='site_seq', id_col='sub_site', count_thr=10, valid_thr=None, plot=False, ) Docstring: Extract motifs from clusters in a dataframe File: /tmp/ipykernel_2927/1868936855.py Type: function
= get_cluster_pssms(all_site_long,
pssms2 'cluster_new',
=10,
count_thr=0.3) blank_thr
100%|██████████| 749/749 [00:22<00:00, 33.07it/s]
pssms2.shape
(749, 943)
= pssms2.drop(index=0) # as 0 represents unmapped pssms2
'out/all_site_pssms.parquet') pssms2.sort_index().to_parquet(
= compute_JS_matrix_parallel(pssms2)
distances = linkage(distances, method='ward') Z
all_site_long.cluster_new
0 447
1 349
2 277
3 396
4 365
...
1186582 664
1186583 541
1186584 660
1186585 679
1186586 676
Name: cluster_new, Length: 1186587, dtype: int64
= all_site_long.drop_duplicates(subset=["cluster_new","sub_site"])["cluster_new"].value_counts() count_map
count_map
cluster_new
548 10755
676 10570
689 9921
577 8877
655 8153
...
4 96
260 91
18 82
724 73
261 67
Name: count, Length: 749, dtype: int64
def pssm_to_seq(pssm_df,
=0.4, # threshold of probability to show in sequence
thr=True, # if true, zero out non-last three values in position 0 (keep only s,t,y values at center)
clean_center
):"Represent PSSM in string sequence of amino acids"
= pssm_df.copy()
pssm_df if clean_center:
-3], 0] = 0 # keep only s,t,y in center 0 position
pssm_df.loc[pssm_df.index[:
= pssm_df.index.map(lambda x: x.replace('pS', 's').replace('pT', 't').replace('pY', 'y'))
pssm_df.index
= []
consensus for i, col in enumerate(pssm_df.columns):
# consider the case where sum for the position is 0
= pssm_df[col]
column_vals if column_vals.sum() == 0:
= '_'
symbol else:
= column_vals.nlargest(3)
top = [aa for aa, prob in zip(top.index, top.values) if prob > thr]
passing
if not passing:
= '.'
symbol elif len(passing) == 1:
= passing[0]
symbol else:
= f"[{'/'.join(passing)}]"
symbol
if col == 0: # center position
if symbol.startswith('['):
= symbol[:-1] + ']*'
symbol else:
+= '*'
symbol
consensus.append(symbol)
return ''.join(consensus)
=[str(i)+f' (n={count_map[i]:,})' + ': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms2.iterrows()] labels
4] labels[:
['548 (n=10,755): ....................s*....................',
'676 (n=10,570): ....................y*....................',
'689 (n=9,921): ....................t*....................',
'577 (n=8,877): ....................s*P...................']
def plot_dendrogram(Z,output='dendrogram.pdf',color_thr=0.03,**kwargs):
=(len(Z)+1)//7
length
=(5,length))
plt.figure(figsize='left', leaf_font_size=7, color_threshold=color_thr,**kwargs)
dendrogram(Z, orientation'Hierarchical Clustering Dendrogram')
plt.title('Distance')
plt.ylabel(='tight')
plt.savefig(output, bbox_inches plt.close()
='out/dendrogram_label2.pdf',labels=labels,color_thr=0.07) plot_dendrogram(Z,output
Onehot of cluster number
'sub_site_seq'] = all_site_long['sub_site']+'_'+all_site_long['site_seq'] all_site_long[
= pd.crosstab(all_site_long['sub_site_seq'], all_site_long['cluster_new']) all_site_onehot
# greater than 0 to be True and convert to int
= all_site_onehot.gt(0).astype(int) all_site_onehot
max() all_site_onehot.
cluster_new
1 1
2 1
3 1
4 1
5 1
..
744 1
745 1
746 1
747 1
748 1
Length: 748, dtype: int64
# remove 0 as it is unassigned for cut tree
= all_site_onehot.drop(columns=0) all_site_onehot
all_site_onehot.columns
Index([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
...
739, 740, 741, 742, 743, 744, 745, 746, 747, 748],
dtype='int64', name='cluster_new', length=748)
sum().sort_values() all_site_onehot.
cluster_new
261 67
724 73
18 82
260 91
4 96
...
655 8153
577 8877
689 9921
676 10570
548 10755
Length: 748, dtype: int64
# all_site_onehot.to_parquet('out/all_site_cluster_onehot.parquet')
Special motifs
def plot_logos(pssms_df,*idxs):
"Plot logos of a dataframe with flattened PSSMs with index ad IDs."
for idx in idxs:
= recover_pssm(pssms_df.loc[idx])
pssm =f'Motif {idx}',figsize=(14,1))
plot_logo(pssm,title
plt.show() plt.close()
Customized
121) plot_logos(pssms2,
224) plot_logos(pssms2,
Most common
= all_site_onehot.sum().sort_values(ascending=False).head(20).index idxs
*idxs) plot_logos(pssms2,
Entropy per position
= pssms2.apply(lambda r: entropy_flat(r),axis=1) pssms2_entropy
# remove 0 and focus on those neighboring residues
= pssms2_entropy.drop(columns=0) pssms2_entropy
=pssms2_entropy.min(1).sort_values().head(10).index idxs
Motif with lowest entropies:
*idxs) plot_logos(pssms2,
Low median entropy
Motifs with low median entropies:
=pssms2_entropy.median(1).sort_values().head(10).index idxs
The first one is mostly Zinc finger protein
*idxs) plot_logos(pssms2,
=pssms2_entropy.sum(1).sort_values().head(10).index idxs
=(pssms2==0).sum(1).sort_values(ascending=False).index idxs
C-terminal motifs
732,742,729) plot_logos(pssms2,
= pssms2.apply(lambda r: (recover_pssm(r).loc[:,1:].sum()==0).sum() , axis=1) zeros_right
= zeros_right.sort_values(ascending=False).head(10).index idxs
*idxs) plot_logos(pssms2,
N-Terminal motifs:
= pssms2.apply(lambda r: (recover_pssm(r).sum()==0).sum() , axis=1) zeros
= zeros.sort_values(ascending=False).head(10).index idxs
*idxs) plot_logos(pssms2,
8,19,22) plot_logos(pssms2,
Umap
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
= pd.DataFrame(all_site_pivot.index) annot
= annot.merge(all_site[['sub_site','site','site_seq','substrate_genes']]) annot
= pd.get_dummies(annot['site'].str[0]).astype(int) sty
= sty.set_index(all_site_pivot.index) sty
= pd.concat([all_site_pivot,sty],axis=1) all_site_pivot_sty
all_site_pivot_sty.head()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 740 | 741 | 742 | 743 | 744 | 745 | 746 | S | T | Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sub_site | |||||||||||||||||||||
A0A024R4G9_S20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
A0A075B6Q4_S24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
A0A075B6Q4_S35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
A0A075B6Q4_S57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
A0A075B6Q4_S68 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 749 columns
= umap.UMAP(
reducer =15,
n_neighbors=2,
n_components='jaccard',
metric# min_dist=0.1,
# random_state=42,
# verbose=True,
)= reducer.fit_transform(all_site_pivot_sty) embedding
f:\git\katlas\.venv\lib\site-packages\sklearn\utils\deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
warnings.warn(
f:\git\katlas\.venv\lib\site-packages\logomaker/..\umap\umap_.py:1887: UserWarning: gradient function is not yet implemented for jaccard distance metric; inverse_transform will be unavailable
warn(
= pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2']) embedding_df
=embedding_df, x='UMAP1', y='UMAP2', alpha=0.5,
sns.scatterplot(data=0.1,
s='none',hue=annot['site'].str[0]) edgecolor
<Axes: xlabel='UMAP1', ylabel='UMAP2'>
'site_seq'].str.len()//2 annot[
0 20
1 20
2 20
3 20
4 20
..
131838 20
131839 20
131840 20
131841 20
131842 20
Name: site_seq, Length: 131843, dtype: int64
=embedding_df, x='UMAP1', y='UMAP2', alpha=0.5,
sns.scatterplot(data=0.1,
s='none',hue=annot['site_seq'].str[21]) edgecolor
<Axes: xlabel='UMAP1', ylabel='UMAP2'>
Entropy
from katlas.core import *
= Data.get_pspa_all_norm() pspa
=False).sum() pspa.index.duplicated(keep
np.int64(0)
= pspa.dropna(axis=1) pspa
entropy??
Signature: entropy(pssm_df, return_min=False, exclude_zero=False, contain_sty=True)
Source:
def entropy(pssm_df,# a dataframe of pssm wtih index as aa and column as position
return_min=False, # return min entropy as a single value or return all entropy as a series
exclude_zero=False, # exclude the column of 0 (center position) in the entropy calculation
contain_sty=True, # keep only s,t,y values (last three) in center 0 position
):
"Calculate entropy per position (max) of a PSSM surrounding 0"
pssm_df = pssm_df.copy()
pssm_df.columns= pssm_df.columns.astype(int)
if 0 in pssm_df.columns:
if exclude_zero:
pssm_df = pssm_df.drop(columns=[0])
if contain_sty:
pssm_df.loc[pssm_df.index[:-3], 0] = 0
pssm_df = pssm_df/pssm_df.sum()
per_position = -np.sum(pssm_df * np.log2(pssm_df + 1e-9), axis=0)
return per_position.min() if return_min else per_position
File: f:\git\katlas\katlas\core.py
Type: function
entropy_flat??
Signature:
entropy_flat(
flat_pssm: pandas.core.series.Series,
return_min=False,
exclude_zero=False,
contain_sty=True,
)
Source:
def entropy_flat(flat_pssm:pd.Series,return_min=False,exclude_zero=False,contain_sty=True):
"Calculate entropy per position of a flat PSSM surrounding 0"
pssm_df = recover_pssm(flat_pssm)
return entropy(pssm_df,return_min=return_min,exclude_zero=exclude_zero,contain_sty=contain_sty)
File: f:\git\katlas\katlas\core.py
Type: function
= []
entropies = []
ICs for i,r in pspa.iterrows():
=False).to_dict())
entropies.append(entropy_flat(r,return_min ICs.append(get_IC_flat(r).to_dict())
= pd.DataFrame(entropies,index=pspa.index)
entropy_df = pd.DataFrame(ICs,index=pspa.index) IC_df
entropy_df
-5 | -4 | -3 | -2 | -1 | 0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|---|---|---|---|---|
kinase | ||||||||||
AAK1 | 4.238872 | 4.477492 | 4.419067 | 4.334337 | 4.285531 | 4.430518e-01 | 2.367128 | 4.484580 | 4.459749 | 4.448665 |
ACVR2A | 4.492276 | 4.483099 | 4.422408 | 3.851257 | 4.450203 | 9.999489e-01 | 4.101970 | 4.509378 | 4.502156 | 4.509964 |
ACVR2B | 4.480671 | 4.478871 | 4.409857 | 3.939154 | 4.426689 | 9.996887e-01 | 4.074009 | 4.491815 | 4.508044 | 4.505800 |
AKT1 | 4.427160 | 4.402988 | 3.143867 | 3.590452 | 4.374148 | 9.659053e-01 | 4.334536 | 4.429082 | 4.442455 | 4.412808 |
AKT2 | 4.427318 | 4.415247 | 2.970578 | 3.821267 | 4.416441 | 9.566125e-01 | 4.467609 | 4.463490 | 4.452789 | 4.435681 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
KDR | 4.491261 | 4.472633 | 4.457427 | 4.448105 | 4.381677 | -1.442695e-09 | 4.390681 | 4.443358 | 4.152800 | 4.462793 |
FLT4 | 4.511274 | 4.501896 | 4.500559 | 4.504176 | 4.297943 | -1.442695e-09 | 4.290937 | 4.344806 | 4.154417 | 4.498858 |
WEE1_TYR | 4.507984 | 4.495537 | 4.489914 | 4.470009 | 4.089527 | -1.442695e-09 | 4.284853 | 4.403815 | 4.301392 | 4.426540 |
YES1 | 4.497127 | 4.491665 | 4.442265 | 4.465032 | 4.274232 | -1.442695e-09 | 4.350331 | 4.485518 | 4.275385 | 4.492019 |
ZAP70 | 4.355980 | 4.260120 | 4.111361 | 4.128756 | 3.473012 | -1.442695e-09 | 3.634941 | 4.358286 | 4.286572 | 4.474739 |
396 rows × 10 columns
# columns surrounding 0
= pspa.columns[~pspa.columns.str.startswith('0')] cols
max(1).sort_values() pspa[cols].
kinase
VRK2 0.0941
ROS1 0.0983
TYK2 0.0995
LIMK1_TYR 0.1027
RET 0.1027
...
YANK2 3.7589
GSK3B 3.9147
YANK3 4.2045
CK1A 5.8890
CK1G3 8.4920
Length: 396, dtype: float64
def plot_dots(df,ylabel='bits',figsize=(5,3)):
= df.columns.astype(str)
df.columns =figsize)
plt.figure(figsizefor i, col in enumerate(df.columns):
= np.random.normal(loc=i, scale=0.1, size=len(df))
x_jitter =0.7, s=5,edgecolors='none')
plt.scatter(x_jitter, df[col], alpha
range(len(df.columns)), df.columns)
plt.xticks("Position")
plt.xlabel( plt.ylabel(ylabel)
def plot_violin(df, ylabel='bits', figsize=(5, 3)):
= df.melt(var_name='Position', value_name='Value')
df_melted =figsize)
plt.figure(figsize
='Position', y='Value', data=df_melted, inner=None, density_norm='width')
sns.violinplot(x='Position', y='Value', data=df_melted, color='k', size=2, jitter=True, alpha=0.5)
sns.stripplot(x
'Position')
plt.xlabel(
plt.ylabel(ylabel) plt.tight_layout()
='Entropy (bits)')
plot_violin(entropy_df,ylabel'Entropy per Position');
plt.title(
='Information content (bits)')
plot_violin(IC_df,ylabel'Information Content per Position'); plt.title(
='Information Content (bits)')
plot_dots(IC_df,ylabel'Information Content per Position');
plt.title(
='Entropy (bits)')
plot_dots(entropy_df,ylabel'Entropy per Position'); plt.title(
entropy_df.columns
Index(['-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4'], dtype='object')
= entropy_df.drop(columns=['0']).copy() entropy_df2
min(1).sort_values().head(20) entropy_df2.
kinase
CK1G3 1.674890
CK1A 1.825040
YANK3 1.943948
YANK2 2.017911
P38G 2.053687
P38D 2.060065
GSK3B 2.067587
GSK3A 2.108316
CDK17 2.123790
CK1G2 2.148218
CDK3 2.198466
SBK 2.216507
CK1A2 2.221651
ERK7 2.223145
CK1D 2.236495
CDK16 2.255882
AAK1 2.367128
FAM20C 2.400912
CDK18 2.435806
CDK4 2.452885
dtype: float64