hierarchical

Plot hierarchical clustering for PSSMs

Setup

from katlas.data import Data
from katlas.pssm import get_pssm_seq_labels,flatten_pssm
from kplot.hierarchical import get_hcluster
pssms=Data.get_pspa_scale()
distance = get_1d_js(pssms.head(20))
100%|██████████| 20/20 [00:00<00:00, 360.94it/s]
distance = get_1d_js_parallel(pssms.head(20))
100%|██████████| 190/190 [00:00<00:00, 5548.41it/s]

Pipeline of clustering

# pssms=Data.get_pspa_scale()
# Z = get_Z(pssms,func_flat=js_divergence_flat)
# count_dict = pssms.index.value_counts()
# labels= get_pssm_seq_labels(pssms,count_dict)

# plot_dendrogram(Z,dense=8,labels=labels,thr=0.125)

Or directly use get_hcluster(include calculating Z and plot dendrogram)

count_dict = pssms.index.value_counts()
labels= get_pssm_seq_labels(pssms,count_dict)

get_hcluster(pssms.head(20),func_flat=js_divergence_flat,labels=labels[:20],thr=0.125,dense=5)
100%|██████████| 190/190 [00:00<00:00, 5515.69it/s]
kinase
AAK1       2
ACVR2A     2
ACVR2B     2
AKT1       1
AKT2       1
AKT3       1
ALK2       2
ALK4       2
ALPHAK3    2
AMPKA1     1
AMPKA2     1
ANKRD3     2
ASK1       2
ATM        2
ATR        2
AURA       1
AURB       1
AURC       1
BCKDK      2
BIKE       2
dtype: int32