Hierarchical clustering

Setup

Distance

pssms=Data.get_pspa_all_scale()

source

get_1d_distance

 get_1d_distance (df, func_flat)

Compute 1D distance for each row in a dataframe given a distance function

# return 1d distance
get_1d_distance(pssms.head(),js_divergence_flat)
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 481.41it/s]
array([0.08286125, 0.08577978, 0.08798376, 0.08501009, 0.00215832,
       0.07937984, 0.07066437, 0.08348296, 0.07361695, 0.0042525 ])

source

get_1d_js

 get_1d_js (df)

Compute 1D distance using JS divergence.

distance = get_1d_js(pssms.head(20))
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 116.00it/s]

Parallel computing to accelerate when flattened pssms are too many in a df:


source

get_1d_distance_parallel

 get_1d_distance_parallel (df, func_flat, max_workers=4, chunksize=100)

Parallel compute 1D distance for each row in a dataframe given a distance function

get_1d_distance_parallel(pssms.head(),js_divergence_flat)

source

get_1d_js_parallel

 get_1d_js_parallel (df, func_flat=<function js_divergence_flat>,
                     max_workers=4, chunksize=100)

Compute 1D distance matrix using JS divergence.

get_1d_js_parallel(pssms.head())

source

get_Z

 get_Z (pssms, func_flat=<function js_divergence_flat>, parallel=True)

Get linkage matrix Z from pssms dataframe

Z = get_Z(pssms.head(10),parallel=False)
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 229.95it/s]
Z[:5]
array([[1.00000000e+00, 2.00000000e+00, 2.15831816e-03, 2.00000000e+00],
       [3.00000000e+00, 4.00000000e+00, 4.25249792e-03, 2.00000000e+00],
       [5.00000000e+00, 1.10000000e+01, 4.65130779e-03, 3.00000000e+00],
       [6.00000000e+00, 7.00000000e+00, 5.89059764e-03, 2.00000000e+00],
       [1.00000000e+01, 1.30000000e+01, 9.31412253e-03, 4.00000000e+00]])

source

plot_dendrogram

 plot_dendrogram (Z, color_thr=0.07, dense=7, line_width=1,
                  title='Hierarchical Clustering Dendrogram', scale=1,
                  **kwargs)
Type Default Details
Z
color_thr float 0.07
dense int 7 the higher the more dense for each row
line_width int 1
title str Hierarchical Clustering Dendrogram
scale int 1
kwargs VAR_KEYWORD
set_sns(100)
plot_dendrogram(Z,dense=7,line_width=1)

plot_dendrogram(Z,dense=4)


source

pssm_to_seq

 pssm_to_seq (pssm_df, thr=0.2, clean_center=True)

Represent PSSM in string sequence of amino acids

Type Default Details
pssm_df
thr float 0.2 threshold of probability to show in sequence
clean_center bool True if true, zero out non-last three values in position 0 (keep only s,t,y values at center)
pssm_df = recover_pssm(pssms.iloc[0])
pssm_to_seq(pssm_df,thr=0.1)
'I..QKt*G...'

source

get_pssm_seq_labels

 get_pssm_seq_labels (pssms, count_map=None, thr=0.3)

Use index of pssms and the pssm to seq to represent pssm.

Type Default Details
pssms
count_map NoneType None df index as key, counts as value
thr float 0.3 threshold of probability to show in sequence
get_pssm_seq_labels(pssms.head(10))
['AAK1: .....t*G...',
 'ACVR2A: .....[t/s]*....',
 'ACVR2B: .....[t/s]*....',
 'AKT1: ..RR.[s/t]*....',
 'AKT2: ..R..[s/t]*....',
 'AKT3: ..RR.[s/t]*....',
 'ALK2: .....[t/s]*....',
 'ALK4: .....[t/s]*....',
 'ALPHAK3: .....t*....',
 'AMPKA1: .....[s/t]*....']
import random
# get a dict of index and counts

count_dict = {idx:random.randint(1,100) for idx in pssms.head(10).index}
labels= get_pssm_seq_labels(pssms.head(10),count_dict)
labels
['AAK1 (n=21): .....t*G...',
 'ACVR2A (n=51): .....[t/s]*....',
 'ACVR2B (n=98): .....[t/s]*....',
 'AKT1 (n=45): ..RR.[s/t]*....',
 'AKT2 (n=60): ..R..[s/t]*....',
 'AKT3 (n=4): ..RR.[s/t]*....',
 'ALK2 (n=79): .....[t/s]*....',
 'ALK4 (n=26): .....[t/s]*....',
 'ALPHAK3 (n=13): .....t*....',
 'AMPKA1 (n=64): .....[s/t]*....']
plot_dendrogram(Z,dense=4,labels=labels)

Full pipeline

# get distance matrix
pssms=pssms.head(100)

Z = get_Z(pssms)

# optional, get counts for each index
# count_dict = pssms.index.value_counts()

# get pssm to seq labels with counts
# labels= get_pssm_seq_labels(pssms,count_dict)

# or get pssm to seq labels only
labels= get_pssm_seq_labels(pssms)
# plot dendrogram
plot_dendrogram(Z,dense=8,labels=labels,truncate_mode='lastp', p=40) # only show 40

# save
# save_pdf('dendrogram.pdf')

End