Hierarchical clustering

Setup

Distance

pssms=Data.get_pspa_scale()

get_1d_distance

 get_1d_distance (df, func_flat)

Compute 1D distance for each row in a dataframe given a distance function

# return 1d distance
get_1d_distance(pssms.head(),js_divergence_flat)

100%|██████████| 5/5 [00:00<00:00, 339.72it/s]

array([0.08286125, 0.08577978, 0.08798376, 0.08501009, 0.00215832,
       0.07937984, 0.07066437, 0.08348296, 0.07361695, 0.0042525 ])

get_1d_js

 get_1d_js (df)

Compute 1D distance using JS divergence.

distance = get_1d_js(pssms.head(20))

100%|██████████| 20/20 [00:00<00:00, 111.97it/s]

Parallel computing to accelerate when flattened pssms are too many in a df:

get_1d_distance_parallel

 get_1d_distance_parallel (df, func_flat, max_workers=4, chunksize=100)

Parallel compute 1D distance for each row in a dataframe given a distance function

get_1d_distance_parallel(pssms.head(),js_divergence_flat)

get_1d_js_parallel

 get_1d_js_parallel (df, func_flat=<function js_divergence_flat>,
                     max_workers=4, chunksize=100)

Compute 1D distance matrix using JS divergence.

get_1d_js_parallel(pssms.head())

get_Z

 get_Z (pssms, func_flat=<function js_divergence_flat>, parallel=True)

Get linkage matrix Z from pssms dataframe

Z = get_Z(pssms.head(10),parallel=False)

100%|██████████| 10/10 [00:00<00:00, 187.80it/s]

Z[:5]

array([[1.00000000e+00, 2.00000000e+00, 2.15831816e-03, 2.00000000e+00],
       [3.00000000e+00, 4.00000000e+00, 4.25249792e-03, 2.00000000e+00],
       [5.00000000e+00, 1.10000000e+01, 4.65130779e-03, 3.00000000e+00],
       [6.00000000e+00, 7.00000000e+00, 5.89059764e-03, 2.00000000e+00],
       [1.00000000e+01, 1.30000000e+01, 9.31412253e-03, 4.00000000e+00]])

plot_dendrogram

 plot_dendrogram (Z, color_thr=0.07, dense=7, line_width=1, title=None,
                  scale=1, **kwargs)

	Type	Default	Details
Z
color_thr	float	0.07
dense	int	7	the higher the more dense for each row
line_width	int	1
title	NoneType	None
scale	int	1
kwargs	VAR_KEYWORD

set_sns(100)

plot_dendrogram(Z,dense=7,line_width=1)

plot_dendrogram(Z,dense=4)

pssm_to_seq

 pssm_to_seq (pssm_df, thr=0.2, clean_center=True)

Represent PSSM in string sequence of amino acids

	Type	Default	Details
pssm_df
thr	float	0.2	threshold of probability to show in sequence
clean_center	bool	True	if true, zero out non-last three values in position 0 (keep only s,t,y values at center)

pssm_df = recover_pssm(pssms.iloc[0])

pssm_to_seq(pssm_df,thr=0.1)

'I..QKt*G...'

get_pssm_seq_labels

 get_pssm_seq_labels (pssms, count_map=None, thr=0.3)

Use index of pssms and the pssm to seq to represent pssm.

	Type	Default	Details
pssms
count_map	NoneType	None	df index as key, counts as value
thr	float	0.3	threshold of probability to show in sequence

get_pssm_seq_labels(pssms.head(10))

['AAK1: .....t*G...',
 'ACVR2A: .....[t/s]*....',
 'ACVR2B: .....[t/s]*....',
 'AKT1: ..RR.[s/t]*....',
 'AKT2: ..R..[s/t]*....',
 'AKT3: ..RR.[s/t]*....',
 'ALK2: .....[t/s]*....',
 'ALK4: .....[t/s]*....',
 'ALPHAK3: .....t*....',
 'AMPKA1: .....[s/t]*....']

import random

# get a dict of index and counts

count_dict = {idx:random.randint(1,100) for idx in pssms.head(10).index}

labels= get_pssm_seq_labels(pssms.head(10),count_dict)
labels

['AAK1 (n=28): .....t*G...',
 'ACVR2A (n=69): .....[t/s]*....',
 'ACVR2B (n=50): .....[t/s]*....',
 'AKT1 (n=60): ..RR.[s/t]*....',
 'AKT2 (n=55): ..R..[s/t]*....',
 'AKT3 (n=33): ..RR.[s/t]*....',
 'ALK2 (n=74): .....[t/s]*....',
 'ALK4 (n=6): .....[t/s]*....',
 'ALPHAK3 (n=14): .....t*....',
 'AMPKA1 (n=60): .....[s/t]*....']

plot_dendrogram(Z,dense=4,labels=labels)

Full pipeline

# get distance matrix
pssms=pssms.head(100)

Z = get_Z(pssms)

# optional, get counts for each index
# count_dict = pssms.index.value_counts()

# get pssm to seq labels with counts
# labels= get_pssm_seq_labels(pssms,count_dict)

# or get pssm to seq labels only
labels= get_pssm_seq_labels(pssms)

# plot dendrogram
plot_dendrogram(Z,dense=8,labels=labels,truncate_mode='lastp', p=40) # only show 40

# save
# save_pdf('dendrogram.pdf')

End