Hierarchical clustering

Overview

This module provides tools for computing pairwise distances between PSSMs (Position-Specific Scoring Matrices), performing hierarchical clustering, and visualizing results as dendrograms.

Distance Computation

get_1d_distance(df, func_flat) — Computes pairwise distances between all rows in a dataframe using a specified distance function. Returns a 1D condensed distance array suitable for scipy’s linkage functions.

dist = get_1d_distance(
    df=pssms.head(10),      # dataframe where each row is a flattened PSSM
    func_flat=js_divergence_flat,  # distance function taking two rows, returns scalar
)

get_1d_js(df) — Convenience wrapper that computes pairwise JS (Jensen-Shannon) divergence distances.

dist = get_1d_js(
    df=pssms.head(20),  # dataframe of flattened PSSMs
)

get_1d_distance_parallel(df, func_flat, max_workers, chunksize) — Parallelized version for large dataframes.

dist = get_1d_distance_parallel(
    df=pssms,              # dataframe of flattened PSSMs
    func_flat=js_divergence_flat,  # distance function
    max_workers=4,         # number of parallel workers
    chunksize=100,         # items per worker batch
)

get_1d_js_parallel(df, **kwargs) — Parallelized JS divergence with same interface as above.

Linkage Matrix

get_Z(pssms, func_flat, parallel) — Computes the linkage matrix Z from a PSSM dataframe. This is the main entry point combining distance computation and hierarchical clustering.

Z = get_Z(
    pssms=pssms.head(100),         # dataframe of flattened PSSMs
    func_flat=js_divergence_flat,  # distance function (default: JS divergence)
    parallel=True,                 # use parallel computation for speed
)

Visualization

plot_dendrogram(Z, color_thr, dense, line_width, title, scale, **kwargs) — Plots a horizontal dendrogram from linkage matrix Z. Passes additional kwargs to scipy’s dendrogram().

plot_dendrogram(
    Z=Z,                  # linkage matrix from get_Z()
    color_thr=0.07,       # distance threshold for coloring branches
    dense=7,              # density control (higher = more compact rows)
    line_width=1,         # width of dendrogram lines
    title="My Cluster",   # optional plot title
    scale=1,              # figure size multiplier
    labels=labels,        # custom leaf labels (from get_pssm_seq_labels)
    truncate_mode='lastp',  # scipy kwarg: show only last p merged clusters
    p=40,                 # scipy kwarg: number of leaves to show
)

PSSM Labeling

pssm_to_seq(pssm_df, thr, clean_center) — Converts a 2D PSSM back to a consensus sequence string. Uses brackets for multiple high-probability amino acids and * to mark the phosphorylation center.

seq = pssm_to_seq(
    pssm_df=recover_pssm(pssms.iloc[0]),  # 2D PSSM (rows=AAs, cols=positions)
    thr=0.2,           # probability threshold to include amino acid
    clean_center=True, # keep only s/t/y at center position
)
# Returns e.g.: '..RR.[s/t]*....'

get_pssm_seq_labels(pssms, count_map, thr) — Generates dendrogram labels combining the PSSM index name with its consensus sequence. Optionally includes sample counts.

labels = get_pssm_seq_labels(
    pssms=pssms.head(10),  # dataframe of flattened PSSMs
    count_map=count_dict,  # optional dict {index: count} for sample sizes
    thr=0.3,               # probability threshold for consensus sequence
)
# Returns e.g.: ['AKT1 (n=60): ..RR.[s/t]*....', ...]

Full Pipeline Example

# 1. Load and subset data
pssms = Data.get_pspa_scale().head(100)

# 2. Compute linkage matrix
Z = get_Z(
    pssms=pssms,
    parallel=True,
)

# 3. Generate labels with consensus sequences
labels = get_pssm_seq_labels(
    pssms=pssms,
    thr=0.3,
)

# 4. Plot dendrogram
plot_dendrogram(
    Z=Z,
    dense=8,
    labels=labels,
    truncate_mode='lastp',
    p=40,
)

Setup

Distance

pssms=Data.get_pspa_scale()

get_1d_distance


def get_1d_distance(
    df, func_flat
):

Compute 1D distance for each row in a dataframe given a distance function

# return 1d distance
get_1d_distance(pssms.head(),js_divergence_flat)


  0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 296.35it/s]

array([0.08286125, 0.08577978, 0.08798376, 0.08501009, 0.00215832,
       0.07937984, 0.07066437, 0.08348296, 0.07361695, 0.0042525 ])

get_1d_js


def get_1d_js(
    df
):

Compute 1D distance using JS divergence.

distance = get_1d_js(pssms.head(20))


  0%|          | 0/20 [00:00<?, ?it/s]
 25%|██▌       | 5/20 [00:00<00:00, 45.19it/s]
 65%|██████▌   | 13/20 [00:00<00:00, 62.03it/s]
100%|██████████| 20/20 [00:00<00:00, 78.69it/s]

Parallel computing to accelerate when flattened pssms are too many in a df:

get_1d_distance_parallel


def get_1d_distance_parallel(
    df, func_flat, max_workers:int=4, chunksize:int=100
):

Parallel compute 1D distance for each row in a dataframe given a distance function

get_1d_distance_parallel(pssms.head(),js_divergence_flat)

get_1d_js_parallel


def get_1d_js_parallel(
    df, func_flat:function=js_divergence_flat, max_workers:int=4, chunksize:int=100
):

Compute 1D distance matrix using JS divergence.

get_1d_js_parallel(pssms.head())

get_Z


def get_Z(
    pssms, func_flat:function=js_divergence_flat, parallel:bool=True
):

Get linkage matrix Z from pssms dataframe

Z = get_Z(pssms.head(10),parallel=False)


  0%|          | 0/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<00:00, 165.83it/s]

Z[:5]

array([[1.00000000e+00, 2.00000000e+00, 2.15831816e-03, 2.00000000e+00],
       [3.00000000e+00, 4.00000000e+00, 4.25249792e-03, 2.00000000e+00],
       [5.00000000e+00, 1.10000000e+01, 4.65130779e-03, 3.00000000e+00],
       [6.00000000e+00, 7.00000000e+00, 5.89059764e-03, 2.00000000e+00],
       [1.00000000e+01, 1.30000000e+01, 9.31412253e-03, 4.00000000e+00]])

plot_dendrogram


def plot_dendrogram(
    Z, color_thr:float=0.07, dense:int=7, # the higher the more dense for each row
    line_width:int=1, title:NoneType=None, scale:int=1, kwargs:VAR_KEYWORD
):

set_sns(100)

plot_dendrogram(Z,dense=7,line_width=1)

plot_dendrogram(Z,dense=4)

pssm_to_seq


def pssm_to_seq(
    pssm_df, thr:float=0.2, # threshold of probability to show in sequence
    clean_center:bool=True, # if true, zero out non-last three values in position 0 (keep only s,t,y values at center)
):

Represent PSSM in string sequence of amino acids

pssm_df = recover_pssm(pssms.iloc[0])

pssm_to_seq(pssm_df,thr=0.1)

'I..QKt*G...'

get_pssm_seq_labels


def get_pssm_seq_labels(
    pssms, count_map:NoneType=None, # df index as key, counts as value
    thr:float=0.3, # threshold of probability to show in sequence
):

Use index of pssms and the pssm to seq to represent pssm.

get_pssm_seq_labels(pssms.head(10))

['AAK1: .....t*G...',
 'ACVR2A: .....[t/s]*....',
 'ACVR2B: .....[t/s]*....',
 'AKT1: ..RR.[s/t]*....',
 'AKT2: ..R..[s/t]*....',
 'AKT3: ..RR.[s/t]*....',
 'ALK2: .....[t/s]*....',
 'ALK4: .....[t/s]*....',
 'ALPHAK3: .....t*....',
 'AMPKA1: .....[s/t]*....']

import random

# get a dict of index and counts

count_dict = {idx:random.randint(1,100) for idx in pssms.head(10).index}

labels= get_pssm_seq_labels(pssms.head(10),count_dict)
labels

['AAK1 (n=47): .....t*G...',
 'ACVR2A (n=95): .....[t/s]*....',
 'ACVR2B (n=90): .....[t/s]*....',
 'AKT1 (n=36): ..RR.[s/t]*....',
 'AKT2 (n=88): ..R..[s/t]*....',
 'AKT3 (n=61): ..RR.[s/t]*....',
 'ALK2 (n=60): .....[t/s]*....',
 'ALK4 (n=88): .....[t/s]*....',
 'ALPHAK3 (n=28): .....t*....',
 'AMPKA1 (n=92): .....[s/t]*....']

plot_dendrogram(Z,dense=4,labels=labels)

Full pipeline

# get distance matrix
pssms=pssms.head(100)

Z = get_Z(pssms)

# optional, get counts for each index
# count_dict = pssms.index.value_counts()

# get pssm to seq labels with counts
# labels= get_pssm_seq_labels(pssms,count_dict)

# or get pssm to seq labels only
labels= get_pssm_seq_labels(pssms)

# plot dendrogram
plot_dendrogram(Z,dense=8,labels=labels,truncate_mode='lastp', p=40) # only show 40

# save
# save_pdf('dendrogram.pdf')