Hierarchical clustering


get_Z_custom


def get_Z_custom(
    df, # Rows=items, cols=features
    metric:str='euclidean', # Distance metric for `pdist`
    method:str | None=None, # Linkage method (auto-chosen if None)
    metric_fn:NoneType=None, # Custom distance fn (e.g. JS divergence)
):

Compute hierarchical clustering linkage matrix Z.


get_Z_cache


def get_Z_cache(
    df
):

Return cached linkage matrix for a dataframe.

df=Data.get_pspa_scale()
Z=get_Z_cache(df)
CPU times: user 4.73 ms, sys: 2.69 ms, total: 7.42 ms
Wall time: 13 ms

auto_thr


def auto_thr(
    Z, # Linkage matrix
    percentile:int=95, # Only consider merge-distance jumps above this percentile.- 90–95: fewer clusters (earlier cut)- 97–99: more clusters (later cut)
):

Pick color threshold at a large jump in linkage distances.

thr=auto_thr(Z)
thr
np.float64(0.5971067774415783)

plot_dendrogram


def plot_dendrogram(
    Z, thr:float=0.07, dense:int=7, # the higher the more dense for each row
    line_width:int=1, title:NoneType=None, scale:int=1, kwargs:VAR_KEYWORD
):
# plot_dendrogram(Z,thr=thr)

get_color_groups


def get_color_groups(
    Z, # Linkage matrix
    thr:float=0.07, # Same threshold as `dendrogram(color_threshold=...)`
):

Assign dendrogram color/cluster group IDs.

# get_color_groups(Z,thr)

get_cluster


def get_cluster(
    df, pct:int=95, thr:NoneType=None, plot:bool=False, labels:NoneType=None, kwargs:VAR_KEYWORD
):

Get flat cluster assignments from hierarchical clustering linkage matrix Z.

get_cluster(df,pct=95)
CPU times: user 650 μs, sys: 187 μs, total: 837 μs
Wall time: 718 μs
kinase
AAK1         7
ACVR2A      49
ACVR2B      49
AKT1        35
AKT2        35
            ..
KDR          5
FLT4         5
WEE1_TYR     1
YES1         2
ZAP70        4
Length: 396, dtype: int32

Labels


pssm_to_seq


def pssm_to_seq(
    pssm_df, thr:float=0.2, # threshold of probability to show in sequence
    clean_center:bool=True, # if true, zero out non-last three values in position 0 (keep only s,t,y values at center)
):

Represent PSSM in string sequence of amino acids

pssm_to_seq(recover_pssm(df.iloc[0]))
'.....t*G...'

get_pssm_seq_labels


def get_pssm_seq_labels(
    pssms, count_map:NoneType=None, thr:float=0.3
):
get_pssm_seq_labels(df.head(2),count_map={'AAK1':100,'ACVR2A':50})
['AAK1 (n=100): .....t*G...', 'ACVR2A (n=50): .....[t/s]*....']