df=Data.get_pspa_scale()Hierarchical clustering
get_Z_custom
def get_Z_custom(
df, # Rows=items, cols=features
metric:str='euclidean', # Distance metric for `pdist`
method:str | None=None, # Linkage method (auto-chosen if None)
metric_fn:NoneType=None, # Custom distance fn (e.g. JS divergence)
):
Compute hierarchical clustering linkage matrix Z.
get_Z_cache
def get_Z_cache(
df
):
Return cached linkage matrix for a dataframe.
Z=get_Z_cache(df)CPU times: user 4.73 ms, sys: 2.69 ms, total: 7.42 ms
Wall time: 13 ms
auto_thr
def auto_thr(
Z, # Linkage matrix
percentile:int=95, # Only consider merge-distance jumps above this percentile.- 90–95: fewer clusters (earlier cut)- 97–99: more clusters (later cut)
):
Pick color threshold at a large jump in linkage distances.
thr=auto_thr(Z)
thrnp.float64(0.5971067774415783)
plot_dendrogram
def plot_dendrogram(
Z, thr:float=0.07, dense:int=7, # the higher the more dense for each row
line_width:int=1, title:NoneType=None, scale:int=1, kwargs:VAR_KEYWORD
):
# plot_dendrogram(Z,thr=thr)get_color_groups
def get_color_groups(
Z, # Linkage matrix
thr:float=0.07, # Same threshold as `dendrogram(color_threshold=...)`
):
Assign dendrogram color/cluster group IDs.
# get_color_groups(Z,thr)get_cluster
def get_cluster(
df, pct:int=95, thr:NoneType=None, plot:bool=False, labels:NoneType=None, kwargs:VAR_KEYWORD
):
Get flat cluster assignments from hierarchical clustering linkage matrix Z.
get_cluster(df,pct=95)CPU times: user 650 μs, sys: 187 μs, total: 837 μs
Wall time: 718 μs
kinase
AAK1 7
ACVR2A 49
ACVR2B 49
AKT1 35
AKT2 35
..
KDR 5
FLT4 5
WEE1_TYR 1
YES1 2
ZAP70 4
Length: 396, dtype: int32
Labels
pssm_to_seq
def pssm_to_seq(
pssm_df, thr:float=0.2, # threshold of probability to show in sequence
clean_center:bool=True, # if true, zero out non-last three values in position 0 (keep only s,t,y values at center)
):
Represent PSSM in string sequence of amino acids
pssm_to_seq(recover_pssm(df.iloc[0]))'.....t*G...'
get_pssm_seq_labels
def get_pssm_seq_labels(
pssms, count_map:NoneType=None, thr:float=0.3
):
get_pssm_seq_labels(df.head(2),count_map={'AAK1':100,'ACVR2A':50})['AAK1 (n=100): .....t*G...', 'ACVR2A (n=50): .....[t/s]*....']