pssms=Data.get_pspa_scale()Hierarchical clustering
Setup
Distance
get_1d_distance
get_1d_distance (df, func_flat)
Compute 1D distance for each row in a dataframe given a distance function
# return 1d distance
get_1d_distance(pssms.head(),js_divergence_flat)100%|██████████| 5/5 [00:00<00:00, 339.72it/s]
array([0.08286125, 0.08577978, 0.08798376, 0.08501009, 0.00215832,
0.07937984, 0.07066437, 0.08348296, 0.07361695, 0.0042525 ])
get_1d_js
get_1d_js (df)
Compute 1D distance using JS divergence.
distance = get_1d_js(pssms.head(20))100%|██████████| 20/20 [00:00<00:00, 111.97it/s]
Parallel computing to accelerate when flattened pssms are too many in a df:
get_1d_distance_parallel
get_1d_distance_parallel (df, func_flat, max_workers=4, chunksize=100)
Parallel compute 1D distance for each row in a dataframe given a distance function
get_1d_distance_parallel(pssms.head(),js_divergence_flat)get_1d_js_parallel
get_1d_js_parallel (df, func_flat=<function js_divergence_flat>, max_workers=4, chunksize=100)
Compute 1D distance matrix using JS divergence.
get_1d_js_parallel(pssms.head())get_Z
get_Z (pssms, func_flat=<function js_divergence_flat>, parallel=True)
Get linkage matrix Z from pssms dataframe
Z = get_Z(pssms.head(10),parallel=False)100%|██████████| 10/10 [00:00<00:00, 187.80it/s]
Z[:5]array([[1.00000000e+00, 2.00000000e+00, 2.15831816e-03, 2.00000000e+00],
[3.00000000e+00, 4.00000000e+00, 4.25249792e-03, 2.00000000e+00],
[5.00000000e+00, 1.10000000e+01, 4.65130779e-03, 3.00000000e+00],
[6.00000000e+00, 7.00000000e+00, 5.89059764e-03, 2.00000000e+00],
[1.00000000e+01, 1.30000000e+01, 9.31412253e-03, 4.00000000e+00]])
plot_dendrogram
plot_dendrogram (Z, color_thr=0.07, dense=7, line_width=1, title=None, scale=1, **kwargs)
| Type | Default | Details | |
|---|---|---|---|
| Z | |||
| color_thr | float | 0.07 | |
| dense | int | 7 | the higher the more dense for each row |
| line_width | int | 1 | |
| title | NoneType | None | |
| scale | int | 1 | |
| kwargs | VAR_KEYWORD |
set_sns(100)plot_dendrogram(Z,dense=7,line_width=1)
plot_dendrogram(Z,dense=4)
pssm_to_seq
pssm_to_seq (pssm_df, thr=0.2, clean_center=True)
Represent PSSM in string sequence of amino acids
| Type | Default | Details | |
|---|---|---|---|
| pssm_df | |||
| thr | float | 0.2 | threshold of probability to show in sequence |
| clean_center | bool | True | if true, zero out non-last three values in position 0 (keep only s,t,y values at center) |
pssm_df = recover_pssm(pssms.iloc[0])pssm_to_seq(pssm_df,thr=0.1)'I..QKt*G...'
get_pssm_seq_labels
get_pssm_seq_labels (pssms, count_map=None, thr=0.3)
Use index of pssms and the pssm to seq to represent pssm.
| Type | Default | Details | |
|---|---|---|---|
| pssms | |||
| count_map | NoneType | None | df index as key, counts as value |
| thr | float | 0.3 | threshold of probability to show in sequence |
get_pssm_seq_labels(pssms.head(10))['AAK1: .....t*G...',
'ACVR2A: .....[t/s]*....',
'ACVR2B: .....[t/s]*....',
'AKT1: ..RR.[s/t]*....',
'AKT2: ..R..[s/t]*....',
'AKT3: ..RR.[s/t]*....',
'ALK2: .....[t/s]*....',
'ALK4: .....[t/s]*....',
'ALPHAK3: .....t*....',
'AMPKA1: .....[s/t]*....']
import random# get a dict of index and counts
count_dict = {idx:random.randint(1,100) for idx in pssms.head(10).index}labels= get_pssm_seq_labels(pssms.head(10),count_dict)
labels['AAK1 (n=28): .....t*G...',
'ACVR2A (n=69): .....[t/s]*....',
'ACVR2B (n=50): .....[t/s]*....',
'AKT1 (n=60): ..RR.[s/t]*....',
'AKT2 (n=55): ..R..[s/t]*....',
'AKT3 (n=33): ..RR.[s/t]*....',
'ALK2 (n=74): .....[t/s]*....',
'ALK4 (n=6): .....[t/s]*....',
'ALPHAK3 (n=14): .....t*....',
'AMPKA1 (n=60): .....[s/t]*....']
plot_dendrogram(Z,dense=4,labels=labels)
Full pipeline
# get distance matrix
pssms=pssms.head(100)
Z = get_Z(pssms)
# optional, get counts for each index
# count_dict = pssms.index.value_counts()
# get pssm to seq labels with counts
# labels= get_pssm_seq_labels(pssms,count_dict)
# or get pssm to seq labels only
labels= get_pssm_seq_labels(pssms)# plot dendrogram
plot_dendrogram(Z,dense=8,labels=labels,truncate_mode='lastp', p=40) # only show 40
# save
# save_pdf('dendrogram.pdf')