Hierarchical clustering

import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
df = pd.read_excel('out/uniprot_kd_active_D1_D2.xlsx')
pspa=pd.read_csv('out/pspa_uniprot_unique_no_TYR_category_remove2kd.csv')

Clustal Omega

# df2 = df.head(4000).copy()

https://www.ebi.ac.uk/jdispatcher/msa/clustalo

# def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
#     "Generate fasta file from sequences."
#     records = [
#         SeqRecord(Seq(row[seq_col]), id=row[id_col], description="")
#         for _, row in df.iterrows()
#     ]
#     SeqIO.write(records, path, "fasta")
#     print(len(records))
# get_fasta(df2,path='raw/active_kinase_domains_4k.fasta')

ProT5

t5=pd.read_parquet('out/uniprot_kd_t5.parquet')
from scipy.cluster.hierarchy import linkage, fcluster,dendrogram
import matplotlib.pyplot as plt
Z = linkage(t5, method='ward')
def plot_dendrogram3(Z, output='dendrogram.pdf', color_thr=0.01, **kwargs):
    with plt.rc_context({'lines.linewidth': 0.3}):  # set default line width
        plt.figure(figsize=(5, 100))
        dendrogram(
            Z,
            orientation='left',
            color_threshold=color_thr,
            truncate_mode='level',
            p=20,
            leaf_font_size=1,
            show_contracted=True,
            **kwargs
        )
        plt.title('Hierarchical Clustering Dendrogram')
        plt.ylabel('Distance')
        plt.savefig(output, bbox_inches='tight')
        plt.close()
from katlas.core import *
pspa_df = pspa.set_index('kd_ID').iloc[:,5:]
def get_dendrogram_labels(order_index, # iterable list of the dendrogram indexes
                          pssms, # df of flattened pssms with index as kd name
                          color_thr=0.15
                         ):
    
    labels = []
    for idx in order_index:
        if idx in pssms.index:
            flat_pssm =pssms.loc[idx]
            pssm_df = recover_pssm(flat_pssm)
            norm_pssm_df = clean_zero_normalize(pssm_df)
            seq = pssm_to_seq(norm_pssm_df, color_thr)
            labels.append(idx + ': ' + seq)
        else:
            labels.append(idx)

    return labels
labels=get_dendrogram_labels(t5.index,pspa_df,0.15)
pspa_df[pspa_df.index.str.contains('KC1A')]
-5P -5G -5A -5C -5S -5T -5V -5I -5L -5M ... 4E 4s 4t 4y 0s 0t 0y 0S 0T 0Y
kd_ID
P48729_KC1A_HUMAN_KD1 0.0843 0.0590 0.0664 0.0588 0.0590 0.0590 0.0459 0.0488 0.057 0.0530 ... 0.0564 0.1808 0.1808 0.1458 1.0 0.1435 0.0 1.0 0.1435 0.0
Q8N752_KC1AL_HUMAN_KD1 0.0514 0.0528 0.0542 0.0535 0.0546 0.0546 0.0544 0.0645 0.064 0.0639 ... 0.0512 0.0966 0.0966 0.1209 1.0 0.4354 0.0 1.0 0.4354 0.0

2 rows × 213 columns

# labels = [i+': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms.iterrows()]
plot_dendrogram3(Z,labels =labels )
pspa_df2 = pspa_df.reset_index()
pspa_df2.shape
(362, 214)
pspa_df.columns
Index(['-5P', '-5G', '-5A', '-5C', '-5S', '-5T', '-5V', '-5I', '-5L', '-5M',
       ...
       '4E', '4s', '4t', '4y', '0s', '0t', '0y', '0S', '0T', '0Y'],
      dtype='object', length=213)
columns_to_fill = pspa_df.columns
df = df.merge(pspa_df2,'left')
for col in columns_to_fill:
    df[col] = df.groupby('kd_seq')[col].transform(lambda x: x.ffill().bfill())
len(pspa_df2)
362
df2 = df.dropna(subset='4E')
df2 = df2.set_index('kd_ID')[columns_to_fill]
labels=get_dendrogram_labels(t5.index,df2,0.15)
plot_dendrogram3(Z,output='dendrogram_similarity_1.pdf',labels =labels )
def get_dup(df):
    dup = df[df.kd_seq.duplicated(keep=False)].sort_values('kd_seq')
    return dup.groupby('kd_seq').agg({'kd_ID':lambda x: ','.join(x)}).reset_index()
dup_unique = get_dup(df)
dup_unique[dup_unique.kd_ID.str.contains("HUMAN")].to_csv('duplicate_human_across.csv')