Hierarchical clustering

import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
df = pd.read_excel('out/uniprot_kd_active_D1_D2.xlsx')
pspa=pd.read_csv('out/pspa_uniprot_unique_no_TYR_category_remove2kd.csv')

Clustal Omega

# df2 = df.head(4000).copy()

https://www.ebi.ac.uk/jdispatcher/msa/clustalo

# def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
#     "Generate fasta file from sequences."
#     records = [
#         SeqRecord(Seq(row[seq_col]), id=row[id_col], description="")
#         for _, row in df.iterrows()
#     ]
#     SeqIO.write(records, path, "fasta")
#     print(len(records))
# get_fasta(df2,path='raw/active_kinase_domains_4k.fasta')

ProT5

df = pd.read_excel('out/uniprot_kd.xlsx')
t5=pd.read_parquet('out/uniprot_kd_t5.parquet')
from scipy.cluster.hierarchy import linkage, fcluster,dendrogram
import matplotlib.pyplot as plt
t5
T5_0 T5_1 T5_2 T5_3 T5_4 T5_5 T5_6 T5_7 T5_8 T5_9 ... T5_1014 T5_1015 T5_1016 T5_1017 T5_1018 T5_1019 T5_1020 T5_1021 T5_1022 T5_1023
kd_ID
A0A075F7E9_LERK1_ORYSI_KD1 b';#' b'h,' b'\x1f$' b'J\x96' b'v\x95' b'y%' b"\xc1'" b'\xfd\xab' b"H'" b'~\xa4' ... b'\x82\xa7' b'\xa0#' b'^\x9e' b'x\xac' b'\xa4)' b'\xef\xa0' b'\xa3\x9f' b'p%' b'\\\x1d' b'o\xa8'
A0A078BQP2_GCY25_CAEEL_KD1 b'Z\x95' b'\xc3\xa7' b'_%' b'\xd2&' b'\x8c\xa2' b'](' b'j\xa7' b'\xc4\xae' b'\xc9\xa4' b'\xbc\xa0' ... b'\xf4\xa8' b'\xb6\xa8' b'>\xa4' b'\xac\xac' b'\xca+' b'\xb5)' b'\xaa\x19' b'~\xa3' b'\xe5)' b'\x81\x1c'
A0A078CGE6_M3KE1_BRANA_KD1 b'\xfa*' b'\x00.' b'\xa2\x9f' b'\xd4$' b'i\xa9' b'\x85\x9d' b'\x0b\x9d' b'\xfe\xa9' b'\xe8\xa2' b'\xff\xa4' ... b'\x1b\xa4' b')\x12' b'\xea\xa8' b'L\xa9' b't,' b'\xba\xa5' b'\xba\xa6' b'\xa9\x95' b'\xce\x1e' b'j\xa8'
A0A0G2K344_PK3CA_RAT_KD1 b"\xe3'" b'\x110' b'r\x1f' b'\xc5\xa4' b'\x12 ' b'\xe8$' b'`\xa6' b']\xac' b'8\xa5' b'8\xa0' ... b'\xa8\xa4' b'^\xa3' b'\t\xa6' b'\xf1\x1e' b'\xb1*' b'\xad\xaa' b'\xe8\xa5' b'e(' b'\x8f\x19' b'D\x9f'
A0A0H2ZM62_HK06_STRP2_KD1 b'T%' b'\x13\xac' b'\xe4\xa2' b'\xda)' b'\x96\xa8' b'\x00)' b'B&' b'\xb6\xad' b'j\x1d' b'u\xa5' ... b'\x00\xab' b'0&' b'\x15 ' b'|\xae' b'=,' b'\xe4\x1d' b'5\xaa' b'\xba#' b'b&' b'\x0b%'
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
W0LYS5_CAMKI_MACNP_KD1 b'\x86,' b'\x83-' b'#\xa5' b">'" b'\xb3\xa0' b'\x1f\xa1' b'l\xa6' b'\xd6\xa9' b'\x86\x9c' b'\xe6\xa6' ... b'\xc9\xa5' b'=\xa5' b'T\xa8' b'5\x9e' b'C,' b'B\xab' b'\xa8\xa8' b'\x05\xa2' b'\x82\x9e' b'.\x1f'
W0T9X4_ATG1_KLUMD_KD1 b'\xcc(' b'\xe3$' b'\xe1\xaa' b'\xf7\x1f' b'7\xaa' b'Z\x87' b'*\xa7' b'\x1c\xab' b'\xf4)' b'd\xa9' ... b'\xf2\xa7' b'\xb0\xaa' b'\xee\xa4' b'\x97\xaa' b'/(' b'9\xa3' b'n\xaa' b'\x80\xa6' b'\x00$' b'\xa7$'
W7JX98_KGP_PLAFO_KD1 b'\xd8!' b'@,' b'<\xa9' b'.#' b'\xb6\xa8' b'C\xa7' b'\x9c\x97' b'Z\xa9' b'\x1b\x1c' b'\xdf\xab' ... b'\x9d\xa6' b')\xa1' b'\xbb%' b'\xd2\xac' b'k+' b'V\xa5' b'y\xa4' b'3\x1d' b'\xa3&' b'\\\xa7'
X5M5N0_WNK_CAEEL_KD1 b'\xd7)' b'\x86/' b'\xef!' b'\xd5\x18' b'\xb9\xa6' b'\x04&' b'\xee\xa4' b'\x91\xac' b'\xf2\xa5' b'\xc3\xa3' ... b'z\xaa' b'U\x8e' b'\x87\xaa' b'\xfc\xa5' b'\xf6)' b'N\xa8' b']\xa4' b'\xb2\x95' b'\xc6\x1c' b'\x87\xa2'
X5M8U1_GCY17_CAEEL_KD1 b'\x13$' b'\xf6\x14' b'+*' b'\x8b%' b'\x89\x99' b'\xb0$' b'q\x1f' b'\\\xad' b'\x9c\xa3' b"'\xa6" ... b'b\xa8' b'\x91\xa7' b'\x1d\xa8' b'\xa9\xac' b'O)' b'h&' b'f&' b'\xd2\x99' b'\x13&' b'i\xa5'

5536 rows × 1024 columns

Z = linkage(t5, method='ward')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_456969/578039480.py in ?()
----> 1 Z = linkage(t5, method='ward')

~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/scipy/cluster/hierarchy.py in ?(y, method, metric, optimal_ordering)
   1017     >>> dn = dendrogram(Z)
   1018     >>> plt.show()
   1019     """
   1020     xp = array_namespace(y)
-> 1021     y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
   1022     lazy = is_lazy_array(y)
   1023 
   1024     if method not in _LINKAGE_METHODS:

~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/scipy/_lib/_array_api.py in ?(array, dtype, order, copy, xp, check_finite, subok)
    199             array = np.array(array, order=order, dtype=dtype, subok=subok)
    200         elif subok:
    201             array = np.asanyarray(array, order=order, dtype=dtype)
    202         else:
--> 203             array = np.asarray(array, order=order, dtype=dtype)
    204     else:
    205         try:
    206             array = xp.asarray(array, dtype=dtype, copy=copy)

~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/pandas/core/generic.py in ?(self, dtype, copy)
   2164             )
   2165         values = self._values
   2166         if copy is None:
   2167             # Note: branch avoids `copy=None` for NumPy 1.x support
-> 2168             arr = np.asarray(values, dtype=dtype)
   2169         else:
   2170             arr = np.array(values, dtype=dtype, copy=copy)
   2171 

ValueError: could not convert string to float: b';#'
def plot_dendrogram3(Z, output='dendrogram.pdf', color_thr=0.01, **kwargs):
    with plt.rc_context({'lines.linewidth': 0.3}):  # set default line width
        plt.figure(figsize=(5, 100))
        dendrogram(
            Z,
            orientation='left',
            color_threshold=color_thr,
            truncate_mode='level',
            p=20,
            leaf_font_size=1,
            show_contracted=True,
            **kwargs
        )
        plt.title('Hierarchical Clustering Dendrogram')
        plt.ylabel('Distance')
        plt.savefig(output, bbox_inches='tight')
        plt.close()
pspa_df = pspa.set_index('kd_ID').iloc[:,5:]
def get_dendrogram_labels(order_index, # iterable list of the dendrogram indexes
                          pssms, # df of flattened pssms with index as kd name
                          color_thr=0.15
                         ):
    
    labels = []
    for idx in order_index:
        if idx in pssms.index:
            flat_pssm =pssms.loc[idx]
            pssm_df = recover_pssm(flat_pssm)
            norm_pssm_df = clean_zero_normalize(pssm_df)
            seq = pssm_to_seq(norm_pssm_df, color_thr)
            labels.append(idx + ': ' + seq)
        else:
            labels.append(idx)

    return labels
labels=get_dendrogram_labels(t5.index,pspa_df,0.15)
pspa_df[pspa_df.index.str.contains('KC1A')]
-5P -5G -5A -5C -5S -5T -5V -5I -5L -5M ... 4E 4s 4t 4y 0s 0t 0y 0S 0T 0Y
kd_ID
P48729_KC1A_HUMAN_KD1 0.0843 0.0590 0.0664 0.0588 0.0590 0.0590 0.0459 0.0488 0.057 0.0530 ... 0.0564 0.1808 0.1808 0.1458 1.0 0.1435 0.0 1.0 0.1435 0.0
Q8N752_KC1AL_HUMAN_KD1 0.0514 0.0528 0.0542 0.0535 0.0546 0.0546 0.0544 0.0645 0.064 0.0639 ... 0.0512 0.0966 0.0966 0.1209 1.0 0.4354 0.0 1.0 0.4354 0.0

2 rows × 213 columns

# labels = [i+': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms.iterrows()]
plot_dendrogram3(Z,labels =labels )
pspa_df2 = pspa_df.reset_index()
pspa_df2.shape
(362, 214)
pspa_df.columns
Index(['-5P', '-5G', '-5A', '-5C', '-5S', '-5T', '-5V', '-5I', '-5L', '-5M',
       ...
       '4E', '4s', '4t', '4y', '0s', '0t', '0y', '0S', '0T', '0Y'],
      dtype='object', length=213)
columns_to_fill = pspa_df.columns
df = df.merge(pspa_df2,'left')
for col in columns_to_fill:
    df[col] = df.groupby('kd_seq')[col].transform(lambda x: x.ffill().bfill())
len(pspa_df2)
362
df2 = df.dropna(subset='4E')
df2 = df2.set_index('kd_ID')[columns_to_fill]
labels=get_dendrogram_labels(t5.index,df2,0.15)
plot_dendrogram3(Z,output='dendrogram_similarity_1.pdf',labels =labels )
def get_dup(df):
    dup = df[df.kd_seq.duplicated(keep=False)].sort_values('kd_seq')
    return dup.groupby('kd_seq').agg({'kd_ID':lambda x: ','.join(x)}).reset_index()
dup_unique = get_dup(df)
dup_unique[dup_unique.kd_ID.str.contains("HUMAN")].to_csv('duplicate_human_across.csv')