import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
Hierarchical clustering
= pd.read_excel('out/uniprot_kd_active_D1_D2.xlsx') df
=pd.read_csv('out/pspa_uniprot_unique_no_TYR_category_remove2kd.csv') pspa
Clustal Omega
# df2 = df.head(4000).copy()
https://www.ebi.ac.uk/jdispatcher/msa/clustalo
# def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
# "Generate fasta file from sequences."
# records = [
# SeqRecord(Seq(row[seq_col]), id=row[id_col], description="")
# for _, row in df.iterrows()
# ]
# SeqIO.write(records, path, "fasta")
# print(len(records))
# get_fasta(df2,path='raw/active_kinase_domains_4k.fasta')
ProT5
=pd.read_parquet('out/uniprot_kd_t5.parquet') t5
from scipy.cluster.hierarchy import linkage, fcluster,dendrogram
import matplotlib.pyplot as plt
= linkage(t5, method='ward') Z
def plot_dendrogram3(Z, output='dendrogram.pdf', color_thr=0.01, **kwargs):
with plt.rc_context({'lines.linewidth': 0.3}): # set default line width
=(5, 100))
plt.figure(figsize
dendrogram(
Z,='left',
orientation=color_thr,
color_threshold='level',
truncate_mode=20,
p=1,
leaf_font_size=True,
show_contracted**kwargs
)'Hierarchical Clustering Dendrogram')
plt.title('Distance')
plt.ylabel(='tight')
plt.savefig(output, bbox_inches plt.close()
from katlas.core import *
= pspa.set_index('kd_ID').iloc[:,5:] pspa_df
def get_dendrogram_labels(order_index, # iterable list of the dendrogram indexes
# df of flattened pssms with index as kd name
pssms, =0.15
color_thr
):
= []
labels for idx in order_index:
if idx in pssms.index:
=pssms.loc[idx]
flat_pssm = recover_pssm(flat_pssm)
pssm_df = clean_zero_normalize(pssm_df)
norm_pssm_df = pssm_to_seq(norm_pssm_df, color_thr)
seq + ': ' + seq)
labels.append(idx else:
labels.append(idx)
return labels
=get_dendrogram_labels(t5.index,pspa_df,0.15) labels
str.contains('KC1A')] pspa_df[pspa_df.index.
-5P | -5G | -5A | -5C | -5S | -5T | -5V | -5I | -5L | -5M | ... | 4E | 4s | 4t | 4y | 0s | 0t | 0y | 0S | 0T | 0Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
kd_ID | |||||||||||||||||||||
P48729_KC1A_HUMAN_KD1 | 0.0843 | 0.0590 | 0.0664 | 0.0588 | 0.0590 | 0.0590 | 0.0459 | 0.0488 | 0.057 | 0.0530 | ... | 0.0564 | 0.1808 | 0.1808 | 0.1458 | 1.0 | 0.1435 | 0.0 | 1.0 | 0.1435 | 0.0 |
Q8N752_KC1AL_HUMAN_KD1 | 0.0514 | 0.0528 | 0.0542 | 0.0535 | 0.0546 | 0.0546 | 0.0544 | 0.0645 | 0.064 | 0.0639 | ... | 0.0512 | 0.0966 | 0.0966 | 0.1209 | 1.0 | 0.4354 | 0.0 | 1.0 | 0.4354 | 0.0 |
2 rows × 213 columns
# labels = [i+': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms.iterrows()]
=labels ) plot_dendrogram3(Z,labels
= pspa_df.reset_index() pspa_df2
pspa_df2.shape
(362, 214)
pspa_df.columns
Index(['-5P', '-5G', '-5A', '-5C', '-5S', '-5T', '-5V', '-5I', '-5L', '-5M',
...
'4E', '4s', '4t', '4y', '0s', '0t', '0y', '0S', '0T', '0Y'],
dtype='object', length=213)
= pspa_df.columns columns_to_fill
= df.merge(pspa_df2,'left') df
for col in columns_to_fill:
= df.groupby('kd_seq')[col].transform(lambda x: x.ffill().bfill()) df[col]
len(pspa_df2)
362
= df.dropna(subset='4E') df2
= df2.set_index('kd_ID')[columns_to_fill] df2
=get_dendrogram_labels(t5.index,df2,0.15) labels
='dendrogram_similarity_1.pdf',labels =labels ) plot_dendrogram3(Z,output
def get_dup(df):
= df[df.kd_seq.duplicated(keep=False)].sort_values('kd_seq')
dup return dup.groupby('kd_seq').agg({'kd_ID':lambda x: ','.join(x)}).reset_index()
= get_dup(df)
dup_unique str.contains("HUMAN")].to_csv('duplicate_human_across.csv') dup_unique[dup_unique.kd_ID.