import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
Hierarchical clustering
= pd.read_excel('out/uniprot_kd_active_D1_D2.xlsx') df
=pd.read_csv('out/pspa_uniprot_unique_no_TYR_category_remove2kd.csv') pspa
Clustal Omega
# df2 = df.head(4000).copy()
https://www.ebi.ac.uk/jdispatcher/msa/clustalo
# def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
# "Generate fasta file from sequences."
# records = [
# SeqRecord(Seq(row[seq_col]), id=row[id_col], description="")
# for _, row in df.iterrows()
# ]
# SeqIO.write(records, path, "fasta")
# print(len(records))
# get_fasta(df2,path='raw/active_kinase_domains_4k.fasta')
ProT5
= pd.read_excel('out/uniprot_kd.xlsx') df
=pd.read_parquet('out/uniprot_kd_t5.parquet') t5
from scipy.cluster.hierarchy import linkage, fcluster,dendrogram
import matplotlib.pyplot as plt
t5
T5_0 | T5_1 | T5_2 | T5_3 | T5_4 | T5_5 | T5_6 | T5_7 | T5_8 | T5_9 | ... | T5_1014 | T5_1015 | T5_1016 | T5_1017 | T5_1018 | T5_1019 | T5_1020 | T5_1021 | T5_1022 | T5_1023 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
kd_ID | |||||||||||||||||||||
A0A075F7E9_LERK1_ORYSI_KD1 | b';#' | b'h,' | b'\x1f$' | b'J\x96' | b'v\x95' | b'y%' | b"\xc1'" | b'\xfd\xab' | b"H'" | b'~\xa4' | ... | b'\x82\xa7' | b'\xa0#' | b'^\x9e' | b'x\xac' | b'\xa4)' | b'\xef\xa0' | b'\xa3\x9f' | b'p%' | b'\\\x1d' | b'o\xa8' |
A0A078BQP2_GCY25_CAEEL_KD1 | b'Z\x95' | b'\xc3\xa7' | b'_%' | b'\xd2&' | b'\x8c\xa2' | b'](' | b'j\xa7' | b'\xc4\xae' | b'\xc9\xa4' | b'\xbc\xa0' | ... | b'\xf4\xa8' | b'\xb6\xa8' | b'>\xa4' | b'\xac\xac' | b'\xca+' | b'\xb5)' | b'\xaa\x19' | b'~\xa3' | b'\xe5)' | b'\x81\x1c' |
A0A078CGE6_M3KE1_BRANA_KD1 | b'\xfa*' | b'\x00.' | b'\xa2\x9f' | b'\xd4$' | b'i\xa9' | b'\x85\x9d' | b'\x0b\x9d' | b'\xfe\xa9' | b'\xe8\xa2' | b'\xff\xa4' | ... | b'\x1b\xa4' | b')\x12' | b'\xea\xa8' | b'L\xa9' | b't,' | b'\xba\xa5' | b'\xba\xa6' | b'\xa9\x95' | b'\xce\x1e' | b'j\xa8' |
A0A0G2K344_PK3CA_RAT_KD1 | b"\xe3'" | b'\x110' | b'r\x1f' | b'\xc5\xa4' | b'\x12 ' | b'\xe8$' | b'`\xa6' | b']\xac' | b'8\xa5' | b'8\xa0' | ... | b'\xa8\xa4' | b'^\xa3' | b'\t\xa6' | b'\xf1\x1e' | b'\xb1*' | b'\xad\xaa' | b'\xe8\xa5' | b'e(' | b'\x8f\x19' | b'D\x9f' |
A0A0H2ZM62_HK06_STRP2_KD1 | b'T%' | b'\x13\xac' | b'\xe4\xa2' | b'\xda)' | b'\x96\xa8' | b'\x00)' | b'B&' | b'\xb6\xad' | b'j\x1d' | b'u\xa5' | ... | b'\x00\xab' | b'0&' | b'\x15 ' | b'|\xae' | b'=,' | b'\xe4\x1d' | b'5\xaa' | b'\xba#' | b'b&' | b'\x0b%' |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
W0LYS5_CAMKI_MACNP_KD1 | b'\x86,' | b'\x83-' | b'#\xa5' | b">'" | b'\xb3\xa0' | b'\x1f\xa1' | b'l\xa6' | b'\xd6\xa9' | b'\x86\x9c' | b'\xe6\xa6' | ... | b'\xc9\xa5' | b'=\xa5' | b'T\xa8' | b'5\x9e' | b'C,' | b'B\xab' | b'\xa8\xa8' | b'\x05\xa2' | b'\x82\x9e' | b'.\x1f' |
W0T9X4_ATG1_KLUMD_KD1 | b'\xcc(' | b'\xe3$' | b'\xe1\xaa' | b'\xf7\x1f' | b'7\xaa' | b'Z\x87' | b'*\xa7' | b'\x1c\xab' | b'\xf4)' | b'd\xa9' | ... | b'\xf2\xa7' | b'\xb0\xaa' | b'\xee\xa4' | b'\x97\xaa' | b'/(' | b'9\xa3' | b'n\xaa' | b'\x80\xa6' | b'\x00$' | b'\xa7$' |
W7JX98_KGP_PLAFO_KD1 | b'\xd8!' | b'@,' | b'<\xa9' | b'.#' | b'\xb6\xa8' | b'C\xa7' | b'\x9c\x97' | b'Z\xa9' | b'\x1b\x1c' | b'\xdf\xab' | ... | b'\x9d\xa6' | b')\xa1' | b'\xbb%' | b'\xd2\xac' | b'k+' | b'V\xa5' | b'y\xa4' | b'3\x1d' | b'\xa3&' | b'\\\xa7' |
X5M5N0_WNK_CAEEL_KD1 | b'\xd7)' | b'\x86/' | b'\xef!' | b'\xd5\x18' | b'\xb9\xa6' | b'\x04&' | b'\xee\xa4' | b'\x91\xac' | b'\xf2\xa5' | b'\xc3\xa3' | ... | b'z\xaa' | b'U\x8e' | b'\x87\xaa' | b'\xfc\xa5' | b'\xf6)' | b'N\xa8' | b']\xa4' | b'\xb2\x95' | b'\xc6\x1c' | b'\x87\xa2' |
X5M8U1_GCY17_CAEEL_KD1 | b'\x13$' | b'\xf6\x14' | b'+*' | b'\x8b%' | b'\x89\x99' | b'\xb0$' | b'q\x1f' | b'\\\xad' | b'\x9c\xa3' | b"'\xa6" | ... | b'b\xa8' | b'\x91\xa7' | b'\x1d\xa8' | b'\xa9\xac' | b'O)' | b'h&' | b'f&' | b'\xd2\x99' | b'\x13&' | b'i\xa5' |
5536 rows × 1024 columns
= linkage(t5, method='ward') Z
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /tmp/ipykernel_456969/578039480.py in ?() ----> 1 Z = linkage(t5, method='ward') ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/scipy/cluster/hierarchy.py in ?(y, method, metric, optimal_ordering) 1017 >>> dn = dendrogram(Z) 1018 >>> plt.show() 1019 """ 1020 xp = array_namespace(y) -> 1021 y = _asarray(y, order='C', dtype=xp.float64, xp=xp) 1022 lazy = is_lazy_array(y) 1023 1024 if method not in _LINKAGE_METHODS: ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/scipy/_lib/_array_api.py in ?(array, dtype, order, copy, xp, check_finite, subok) 199 array = np.array(array, order=order, dtype=dtype, subok=subok) 200 elif subok: 201 array = np.asanyarray(array, order=order, dtype=dtype) 202 else: --> 203 array = np.asarray(array, order=order, dtype=dtype) 204 else: 205 try: 206 array = xp.asarray(array, dtype=dtype, copy=copy) ~/git/KATLAS/katlas/.venv/lib/python3.12/site-packages/pandas/core/generic.py in ?(self, dtype, copy) 2164 ) 2165 values = self._values 2166 if copy is None: 2167 # Note: branch avoids `copy=None` for NumPy 1.x support -> 2168 arr = np.asarray(values, dtype=dtype) 2169 else: 2170 arr = np.array(values, dtype=dtype, copy=copy) 2171 ValueError: could not convert string to float: b';#'
def plot_dendrogram3(Z, output='dendrogram.pdf', color_thr=0.01, **kwargs):
with plt.rc_context({'lines.linewidth': 0.3}): # set default line width
=(5, 100))
plt.figure(figsize
dendrogram(
Z,='left',
orientation=color_thr,
color_threshold='level',
truncate_mode=20,
p=1,
leaf_font_size=True,
show_contracted**kwargs
)'Hierarchical Clustering Dendrogram')
plt.title('Distance')
plt.ylabel(='tight')
plt.savefig(output, bbox_inches plt.close()
= pspa.set_index('kd_ID').iloc[:,5:] pspa_df
def get_dendrogram_labels(order_index, # iterable list of the dendrogram indexes
# df of flattened pssms with index as kd name
pssms, =0.15
color_thr
):
= []
labels for idx in order_index:
if idx in pssms.index:
=pssms.loc[idx]
flat_pssm = recover_pssm(flat_pssm)
pssm_df = clean_zero_normalize(pssm_df)
norm_pssm_df = pssm_to_seq(norm_pssm_df, color_thr)
seq + ': ' + seq)
labels.append(idx else:
labels.append(idx)
return labels
=get_dendrogram_labels(t5.index,pspa_df,0.15) labels
str.contains('KC1A')] pspa_df[pspa_df.index.
-5P | -5G | -5A | -5C | -5S | -5T | -5V | -5I | -5L | -5M | ... | 4E | 4s | 4t | 4y | 0s | 0t | 0y | 0S | 0T | 0Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
kd_ID | |||||||||||||||||||||
P48729_KC1A_HUMAN_KD1 | 0.0843 | 0.0590 | 0.0664 | 0.0588 | 0.0590 | 0.0590 | 0.0459 | 0.0488 | 0.057 | 0.0530 | ... | 0.0564 | 0.1808 | 0.1808 | 0.1458 | 1.0 | 0.1435 | 0.0 | 1.0 | 0.1435 | 0.0 |
Q8N752_KC1AL_HUMAN_KD1 | 0.0514 | 0.0528 | 0.0542 | 0.0535 | 0.0546 | 0.0546 | 0.0544 | 0.0645 | 0.064 | 0.0639 | ... | 0.0512 | 0.0966 | 0.0966 | 0.1209 | 1.0 | 0.4354 | 0.0 | 1.0 | 0.4354 | 0.0 |
2 rows × 213 columns
# labels = [i+': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms.iterrows()]
=labels ) plot_dendrogram3(Z,labels
= pspa_df.reset_index() pspa_df2
pspa_df2.shape
(362, 214)
pspa_df.columns
Index(['-5P', '-5G', '-5A', '-5C', '-5S', '-5T', '-5V', '-5I', '-5L', '-5M',
...
'4E', '4s', '4t', '4y', '0s', '0t', '0y', '0S', '0T', '0Y'],
dtype='object', length=213)
= pspa_df.columns columns_to_fill
= df.merge(pspa_df2,'left') df
for col in columns_to_fill:
= df.groupby('kd_seq')[col].transform(lambda x: x.ffill().bfill()) df[col]
len(pspa_df2)
362
= df.dropna(subset='4E') df2
= df2.set_index('kd_ID')[columns_to_fill] df2
=get_dendrogram_labels(t5.index,df2,0.15) labels
='dendrogram_similarity_1.pdf',labels =labels ) plot_dendrogram3(Z,output
def get_dup(df):
= df[df.kd_seq.duplicated(keep=False)].sort_values('kd_seq')
dup return dup.groupby('kd_seq').agg({'kd_ID':lambda x: ','.join(x)}).reset_index()
= get_dup(df)
dup_unique str.contains("HUMAN")].to_csv('duplicate_human_across.csv') dup_unique[dup_unique.kd_ID.