python-katlas

ProT5 embeddings

from katlas.feature import *
import pandas as pd
from katlas.data import *

# pip install git+https://github.com/sky1ove/kdock.git

from kdock.core.protein import *
import time

from tqdm import tqdm
tqdm.pandas()

Extract T5 embeddings

df=Data.get_kinase_info()

dd = df.sort_values('length',ascending=False).copy()

dd.head()
# t5_feat = get_t5(dd.iloc[3:5],'human_uniprot_sequence')

	kinase	ID_coral	uniprot	gene	modi_group	group	family	subfamily_coral	subfamily	in_pspa_st	...	cytosol	cytoskeleton	plasma membrane	mitochondrion	Golgi apparatus	endoplasmic reticulum	vesicle	centrosome	aggresome	main_location
498	TTN	TTN	Q8WZ42	TTN	CAMK	CAMK	MLCK	NaN	MLCK	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
325	OBSCN	Obscn	Q5VST9	OBSCN	CAMK	CAMK	Trio	NaN	Trio	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
326	OBSCN_b	Obscn_b	Q5VST9	OBSCN	CAMK	CAMK	Trio	NaN	Trio	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
124	DNAPK	DNAPK	P78527	PRKDC	Atypical	Atypical	PIKK	DNAPK	DNAPK	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
490	TRRAP	TRRAP	Q9Y4A5	TRRAP	Atypical	Atypical	PIKK	TRRAP	TRRAP	0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 36 columns

We skip TTN (34350), OBSCN(7968), OBSCN_b(7968) as OOM and start with DNAPK(4128)

skip_kinase = ['TTN','OBSCN','OBSCN_b']

df = df[~df.kinase.isin(skip_kinase)].copy()

df['uniprot_gene'] = df.uniprot + '_' + df.gene

df = df.drop_duplicates('uniprot_gene')

df = df.set_index('uniprot_gene')

# from transformers import T5Tokenizer, T5EncoderModel

# tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

#     # Load the model
# model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to('cuda')

t5_feat = get_t5(df,'human_uniprot_sequence')

# t5_feat.to_parquet('out/human_full_protein_t5.parquet')

Hierarchical

from scipy.cluster.hierarchy import linkage, fcluster,dendrogram
import matplotlib.pyplot as plt

t5_feat = pd.read_parquet('out/human_full_protein_t5.parquet')

Z = linkage(t5_feat, method='ward')

from katlas.core import *

plot_dendrogram(Z,labels=t5_feat.index,color_thr=1)
save_svg('full_protein_dendrogram.svg')

from katlas.core import *

pspa_df = pspa.set_index('kd_ID').iloc[:,5:]

def get_dendrogram_labels(order_index, # iterable list of the dendrogram indexes
                          pssms, # df of flattened pssms with index as kd name
                          color_thr=0.15
                         ):
    
    labels = []
    for idx in order_index:
        if idx in pssms.index:
            flat_pssm =pssms.loc[idx]
            pssm_df = recover_pssm(flat_pssm)
            norm_pssm_df = clean_zero_normalize(pssm_df)
            seq = pssm_to_seq(norm_pssm_df, color_thr)
            labels.append(idx + ': ' + seq)
        else:
            labels.append(idx)

    return labels

labels=get_dendrogram_labels(t5.index,pspa_df,0.15)

pspa_df[pspa_df.index.str.contains('KC1A')]

	-5P	-5G	-5A	-5C	-5S	-5T	-5V	-5I	-5L	-5M	...	4E	4s	4t	4y	0s	0t	0y	0S	0T	0Y
kd_ID
P48729_KC1A_HUMAN_KD1	0.0843	0.0590	0.0664	0.0588	0.0590	0.0590	0.0459	0.0488	0.057	0.0530	...	0.0564	0.1808	0.1808	0.1458	1.0	0.1435	0.0	1.0	0.1435	0.0
Q8N752_KC1AL_HUMAN_KD1	0.0514	0.0528	0.0542	0.0535	0.0546	0.0546	0.0544	0.0645	0.064	0.0639	...	0.0512	0.0966	0.0966	0.1209	1.0	0.4354	0.0	1.0	0.4354	0.0

2 rows × 213 columns

# labels = [i+': '+pssm_to_seq(recover_pssm(r),0.2) for i,r in pssms.iterrows()]

plot_dendrogram3(Z,labels =labels )

pspa_df2 = pspa_df.reset_index()

pspa_df2.shape

(362, 214)

pspa_df.columns

Index(['-5P', '-5G', '-5A', '-5C', '-5S', '-5T', '-5V', '-5I', '-5L', '-5M',
       ...
       '4E', '4s', '4t', '4y', '0s', '0t', '0y', '0S', '0T', '0Y'],
      dtype='object', length=213)

columns_to_fill = pspa_df.columns

df = df.merge(pspa_df2,'left')

for col in columns_to_fill:
    df[col] = df.groupby('kd_seq')[col].transform(lambda x: x.ffill().bfill())

len(pspa_df2)

df2 = df.dropna(subset='4E')

df2 = df2.set_index('kd_ID')[columns_to_fill]

labels=get_dendrogram_labels(t5.index,df2,0.15)

plot_dendrogram3(Z,output='dendrogram_similarity_1.pdf',labels =labels )

Kinase domain

df = pd.read_excel('raw/uniprot_kd.xlsx')

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 df = pd.read_excel('raw/uniprot_kd.xlsx')

File f:\git\kkatlas\.venv\Lib\site-packages\pandas\io\excel\_base.py:495, in read_excel(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)
    493 if not isinstance(io, ExcelFile):
    494     should_close = True
--> 495     io = ExcelFile(
    496         io,
    497         storage_options=storage_options,
    498         engine=engine,
    499         engine_kwargs=engine_kwargs,
    500     )
    501 elif engine and engine != io.engine:
    502     raise ValueError(
    503         "Engine should not be specified when passing "
    504         "an ExcelFile - ExcelFile already has the engine set"
    505     )

File f:\git\kkatlas\.venv\Lib\site-packages\pandas\io\excel\_base.py:1550, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options, engine_kwargs)
   1548     ext = "xls"
   1549 else:
-> 1550     ext = inspect_excel_format(
   1551         content_or_path=path_or_buffer, storage_options=storage_options
   1552     )
   1553     if ext is None:
   1554         raise ValueError(
   1555             "Excel file format cannot be determined, you must specify "
   1556             "an engine manually."
   1557         )

File f:\git\kkatlas\.venv\Lib\site-packages\pandas\io\excel\_base.py:1402, in inspect_excel_format(content_or_path, storage_options)
   1399 if isinstance(content_or_path, bytes):
   1400     content_or_path = BytesIO(content_or_path)
-> 1402 with get_handle(
   1403     content_or_path, "rb", storage_options=storage_options, is_text=False
   1404 ) as handle:
   1405     stream = handle.handle
   1406     stream.seek(0)

File f:\git\kkatlas\.venv\Lib\site-packages\pandas\io\common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    873         handle = open(
    874             handle,
    875             ioargs.mode,
   (...)    878             newline="",
    879         )
    880     else:
    881         # Binary mode
--> 882         handle = open(handle, ioargs.mode)
    883     handles.append(handle)
    885 # Convert BytesIO or file objects passed with an encoding

FileNotFoundError: [Errno 2] No such file or directory: 'raw/uniprot_kd.xlsx'

df.shape

(5536, 27)

t5feature = get_t5(df,'kd_seq')

t5feature_df = t5feature.set_index(df.kd_ID)

# t5feature_df.to_parquet('out/uniprot_kd_t5.parquet')

Load

import pandas as pd

t5=pd.read_parquet('out/uniprot_kd_t5.parquet')

human = t5.loc[t5.index.str.contains('HUMAN')]

human

	T5_0	T5_1	T5_2	T5_3	T5_4	T5_5	T5_6	T5_7	T5_8	T5_9	...	T5_1014	T5_1015	T5_1016	T5_1017	T5_1018	T5_1019	T5_1020	T5_1021	T5_1022	T5_1023
kd_ID
A4D2B8_PM2P1_HUMAN_KD1	0.036774	0.059082	0.017517	0.029770	-0.031342	0.051331	-0.036804	-0.007477	0.003513	0.023254	...	0.005329	-0.007664	-0.023956	-0.061340	0.055481	-0.052155	-0.057739	-0.063416	0.037384	-0.042938
A4QPH2_PI4P2_HUMAN_KD1	0.085022	0.115601	-0.003788	-0.025375	0.002710	0.014046	-0.027451	-0.091187	-0.025314	-0.008125	...	-0.009933	-0.020767	-0.030838	-0.020828	0.036560	-0.056000	-0.019379	0.061218	-0.001641	0.055908
O00141_SGK1_HUMAN_KD1	0.044617	0.130127	-0.013618	0.014923	-0.008316	-0.014488	-0.027954	-0.045044	0.008698	-0.031647	...	-0.034180	-0.009949	-0.016205	-0.006100	0.057068	-0.041412	-0.022461	-0.004608	0.008453	0.003035
O00238_BMR1B_HUMAN_KD1	0.048584	0.143311	0.049774	0.009216	-0.013748	0.034637	-0.025513	-0.067810	0.004253	-0.000382	...	-0.042328	-0.019745	-0.037445	-0.010078	0.058380	-0.034637	-0.032471	-0.003811	-0.033539	-0.025116
O00311_CDC7_HUMAN_KD1	0.037994	0.018890	-0.033600	0.005436	-0.018784	0.082947	-0.035309	-0.002960	0.068237	0.008904	...	-0.050690	-0.050323	-0.017853	-0.051605	0.019638	0.011986	-0.053955	0.011955	0.024643	0.030258
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Q9Y616_IRAK3_HUMAN_KD1	-0.016159	0.030655	0.008446	0.023926	0.003010	0.025528	-0.027893	-0.068665	0.048035	-0.020081	...	0.000594	-0.020996	0.041138	-0.086670	0.060516	0.029602	-0.006458	-0.001279	0.013954	-0.011383
Q9Y6E0_STK24_HUMAN_KD1	0.083130	0.109009	0.007038	0.000793	0.000355	0.006264	-0.031525	-0.044250	-0.047546	-0.010468	...	-0.040649	-0.009003	-0.076111	-0.012901	0.069702	-0.057648	-0.023697	-0.003151	0.001767	-0.020844
Q9Y6M4_KC1G3_HUMAN_KD1	0.081421	0.142700	-0.008904	-0.010002	-0.019638	0.021576	-0.047394	-0.060516	-0.010567	-0.041534	...	-0.013542	0.002172	-0.053345	0.003716	0.061646	-0.072937	-0.019058	-0.049835	-0.028427	-0.005585
Q9Y6R4_M3K4_HUMAN_KD1	0.069580	0.103821	0.000365	0.015762	-0.018250	0.008316	-0.008224	-0.042358	0.005066	-0.021591	...	-0.022095	-0.014961	-0.034424	-0.036560	0.068237	-0.025970	-0.039673	-0.009048	-0.008766	-0.023636
Q9Y6S9_RPKL1_HUMAN_KD1	-0.009552	0.027008	0.041565	0.003145	-0.022430	0.084839	-0.022430	-0.047516	0.069275	0.007629	...	0.039185	-0.053375	-0.039093	-0.053223	-0.015930	0.031982	0.005432	0.007584	0.032257	0.042114

539 rows × 1024 columns