Covalent bond small inhibitor

Reference

github AF3 issues: https://github.com/google-deepmind/alphafold3/issues/159

Setup

from kdock.data.core import *
import pandas as pd

Prepare json file

seq='HHHHHHAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLIMQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGRAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQG'
seq[106-1] # always check if the bonded Atom Pair of receptor is correct
'C'

First run with database to get msa and template:

protein_json = get_protein_json('proteinA',seq,'data/proteinA.json',seeds=[1])

Second run directly read the protein json:

protein_json = read_json('3w2q_test_data.json')
print(str(protein_json)[:1000])
{'dialect': 'alphafold3', 'version': 3, 'name': '3W2Q_test', 'sequences': [{'ligand': {'id': 'L', 'ccdCodes': ['lig-any']}}, {'protein': {'id': 'A', 'sequence': 'HHHHHHAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLIMQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGRAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQG', 'modifications': [], 'unpairedMsa': ">query\nHHHHHHAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLIMQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGRAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQG\n>UniRef90_A0A498NET7/528-812 [subseq from] Receptor protein-tyrosine kinase n=2 Tax=Labeo rohita TaxID=84645 RepID=A0A498NET7_

Prepare ligand

Load pdb in maestro, split complex, save ligand (without covalent bond) into pdb

Convert the pdb to ccd

ccd_text = sdf2ccd('covalent_test/lig-HKI.sdf')
ccd_text
"data_lig-1\n#\n_chem_comp.id lig-1\n_chem_comp.name 'lig-1'\n_chem_comp.type non-polymer\n_chem_comp.formula '?'\n_chem_comp.mon_nstd_parent_comp_id ?\n_chem_comp.pdbx_synonyms ?\n_chem_comp.formula_weight '?'\n#\nloop_\n_chem_comp_atom.comp_id\n_chem_comp_atom.atom_id\n_chem_comp_atom.type_symbol\n_chem_comp_atom.charge\n_chem_comp_atom.pdbx_leaving_atom_flag\n_chem_comp_atom.pdbx_model_Cartn_x_ideal\n_chem_comp_atom.pdbx_model_Cartn_y_ideal\n_chem_comp_atom.pdbx_model_Cartn_z_ideal\nlig-1 C1 C 0 N 1.654 24.013 52.956\nlig-1 C2 C 0 N 1.438 32.804 50.984\nlig-1 C3 C 0 N 0.712 33.151 49.868\nlig-1 C4 C 0 N -0.692 25.377 50.586\nlig-1 C5 C 0 N -0.322 26.674 50.279\nlig-1 C6 C 0 N 1.518 31.470 51.317\nlig-1 C7 C 0 N -0.243 20.239 50.608\nlig-1 C8 C 0 N 1.486 18.601 51.967\nlig-1 C9 C 0 N 1.586 24.734 50.460\nlig-1 C10 C 0 N 0.101 32.158 49.137\nlig-1 C11 C 0 N 2.414 21.749 53.291\nlig-1 C12 C 0 N 1.591 22.631 52.634\nlig-1 C13 C 0 N 0.661 20.790 51.497\nlig-1 C14 C 0 N 1.536 19.963 52.199\nlig-1 C15 C 0 N 0.256 24.378 50.677\nlig-1 C16 C 0 N -0.289 18.874 50.388\nlig-1 C17 C 0 N 0.679 22.164 51.710\nlig-1 C18 C 0 N 1.009 26.999 50.081\nlig-1 C19 C 0 N 0.589 18.055 51.071\nlig-1 C20 C 0 N 1.985 26.032 50.174\nlig-1 C21 C 0 N 0.874 30.542 50.526\nlig-1 C22 C 0 N -2.591 18.634 49.703\nlig-1 C23 C 0 N 1.402 14.519 51.149\nlig-1 C24 C 0 N -7.831 18.607 48.749\nlig-1 C25 C 0 N -7.717 17.604 46.668\nlig-1 C26 C 0 N 0.962 29.094 50.896\nlig-1 C27 C 0 N -3.554 17.670 49.067\nlig-1 C28 C 0 N -5.011 18.123 49.196\nlig-1 C29 C 0 N -5.726 17.713 47.913\nlig-1 C30 C 0 N 1.690 16.001 51.281\nlig-1 N31 N 0 N 1.622 25.129 53.300\nlig-1 N32 N 0 N 2.416 20.432 53.093\nlig-1 N33 N 0 N 0.164 30.856 49.437\nlig-1 N34 N 0 N -0.176 23.061 51.022\nlig-1 N35 N 0 N -1.237 18.341 49.477\nlig-1 N36 N 0 N -6.959 18.439 47.595\nlig-1 O37 O 0 N -3.018 19.579 50.358\nlig-1 O38 O 0 N 1.397 28.299 49.780\nlig-1 O39 O 0 N 0.523 16.689 50.833\nlig-1 Cl40 Cl 0 N 3.657 26.498 49.921\n#\nloop_\n_chem_comp_bond.atom_id_1\n_chem_comp_bond.atom_id_2\n_chem_comp_bond.value_order\n_chem_comp_bond.pdbx_aromatic_flag\nC1 C12 SING N\nC1 N31 TRIP N\nC2 C3 DOUB N\nC2 C6 SING N\nC3 C10 SING N\nC4 C5 DOUB N\nC4 C15 SING N\nC5 C18 SING N\nC6 C21 DOUB N\nC7 C13 DOUB N\nC7 C16 SING N\nC8 C14 DOUB N\nC8 C19 SING N\nC9 C15 DOUB N\nC9 C20 SING N\nC10 N33 DOUB N\nC11 C12 SING N\nC11 N32 DOUB N\nC12 C17 DOUB N\nC13 C14 SING N\nC13 C17 SING N\nC14 N32 SING N\nC15 N34 SING N\nC16 C19 DOUB N\nC16 N35 SING N\nC17 N34 SING N\nC18 C20 DOUB N\nC18 O38 SING N\nC19 O39 SING N\nC20 Cl40 SING N\nC21 C26 SING N\nC21 N33 SING N\nC22 C27 SING N\nC22 N35 SING N\nC22 O37 DOUB N\nC23 C30 SING N\nC24 N36 SING N\nC25 N36 SING N\nC26 O38 SING N\nC27 C28 SING N\nC28 C29 SING N\nC29 N36 SING N\nC30 O39 SING N\n#"
get_protein_ccd_json?
Signature:
get_protein_ccd_json(
    protein_json,
    rec_residue_num: int,
    rec_atom_id: str,
    lig_sdf_path,
    lig_atom_id: str,
    job_id: str,
    save_path=None,
    seeds=[1],
)
Docstring: Create AlphaFold3 docking JSON with customized CCD ligand and bondedAtomPairs.
File:      ~/af_kit/af_kit/covalent.py
Type:      function
data = get_protein_ccd_json(protein_json,
                            106,
                            'SG',
                            'covalent_test/lig-HKI.sdf',
                            'C28',
                            'test',
                            '3W2Q_3.json')
print(str(data)[:1000])
{'name': '3W2Q_test', 'modelSeeds': [1], 'sequences': [{'ligand': {'id': 'L', 'ccdCodes': ['lig-any']}}, {'protein': {'id': 'A', 'sequence': 'HHHHHHAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLIMQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGRAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQG', 'modifications': [], 'unpairedMsa': ">query\nHHHHHHAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLIMQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGRAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQG\n>UniRef90_A0A498NET7/528-812 [subseq from] Receptor protein-tyrosine kinase n=2 Tax=Labeo rohita TaxID=84645 RepID=A0A498NET7_LABRO\n---KHHKKKETRR
str(data)[-1000:]
ig-any O37 O 0 N -3.018 19.579 50.358\nlig-any O38 O 0 N 1.397 28.299 49.780\nlig-any O39 O 0 N 0.523 16.689 50.833\nlig-any Cl40 Cl 0 N 3.657 26.498 49.921\n#\nloop_\n_chem_comp_bond.atom_id_1\n_chem_comp_bond.atom_id_2\n_chem_comp_bond.value_order\n_chem_comp_bond.pdbx_aromatic_flag\nC1 C12 SING N\nC1 N31 TRIP N\nC2 C3 DOUB N\nC2 C6 SING N\nC3 C10 SING N\nC4 C5 DOUB N\nC4 C15 SING N\nC5 C18 SING N\nC6 C21 DOUB N\nC7 C13 DOUB N\nC7 C16 SING N\nC8 C14 DOUB N\nC8 C19 SING N\nC9 C15 DOUB N\nC9 C20 SING N\nC10 N33 DOUB N\nC11 C12 SING N\nC11 N32 DOUB N\nC12 C17 DOUB N\nC13 C14 SING N\nC13 C17 SING N\nC14 N32 SING N\nC15 N34 SING N\nC16 C19 DOUB N\nC16 N35 SING N\nC17 N34 SING N\nC18 C20 DOUB N\nC18 O38 SING N\nC19 O39 SING N\nC20 Cl40 SING N\nC21 C26 SING N\nC21 N33 SING N\nC22 C27 SING N\nC22 N35 SING N\nC22 O37 DOUB N\nC23 C30 SING N\nC24 N36 SING N\nC25 N36 SING N\nC26 O38 SING N\nC27 C28 SING N\nC28 C29 SING N\nC29 N36 SING N\nC30 O39 SING N\n#", 'dialect': 'alphafold3', 'version': 3}
data['bondedAtomPairs']
[[['A', 106, 'SG'], ['L', 1, 'C28']]]

Docker command

Move the generated proteinA.json to the af_input/project_name folder

project_name='common'

First run with search enabled:

docker_single_full(f"af_input/{project_name}/3W2Q.json",
                               output_dir=f"af_output/{project_name}")

After the first run, skip msa:

docker_single_full(json_path=f"af_input/{project_name}/3W2Q_3.json",
                               output_dir=f"af_output/{project_name}",skip_search=True)
docker run --rm \
    --volume "$HOME/af_input:/root/af_input" \
    --volume "$HOME/af_output/common:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_database:/root/public_databases" \
    --gpus "device=0" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --json_path=/root/af_input/common/3W2Q_3.json \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --norun_data_pipeline

Multiple protein-smile pairs in a df

df = pd.DataFrame({'idx':['a','b'],'smi':['CCC','OCO']})
df
idx smi
0 a CCC
1 b OCO
for idx, smi in df.values:
    _ = get_protein_smiles_json(idx,smi,protein_json,f'af_input/{project_name}/{idx}.json',seeds=[1,2,3])

This will generate many json files in the directory

Split file into multiple subfolder for multi-GPUs

split_nfolder(f'af_input/{project_name}')
Distributed 2 files into 4 folders.

Docker

docker pull sky1ove/alphafold3

for i in range(4):
    docker_multi_infer(input_dir=f"af_input/{project_name}/folder_{i}",
                               output_dir=f"af_output/{project_name}",
                               gpus=i)
# norun_data_pipeline means skip template search as we already did in the first step
docker run --rm \
    --volume "$HOME/af_input:/root/af_input" \
    --volume "$HOME/af_output/sdf:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_db:/root/public_databases" \
    --volume "$HOME/af_cache:/root/cache" \
    --gpus "device=0" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --input_dir=/root/af_input/sdf/folder_0 \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --jax_compilation_cache_dir=/root/cache \
    --norun_data_pipeline
docker run --rm \
    --volume "$HOME/af_input:/root/af_input" \
    --volume "$HOME/af_output/sdf:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_db:/root/public_databases" \
    --volume "$HOME/af_cache:/root/cache" \
    --gpus "device=1" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --input_dir=/root/af_input/sdf/folder_1 \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --jax_compilation_cache_dir=/root/cache \
    --norun_data_pipeline
docker run --rm \
    --volume "$HOME/af_input:/root/af_input" \
    --volume "$HOME/af_output/sdf:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_db:/root/public_databases" \
    --volume "$HOME/af_cache:/root/cache" \
    --gpus "device=2" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --input_dir=/root/af_input/sdf/folder_2 \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --jax_compilation_cache_dir=/root/cache \
    --norun_data_pipeline
docker run --rm \
    --volume "$HOME/af_input:/root/af_input" \
    --volume "$HOME/af_output/sdf:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_db:/root/public_databases" \
    --volume "$HOME/af_cache:/root/cache" \
    --gpus "device=3" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --input_dir=/root/af_input/sdf/folder_3 \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --jax_compilation_cache_dir=/root/cache \
    --norun_data_pipeline