Generate json

Single protein sequence (default)

Default pipeline, will run MSA and template search


source

dump_json

 dump_json (data, save_path)

Save json data into a file


source

get_protein_json

 get_protein_json (name, seq, save_path=None, seeds=[1])

Generate json of single protein sequence for input of docker command

Type Default Details
name job name
seq aa sequence
save_path NoneType None .json
seeds list [1]
data = get_protein_json('proteinA','AAA','data/proteinA.json',seeds=[1,2,3])
data
{'name': 'proteinA',
 'modelSeeds': [1, 2, 3],
 'sequences': [{'protein': {'id': 'A', 'sequence': 'AAA'}}],
 'bondedAtomPairs': [],
 'dialect': 'alphafold3',
 'version': 3}

Protein-SMILES

  • First run the normal sequence only pipeline for the protein
  • Get the output data.json file, read it, load the ["sequences"][0]["protein"]

source

read_json

 read_json (file_path)
protein_json = read_json('data/seq_only_data.json')

source

get_protein_smiles_json

 get_protein_smiles_json (smi_id:str, SMILES:str, protein_json,
                          save_path=None, seeds=[1])

Get json for protein-ligand docking task

Type Default Details
smi_id str
SMILES str
protein_json json type
save_path NoneType None .json
seeds list [1]
out = get_protein_smiles_json('smi_name','CCC',protein_json,'data/protein_smi.json',seeds=[1,2,3])

Let’s take a look for the json:

str(out)[:100]
"{'name': 'smi_name', 'modelSeeds': [1, 2, 3], 'sequences': [{'ligand': {'id': 'L', 'smiles': 'CCC'}}"
df = pd.DataFrame({'idx':['a','b'],'smi':['CCC','OCO']})
df
idx smi
0 a CCC
1 b OCO
project_name='sdf'
for idx, smi in df.values:
    _ = get_protein_smiles_json(idx,smi,protein_json,f'af_input/{project_name}/{idx}.json',seeds=[1,2,3])

Protein-CCDcode


source

get_protein_ccdcode_json

 get_protein_ccdcode_json (protein_json, ccd_code, job_id:str,
                           save_path=None, seeds=[1])

Create AlphaFold3 docking JSON with CCD code(s).

Type Default Details
protein_json dict with protein sequence
ccd_code str or list of str
job_id str job/task ID
save_path NoneType None optional output path
seeds list [1] optional random seeds

Protein-CCD for covalent

sdf2CCD

mol_to_ccd_cif Reference: https://github.com/google-deepmind/alphafold3/issues/178

About hydrogens: https://github.com/google-deepmind/alphafold3/issues/212


source

mol_to_ccd_text

 mol_to_ccd_text (mol, component_id, pdbx_smiles=None,
                  include_hydrogens=False)

source

assign_atom_names_from_graph

 assign_atom_names_from_graph (mol)

source

sdf2ccd

 sdf2ccd (sdf_path, CCD_name='lig-1')

Convert the compound to the AF3 required CCD format

Type Default Details
sdf_path
CCD_name str lig-1 do not use ’_‘; use as less letter as possible, ’lig-any’ leads to extra ligands
sdf2ccd('covalent_test/lig-HKI.sdf')[:100]
"data_lig-1\n#\n_chem_comp.id lig-1\n_chem_comp.name 'lig-1'\n_chem_comp.type non-polymer\n_chem_comp.form"

json


source

get_protein_ccd_json

 get_protein_ccd_json (protein_json, rec_residue_num:int, rec_atom_id:str,
                       lig_sdf_path, lig_atom_id:str, job_id:str,
                       save_path=None, seeds=[1])

Create AlphaFold3 docking JSON with customized CCD ligand and bondedAtomPairs.

Type Default Details
protein_json dict with protein sequence
rec_residue_num int 1-indexed, for bondedAtomPairs, e.g., [“A”, 145, “SG”]
rec_atom_id str for bondedAtomPairs, e.g., [“A”, 145, “SG”]
lig_sdf_path ccd text
lig_atom_id str 0-indexed, for bondedAtomPairs, [“L”, 1, “C04”]
job_id str str, job/task ID
save_path NoneType None optional output path
seeds list [1] optional random seeds

Version 2, with user ccd and pair as input:

def get_protein_ccd_json2(protein_json,             # dict with protein sequence
                         userCCD,                  # ccd text
                         pair1,                    # protein pair e.g., ["A", 145, "SG"] 1-indexed
                         pair2,                    # ligand pair e.g., ["L", 1, "C04"] 0-indexed
                         job_id,                   # str, job/task ID
                         save_path=None,           # optional output path
                         seeds=[1]):               # optional random seeds
    "Create AlphaFold3 docking JSON with customized CCD ligand and bondedAtomPairs."
    
    ccd_id = re.search(r"_chem_comp.id\s+([^\s#]+)", ccd_text).group(1)
    json_data = {
        "name": job_id,
        "modelSeeds": seeds,
        "sequences": [
            {
                "ligand": {
                    "id": "L",
                    "ccdCodes": [ccd_id]
                }
            },
            {
                "protein": protein_json["sequences"][0]["protein"]
            },
        ],
        "bondedAtomPairs": [[pair1,pair2]],
        "userCCD": userCCD,
        "dialect": "alphafold3",
        "version": 3
    }

    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        dump_json(json_data, save_path)

    return json_data

Split the files to subfolder

for multi-GPUs


source

split_nfolder

 split_nfolder (folder_dir, n=4)

Move json files from a folder into subfolders (folder_0, folder_1, …, folder_N).

split_nfolder(f'af_input/{project_name}')
Distributed 2 files into 4 folders.

End

Reference:

# import collections
# from collections.abc import Mapping, Sequence

# from absl import logging
# from alphafold3.cpp import cif_dict
# import numpy as np
# import rdkit.Chem as rd_chem
# from rdkit.Chem import AllChem as rd_all_chem

# def mol_to_ccd_cif(
#     mol: rd_chem.Mol,
#     component_id: str,
#     pdbx_smiles: str | None = None,
#     include_hydrogens: bool = True,
# ) -> cif_dict.CifDict:
#   """Creates a CCD-like mmcif data block from an rdkit Mol object.

#   Only a subset of associated mmcif fields is populated, but that is
#   sufficient for further usage, e.g. in featurization code.

#   Atom names can be specified via `atom_name` property. For atoms with
#   unspecified value of that property, the name is assigned based on element type
#   and the order in the Mol object.

#   If the Mol object has associated conformers, atom positions from the first of
#   them will be populated in the resulting mmcif file.

#   Args:
#      mol: An rdkit molecule.
#      component_id: Name of the molecule to use in the resulting mmcif. That is
#        equivalent to CCD code.
#      pdbx_smiles: If specified, the value will be used to populate
#        `_chem_comp.pdbx_smiles`.
#      include_hydrogens: Whether to include atom and bond data involving
#        hydrogens.

#   Returns:
#      An mmcif data block corresponding for the given rdkit molecule.

#   Raises:
#     UnsupportedMolBond: When a molecule contains a bond that can't be
#       represented with mmcif.
#   """
#   mol = rd_chem.Mol(mol)
#   if include_hydrogens:
#     mol = rd_chem.AddHs(mol)
#   rd_chem.Kekulize(mol)

#   if mol.GetNumConformers() > 0:
#     ideal_conformer = mol.GetConformer(0).GetPositions()
#     ideal_conformer = np.vectorize(lambda x: f'{x:.3f}')(ideal_conformer)
#   else:
#     # No data will be populated in the resulting mmcif if the molecule doesn't
#     # have any conformers attached to it.
#     ideal_conformer = None

#   mol_cif = collections.defaultdict(list)
#   mol_cif['data_'] = [component_id]
#   mol_cif['_chem_comp.id'] = [component_id]
#   if pdbx_smiles:
#     mol_cif['_chem_comp.pdbx_smiles'] = [pdbx_smiles]

#   mol = assign_atom_names_from_graph(mol, keep_existing_names=True)

#   for atom_idx, atom in enumerate(mol.GetAtoms()):
#     element = atom.GetSymbol()
#     if not include_hydrogens and element in ('H', 'D'):
#       continue

#     mol_cif['_chem_comp_atom.comp_id'].append(component_id)
#     mol_cif['_chem_comp_atom.atom_id'].append(atom.GetProp('atom_name'))
#     mol_cif['_chem_comp_atom.type_symbol'].append(atom.GetSymbol().upper())
#     mol_cif['_chem_comp_atom.charge'].append(str(atom.GetFormalCharge()))
#     if ideal_conformer is not None:
#       coords = ideal_conformer[atom_idx]
#       mol_cif['_chem_comp_atom.pdbx_model_Cartn_x_ideal'].append(coords[0])
#       mol_cif['_chem_comp_atom.pdbx_model_Cartn_y_ideal'].append(coords[1])
#       mol_cif['_chem_comp_atom.pdbx_model_Cartn_z_ideal'].append(coords[2])

#   for bond in mol.GetBonds():
#     atom1 = bond.GetBeginAtom()
#     atom2 = bond.GetEndAtom()
#     if not include_hydrogens and (
#         atom1.GetSymbol() in ('H', 'D') or atom2.GetSymbol() in ('H', 'D')
#     ):
#       continue
#     mol_cif['_chem_comp_bond.comp_id'].append(component_id)
#     mol_cif['_chem_comp_bond.atom_id_1'].append(
#         bond.GetBeginAtom().GetProp('atom_name')
#     )
#     mol_cif['_chem_comp_bond.atom_id_2'].append(
#         bond.GetEndAtom().GetProp('atom_name')
#     )
#     try:
#       bond_type = bond.GetBondType()
#       # Older versions of RDKit did not have a DATIVE bond type. Convert it to
#       # SINGLE to match the AF3 training setup.
#       if bond_type == rd_chem.BondType.DATIVE:
#         bond_type = rd_chem.BondType.SINGLE
#       mol_cif['_chem_comp_bond.value_order'].append(
#           _RDKIT_BOND_TYPE_TO_MMCIF[bond_type]
#       )
#       mol_cif['_chem_comp_bond.pdbx_stereo_config'].append(
#           _RDKIT_BOND_STEREO_TO_MMCIF[bond.GetStereo()]
#       )
#     except KeyError as e:
#       raise UnsupportedMolBondError from e
#     mol_cif['_chem_comp_bond.pdbx_aromatic_flag'].append(
#         'Y' if bond.GetIsAromatic() else 'N'
#     )

#   return cif_dict.CifDict(mol_cif)