RDKit ligand

Setup

plot_drug

 plot_drug (drug_dict, flip_list=None, save_path=None)

drug_smiles = {
    "Capmatinib": "CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC5=C(C=C4)N=CC=C5)F",
    "Tepotinib": "CN1CCC(CC1)COC2=CN=C(N=C2)C3=CC=CC(=C3)CN4C(=O)C=CC(=N4)C5=CC=CC(=C5)C#N",}

flip_list = ['Tepotinib'] # indicate keys to flip horizontally
plot_drug(drug_smiles,
          flip_list=flip_list,
        #   save_path='drug_structures.svg'
          )

Conformer

source

rdkit_conformer

 rdkit_conformer (SMILES, output=None, method='ETKDG', visualize=True,
                  seed=3)

Gemerate 3D conformers from SMILES

	Type	Default	Details
SMILES			SMILES string
output	NoneType	None	file “.sdf” to be saved
method	str	ETKDG	Optimization method, can be ‘UFF’, ‘MMFF’ or ‘ETKDGv3’
visualize	bool	True	whether or not to visualize the compound
seed	int	3	randomness of the 3D conformation

rdkit_conformer('CC1=C(C=CC(=C1)NC2=NC=NC3=CN=C(N=C32)N4CCC(CC4)NC(=O)C=C)OC5=CC6=C(C=C5)N(C=N6)C')

source

tanimoto

 tanimoto (df, smiles_col='SMILES', id_col='ID', target_col=None,
           radius=2)

Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame.

	Type	Default	Details
df			df with SMILES and ID columns
smiles_col	str	SMILES	colname of SMILES
id_col	str	ID	colname of compound ID
target_col	NoneType	None	colname of compound values (e.g., IC50)
radius	int	2	radius of the Morgan fingerprint.

df = Kras.get_mirati_g12d_raw()[['ID','SMILES','IC50']]
df = df.dropna(subset= 'IC50').reset_index(drop=True)
df

	ID	SMILES	IC50
0	US_1	CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...	124.7
1	US_2	CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2...	2.7
...	...	...	...
703	paper_37	FC1=C(C2=C(C(Cl)=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C...	2.0
704	paper_38	FC1=C(C2=C(C(C#C)=CC=C3)C3=CC(O)=C2)N=CC4=C1N=...	2.0

705 rows × 3 columns

result = tanimoto(df.head(), target_col = 'IC50')
result

	ID1	ID2	SMILES1	SMILES2	SimilarityScore	Target1	Target2
0	US_3	US_4	Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)...	Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...	0.779221	9.5	496.2
1	US_1	US_2	CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...	CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2...	0.743590	124.7	2.7
...	...	...	...	...	...	...	...
8	US_1	US_3	CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...	Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)...	0.505495	124.7	9.5
9	US_1	US_4	CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...	Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...	0.488889	124.7	496.2

10 rows × 7 columns

Rdkit feature

source

get_rdkit

 get_rdkit (SMILES:str)

Extract chemical features from SMILES Reference: https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html

source

get_rdkit_3d

 get_rdkit_3d (SMILES:str)

Extract 3d features from SMILES

source

get_rdkit_all

 get_rdkit_all (SMILES:str)

Extract chemical features and 3d features from SMILES

source

remove_hi_corr

 remove_hi_corr (df:pandas.core.frame.DataFrame, thr=0.99)

Remove highly correlated features in a dataframe given a pearson threshold

	Type	Default	Details
df	DataFrame
thr	float	0.99	threshold

source

preprocess

 preprocess (df:pandas.core.frame.DataFrame, thr=0.99)

Remove features with no variance, and highly correlated features based on threshold.

source

get_rdkit_df

 get_rdkit_df (df:pandas.core.frame.DataFrame, include_3d=False,
               col='SMILES', postprocess=False, chunksize=128)

Extract rdkit features (optionally in parallel with 3D) from SMILES in a df

	Type	Default	Details
df	DataFrame
include_3d	bool	False
col	str	SMILES
postprocess	bool	False
chunksize	int	128	for parallel process_map

df=Collins.get_antibiotics_2k().head(300)
df.head()

	name	SMILES	inhibition	activity
0	CEFPIRAMIDE	Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...	0.041572	1
1	GEMIFLOXACIN MESYLATE	CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...	0.041876	1
2	POLYMYXIN B SULFATE	CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...	0.041916	1
3	PRAXADINE HYDROCHLORIDE	Cl.N=C(N)n1cccn1	0.041964	1
4	CHLORHEXIDINE DIHYDROCHLORIDE	Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...	0.042295	1

# %%time
# feat_raw=get_rdkit_df(df)
# feat_raw

CPU times: user 46.6 ms, sys: 65.7 ms, total: 112 ms
Wall time: 2.63 s

	MaxAbsEStateIndex	MaxEStateIndex	MinAbsEStateIndex	MinEStateIndex	qed	SPS	MolWt	HeavyAtomMolWt	ExactMolWt	NumValenceElectrons	NumRadicalElectrons	MaxPartialCharge	MinPartialCharge	MaxAbsPartialCharge	MinAbsPartialCharge	FpDensityMorgan1	FpDensityMorgan2	FpDensityMorgan3	BCUT2D_MWHI	BCUT2D_MWLOW	BCUT2D_CHGHI	BCUT2D_CHGLO	BCUT2D_LOGPHI	BCUT2D_LOGPLOW	BCUT2D_MRHI	BCUT2D_MRLOW	AvgIpc	BalabanJ	BertzCT	Chi0	Chi0n	Chi0v	Chi1	Chi1n	Chi1v	Chi2n	Chi2v	Chi3n	Chi3v	Chi4n	Chi4v	HallKierAlpha	Ipc	Kappa1	Kappa2	Kappa3	LabuteASA	PEOE_VSA1	PEOE_VSA10	PEOE_VSA11	...	fr_benzodiazepine	fr_bicyclic	fr_diazo	fr_dihydropyridine	fr_epoxide	fr_ester	fr_ether	fr_furan	fr_guanido	fr_halogen	fr_hdrzine	fr_hdrzone	fr_imidazole	fr_imide	fr_isocyan	fr_isothiocyan	fr_ketone	fr_ketone_Topliss	fr_lactam	fr_lactone	fr_methoxy	fr_morpholine	fr_nitrile	fr_nitro	fr_nitro_arom	fr_nitro_arom_nonortho	fr_nitroso	fr_oxazole	fr_oxime	fr_para_hydroxylation	fr_phenol	fr_phenol_noOrthoHbond	fr_phos_acid	fr_phos_ester	fr_piperdine	fr_piperzine	fr_priamide	fr_prisulfonamd	fr_pyridine	fr_quatN	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea
0	13.503995	13.503995	0.064423	-1.323430	0.162954	18.619048	612.650	588.458	612.120937	218	0	0.352159	-0.507967	0.507967	0.352159	1.214286	1.952381	2.595238	32.167943	10.004034	2.541608	-2.492828	2.413335	-2.707792	8.005500	-0.150504	3.369454	1.339505e+00	1610.181663	30.128663	22.084778	23.717771	20.044622	12.357347	14.391701	9.367961	11.415452	6.531851	8.968356	4.422333	6.757255	-4.64	2.012862e+09	28.873109	11.690583	5.597490	245.926226	25.953159	34.653617	0.000000	...	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	1	0	2	0	0	0	1	0	0	0	0	0
1	14.874521	14.874521	0.033885	-3.666667	0.398220	19.484848	485.494	461.302	485.138047	180	0	0.340725	-0.477497	0.477497	0.340725	1.424242	2.060606	2.575758	32.239615	10.092419	2.328029	-2.190189	2.370431	-2.256089	7.845502	0.069447	3.471950	5.549999e-07	1265.508949	24.499271	17.933021	18.749518	15.439091	9.902198	11.750510	7.791830	9.257870	5.167537	5.167537	3.595955	3.595955	-3.10	1.783676e+07	24.540619	9.407810	5.638555	187.744947	25.144793	18.320426	11.635084	...	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	11.161773	11.161773	0.278802	-0.471329	0.463380	10.423077	354.358	336.214	354.110338	134	0	0.307526	-0.426733	0.426733	0.307526	0.615385	1.038462	1.500000	16.546477	10.114910	2.058887	-2.069790	2.248839	-2.041320	5.751292	-0.132566	2.519358	2.225734e+00	814.855170	19.104084	14.645642	14.645642	12.312571	7.801219	7.801219	5.421051	5.421051	3.035442	3.035442	2.145140	2.145140	-3.41	4.201281e+05	18.921939	8.617804	6.238343	150.543633	14.210589	17.248535	0.000000	...	0	0	0	0	0	3	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
299	6.134942	6.134942	0.000000	0.000000	0.709648	16.421053	412.597	391.429	409.976001	108	0	0.050449	-0.397550	0.397550	0.050449	1.210526	1.842105	2.473684	79.919764	9.939734	2.186581	-2.311907	2.301302	-2.326367	9.108394	0.184585	2.360656	1.804999e-06	420.152146	13.120956	11.163433	15.151922	8.575387	6.513030	8.099027	5.167360	6.937372	3.797772	4.911697	2.688476	4.002186	0.23	1.225652e+04	17.282002	7.866121	4.622779	138.333022	5.733667	0.000000	0.000000	...	0	0	0	0	0	0	0	0	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

300 rows × 210 columns

# feat = get_rdkit_df(df,postprocess=True)
# feat

Morgan fingerprints (ECFP)

Reference:

source

get_fp

 get_fp (SMILES, name='ecfp', ELEMENTS_PER_WORKER=1000000)

Super fast method to get molecule fingerprints using scikit-fingerprints

# %%time
# fp = get_fp(df.SMILES, name='ecfp')
# fp

0it [00:00, ?it/s]

CPU times: user 21.2 ms, sys: 78 ms, total: 99.2 ms
Wall time: 2.07 s

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=uint8)

Use np.packbits to save space

Save space when dealing with super big dataset

Packbits will transform binary to decimals, if it is [1, 0, 1, 1, 0, 0, 0, 0, 1, 1], then it will take 8 bits in the sequence, [10110000] and [11000000], for the latter, if not enough for 8 bits length, it will add 0.

To transform binary to decimals, take [10110000] for example, calculate through 12^7 + 02^6 + 12^5 + 12^4 + 02^3 + 02^2 + 02^1 + 02^0=176

source

compress_fp

 compress_fp (array)

Compress binary finterprints using np.packbits

# compress_fp(fp)

array([[100,   0,   0, ...,  16,   0,   0],
       [  0, 128,   0, ...,   0,   0,   0],
       [ 64,   0,   0, ...,   0,  16,   0],
       ...,
       [  0, 128,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,  64,   0],
       [ 32,   0,   0, ...,   0,   0,   0]], dtype=uint8)

Tanimoto

Reference: https://www.kaggle.com/code/towardsentropy/fingerprint-tips-and-tricks/notebook

Slow version:

Method 1

from scipy.spatial import distance

# %%time
# tanimotos_scipy = 1-distance.cdist(fp, fp, metric='jaccard')
# # 30.4s

A little faster with parallel

from sklearn.metrics import pairwise_distances

# %%time
# tanimotos_sklearn = 1 - pairwise_distances(fp, metric='jaccard', n_jobs=-1)
# # 13.1s

Method 2

from rdkit.Chem import DataStructs

# %%time
# tanimotos_rdkit = np.array([DataStructs.BulkTanimotoSimilarity(i, fp) for i in fp])

Method 3

def tanimoto_bitwise(fps_np):
    intersection = (fps_np[:,None] & fps_np[None]).sum(-1)
    union = (fps_np[:,None] | fps_np[None]).sum(-1)
    return intersection / union

# tanimoto_bitwise(fp)

Numba tanimoto

source

tanimoto_numba

 tanimoto_numba (fps)

Get a NxN matrix of tanimoto similarity among N compounds.

# %%time
# tanimoto_numba(fp)

CPU times: user 2.02 s, sys: 319 ms, total: 2.34 s
Wall time: 2.4 s

array([[1.        , 0.11695907, 0.12676056, ..., 0.04918033, 0.06617647,
        0.06206897],
       [0.11695907, 1.        , 0.13020833, ..., 0.09375   , 0.0990991 ,
        0.09166667],
       [0.12676056, 0.13020833, 1.        , ..., 0.06206897, 0.05555556,
        0.07142857],
       ...,
       [0.04918033, 0.09375   , 0.06206897, ..., 1.        , 0.03508772,
        0.0625    ],
       [0.06617647, 0.0990991 , 0.05555556, ..., 0.03508772, 1.        ,
        0.0625    ],
       [0.06206897, 0.09166667, 0.07142857, ..., 0.0625    , 0.0625    ,
        1.        ]], dtype=float32)

Group same compounds

Utilize hash sha256 to encode morgan fp and find same molecule

source

hash_fp

 hash_fp (fp_row)

Hash a binary fingerprint row using SHA256

# hash_fp(fp[0])

'8deb65ba1323d46ca59ebf1206a497761ed2b83a518aa7227a965d5426e25aea'

source

get_same_mol_group

 get_same_mol_group (df, smi_col='SMILES')

Assign a group number to the same compounds by utilizing hash sha256 to encode morgan fp and find same molecule.

# %%time
# df2 = get_same_mol_group(df)
# df2.group.value_counts()

0it [00:00, ?it/s]

CPU times: user 26.4 ms, sys: 124 ms, total: 151 ms
Wall time: 237 ms

group
0      1
206    1
      ..
96     1
299    1
Name: count, Length: 300, dtype: int64

Setup

plot_drug

Conformer

rdkit_conformer

tanimoto

Rdkit feature

get_rdkit

get_rdkit_3d

get_rdkit_all

remove_hi_corr

preprocess

get_rdkit_df

Morgan fingerprints (ECFP)

get_fp

Use np.packbits to save space

compress_fp

Tanimoto

Numba tanimoto

tanimoto_numba

Group same compounds

hash_fp

get_same_mol_group

End