This repository provides implementations of molecular generative models using VAE and Diffusion Models, with built-in pipelines for training, sampling, and optimization.
It also includes scripts for external dataset application and end-to-end pipelines.
We are revising the code, and some parameters can be directly modified via the relevant files.
The datasets used in this project can be found at the following links:
We provide a single executable script (run_pipeline.sh) to run the full process (training β sampling β optimization).
Save as run_pipeline.sh:
#!/usr/bin/env bash
set -euo pipefail
# ========= Default Config =========
DATA="./data/drugs.smi" # Path to training dataset (.smi)
TARGET="2RMA" # Target ID for affinity optimization
% REF_MOL="./data/reference.smi" # Reference molecule for similarity optimization, We are currently revising the code, and similar molecules can now be directly defined via the samply.py file.
NUM_SAMPLES=1000 # Number of molecules to sample
RUN_NAME="$(date +%Y%m%d_%H%M%S)"
OUTDIR="./results/${RUN_NAME}"
LOGDIR="${OUTDIR}/logs"
# =================================
usage() {
cat <<EOF
Usage: $0 [--data PATH] [--target ID] [--ref_mol PATH] [--num_samples N] [--run_name NAME]
Examples:
$0 --data ./data/drugs.smi --num_samples 1000
$0 --data ./data/my_dataset.smi --target 3AF2 --ref_mol ./data/reference.smi --num_samples 500 --run_name myrun
EOF
exit 0
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help) usage ;;
--data) DATA="$2"; shift 2 ;;
--target) TARGET="$2"; shift 2 ;;
--ref_mol) REF_MOL="$2"; shift 2 ;;
--num_samples) NUM_SAMPLES="$2"; shift 2 ;;
--run_name) RUN_NAME="$2"; OUTDIR="./results/${RUN_NAME}"; LOGDIR="${OUTDIR}/logs"; shift 2 ;;
*) echo "Unknown arg: $1"; usage ;;
esac
done
mkdir -p "$OUTDIR" "$LOGDIR"
echo "=== CONFIG ==="
echo "DATA = $DATA"
echo "TARGET = $TARGET"
echo "REF_MOL = $REF_MOL"
echo "NUM_SAMPLES = $NUM_SAMPLES"
echo "OUTDIR = $OUTDIR"
echo "=============="
# 1) Train VAE
echo "[1/5] Training VAE..."
python train_vae.py --data "$DATA" 2>&1 | tee "${LOGDIR}/01_train_vae.log"
# 2) Train Diffusion
echo "[2/5] Training diffusion model..."
python train_diffusion.py --data "$DATA" 2>&1 | tee "${LOGDIR}/02_train_diffusion.log"
# 3) Sampling
echo "[3/5] Sampling ${NUM_SAMPLES} molecules..."
python sample.py --model diffusion --num_samples "$NUM_SAMPLES" --out "${OUTDIR}/generated.smi" 2>&1 | tee "${LOGDIR}/03_sample.log"
# 4) Optimize for binding affinity
echo "[4/5] Affinity optimization (target=${TARGET})..."
# If optimize_affinity.py supports --out, keep it; otherwise it will save to its default location.
if python optimize_affinity.py --help 2>/dev/null | grep -q -- "--out"; then
python optimize_affinity.py --model diffusion --target "$TARGET" --out "${OUTDIR}/optimized_affinity.smi" 2>&1 | tee "${LOGDIR}/04_opt_affinity.log"
else
python optimize_affinity.py --model diffusion --target "$TARGET" 2>&1 | tee "${LOGDIR}/04_opt_affinity.log"
fi
# 5) Optimize for similarity
echo "[5/5] Similarity optimization (ref_mol=${REF_MOL})..."
if python optimize_similarity.py --help 2>/dev/null | grep -q -- "--out"; then
python optimize_similarity.py --model diffusion --ref_mol "$REF_MOL" --out "${OUTDIR}/optimized_similarity.smi" 2>&1 | tee "${LOGDIR}/05_opt_similarity.log"
else
python optimize_similarity.py --model diffusion --ref_mol "$REF_MOL" 2>&1 | tee "${LOGDIR}/05_opt_similarity.log"
fi
echo "β
Done. Outputs & logs saved to: ${OUTDIR}"Run example:
chmod +x run_pipeline.sh
./run_pipeline.sh \
--data ./data/ChEMBL.smi \
--target 2RMA \
--ref_mol ./data/reference.smi \
--num_samples 1000 \
--run_name demo_runpython train_vae.py --data ./data/drugs.smipython train_diffusion.py --data ./data/drugs.smiGenerate molecules using trained models:
python sample.py --model diffusion --num_samples 1000 --out ./results/generated.smiThese parameters can be modified within this file.
python optimize_affinity.py --model diffusion --target 2RMAThe target can be directly revised in this file.
python optimize_similarity.py --model diffusion --ref_mol ./data/reference.smiThe reference.smi file requires user preparation or can be modified within this file.
To apply the framework on a new dataset, prepare a .smi file and specify it via the --data argument.
python train_diffusion.py --data ./data/my_dataset.smiWe have provided numerous data examples in the data folder, and users can define their own data following this format.
python sample.py --model diffusion --num_samples 500 --out ./results/my_generated.smiThese parameters can be modified within this file.
python train_vae.py --data ./data/drugs.smi
python train_diffusion.py --data ./data/drugs.smi
python sample.py --model diffusion --num_samples 1000 --out ./results/generated.smipython train_diffusion.py --data ./data/my_dataset.smi
python optimize_affinity.py --model diffusion --target 3AF2These parameters can be modified within this file.
python sample.py --model diffusion --num_samples 200 --out ./results/candidates.smi
python optimize_similarity.py --model diffusion --ref_mol ./data/reference.smiThese parameters can be modified within this file.
If you find this code useful, please cite our paper.
[XXX]