Module molcrawl.preparation.download_guacamol
GuacaMol Dataset Download Script
Download the GuacaMol benchmark dataset from Figshare. https://figshare.com/projects/GuacaMol/56639
How to use: python src/preparation/download_guacamol.py
environmental variables: LEARNING_SOURCE_DIR: Base directory where dataset is saved (required)
Functions
def download_file(url, output_path, chunk_size=8192)-
Expand source code
def download_file(url, output_path, chunk_size=8192): """ Download and save file from URL. Falls back to copying from a sibling learning_source_* directory when the remote server blocks automated downloads (e.g. Figshare WAF challenge). Args: url: Download source URL output_path: save the first file chunk_size: Chunk size (bytes) """ import shutil as _shutil output_path = Path(output_path) # Skip if already exists and has content (0-byte files are treated as missing) if output_path.exists() and output_path.stat().st_size > 0: print(f"✓ Already exists: {output_path.name}") return True if output_path.exists() and output_path.stat().st_size == 0: print(f"⚠ Found 0-byte file, re-downloading: {output_path.name}") output_path.unlink() print(f"Downloading {output_path.name}...") try: headers = { "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) } response = requests.get(url, stream=True, timeout=60, headers=headers) response.raise_for_status() # get file size total_size = int(response.headers.get("content-length", 0)) # Download with progress bar with ( open(output_path, "wb") as f, tqdm( desc=output_path.name, total=total_size, unit="B", unit_scale=True, unit_divisor=1024, ) as pbar, ): for chunk in response.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) pbar.update(len(chunk)) # Verify we actually got content (WAF blocks return 0 bytes with 200/202) if output_path.stat().st_size == 0: output_path.unlink() raise requests.exceptions.RequestException( "Server returned 0 bytes — possible WAF/bot-protection block." ) print(f"✓ Downloaded: {output_path.name}") return True except requests.exceptions.RequestException as e: print(f"✗ HTTP download failed: {e}", file=sys.stderr) # delete failed files if output_path.exists(): output_path.unlink() # Fallback: copy from an existing learning_source_* directory existing = _find_existing_smiles(output_path.name) if existing: print(f" ↪ Copying from existing dataset: {existing}", file=sys.stderr) _shutil.copy2(existing, output_path) print(f"✓ Copied: {output_path.name} ({output_path.stat().st_size:,} bytes)") return True print( f" No local fallback found for {output_path.name}.\n" f" Download manually from {url} and place in {output_path.parent}/", file=sys.stderr, ) return FalseDownload and save file from URL.
Falls back to copying from a sibling learning_source_* directory when the remote server blocks automated downloads (e.g. Figshare WAF challenge).
Args
url- Download source URL
output_path- save the first file
chunk_size- Chunk size (bytes)
def download_guacamol(compounds_dir)-
Expand source code
def download_guacamol(compounds_dir): """ Download GuacaMol dataset Args: compounds_dir: compounds directorypath (example: learning_source_XXX/compounds) Raises: RuntimeError: If download fails """ import logging logger = logging.getLogger(__name__) output_dir = Path(compounds_dir) / "benchmark" / "GuacaMol" output_dir.mkdir(parents=True, exist_ok=True) logger.info("Downloading GuacaMol benchmark from https://figshare.com/projects/GuacaMol/56639") logger.info(f"Destination: {output_dir}") success_count = 0 total_count = len(GUACAMOL_URLS) for split, url in GUACAMOL_URLS.items(): filename = f"guacamol_v1_{split}.smiles" output_path = output_dir / filename if download_file(url, output_path): success_count += 1 if success_count < total_count: logger.warning( f"Figshare download incomplete ({success_count}/{total_count}). " "Trying HuggingFace fallback..." ) if not _download_from_huggingface(output_dir): raise RuntimeError( f"GuacaMol download incomplete: {success_count}/{total_count} files downloaded " "and HuggingFace fallback also failed." ) logger.info("✓ GuacaMol: All files obtained via HuggingFace fallback.") else: logger.info(f"✓ GuacaMol: All {total_count} files downloaded successfully")Download GuacaMol dataset
Args
compounds_dir- compounds directorypath (example: learning_source_XXX/compounds)
Raises
RuntimeError- If download fails
def main()-
Expand source code
def main(): """Download GuacaMol dataset (for standalone execution)""" # Check LEARNING_SOURCE_DIR learning_source_dir = os.environ.get("LEARNING_SOURCE_DIR") if not learning_source_dir: print( "ERROR: Environment variable 'LEARNING_SOURCE_DIR' is not set.", file=sys.stderr, ) print( "Please set LEARNING_SOURCE_DIR before running this script:", file=sys.stderr, ) print(" export LEARNING_SOURCE_DIR='learning_20251104'", file=sys.stderr) sys.exit(1) compounds_dir = Path(learning_source_dir) / "compounds" try: download_guacamol(str(compounds_dir)) print("\nNext steps:") print(" 1. Run the GPT-2 preparation script:") print( f" LEARNING_SOURCE_DIR={learning_source_dir} python src/compounds/dataset/prepare_gpt2.py assets/configs/compounds.yaml" ) return 0 except RuntimeError as e: print(f"\n✗ {e}", file=sys.stderr) return 1Download GuacaMol dataset (for standalone execution)