Module molcrawl.compounds.dataset.prepare_chembl
Prepare ChEMBL for GPT-2 / BERT fine-tuning on the compounds domain.
Reads the canonical SMILES file produced by download_chembl.py, tokenises
each SMILES string with the same CompoundsTokenizer used during pretraining,
shuffles the dataset, splits it 80 / 10 / 10 (train / valid / test), and saves
the result in HuggingFace Dataset format to output_dir.
The output directory layout is compatible with molcrawl/core/dataset.py
(PreparedDataset) and with the HuggingFace Trainer used in
molcrawl/bert/main.py:
training_ready_hf_dataset/
dataset_info.json
train/
valid/
test/
Usage ───── python -m molcrawl.compounds.dataset.prepare_chembl # uses CHEMBL_* constants python -m molcrawl.compounds.dataset.prepare_chembl –force
Functions
def prepare_chembl(source_dir: str,
output_dir: str,
vocab_file: str = 'assets/molecules/vocab.txt',
max_len: int = 256,
train_ratio: float = 0.8,
valid_ratio: float = 0.1,
random_seed: int = 42,
force: bool = False,
num_proc: int = 4) ‑> bool-
Expand source code
def prepare_chembl( source_dir: str, output_dir: str, vocab_file: str = VOCAB_FILE, max_len: int = MAX_LEN, train_ratio: float = 0.8, valid_ratio: float = 0.1, random_seed: int = 42, force: bool = False, num_proc: int = 4, ) -> bool: """Tokenise ChEMBL SMILES and save a HuggingFace DatasetDict. Args: source_dir: Directory containing ``smiles.txt`` (output of ``download_chembl.py``). output_dir: Destination for ``training_ready_hf_dataset/``. vocab_file: Path to the SMILES WordPiece vocabulary file. max_len: Maximum token length (same as pretraining, default 256). train_ratio: Fraction of data used for training. valid_ratio: Fraction for validation (remainder → test). random_seed: Random seed for shuffling. force: Re-run even if the marker file already exists. num_proc: Worker count for HuggingFace ``.map()`` parallelism. Returns: ``True`` on success, ``False`` on failure. """ from datasets import Dataset, DatasetDict out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) marker = out / "prepare_complete.marker" if not force and marker.exists(): logger.info("ChEMBL preparation already completed. Skipping. (use force=True to re-run)") return True smiles_file = Path(source_dir) / "smiles.txt" if not smiles_file.exists(): logger.error(f"smiles.txt not found at {smiles_file}. Run download_chembl.py first.") return False # ── 1. Read SMILES ──────────────────────────────────────────────────────── logger.info(f"Reading SMILES from {smiles_file} …") with smiles_file.open("r") as fh: smiles_list = [line.strip() for line in fh if line.strip()] logger.info(f"Loaded {len(smiles_list):,} SMILES strings.") # ── 2. Tokenise in batches ──────────────────────────────────────────────── from molcrawl.compounds.utils.tokenizer import CompoundsTokenizer tokenizer = CompoundsTokenizer(vocab_file, max_len) logger.info(f"Tokenising with vocab_size={tokenizer.vocab_size}, max_len={max_len} …") all_input_ids: List[List[int]] = [] skipped = 0 for start in range(0, len(smiles_list), BATCH_SIZE): batch = smiles_list[start : start + BATCH_SIZE] tokenised = _tokenize_batch(batch, tokenizer) for ids in tokenised: if ids is None: skipped += 1 else: all_input_ids.append(ids) if (start // BATCH_SIZE) % 10 == 0: logger.info(f" … {start + len(batch):,} / {len(smiles_list):,} processed ({skipped:,} skipped so far)") logger.info( f"Tokenisation complete. " f"Kept {len(all_input_ids):,}, skipped {skipped:,} " f"({skipped * 100 / max(1, len(smiles_list)):.1f}%)." ) if not all_input_ids: logger.error("No valid SMILES tokenised. Aborting.") return False # ── 3. Build Dataset and split ──────────────────────────────────────────── logger.info("Building HuggingFace Dataset …") dataset = Dataset.from_dict({"input_ids": all_input_ids}) dataset = dataset.shuffle(seed=random_seed) n = len(dataset) n_train = int(n * train_ratio) n_valid = int(n * valid_ratio) train_ds = dataset.select(range(n_train)) valid_ds = dataset.select(range(n_train, n_train + n_valid)) test_ds = dataset.select(range(n_train + n_valid, n)) logger.info(f"Split: train={len(train_ds):,}, valid={len(valid_ds):,}, test={len(test_ds):,}") # ── 3b. Concatenate with [SEP] as EOS and chunk into 1024-token blocks ──── # Matches the genome_sequence / protein_sequence pretraining pipeline. from molcrawl.compounds.utils.tokenizer import CompoundsTokenizer as _Tok _tok = _Tok(vocab_file, max_len) eos_id = _tok.eos_token_id # 13 ([SEP]) context_length = 1024 logger.info(f"Concatenating and chunking into {context_length}-token blocks (eos_id={eos_id}) …") chunked_splits = {} for split_name, split_ds in [("train", train_ds), ("valid", valid_ds), ("test", test_ds)]: split_ds = split_ds.map(partial(_concat_with_eos, eos_token_id=eos_id), batched=True, batch_size=-1) split_ds = split_ds.map(partial(_create_chunks, context_length=context_length), batched=True, batch_size=-1) chunked_splits[split_name] = split_ds logger.info(f" {split_name}: {len(split_ds):,} chunks") dataset_dict = DatasetDict(chunked_splits) # ── 4. Save ────────────────────────────────────────────────────────────── hf_path = out / "training_ready_hf_dataset" logger.info(f"Saving dataset to {hf_path} …") dataset_dict.save_to_disk(str(hf_path)) for split_name, split_ds in dataset_dict.items(): logger.info(f" Saved {split_name}: {len(split_ds):,} chunks → {hf_path / split_name}") marker.touch() logger.info(f"ChEMBL preparation complete. Marker written to {marker}") return TrueTokenise ChEMBL SMILES and save a HuggingFace DatasetDict.
Args
source_dir- Directory containing
smiles.txt(output ofdownload_chembl.py). output_dir- Destination for
training_ready_hf_dataset/. vocab_file- Path to the SMILES WordPiece vocabulary file.
max_len- Maximum token length (same as pretraining, default 256).
train_ratio- Fraction of data used for training.
valid_ratio- Fraction for validation (remainder → test).
random_seed- Random seed for shuffling.
force- Re-run even if the marker file already exists.
num_proc- Worker count for HuggingFace
.map()parallelism.
Returns
Trueon success,Falseon failure.