Module `molcrawl.compounds.dataset.prepare_gpt2`

Functions

def concatenate_texts(examples, eos_token_id)

Expand source code

def concatenate_texts(examples, eos_token_id):
    """Concatenate all input_ids sequences, appending eos_token_id after each."""
    concatenated_ids = []
    for input_ids in examples["input_ids"]:
        concatenated_ids.extend(list(input_ids) + [eos_token_id])
    return {"input_ids": concatenated_ids}

Concatenate all input_ids sequences, appending eos_token_id after each.

def create_chunks(examples, context_length)

Expand source code

def create_chunks(examples, context_length):
    """Split a flat input_ids list into fixed-length chunks."""
    concatenated_ids = examples["input_ids"]
    total_length = (len(concatenated_ids) // context_length) * context_length
    concatenated_ids = concatenated_ids[:total_length]
    input_ids = [concatenated_ids[i : i + context_length] for i in range(0, total_length, context_length)]
    return {"input_ids": input_ids}

Split a flat input_ids list into fixed-length chunks.

def tokenize_batch_dataset(compounds_dir, vocab_path, max_length)

Expand source code

def tokenize_batch_dataset(compounds_dir, vocab_path, max_length):
    """
    Tokenize GuacaMol benchmark data for GPT-2 training.

    Each SMILES is encoded without padding; all sequences are concatenated with
    [SEP] (eos_token_id=13) as the end-of-sequence marker and chunked into
    blocks of 1024 tokens — matching the genome_sequence / protein_sequence
    preparation pipeline.

    Args:
        compounds_dir: Base directory for compounds data (from LEARNING_SOURCE_DIR)
        vocab_path: Path to vocabulary file
        max_length: Maximum token length per SMILES (used for truncation)
    """
    from functools import partial

    from datasets import Dataset, DatasetDict

    tokenizer = CompoundsTokenizer(vocab_path, max_length)

    # GuacaMol benchmark data directory
    benchmark_dir = Path(compounds_dir) / "benchmark" / "GuacaMol"

    dataset_dic = {}
    for split in ["train", "valid", "test"]:
        smiles_file = benchmark_dir / f"guacamol_v1_{split}.smiles"

        if not smiles_file.exists() or smiles_file.stat().st_size == 0:
            raise FileNotFoundError(
                f"GuacaMol benchmark file not found: {smiles_file}\n\n"
                f"Please download GuacaMol data by running:\n"
                f"  LEARNING_SOURCE_DIR={os.environ.get('LEARNING_SOURCE_DIR', 'learning_source_YYYYMMDD')} "
                f"python src/preparation/preparation_script_compounds.py assets/configs/compounds.yaml "
                f"--datasets guacamol --download-only\n\n"
                f"Or download manually from: https://figshare.com/projects/GuacaMol/56639\n"
                f"And place the files in: {benchmark_dir}/"
            )

        print(f"Loading {split} data from: {smiles_file}")
        with open(smiles_file) as f:
            lines = [line.strip() for line in f if line.strip()]

        # Encode without padding; [SEP] is appended per-sequence by concatenate_texts
        encoded = []
        for smi in lines:
            ids = tokenizer.encode(smi, add_special_tokens=False, truncation=True, max_length=max_length)
            if ids:
                encoded.append(ids)

        if not encoded:
            raise ValueError(
                f"No valid SMILES encoded from {smiles_file}. The file may be empty or all entries were filtered out."
            )
        print(f"{split} - {len(encoded)} molecules encoded; first decoded: {tokenizer.decode(encoded[0])}")
        dataset_dic[split] = encoded

    d = {
        "train": Dataset.from_dict({"input_ids": dataset_dic["train"]}),
        "valid": Dataset.from_dict({"input_ids": dataset_dic["valid"]}),
        "test": Dataset.from_dict({"input_ids": dataset_dic["test"]}),
    }
    dataset = DatasetDict(d)

    # Concatenate sequences with [SEP] (id=13) as EOS, then chunk into 1024-token blocks
    context_length = 1024
    eos_id = tokenizer.eos_token_id  # 13 ([SEP])

    concatenated = dataset.map(
        partial(concatenate_texts, eos_token_id=eos_id),
        batched=True,
        batch_size=-1,
    )
    chunked = concatenated.map(
        partial(create_chunks, context_length=context_length),
        batched=True,
        batch_size=-1,
    )

    # Save to compounds directory structure
    output_path = benchmark_dir / "compounds" / "training_ready_hf_dataset"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Saving dataset to: {output_path}")
    print("Match this path to the train_gpt2_config.py->dataset_dir parameter.")
    chunked.save_to_disk(str(output_path))

    # Print statistics
    print("\nDataset statistics:")
    for split in ["train", "valid", "test"]:
        print(f"  {split}: {len(chunked[split])} chunks of {context_length} tokens")

Tokenize GuacaMol benchmark data for GPT-2 training.

Each SMILES is encoded without padding; all sequences are concatenated with [SEP] (eos_token_id=13) as the end-of-sequence marker and chunked into blocks of 1024 tokens — matching the genome_sequence / protein_sequence preparation pipeline.

Args

compounds_dir: Base directory for compounds data (from LEARNING_SOURCE_DIR)
vocab_path: Path to vocabulary file
max_length: Maximum token length per SMILES (used for truncation)