Module `molcrawl.compounds.dataset.dataset_config`

Defining and configuring compound datasets

Provides a uniform definition for processing each dataset individually.

Functions

def get_all_dataset_types() ‑> List[CompoundDatasetType]

Expand source code

def get_all_dataset_types() -> List[CompoundDatasetType]:
    """Get list of all dataset types"""
    return list(CompoundDatasetType)

Get list of all dataset types

def get_available_datasets(compounds_dir: pathlib.Path) ‑> List[CompoundDatasetType]

Expand source code

def get_available_datasets(compounds_dir: Path) -> List[CompoundDatasetType]:
    """
    Get list of available datasets

    Returns only datasets with raw data.

    Args:
        compounds_dir: compounds directorypath of

    Returns:
        List of available datasets
    """
    available = []
    for dataset_type, info in DATASET_DEFINITIONS.items():
        raw_path = info.get_raw_path(compounds_dir)
        if raw_path.exists():
            available.append(dataset_type)
    return available

Get list of available datasets

Returns only datasets with raw data.

Args

compounds_dir: compounds directorypath of

Returns

List of available datasets

def get_dataset_info(dataset_type: CompoundDatasetType) ‑> DatasetInfo

Expand source code

def get_dataset_info(dataset_type: CompoundDatasetType) -> DatasetInfo:
    """Get dataset information"""
    return DATASET_DEFINITIONS[dataset_type]

Get dataset information

Classes

class CompoundDatasetType (value, names=None, *, module=None, qualname=None, type=None, start=1)

Expand source code

class CompoundDatasetType(str, Enum):
    """Compound dataset type"""

    ZINC20 = "zinc20"
    OPV = "opv"
    PC9_GAP = "pc9_gap"
    ZINC_QM9 = "zinc_qm9"
    REDDB = "reddb"
    CHEMBL = "chembl"
    PUBCHEMQC_2017 = "pubchemqc_2017"
    PUBCHEMQC_2020 = "pubchemqc_2020"
    GUACAMOL = "guacamol"  # Benchmark dataset for GPT2

Compound dataset type

Ancestors

builtins.str
enum.Enum

Class variables

var CHEMBL
var GUACAMOL
var OPV
var PC9_GAP
var PUBCHEMQC_2017
var PUBCHEMQC_2020
var REDDB
var ZINC20
var ZINC_QM9

class DatasetInfo (name: str, dataset_type: CompoundDatasetType, source_subdir: str, source_filename: str, requires_download: bool = True, requires_properties: bool = True, sample_size: int | None = None)

Expand source code

@dataclass
class DatasetInfo:
    """Dataset information"""

    name: str  # dataset name
    dataset_type: CompoundDatasetType  # Dataset type
    source_subdir: str  # Raw data subdirectory (under data/)
    source_filename: str  # Raw data file name
    requires_download: bool = True  # Is download required?
    requires_properties: bool = True  # Is property calculation required?
    sample_size: Optional[int] = None  # Sampling size (if None, all data will be used)

    def get_raw_path(self, compounds_dir: Path) -> Path:
        """Get raw data path"""
        return compounds_dir / "data" / self.source_subdir / self.source_filename

    def get_processed_path(self, compounds_dir: Path) -> Path:
        """Get path to processed data"""
        return compounds_dir / "processed" / f"{self.name}.parquet"

    def get_tokenized_path(self, compounds_dir: Path) -> Path:
        """Get path to tokenized data"""
        return compounds_dir / "tokenized" / f"{self.name}_tokenized.parquet"

    def get_hf_dataset_path(self, compounds_dir: Path) -> Path:
        """Get the path in HuggingFace Dataset format"""
        return compounds_dir / "hf_datasets" / self.name

Dataset information

Instance variables

var dataset_type : CompoundDatasetType
var name : str
var requires_download : bool
var requires_properties : bool
var sample_size : int | None
var source_filename : str
var source_subdir : str

Methods

def get_hf_dataset_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path

Expand source code

def get_hf_dataset_path(self, compounds_dir: Path) -> Path:
    """Get the path in HuggingFace Dataset format"""
    return compounds_dir / "hf_datasets" / self.name

Get the path in HuggingFace Dataset format

def get_processed_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path

Expand source code

def get_processed_path(self, compounds_dir: Path) -> Path:
    """Get path to processed data"""
    return compounds_dir / "processed" / f"{self.name}.parquet"

Get path to processed data

def get_raw_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path

Expand source code

def get_raw_path(self, compounds_dir: Path) -> Path:
    """Get raw data path"""
    return compounds_dir / "data" / self.source_subdir / self.source_filename

Get raw data path

def get_tokenized_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path

Expand source code

def get_tokenized_path(self, compounds_dir: Path) -> Path:
    """Get path to tokenized data"""
    return compounds_dir / "tokenized" / f"{self.name}_tokenized.parquet"

Get path to tokenized data