Module molcrawl.compounds.dataset.dataset_config
Defining and configuring compound datasets
Provides a uniform definition for processing each dataset individually.
Functions
def get_all_dataset_types() ‑> List[CompoundDatasetType]-
Expand source code
def get_all_dataset_types() -> List[CompoundDatasetType]: """Get list of all dataset types""" return list(CompoundDatasetType)Get list of all dataset types
def get_available_datasets(compounds_dir: pathlib.Path) ‑> List[CompoundDatasetType]-
Expand source code
def get_available_datasets(compounds_dir: Path) -> List[CompoundDatasetType]: """ Get list of available datasets Returns only datasets with raw data. Args: compounds_dir: compounds directorypath of Returns: List of available datasets """ available = [] for dataset_type, info in DATASET_DEFINITIONS.items(): raw_path = info.get_raw_path(compounds_dir) if raw_path.exists(): available.append(dataset_type) return availableGet list of available datasets
Returns only datasets with raw data.
Args
compounds_dir- compounds directorypath of
Returns
List of available datasets
def get_dataset_info(dataset_type: CompoundDatasetType) ‑> DatasetInfo-
Expand source code
def get_dataset_info(dataset_type: CompoundDatasetType) -> DatasetInfo: """Get dataset information""" return DATASET_DEFINITIONS[dataset_type]Get dataset information
Classes
class CompoundDatasetType (value, names=None, *, module=None, qualname=None, type=None, start=1)-
Expand source code
class CompoundDatasetType(str, Enum): """Compound dataset type""" ZINC20 = "zinc20" OPV = "opv" PC9_GAP = "pc9_gap" ZINC_QM9 = "zinc_qm9" REDDB = "reddb" CHEMBL = "chembl" PUBCHEMQC_2017 = "pubchemqc_2017" PUBCHEMQC_2020 = "pubchemqc_2020" GUACAMOL = "guacamol" # Benchmark dataset for GPT2Compound dataset type
Ancestors
- builtins.str
- enum.Enum
Class variables
var CHEMBLvar GUACAMOLvar OPVvar PC9_GAPvar PUBCHEMQC_2017var PUBCHEMQC_2020var REDDBvar ZINC20var ZINC_QM9
class DatasetInfo (name: str,
dataset_type: CompoundDatasetType,
source_subdir: str,
source_filename: str,
requires_download: bool = True,
requires_properties: bool = True,
sample_size: int | None = None)-
Expand source code
@dataclass class DatasetInfo: """Dataset information""" name: str # dataset name dataset_type: CompoundDatasetType # Dataset type source_subdir: str # Raw data subdirectory (under data/) source_filename: str # Raw data file name requires_download: bool = True # Is download required? requires_properties: bool = True # Is property calculation required? sample_size: Optional[int] = None # Sampling size (if None, all data will be used) def get_raw_path(self, compounds_dir: Path) -> Path: """Get raw data path""" return compounds_dir / "data" / self.source_subdir / self.source_filename def get_processed_path(self, compounds_dir: Path) -> Path: """Get path to processed data""" return compounds_dir / "processed" / f"{self.name}.parquet" def get_tokenized_path(self, compounds_dir: Path) -> Path: """Get path to tokenized data""" return compounds_dir / "tokenized" / f"{self.name}_tokenized.parquet" def get_hf_dataset_path(self, compounds_dir: Path) -> Path: """Get the path in HuggingFace Dataset format""" return compounds_dir / "hf_datasets" / self.nameDataset information
Instance variables
var dataset_type : CompoundDatasetTypevar name : strvar requires_download : boolvar requires_properties : boolvar sample_size : int | Nonevar source_filename : strvar source_subdir : str
Methods
def get_hf_dataset_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path-
Expand source code
def get_hf_dataset_path(self, compounds_dir: Path) -> Path: """Get the path in HuggingFace Dataset format""" return compounds_dir / "hf_datasets" / self.nameGet the path in HuggingFace Dataset format
def get_processed_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path-
Expand source code
def get_processed_path(self, compounds_dir: Path) -> Path: """Get path to processed data""" return compounds_dir / "processed" / f"{self.name}.parquet"Get path to processed data
def get_raw_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path-
Expand source code
def get_raw_path(self, compounds_dir: Path) -> Path: """Get raw data path""" return compounds_dir / "data" / self.source_subdir / self.source_filenameGet raw data path
def get_tokenized_path(self, compounds_dir: pathlib.Path) ‑> pathlib.Path-
Expand source code
def get_tokenized_path(self, compounds_dir: Path) -> Path: """Get path to tokenized data""" return compounds_dir / "tokenized" / f"{self.name}_tokenized.parquet"Get path to tokenized data