Module molcrawl.compounds.dataset.hf_converter
Conversion to HuggingFace Dataset format
Convert tokenized data to HuggingFace Dataset format.
Functions
def convert_all_tokenized_datasets(compounds_dir: pathlib.Path,
dataset_types: List[CompoundDatasetType] | None = None,
train_ratio: float = 0.9,
valid_ratio: float = 0.05,
test_ratio: float = 0.05,
force: bool = False,
random_seed: int = 42) ‑> dict-
Expand source code
def convert_all_tokenized_datasets( compounds_dir: Path, dataset_types: Optional[List[CompoundDatasetType]] = None, train_ratio: float = 0.9, valid_ratio: float = 0.05, test_ratio: float = 0.05, force: bool = False, random_seed: int = 42, ) -> dict: """ Convert all tokenized datasets to HuggingFace format Args: compounds_dir: compounds directorypath of dataset_types: List of dataset types to convert (if None, all tokenized) train_ratio: Ratio of training data valid_ratio: Ratio of valid data test_ratio: Ratio of test data force: forced reconversion flag random_seed: random seed Returns: Dictionary of {dataset_type: dataset_dict} """ from molcrawl.compounds.dataset.dataset_config import get_dataset_info, DATASET_DEFINITIONS # Determine the dataset to be processed if dataset_types is None: # Get the dataset with tokenized data dataset_types = [] for dt, info in DATASET_DEFINITIONS.items(): tokenized_path = info.get_tokenized_path(compounds_dir) if tokenized_path.exists(): dataset_types.append(dt) if not dataset_types: logger.warning("No tokenized datasets available for conversion") return {} else: # If the specified dataset is a string, convert it to an Enum if isinstance(dataset_types[0], str): dataset_types = [CompoundDatasetType(dt) for dt in dataset_types] logger.info(f"Converting {len(dataset_types)} datasets: {[dt.value for dt in dataset_types]}") results = {} for dataset_type in dataset_types: info = get_dataset_info(dataset_type) converter = HFDatasetConverter(info, compounds_dir) dataset_dict = converter.convert( train_ratio=train_ratio, valid_ratio=valid_ratio, test_ratio=test_ratio, force=force, random_seed=random_seed, ) if dataset_dict is not None: results[dataset_type] = dataset_dict logger.info(f"Successfully converted {len(results)}/{len(dataset_types)} datasets") return resultsConvert all tokenized datasets to HuggingFace format
Args:compounds_dir: compounds directorypath of dataset_types: List of dataset types to convert (if None, all tokenized) train_ratio: Ratio of training data valid_ratio: Ratio of valid data test_ratio: Ratio of test data force: forced reconversion flag random_seed: random seed
Returns:Dictionary of {dataset_type: dataset_dict}
Classes
class HFDatasetConverter (dataset_info: DatasetInfo,
compounds_dir: pathlib.Path)-
Expand source code
class HFDatasetConverter: """ HuggingFace Dataset transformation class Convert tokenized data to HuggingFace Dataset format. """ def __init__(self, dataset_info: DatasetInfo, compounds_dir: Path): """ Args: dataset_info: Dataset information compounds_dir: Path to the compounds directory """ self.dataset_info = dataset_info self.compounds_dir = Path(compounds_dir) def convert( self, train_ratio: float = 0.9, valid_ratio: float = 0.05, test_ratio: float = 0.05, force: bool = False, random_seed: int = 42, ) -> Optional[DatasetDict]: """ Convert to HuggingFace Dataset format Args: train_ratio: Ratio of training data valid_ratio: Ratio of valid data test_ratio: Ratio of test data force: forced reconversion flag random_seed: random seed Returns: DatasetDict (None in case of error) """ hf_path = self.dataset_info.get_hf_dataset_path(self.compounds_dir) # Skip if already converted if not force and hf_path.exists(): try: logger.info(f"✓ {self.dataset_info.name}: Already converted, loading from {hf_path}") # Check if train/valid/test all exist train_path = hf_path / "train" valid_path = hf_path / "valid" test_path = hf_path / "test" if train_path.exists() and valid_path.exists() and test_path.exists(): return DatasetDict( { "train": Dataset.load_from_disk(str(train_path)), "valid": Dataset.load_from_disk(str(valid_path)), "test": Dataset.load_from_disk(str(test_path)), } ) except Exception as e: logger.warning(f"Failed to load existing dataset: {e}, will reconvert") # Load tokenized data tokenized_path = self.dataset_info.get_tokenized_path(self.compounds_dir) if not tokenized_path.exists(): logger.warning( f"⚠ {self.dataset_info.name}: Tokenized data not found at {tokenized_path}\n Please run tokenization first." ) return None logger.info(f"🔄 {self.dataset_info.name}: Converting to HuggingFace Dataset format...") try: # Load data table = pq.read_table(tokenized_path) df = table.to_pandas() # data split df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True) total_samples = len(df) train_end = int(total_samples * train_ratio) valid_end = train_end + int(total_samples * valid_ratio) train_df = df[:train_end] valid_df = df[train_end:valid_end] test_df = df[valid_end:] logger.info(f" Split: train={len(train_df)}, valid={len(valid_df)}, test={len(test_df)}") # Convert to HuggingFace Dataset format dataset_dict = DatasetDict( { "train": Dataset.from_pandas(train_df, preserve_index=False), "valid": Dataset.from_pandas(valid_df, preserve_index=False), "test": Dataset.from_pandas(test_df, preserve_index=False), } ) # Save as DatasetDict so `load_from_disk(hf_path)` works directly. hf_path.mkdir(parents=True, exist_ok=True) dataset_dict.save_to_disk(str(hf_path)) for split_name, split_dataset in dataset_dict.items(): logger.info(f" Saved {split_name}: {len(split_dataset)} samples to {hf_path / split_name}") logger.info(f"✓ {self.dataset_info.name}: Converted to HuggingFace Dataset format") return dataset_dict except Exception as e: logger.error(f"✗ {self.dataset_info.name}: Conversion failed - {e}") return NoneHuggingFace Dataset transformation class
Convert tokenized data to HuggingFace Dataset format.
Args:dataset_info: Dataset information compounds_dir: Path to the compounds directory
Methods
def convert(self,
train_ratio: float = 0.9,
valid_ratio: float = 0.05,
test_ratio: float = 0.05,
force: bool = False,
random_seed: int = 42) ‑> datasets.dataset_dict.DatasetDict | None-
Expand source code
def convert( self, train_ratio: float = 0.9, valid_ratio: float = 0.05, test_ratio: float = 0.05, force: bool = False, random_seed: int = 42, ) -> Optional[DatasetDict]: """ Convert to HuggingFace Dataset format Args: train_ratio: Ratio of training data valid_ratio: Ratio of valid data test_ratio: Ratio of test data force: forced reconversion flag random_seed: random seed Returns: DatasetDict (None in case of error) """ hf_path = self.dataset_info.get_hf_dataset_path(self.compounds_dir) # Skip if already converted if not force and hf_path.exists(): try: logger.info(f"✓ {self.dataset_info.name}: Already converted, loading from {hf_path}") # Check if train/valid/test all exist train_path = hf_path / "train" valid_path = hf_path / "valid" test_path = hf_path / "test" if train_path.exists() and valid_path.exists() and test_path.exists(): return DatasetDict( { "train": Dataset.load_from_disk(str(train_path)), "valid": Dataset.load_from_disk(str(valid_path)), "test": Dataset.load_from_disk(str(test_path)), } ) except Exception as e: logger.warning(f"Failed to load existing dataset: {e}, will reconvert") # Load tokenized data tokenized_path = self.dataset_info.get_tokenized_path(self.compounds_dir) if not tokenized_path.exists(): logger.warning( f"⚠ {self.dataset_info.name}: Tokenized data not found at {tokenized_path}\n Please run tokenization first." ) return None logger.info(f"🔄 {self.dataset_info.name}: Converting to HuggingFace Dataset format...") try: # Load data table = pq.read_table(tokenized_path) df = table.to_pandas() # data split df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True) total_samples = len(df) train_end = int(total_samples * train_ratio) valid_end = train_end + int(total_samples * valid_ratio) train_df = df[:train_end] valid_df = df[train_end:valid_end] test_df = df[valid_end:] logger.info(f" Split: train={len(train_df)}, valid={len(valid_df)}, test={len(test_df)}") # Convert to HuggingFace Dataset format dataset_dict = DatasetDict( { "train": Dataset.from_pandas(train_df, preserve_index=False), "valid": Dataset.from_pandas(valid_df, preserve_index=False), "test": Dataset.from_pandas(test_df, preserve_index=False), } ) # Save as DatasetDict so `load_from_disk(hf_path)` works directly. hf_path.mkdir(parents=True, exist_ok=True) dataset_dict.save_to_disk(str(hf_path)) for split_name, split_dataset in dataset_dict.items(): logger.info(f" Saved {split_name}: {len(split_dataset)} samples to {hf_path / split_name}") logger.info(f"✓ {self.dataset_info.name}: Converted to HuggingFace Dataset format") return dataset_dict except Exception as e: logger.error(f"✗ {self.dataset_info.name}: Conversion failed - {e}") return NoneConvert to HuggingFace Dataset format
Args:train_ratio: Ratio of training data valid_ratio: Ratio of valid data test_ratio: Ratio of test data force: forced reconversion flag random_seed: random seed
Returns:DatasetDict (None in case of error)