Module molcrawl.compounds.utils.general
Functions
def combine_datasets(raw_data_dir: str, output_dir: str)-
Expand source code
def combine_datasets(raw_data_dir: str, output_dir: str): """ Combine all datasets to generate OrganiX13 Args: raw_data_dir: raw data directory output_dir: Integrated data output directory """ combine_all(raw_data_dir, output_dir)Combine all datasets to generate OrganiX13
Args
raw_data_dir- raw data directory
output_dir- Integrated data output directory
def download_additional_datasets(raw_data_dir: str)-
Expand source code
def download_additional_datasets(raw_data_dir: str): """Alias for backwards compatibility. Please use download_llamol_datasets.""" download_llamol_datasets(raw_data_dir)Alias for backwards compatibility. Please use download_llamol_datasets.
def download_datasets(raw_data_dir: str, output_dir: str)-
Expand source code
def download_datasets(raw_data_dir: str, output_dir: str): """ Download and integrate all datasets (for legacy compatibility) Args: raw_data_dir: raw data storage directory output_dir: Integrated data output directory """ download_zinc20(raw_data_dir) download_opv(raw_data_dir) download_llamol_datasets(raw_data_dir) combine_datasets(raw_data_dir, output_dir)Download and integrate all datasets (for legacy compatibility)
Args
raw_data_dir- raw data storage directory
output_dir- Integrated data output directory
def download_llamol_datasets(raw_data_dir: str)-
Expand source code
def download_llamol_datasets(raw_data_dir: str): """ LlaMol dataset (download from Fraunhofer-SCAI/llamol repository) Args: raw_data_dir: raw data storage directory (COMPOUNDS_DIR) """ # Save in data/Fraunhofer-SCAI-llamol directory llamol_dir = os.path.join(raw_data_dir, "data", "Fraunhofer-SCAI-llamol") os.makedirs(llamol_dir, exist_ok=True) # Check the integrity of existing parquet files _verify_llamol_parquet_files(llamol_dir) download_datasets_from_repo(llamol_dir) # Create marker file data_dir = os.path.join(raw_data_dir, "data") marker_file = Path(data_dir) / "llamol_download.marker" marker_file.touch()LlaMol dataset (download from Fraunhofer-SCAI/llamol repository)
Args
raw_data_dir- raw data storage directory (COMPOUNDS_DIR)
def download_opv(raw_data_dir: str)-
Expand source code
def download_opv(raw_data_dir: str): """ Download OPV dataset Args: raw_data_dir: raw data storage directory (COMPOUNDS_DIR) """ # save to data/opv directory opv_dir = os.path.join(raw_data_dir, "data", "opv") os.makedirs(opv_dir, exist_ok=True) OPV(opv_dir) # Create marker file data_dir = os.path.join(raw_data_dir, "data") marker_file = Path(data_dir) / "opv_download.marker" marker_file.touch()Download OPV dataset
Args
raw_data_dir- raw data storage directory (COMPOUNDS_DIR)
def download_zinc20(raw_data_dir: str)-
Expand source code
def download_zinc20(raw_data_dir: str): """ Download and convert ZINC20 dataset Args: raw_data_dir: raw data storage directory (COMPOUNDS_DIR) """ download_zinc_files() # Save to data/zinc20 directory zinc_save_path = os.path.join(raw_data_dir, "data", "zinc20") convert_zinc_to_parquet(zinc_save_path) # Create marker file data_dir = os.path.join(raw_data_dir, "data") os.makedirs(data_dir, exist_ok=True) marker_file = Path(data_dir) / "zinc20_download.marker" marker_file.touch()Download and convert ZINC20 dataset
Args
raw_data_dir- raw data storage directory (COMPOUNDS_DIR)