Module molcrawl.compounds.dataset.organix13.combine_all
Functions
def calcLogPIfMol(smi)-
Expand source code
def calcLogPIfMol(smi): Chem, Descriptors, _ = _get_rdkit_helpers() m = Chem.MolFromSmiles(smi) if m is not None: return Descriptors.MolLogP(m) else: return None def calcMol(smi)-
Expand source code
def calcMol(smi): Chem, _, _ = _get_rdkit_helpers() return Chem.MolFromSmiles(smi) def calcMolWeight(smi)-
Expand source code
def calcMolWeight(smi): Chem, Descriptors, _ = _get_rdkit_helpers() mol = Chem.MolFromSmiles(smi) if mol is not None: return Descriptors.ExactMolWt(mol) else: return None def calcSascore(smi)-
Expand source code
def calcSascore(smi): Chem, _, sascorer = _get_rdkit_helpers() mol = Chem.MolFromSmiles(smi) if mol is not None: return sascorer.calculateScore(mol) else: return None def calculateProperties(df)-
Expand source code
def calculateProperties(df): import pandas as pd smi, logps, mol_weights, sascores = calculateValues(df["smiles"]) out_df = pd.DataFrame({"smiles": smi, "logp": logps, "mol_weight": mol_weights, "sascore": sascores}) return out_df def calculateValues(smi: pd.Series)-
Expand source code
def calculateValues(smi: pd.Series): import pandas as pd logging.info("Calculating properties") with multiprocessing.Pool(16) as pool: logging.info("Starting logps") logps_list = pool.map(calcLogPIfMol, smi) valid_mols = ~pd.isna(logps_list) logps = pd.Series(logps_list)[valid_mols] smi = pd.Series(smi)[valid_mols] logps.reset_index(drop=True, inplace=True) smi.reset_index(drop=True, inplace=True) logging.info("Starting mol weights") mol_weights = pool.map(calcMolWeight, smi) logging.info("Starting sascores") sascores = pool.map(calcSascore, smi) return smi, logps, mol_weights, sascores def combine_all(raw_data_path: str, save_path: str)-
Expand source code
def combine_all(raw_data_path: str, save_path: str): """ Combine all datasets to generate OrganiX13 Args: raw_data_path: COMPOUNDS_DIR (example: learning_20251104/compounds) save_path: Output directory (e.g. learning_20251104/compounds/organix13) """ import pandas as pd # data directory path data_dir = os.path.join(raw_data_path, "data") llamol_dir = os.path.join(data_dir, "Fraunhofer-SCAI-llamol") logging.info("Processing df_pc9") df_pc9 = safe_read_parquet(os.path.join(llamol_dir, "Full_PC9_GAP.parquet"), "PC9 GAP") df_pc9 = calculateProperties(df_pc9) logging.info("Processing df_zinc_full") df_zinc_full = safe_read_parquet(os.path.join(data_dir, "zinc20", "zinc_processed.parquet"), "ZINC20 Full") df_zinc_full = df_zinc_full.sample(n=5_000_000) df_zinc_full = calculateProperties(df_zinc_full) logging.info("Processing df_zinc_qm9") df_zinc_qm9 = safe_read_parquet(os.path.join(llamol_dir, "qm9_zinc250_cep.parquet"), "ZINC QM9") df_zinc_qm9 = calculateProperties(df_zinc_qm9) logging.info("Processing df_opv") df_opv = safe_read_parquet(os.path.join(data_dir, "opv", "opv.parquet"), "OPV") df_opv = calculateProperties(df_opv) logging.info("Processing df_reddb") # Source: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/F3QFSQ df_reddb = safe_read_parquet(os.path.join(llamol_dir, "RedDB_Full.parquet"), "RedDB") df_reddb = calculateProperties(df_reddb) logging.info("Processing df_chembl") df_chembl = safe_read_parquet(os.path.join(llamol_dir, "chembl_log_sascore.parquet"), "ChEMBL") df_chembl = calculateProperties(df_chembl) logging.info("Processing df_pubchemqc_2017") df_pubchemqc_2017 = safe_read_parquet(os.path.join(llamol_dir, "pubchemqc_energy.parquet"), "PubChemQC 2017") df_pubchemqc_2017 = calculateProperties(df_pubchemqc_2017) logging.info("Processing df_pubchemqc_2020") df_pubchemqc_2020 = safe_read_parquet(os.path.join(llamol_dir, "pubchemqc2020_energy.parquet"), "PubChemQC 2020") df_pubchemqc_2020 = calculateProperties(df_pubchemqc_2020) df_list = [ df_zinc_qm9, df_opv, df_pubchemqc_2017, df_pubchemqc_2020, df_zinc_full, df_reddb, df_pc9, df_chembl, ] logging.info(f"ZINC QM9 {len(df_zinc_qm9)}") logging.info(f"df_opv {len(df_opv)}") logging.info(f"df_pubchemqc_2017 {len(df_pubchemqc_2017)}") logging.info(f"df_pubchemqc_2020 {len(df_pubchemqc_2020)}") logging.info(f"df_zinc_full {len(df_zinc_full)}") logging.info(f"df_reddb {len(df_reddb)}") logging.info(f"df_pc9 {len(df_pc9)}") logging.info(f"df_chembl {len(df_chembl)}") all_columns = [ "smiles", "logp", "sascore", "mol_weight", ] logging.info("concatenting") df = pd.concat(df_list, axis=0, ignore_index=True) df = df[all_columns] df.reset_index(drop=True, inplace=True) df["mol_weight"] = df["mol_weight"] / 100.0 logging.info(df.head()) logging.info("saving") logging.info("Combined len: {}".format(len(df))) df.to_parquet(os.path.join(save_path, "OrganiX13.parquet"))Combine all datasets to generate OrganiX13
Args:raw_data_path: COMPOUNDS_DIR (example: learning_20251104/compounds) save_path: Output directory (e.g. learning_20251104/compounds/organix13)
def safe_read_parquet(file_path, dataset_name)-
Expand source code
def safe_read_parquet(file_path, dataset_name): import pandas as pd """ Safely read a parquet file with error handling Args: file_path: Path to the parquet file dataset_name: Name of the dataset for logging Returns: DataFrame or None if file is corrupted Raises: FileNotFoundError: If file doesn't exist ValueError: If file is corrupted """ if not os.path.exists(file_path): raise FileNotFoundError( f"{dataset_name} parquet file not found: {file_path}\nPlease re-run the download step to obtain this file." ) try: logger.info(f"Reading {dataset_name} from {file_path}") df = pd.read_parquet(file_path) logger.info(f"Successfully loaded {dataset_name}: {len(df)} rows") return df except Exception as e: error_msg = ( f"Failed to read {dataset_name} parquet file: {file_path}\n" f"Error: {str(e)}\n" f"The file may be corrupted or incomplete.\n" f"Solution: Delete the file and re-run the download:\n" f" rm {file_path}\n" f" LEARNING_SOURCE_DIR=<your_dir> ./bootstraps/01_compounds_prepare.sh" ) logger.error(error_msg) raise ValueError(error_msg) from e