Module molcrawl.compounds.utils.preprocessing
Functions
def get_invalid_smiles_stats()-
Expand source code
def get_invalid_smiles_stats(): """ Get statistics for invalid SMILES Returns: tuple: (number of invalid SMILES, total number of SMILES, ineffectiveness, list of invalid examples) """ if _total_smiles_count == 0: return 0, 0, 0.0, [] invalid_rate = (_invalid_smiles_count / _total_smiles_count) * 100 return _invalid_smiles_count, _total_smiles_count, invalid_rate, _invalid_smiles_examplesGet statistics for invalid SMILES
Returns:tuple: (number of invalid SMILES, total number of SMILES, ineffectiveness, list of invalid examples)
def prepare_scaffolds(smiles: str)-
Expand source code
def prepare_scaffolds(smiles: str): """ Prepare the scaffolds of a molecule. Args: smiles: SMILES string Returns: str: scaffold SMILES string, empty string if invalid Note: Large databases such as ZINC20 may contain invalid SMILES for the following reasons: 1. Notation problems for ionic structures such as quaternary ammonium (N+) 2. Conversion error from different formats 3. Expression of special stereochemistry 4. Automatic processing error when creating database These are usually within an acceptable range of a few percent of the total database. """ global _invalid_smiles_count, _total_smiles_count, _invalid_smiles_examples _total_smiles_count += 1 if smiles == "." or not smiles: _invalid_smiles_count += 1 if len(_invalid_smiles_examples) < 10: _invalid_smiles_examples.append(("empty or dot", smiles)) return "" if Chem is None or GetScaffoldForMol is None: raise ModuleNotFoundError("rdkit is required to prepare scaffolds") try: molecule = Chem.MolFromSmiles(smiles) if molecule is None: _invalid_smiles_count += 1 # save first 10 invalid examples if len(_invalid_smiles_examples) < 10: _invalid_smiles_examples.append(("parse_failed", smiles[:100])) # Log statistics every 1000 items if _invalid_smiles_count % 1000 == 0: invalid_count, total_count, invalid_rate, examples = get_invalid_smiles_stats() logger.warning(f"Invalid SMILES detected: {invalid_count}/{total_count} ({invalid_rate:.2f}%)") return "" scaffold = GetScaffoldForMol(molecule) scaffold_smiles = Chem.MolToSmiles(scaffold) return scaffold_smiles except Exception as e: _invalid_smiles_count += 1 if len(_invalid_smiles_examples) < 10: _invalid_smiles_examples.append(("exception", f"{smiles[:100]} | Error: {str(e)[:50]}")) logger.debug(f"Error processing SMILES '{smiles[:50]}...': {e}") return ""Prepare the scaffolds of a molecule.
Args:smiles: SMILES string
Returns:str: scaffold SMILES string, empty string if invalid
Note:Large databases such as ZINC20 may contain invalid SMILES for the following reasons: 1. Notation problems for ionic structures such as quaternary ammonium (N+) 2. Conversion error from different formats 3. Expression of special stereochemistry 4. Automatic processing error when creating database
These are usually within an acceptable range of a few percent of the total database.