Module molcrawl.compounds.utils.tokenizer
Functions
def load_vocab(vocab_file)-
Expand source code
def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): token = token.rstrip("\n") vocab[token] = index return vocabLoads a vocabulary file into a dictionary.
def tokenize_smiles(smi: List[str],
tokenizer: SmilesTokenizer = None,
max_smiles_len=256,
log_output=True)-
Expand source code
def tokenize_smiles( smi: List[str], tokenizer: SmilesTokenizer = None, max_smiles_len=256, log_output=True, ): tokens = tokenizer.encode(smi) if len(tokens) > max_smiles_len: if log_output: print(f"Removing to long {smi} with smiles len of {len(tokens)} ") return None return tokens
Classes
class BasicSmilesTokenizer (regex_pattern: str = '(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])')-
Expand source code
class BasicSmilesTokenizer(object): """ Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required. Examples -------- >>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer >>> tokenizer = BasicSmilesTokenizer() >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O")) ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O'] References ---------- .. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction 1572-1583 DOI: 10.1021/acscentsci.9b00576 """ def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN): """Constructs a BasicSMILESTokenizer. Parameters ---------- regex: string SMILES token regex """ self.regex_pattern = regex_pattern self.regex = re.compile(self.regex_pattern) def tokenize(self, text): """Basic Tokenization of a SMILES.""" tokens = [token for token in self.regex.findall(text)] return tokensRun basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.
Examples
>>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer >>> tokenizer = BasicSmilesTokenizer() >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O")) ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']References
.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction 1572-1583 DOI: 10.1021/acscentsci.9b00576
Constructs a BasicSMILESTokenizer. Parameters
regex:string- SMILES token regex
Methods
def tokenize(self, text)-
Expand source code
def tokenize(self, text): """Basic Tokenization of a SMILES.""" tokens = [token for token in self.regex.findall(text)] return tokensBasic Tokenization of a SMILES.
class BertTokenizer (*args, **kwargs)-
Expand source code
class BertTokenizer: # type: ignore def __init__(self, *args, **kwargs): raise ModuleNotFoundError("transformers is required to use SmilesTokenizer")Subclasses
class CompoundsTokenizer (vocab_file: str, max_len: int = 256)-
Expand source code
class CompoundsTokenizer(UnTrainableTokenizer, SmilesTokenizer): def __init__(self, vocab_file: str, max_len: int = 256): self.max_len = max_len UnTrainableTokenizer.__init__(self) SmilesTokenizer.__init__(self, vocab_file) def tokenize_text(self, text: str, verbose: bool = False): tokens = self.encode(text, padding="max_length", truncation=True, max_length=self.max_len) if len(tokens) > self.max_len: if verbose: logger.info(f"Removing to long {text} with len of {len(tokens)} ") return None return tokensHelper class that provides a standard way to create an ABC using inheritance.
Ancestors
Subclasses
Methods
def tokenize_text(self, text: str, verbose: bool = False)-
Expand source code
def tokenize_text(self, text: str, verbose: bool = False): tokens = self.encode(text, padding="max_length", truncation=True, max_length=self.max_len) if len(tokens) > self.max_len: if verbose: logger.info(f"Removing to long {text} with len of {len(tokens)} ") return None return tokens
Inherited members
class ScaffoldsTokenizer (vocab_file: str, max_len: int = 256)-
Expand source code
class ScaffoldsTokenizer(CompoundsTokenizer): def tokenize_text(self, text: str, verbose: bool = False): return super().tokenize_text(prepare_scaffolds(text), verbose=verbose)Helper class that provides a standard way to create an ABC using inheritance.
Ancestors
Methods
def tokenize_text(self, text: str, verbose: bool = False)-
Expand source code
def tokenize_text(self, text: str, verbose: bool = False): return super().tokenize_text(prepare_scaffolds(text), verbose=verbose)
Inherited members
class SmilesTokenizer (vocab_file, **kwargs)-
Expand source code
class SmilesTokenizer(BertTokenizer): """ Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer implementation found in Huggingface's transformers library. It runs a WordPiece tokenization algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al. Please see https://github.com/huggingface/transformers and https://github.com/rxn4chemistry/rxnfp for more details. References ---------- .. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David; Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3 Notes ---- This class requires huggingface's transformers and tokenizers libraries to be installed. """ def __init__(self, vocab_file, **kwargs): """Constructs a SmilesTokenizer. Parameters ---------- vocab_file: str Path to a SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt """ super().__init__(vocab_file, **kwargs) self.sos = "[CLS]" self.eos = "[SEP]" if not os.path.isfile(vocab_file): raise ValueError("Can't find a vocab file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.highest_unused_index = max([i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.basic_tokenizer = BasicSmilesTokenizer() @property def eos_token_id(self): return self.added_tokens_encoder[self.eos] @property def vocab_size(self): return len(self.vocab) @property def vocab_list(self): return list(self.vocab.keys()) def _tokenize(self, text: str): """ Tokenize a string into a list of tokens. Parameters ---------- text: str Input string sequence to be tokenized. """ split_tokens = [token for token in self.basic_tokenizer.tokenize(text)] return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. Parameters ---------- token: str String token from a larger sequence to be converted to a numerical id. """ return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): """ Converts an index (integer) in a token (string/unicode) using the vocab. Parameters ---------- index: int Integer index to be converted back to a string-based token as part of a larger sequence. """ return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens: List[str]): """Converts a sequence of tokens (string) in a single string. Parameters ---------- tokens: List[str] List of tokens for a given string sequence. Returns ------- out_string: str Single string from combined tokens. """ out_string: str = " ".join(tokens).replace(" ##", "").strip() return out_string def add_special_tokens_ids_single_sequence(self, token_ids: List[int]): """ Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] Parameters ---------- token_ids: list[int] list of tokenized input ids. Can be obtained using the encode or encode_plus methods. """ return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_single_sequence(self, tokens: List[str]): """ Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] Parameters ---------- tokens: List[str] List of tokens for a given string sequence. """ return [self.cls_token] + tokens + [self.sep_token] def add_special_tokens_ids_sequence_pair(self, token_ids_0: List[int], token_ids_1: List[int]) -> List[int]: """ Adds special tokens to a sequence pair for sequence classification tasks. A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] Parameters ---------- token_ids_0: List[int] List of ids for the first string sequence in the sequence pair (A). token_ids_1: List[int] List of tokens for the second string sequence in the sequence pair (B). """ sep = [self.sep_token_id] cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def add_padding_tokens(self, token_ids: List[int], length: int, right: bool = True) -> List[int]: """ Adds padding tokens to return a sequence of length max_length. By default padding tokens are added to the right of the sequence. Parameters ---------- token_ids: list[int] list of tokenized input ids. Can be obtained using the encode or encode_plus methods. length: int right: bool (True by default) Returns ---------- token_ids : list of tokenized input ids. Can be obtained using the encode or encode_plus methods. padding: int Integer to be added as padding token """ padding = [self.pad_token_id] * (length - len(token_ids)) if right: return token_ids + padding else: return padding + token_ids def save_vocabulary(self, vocab_path: str): # -> tuple[str]: doctest issue raised with this return type annotation """ Save the tokenizer vocabulary to a file. Parameters ---------- vocab_path: obj: str The directory in which to save the SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt Returns ---------- vocab_file: :obj:`Tuple(str)`: Paths to the files saved. tuple with string to a SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt """ index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, "vocab.txt") else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: index = token_index writer.write(token + "\n") index += 1 return (vocab_file,)Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer implementation found in Huggingface's transformers library. It runs a WordPiece tokenization algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.
Please see https://github.com/huggingface/transformers and https://github.com/rxn4chemistry/rxnfp for more details.
References
.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David; Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
Notes
This class requires huggingface's transformers and tokenizers libraries to be installed.
Constructs a SmilesTokenizer.
Parameters
vocab_file:str- Path to a SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt
Ancestors
Subclasses
Instance variables
prop eos_token_id-
Expand source code
@property def eos_token_id(self): return self.added_tokens_encoder[self.eos] prop vocab_list-
Expand source code
@property def vocab_list(self): return list(self.vocab.keys()) prop vocab_size-
Expand source code
@property def vocab_size(self): return len(self.vocab)
Methods
def add_padding_tokens(self, token_ids: List[int], length: int, right: bool = True) ‑> List[int]-
Expand source code
def add_padding_tokens(self, token_ids: List[int], length: int, right: bool = True) -> List[int]: """ Adds padding tokens to return a sequence of length max_length. By default padding tokens are added to the right of the sequence. Parameters ---------- token_ids: list[int] list of tokenized input ids. Can be obtained using the encode or encode_plus methods. length: int right: bool (True by default) Returns ---------- token_ids : list of tokenized input ids. Can be obtained using the encode or encode_plus methods. padding: int Integer to be added as padding token """ padding = [self.pad_token_id] * (length - len(token_ids)) if right: return token_ids + padding else: return padding + token_idsAdds padding tokens to return a sequence of length max_length. By default padding tokens are added to the right of the sequence.
Parameters
token_ids:list[int]- list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
length:intright:bool (True by default)
Returns
token_ids :- list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
padding:int- Integer to be added as padding token
def add_special_tokens_ids_sequence_pair(self, token_ids_0: List[int], token_ids_1: List[int]) ‑> List[int]-
Expand source code
def add_special_tokens_ids_sequence_pair(self, token_ids_0: List[int], token_ids_1: List[int]) -> List[int]: """ Adds special tokens to a sequence pair for sequence classification tasks. A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] Parameters ---------- token_ids_0: List[int] List of ids for the first string sequence in the sequence pair (A). token_ids_1: List[int] List of tokens for the second string sequence in the sequence pair (B). """ sep = [self.sep_token_id] cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sepAdds special tokens to a sequence pair for sequence classification tasks. A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
Parameters
token_ids_0:List[int]- List of ids for the first string sequence in the sequence pair (A).
token_ids_1:List[int]- List of tokens for the second string sequence in the sequence pair (B).
def add_special_tokens_ids_single_sequence(self, token_ids: List[int])-
Expand source code
def add_special_tokens_ids_single_sequence(self, token_ids: List[int]): """ Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] Parameters ---------- token_ids: list[int] list of tokenized input ids. Can be obtained using the encode or encode_plus methods. """ return [self.cls_token_id] + token_ids + [self.sep_token_id]Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP]
Parameters
token_ids:list[int]- list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
def add_special_tokens_single_sequence(self, tokens: List[str])-
Expand source code
def add_special_tokens_single_sequence(self, tokens: List[str]): """ Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] Parameters ---------- tokens: List[str] List of tokens for a given string sequence. """ return [self.cls_token] + tokens + [self.sep_token]Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP]
Parameters
tokens:List[str]- List of tokens for a given string sequence.
def convert_tokens_to_string(self, tokens: List[str])-
Expand source code
def convert_tokens_to_string(self, tokens: List[str]): """Converts a sequence of tokens (string) in a single string. Parameters ---------- tokens: List[str] List of tokens for a given string sequence. Returns ------- out_string: str Single string from combined tokens. """ out_string: str = " ".join(tokens).replace(" ##", "").strip() return out_stringConverts a sequence of tokens (string) in a single string.
Parameters
tokens:List[str]- List of tokens for a given string sequence.
Returns
out_string:str- Single string from combined tokens.
def save_vocabulary(self, vocab_path: str)-
Expand source code
def save_vocabulary(self, vocab_path: str): # -> tuple[str]: doctest issue raised with this return type annotation """ Save the tokenizer vocabulary to a file. Parameters ---------- vocab_path: obj: str The directory in which to save the SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt Returns ---------- vocab_file: :obj:`Tuple(str)`: Paths to the files saved. tuple with string to a SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt """ index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, "vocab.txt") else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: index = token_index writer.write(token + "\n") index += 1 return (vocab_file,)Save the tokenizer vocabulary to a file.
Parameters
vocab_path:obj: str- The directory in which to save the SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt
Returns
vocab_file::obj:Tuple(str):- Paths to the files saved. tuple with string to a SMILES character per line vocabulary file. Default vocab file is found in deepchem/feat/tests/data/vocab.txt