Module `molcrawl.molecule_nat_lang.utils.bert_tokenizer`

BERT-compatible tokenizer wrapper for Molecule Natural Language Wrap MoleculeNatLangTokenizer in a format compatible with BERT learning

Functions

def create_bert_molecule_nat_lang_tokenizer(**kwargs) ‑> BertMoleculeNlTokenizer

Expand source code

def create_bert_molecule_nat_lang_tokenizer(**kwargs) -> BertMoleculeNlTokenizer:
    """
    Create a BERT-compatible molecule natural language tokenizer

    Returns:
        BertMoleculeNlTokenizer instance
    """
    return BertMoleculeNlTokenizer(**kwargs)

Create a BERT-compatible molecule natural language tokenizer

Returns

BertMoleculeNlTokenizer instance

Classes

class BertMoleculeNlTokenizer (**kwargs)

Expand source code

class BertMoleculeNlTokenizer:
    """
    MoleculeNatLangTokenizer wrapper for BERT compatibility

    This class wraps the MoleculeNatLangTokenizer to make it compatible
    with BERT training and testing pipelines
    """

    # Override model input names to use standard BERT format
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(self, **kwargs):
        self.tokenizer = MoleculeNatLangTokenizer(**kwargs)

        # BERT compatibility attributes
        self.pad_token = self.tokenizer.tokenizer.pad_token
        self.unk_token = self.tokenizer.tokenizer.unk_token
        self.cls_token = getattr(self.tokenizer.tokenizer, "cls_token", "[CLS]")
        self.sep_token = getattr(self.tokenizer.tokenizer, "sep_token", "[SEP]")
        self.mask_token = getattr(self.tokenizer.tokenizer, "mask_token", "[MASK]")

        self.pad_token_id = self.tokenizer.tokenizer.pad_token_id
        self.unk_token_id = self.tokenizer.tokenizer.unk_token_id
        self.cls_token_id = getattr(self.tokenizer.tokenizer, "cls_token_id", 101)
        self.sep_token_id = getattr(self.tokenizer.tokenizer, "sep_token_id", 102)
        self.mask_token_id = getattr(self.tokenizer.tokenizer, "mask_token_id", 103)

    def get_vocab(self):
        """Get vocabulary dictionary"""
        return self.tokenizer.tokenizer.get_vocab()

    def __len__(self):
        """Return vocabulary size"""
        return len(self.tokenizer.tokenizer)

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize text using the underlying tokenizer
        """
        return self.tokenizer.tokenizer.tokenize(text)

    def encode(
        self,
        text: Union[str, List[str]],
        add_special_tokens: bool = True,
        max_length: Optional[int] = None,
        padding: bool = False,
        truncation: bool = False,
        return_tensors: Optional[str] = None,
    ) -> Union[List[int], torch.Tensor]:
        """
        Encode text to token IDs
        """
        return self.tokenizer.tokenizer.encode(
            text,
            add_special_tokens=add_special_tokens,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            return_tensors=return_tensors,
        )

    def __call__(
        self,
        text: Union[str, List[str]],
        add_special_tokens: bool = True,
        padding: Union[bool, str] = False,
        truncation: Union[bool, str] = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[str] = None,
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Tokenize and encode text using the underlying tokenizer
        """
        return self.tokenizer.tokenizer(
            text,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            return_tensors=return_tensors,
            **kwargs,
        )

    def decode(
        self,
        token_ids: Union[List[int], torch.Tensor],
        skip_special_tokens: bool = True,
    ) -> str:
        """
        Decode token IDs back to text
        """
        return self.tokenizer.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
        Convert tokens to string
        """
        return self.tokenizer.tokenizer.convert_tokens_to_string(tokens)

    def pad(self, encoded_inputs, **kwargs):
        """
        Pad encoded inputs
        """
        return self.tokenizer.tokenizer.pad(encoded_inputs, **kwargs)

MoleculeNatLangTokenizer wrapper for BERT compatibility

This class wraps the MoleculeNatLangTokenizer to make it compatible with BERT training and testing pipelines

Class variables

var model_input_names

Methods

def convert_tokens_to_string(self, tokens: List[str]) ‑> str

Expand source code

def convert_tokens_to_string(self, tokens: List[str]) -> str:
    """
    Convert tokens to string
    """
    return self.tokenizer.tokenizer.convert_tokens_to_string(tokens)

Convert tokens to string

def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = True) ‑> str

Expand source code

def decode(
    self,
    token_ids: Union[List[int], torch.Tensor],
    skip_special_tokens: bool = True,
) -> str:
    """
    Decode token IDs back to text
    """
    return self.tokenizer.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

Decode token IDs back to text

def encode(self, text: Union[str, List[str]], add_special_tokens: bool = True, max_length: Optional[int] = None, padding: bool = False, truncation: bool = False, return_tensors: Optional[str] = None) ‑> Union[List[int], torch.Tensor]

Expand source code

def encode(
    self,
    text: Union[str, List[str]],
    add_special_tokens: bool = True,
    max_length: Optional[int] = None,
    padding: bool = False,
    truncation: bool = False,
    return_tensors: Optional[str] = None,
) -> Union[List[int], torch.Tensor]:
    """
    Encode text to token IDs
    """
    return self.tokenizer.tokenizer.encode(
        text,
        add_special_tokens=add_special_tokens,
        max_length=max_length,
        padding=padding,
        truncation=truncation,
        return_tensors=return_tensors,
    )

Encode text to token IDs

def get_vocab(self)

Expand source code

def get_vocab(self):
    """Get vocabulary dictionary"""
    return self.tokenizer.tokenizer.get_vocab()

Get vocabulary dictionary

def pad(self, encoded_inputs, **kwargs)

Expand source code

def pad(self, encoded_inputs, **kwargs):
    """
    Pad encoded inputs
    """
    return self.tokenizer.tokenizer.pad(encoded_inputs, **kwargs)

Pad encoded inputs

def tokenize(self, text: str) ‑> List[str]

Expand source code

def tokenize(self, text: str) -> List[str]:
    """
    Tokenize text using the underlying tokenizer
    """
    return self.tokenizer.tokenizer.tokenize(text)

Tokenize text using the underlying tokenizer