Module `molcrawl.utils.model_evaluator`

Base evaluator class for all model evaluation classes.

This module provides a common base class that all evaluator classes should inherit from, ensuring consistent interface and shared functionality across different model evaluation tasks.

Classes

class ModelEvaluator (model_path: str, tokenizer_path: str, device: str = 'cuda', **kwargs)

Expand source code

class ModelEvaluator(ABC):
    """
    Base class for all model evaluators.

    This abstract base class defines the common interface and shared functionality
    that all specific evaluator classes should implement.

    Attributes:
        model_path (str): Path to the trained model
        tokenizer_path (str): Path to the tokenizer
        device (str): Device to run the model on (e.g., 'cuda', 'cpu')
        model: The loaded model instance
        tokenizer: The loaded tokenizer instance
        vocab_size (int): Size of the vocabulary
    """

    def __init__(self, model_path: str, tokenizer_path: str, device: str = "cuda", **kwargs):
        """
        Initialize the base evaluator.

        Args:
            model_path (str): Path to the trained model
            tokenizer_path (str): Path to the tokenizer
            device (str): Device to run the model on
            **kwargs: Additional keyword arguments for specific evaluators
        """
        self.model_path = model_path
        self.tokenizer_path = tokenizer_path
        self.device = device

        # Validate paths
        self._validate_paths()

        # Initialize components
        self.tokenizer = None
        self.model = None
        self.vocab_size: Optional[int] = None

        # Initialize tokenizer and model (and assign their return values)
        self.tokenizer = self._init_tokenizer()
        self.model = self._init_model()

        logger.info(f"Initialized {self.__class__.__name__} with model: {model_path}")

    def _validate_paths(self):
        """Validate that the provided paths exist."""
        if not os.path.exists(self.model_path):
            raise FileNotFoundError(f"Model path not found: {self.model_path}")

        # Skip if tokenizer_path is None (if using a built-in tokenizer, such as protein_sequence)
        if self.tokenizer_path is not None and not os.path.exists(self.tokenizer_path):
            raise FileNotFoundError(f"Tokenizer path not found: {self.tokenizer_path}")

    @abstractmethod
    def _init_tokenizer(self):
        """
        Initialize the tokenizer.

        This method should be implemented by each specific evaluator class
        to load and configure the appropriate tokenizer for the model type.
        """
        pass

    @abstractmethod
    def _init_model(self):
        """
        Initialize and load the model.

        This method should be implemented by each specific evaluator class
        to load and configure the appropriate model.
        """
        pass

    @abstractmethod
    def encode_sequence(self, sequence: str) -> Union[list, torch.Tensor]:
        """
        Encode input sequence to model input format.

        Args:
            sequence (str): Input sequence to encode

        Returns:
            Encoded representation of the input sequence
        """
        pass

    def set_device(self, device: str):
        """
        Change the device for model evaluation.

        Args:
            device (str): New device to use (e.g., 'cuda', 'cpu')
        """
        old_device = self.device
        self.device = device

        if self.model is not None:
            self.model = self.model.to(device)
            logger.info(f"Moved model from {old_device} to {device}")

    def get_model_info(self) -> Dict[str, Any]:
        """
        Get information about the loaded model.

        Returns:
            Dictionary containing model information
        """
        info = {
            "model_path": self.model_path,
            "tokenizer_path": self.tokenizer_path,
            "device": self.device,
            "vocab_size": self.vocab_size,
            "model_class": self.__class__.__name__,
        }

        if hasattr(self.model, "config"):
            info["model_config"] = vars(self.model.config)

        return info

    def __repr__(self) -> str:
        """String representation of the evaluator."""
        return f"{self.__class__.__name__}(model_path='{self.model_path}', device='{self.device}')"

Base class for all model evaluators.

This abstract base class defines the common interface and shared functionality that all specific evaluator classes should implement.

Attributes

model_path : str: Path to the trained model
tokenizer_path : str: Path to the tokenizer
device : str: Device to run the model on (e.g., 'cuda', 'cpu')
model: The loaded model instance
tokenizer: The loaded tokenizer instance
vocab_size : int: Size of the vocabulary

Initialize the base evaluator.

Args

model_path : str: Path to the trained model
tokenizer_path : str: Path to the tokenizer
device : str: Device to run the model on
**kwargs: Additional keyword arguments for specific evaluators

Ancestors

abc.ABC

Methods

def encode_sequence(self, sequence: str) ‑> Union[list, torch.Tensor]

Expand source code

@abstractmethod
def encode_sequence(self, sequence: str) -> Union[list, torch.Tensor]:
    """
    Encode input sequence to model input format.

    Args:
        sequence (str): Input sequence to encode

    Returns:
        Encoded representation of the input sequence
    """
    pass

Encode input sequence to model input format.

Args

sequence : str: Input sequence to encode

Returns

Encoded representation of the input sequence

def get_model_info(self) ‑> Dict[str, Any]

Expand source code

def get_model_info(self) -> Dict[str, Any]:
    """
    Get information about the loaded model.

    Returns:
        Dictionary containing model information
    """
    info = {
        "model_path": self.model_path,
        "tokenizer_path": self.tokenizer_path,
        "device": self.device,
        "vocab_size": self.vocab_size,
        "model_class": self.__class__.__name__,
    }

    if hasattr(self.model, "config"):
        info["model_config"] = vars(self.model.config)

    return info

Get information about the loaded model.

Returns

Dictionary containing model information

def set_device(self, device: str)

Expand source code

def set_device(self, device: str):
    """
    Change the device for model evaluation.

    Args:
        device (str): New device to use (e.g., 'cuda', 'cpu')
    """
    old_device = self.device
    self.device = device

    if self.model is not None:
        self.model = self.model.to(device)
        logger.info(f"Moved model from {old_device} to {device}")

Change the device for model evaluation.

Args

device : str: New device to use (e.g., 'cuda', 'cpu')