Module molcrawl.config.paths

Define constants for path settings used throughout the project

Functions

def get_bert_model_output_path(domain, model_size)
Expand source code
def get_bert_model_output_path(domain, model_size):
    """Get BERT model output path"""
    return get_bert_output_path(domain, model_size)

Get BERT model output path

def get_bert_output_path(domain, model_size)
Expand source code
def get_bert_output_path(domain, model_size):
    """
    Function to get output path of BERT model

    Args:
        domain (str): domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang')
        model_size (str): Model size ('small', 'medium', 'large')

    Returns:
        str: BERT output directory path
    """
    return os.path.join(LEARNING_SOURCE_DIR, domain, BERT_OUTPUT_BASE_DIR, f"{domain}-{model_size}")

Function to get output path of BERT model

Args

domain : str
domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang')
model_size : str
Model size ('small', 'medium', 'large')

Returns

str
BERT output directory path
def get_bert_tensorboard_path(domain, model_size)
Expand source code
def get_bert_tensorboard_path(domain, model_size):
    """Get BERT TensorBoard output path"""
    return get_bert_output_path(domain, model_size)

Get BERT TensorBoard output path

def get_custom_tokenizer_path(domain, model_type='bert')
Expand source code
def get_custom_tokenizer_path(domain, model_type="bert"):
    """
    Get path for custom tokenizer output

    Args:
        domain (str): domain name ('genome_sequence', 'rna', etc.)
        model_type (str): model type ('bert', 'rnaformer', 'dnabert2')

    Returns:
        str: custom tokenizer directory path
    """
    return os.path.join(LEARNING_SOURCE_DIR, domain, f"custom_tokenizer_{model_type}")

Get path for custom tokenizer output

Args

domain : str
domain name ('genome_sequence', 'rna', etc.)
model_type : str
model type ('bert', 'rnaformer', 'dnabert2')

Returns

str
custom tokenizer directory path
def get_dataset_path(dataset_type, relative_path='')
Expand source code
def get_dataset_path(dataset_type, relative_path=""):
    """
    Function to get dataset path

    Args:
        dataset_type (str): dataset type ('uniprot', 'refseq', 'cellxgene', 'molecule_nat_lang')
        relative_path (str): relative path within the dataset

    Returns:
        str: complete path
    """
    if dataset_type == "molecule_nat_lang":
        base_path = os.path.join(PROJECT_ROOT, MOLECULE_NAT_LANG_DATASET_DIR)
    else:
        base_path = os.path.join(PROJECT_ROOT, GENOME_SEQUENCE_DIR, dataset_type)

    if relative_path:
        return os.path.join(base_path, relative_path)
    return base_path

Function to get dataset path

Args

dataset_type : str
dataset type ('uniprot', 'refseq', 'cellxgene', 'molecule_nat_lang')
relative_path : str
relative path within the dataset

Returns

str
complete path
def get_genome_tokenizer_path()
Expand source code
def get_genome_tokenizer_path():
    """
    Get tokenizer path for genome sequence

    Returns:
        str: Genome sequence tokenizer path (string)
    """
    # Use tokenizer for RefSeq genome sequences
    return get_refseq_tokenizer_path()

Get tokenizer path for genome sequence

Returns

str
Genome sequence tokenizer path (string)
def get_gpt2_model_output_path(domain, model_size)
Expand source code
def get_gpt2_model_output_path(domain, model_size):
    """Get GPT-2 model output path"""
    return get_gpt2_output_path(domain, model_size)

Get GPT-2 model output path

def get_gpt2_output_path(domain, model_size)
Expand source code
def get_gpt2_output_path(domain, model_size):
    """
    Function to get output path of GPT-2 model

    Args:
        domain (str): domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang')
        model_size (str): Model size ('small', 'medium', 'large', 'xl', 'ex-large')

    Returns:
        str: GPT-2 output directory path
    """
    # Standardize model_size
    if model_size == "xl":
        size_suffix = "ex-large"
    else:
        size_suffix = model_size

    return os.path.join(LEARNING_SOURCE_DIR, domain, GPT2_OUTPUT_BASE_DIR, f"{domain}-{size_suffix}")

Function to get output path of GPT-2 model

Args

domain : str
domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang')
model_size : str
Model size ('small', 'medium', 'large', 'xl', 'ex-large')

Returns

str
GPT-2 output directory path
def get_gpt2_tensorboard_path(domain, model_size)
Expand source code
def get_gpt2_tensorboard_path(domain, model_size):
    """Get GPT-2 TensorBoard output path"""
    return get_gpt2_output_path(domain, model_size)

Get GPT-2 TensorBoard output path

def get_refseq_tokenizer_path()
Expand source code
def get_refseq_tokenizer_path():
    return os.path.join(PROJECT_ROOT, GENOME_SEQUENCE_DIR, "spm_tokenizer.model")