Module molcrawl.config.paths
Define constants for path settings used throughout the project
Functions
def get_bert_model_output_path(domain, model_size)-
Expand source code
def get_bert_model_output_path(domain, model_size): """Get BERT model output path""" return get_bert_output_path(domain, model_size)Get BERT model output path
def get_bert_output_path(domain, model_size)-
Expand source code
def get_bert_output_path(domain, model_size): """ Function to get output path of BERT model Args: domain (str): domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang') model_size (str): Model size ('small', 'medium', 'large') Returns: str: BERT output directory path """ return os.path.join(LEARNING_SOURCE_DIR, domain, BERT_OUTPUT_BASE_DIR, f"{domain}-{model_size}")Function to get output path of BERT model
Args
domain:str- domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang')
model_size:str- Model size ('small', 'medium', 'large')
Returns
str- BERT output directory path
def get_bert_tensorboard_path(domain, model_size)-
Expand source code
def get_bert_tensorboard_path(domain, model_size): """Get BERT TensorBoard output path""" return get_bert_output_path(domain, model_size)Get BERT TensorBoard output path
def get_custom_tokenizer_path(domain, model_type='bert')-
Expand source code
def get_custom_tokenizer_path(domain, model_type="bert"): """ Get path for custom tokenizer output Args: domain (str): domain name ('genome_sequence', 'rna', etc.) model_type (str): model type ('bert', 'rnaformer', 'dnabert2') Returns: str: custom tokenizer directory path """ return os.path.join(LEARNING_SOURCE_DIR, domain, f"custom_tokenizer_{model_type}")Get path for custom tokenizer output
Args
domain:str- domain name ('genome_sequence', 'rna', etc.)
model_type:str- model type ('bert', 'rnaformer', 'dnabert2')
Returns
str- custom tokenizer directory path
def get_dataset_path(dataset_type, relative_path='')-
Expand source code
def get_dataset_path(dataset_type, relative_path=""): """ Function to get dataset path Args: dataset_type (str): dataset type ('uniprot', 'refseq', 'cellxgene', 'molecule_nat_lang') relative_path (str): relative path within the dataset Returns: str: complete path """ if dataset_type == "molecule_nat_lang": base_path = os.path.join(PROJECT_ROOT, MOLECULE_NAT_LANG_DATASET_DIR) else: base_path = os.path.join(PROJECT_ROOT, GENOME_SEQUENCE_DIR, dataset_type) if relative_path: return os.path.join(base_path, relative_path) return base_pathFunction to get dataset path
Args
dataset_type:str- dataset type ('uniprot', 'refseq', 'cellxgene', 'molecule_nat_lang')
relative_path:str- relative path within the dataset
Returns
str- complete path
def get_genome_tokenizer_path()-
Expand source code
def get_genome_tokenizer_path(): """ Get tokenizer path for genome sequence Returns: str: Genome sequence tokenizer path (string) """ # Use tokenizer for RefSeq genome sequences return get_refseq_tokenizer_path()Get tokenizer path for genome sequence
Returns
str- Genome sequence tokenizer path (string)
def get_gpt2_model_output_path(domain, model_size)-
Expand source code
def get_gpt2_model_output_path(domain, model_size): """Get GPT-2 model output path""" return get_gpt2_output_path(domain, model_size)Get GPT-2 model output path
def get_gpt2_output_path(domain, model_size)-
Expand source code
def get_gpt2_output_path(domain, model_size): """ Function to get output path of GPT-2 model Args: domain (str): domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang') model_size (str): Model size ('small', 'medium', 'large', 'xl', 'ex-large') Returns: str: GPT-2 output directory path """ # Standardize model_size if model_size == "xl": size_suffix = "ex-large" else: size_suffix = model_size return os.path.join(LEARNING_SOURCE_DIR, domain, GPT2_OUTPUT_BASE_DIR, f"{domain}-{size_suffix}")Function to get output path of GPT-2 model
Args
domain:str- domain name ('protein_sequence', 'genome_sequence', 'rna', 'compounds', 'molecule_nat_lang')
model_size:str- Model size ('small', 'medium', 'large', 'xl', 'ex-large')
Returns
str- GPT-2 output directory path
def get_gpt2_tensorboard_path(domain, model_size)-
Expand source code
def get_gpt2_tensorboard_path(domain, model_size): """Get GPT-2 TensorBoard output path""" return get_gpt2_output_path(domain, model_size)Get GPT-2 TensorBoard output path
def get_refseq_tokenizer_path()-
Expand source code
def get_refseq_tokenizer_path(): return os.path.join(PROJECT_ROOT, GENOME_SEQUENCE_DIR, "spm_tokenizer.model")