Module molcrawl.genome_sequence.utils.config
Classes
class GenomeSequenceConfig (data_preparation: RefSeqPreparationConfig = <factory>)-
Expand source code
@dataclass class GenomeSequenceConfig(Config): data_preparation: RefSeqPreparationConfig = field(default_factory=RefSeqPreparationConfig) def __post_init__(self): if not isinstance(self.data_preparation, RefSeqPreparationConfig): # type: ignore[misc] self.data_preparation = RefSeqPreparationConfig(**self.data_preparation) # type: ignore[arg-type]GenomeSequenceConfig(data_preparation: molcrawl.genome_sequence.utils.config.RefSeqPreparationConfig =
) Ancestors
Instance variables
var data_preparation : RefSeqPreparationConfig
Inherited members
class RefSeqPreparationConfig (output_dir: str = 'learning_source_dummy/genome_sequence',
path_species: str = 'assets/genome_species_list/filtered_species_refseq',
num_worker: int = 16,
max_lines_per_file: int = 10000,
vocab_size: int = 4096,
input_sentence_size: int = 700000,
species_timeout: int = 1800,
max_retries: int = 2,
num_proc_parquet: int | None = None,
parquet_batch_size: int | None = None,
local_base_dir: str | None = None)-
Expand source code
@dataclass class RefSeqPreparationConfig: # Output directory where the preparation will be made output_dir: str = GENOME_SEQUENCE_DIR # Path to a directory containing one file per species to download from refseq (see assets/genome_species_list/species for example) # Possible groups are archaea, bacteria, fungi, invertebrate, metagenomes, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral. path_species: str = "assets/genome_species_list/filtered_species_refseq" # Num of parallel worker to use, note that for download the worker are capped to 3 num_worker: int = 16 max_lines_per_file: int = 10000 # Size of the vocabulary of the BPE tokenizer vocab_size: int = 4096 # Number of genome sequence to use to train the BPE tokenizer. # We will sample input_sentence_size randomly from input_sentence_size * 2 number of sequence. # So input_sentence_size * 2 / max_lines_per_file will be randomly selected for the BPE training. input_sentence_size: int = 700000 # Per-species download timeout in seconds (default: 30 min) # If a species download takes longer than this, the child process is killed. species_timeout: int = 30 * 60 # Maximum number of retries per species before giving up max_retries: int = 2 # Added: Speed-up options (optional) num_proc_parquet: Optional[int] = None parquet_batch_size: Optional[int] = None local_base_dir: Optional[str] = NoneRefSeqPreparationConfig(output_dir: str = 'learning_source_dummy/genome_sequence', path_species: str = 'assets/genome_species_list/filtered_species_refseq', num_worker: int = 16, max_lines_per_file: int = 10000, vocab_size: int = 4096, input_sentence_size: int = 700000, species_timeout: int = 1800, max_retries: int = 2, num_proc_parquet: Optional[int] = None, parquet_batch_size: Optional[int] = None, local_base_dir: Optional[str] = None)
Instance variables
var input_sentence_size : intvar local_base_dir : str | Nonevar max_lines_per_file : intvar max_retries : intvar num_proc_parquet : int | Nonevar num_worker : intvar output_dir : strvar parquet_batch_size : int | Nonevar path_species : strvar species_timeout : intvar vocab_size : int