Module molcrawl.protein_sequence.utils.configs
Classes
class ProteinSequenceConfig (data_preparation: UniProtPreparationConfig = <factory>)-
Expand source code
@dataclass class ProteinSequenceConfig(Config): data_preparation: UniProtPreparationConfig = field(default_factory=UniProtPreparationConfig) def __post_init__(self): if not isinstance(self.data_preparation, UniProtPreparationConfig): # type: ignore[misc] self.data_preparation = UniProtPreparationConfig(**self.data_preparation) # type: ignore[arg-type]ProteinSequenceConfig(data_preparation: molcrawl.protein_sequence.utils.configs.UniProtPreparationConfig =
) Ancestors
Instance variables
var data_preparation : UniProtPreparationConfig
Inherited members
class UniProtPreparationConfig (dataset: str = 'UniRef50',
output_dir: str = 'learning_source_dummy/protein_sequence',
use_md5: bool = False,
num_worker: int = 4,
max_lines_per_file: int = 1000000)-
Expand source code
@dataclass class UniProtPreparationConfig: # Which uniprot dataset to download must be one of the following: # "UniprotKB_reviewed", "UniprotKB_unreviewed", "UniRef100", "UniRef90", "UniRef50", "UniParc" dataset: str = "UniRef50" # Output directory where the preparation will be made output_dir: str = PROTEIN_SEQUENCE_DIR # If True use md5 to check if a file needs to be downloaded again, using md5 # is very time consuming for large file. Otherwise we only check if the path exists. use_md5: bool = False # Special case for Uniparc download, num of worker to use. num_worker: int = 4 # Number of sequence per files for raw files and parquet. It also reflex the number # of sequence loaded in memory during the processing of those files. max_lines_per_file: int = 10**6UniProtPreparationConfig(dataset: str = 'UniRef50', output_dir: str = 'learning_source_dummy/protein_sequence', use_md5: bool = False, num_worker: int = 4, max_lines_per_file: int = 1000000)
Instance variables
var dataset : strvar max_lines_per_file : intvar num_worker : intvar output_dir : strvar use_md5 : bool