Module molcrawl.rna.utils.compute_stats
Functions
def compute_stats(output_dir)-
Expand source code
def compute_stats(output_dir): import dask.dataframe as dd from dask.diagnostics import ProgressBar # Load Parquet files with Dask parquet_dir = Path(output_dir) / "parquet_files" ddf = dd.read_parquet(parquet_dir) # Initialize Counter and array shapes genes_counter = Counter() array_shapes = [] # Apply the function to each partition (map_partitions allows parallel processing) # job = ddf.map_partitions(process_partition) # progress(job) # result = job.compute() with ProgressBar(): result = ddf.map_partitions(process_partition).compute() # Aggregate results for local_counter, local_shapes in result: genes_counter.update(local_counter) array_shapes.extend(local_shapes) with open(Path(output_dir) / "gene_vocab.json", "r") as file: vocab = json.load(file) genes_counter = reverse_remap_genes_dict(genes_counter, vocab) save_dir = Path(output_dir) / "stats" save_dir.mkdir(exist_ok=True) with open(save_dir / "gene_counts.json", "w") as file: json.dump(genes_counter, file, indent=4) with open(save_dir / "array_shapes.pkl", "wb") as handle: pickle.dump(array_shapes, handle) def process_partition(partition)-
Expand source code
def process_partition(partition): local_counter = Counter() local_shapes = [] for genes_array in partition["genes"]: # Update the Counter and collect shapes local_counter.update(genes_array) local_shapes.append(len(genes_array)) return local_counter, local_shapes def reverse_remap_genes_dict(genes_dict, vocab_dict)-
Expand source code
def reverse_remap_genes_dict(genes_dict, vocab_dict): """ Remap the numerical keys of a dictionary back to gene names using a given vocabulary dictionary. Args: genes_dict (dict): The original dictionary with numerical values as keys. vocab_dict (dict): A dictionary where gene names are mapped to numerical values. Returns: dict: A new dictionary with the remapped keys (gene names). """ # Create a reverse dictionary mapping numerical values back to gene names reverse_vocab_dict = {v: k for k, v in vocab_dict.items()} # Initialize a new dictionary for the remapped data remapped_dict = {} # Iterate through the original dictionary for gene_id, count in genes_dict.items(): # Check if the numerical key exists in the reverse vocabulary if gene_id in reverse_vocab_dict: # Get the corresponding gene name and update the new dictionary remapped_dict[reverse_vocab_dict[gene_id]] = count return remapped_dictRemap the numerical keys of a dictionary back to gene names using a given vocabulary dictionary.
Args
genes_dict:dict- The original dictionary with numerical values as keys.
vocab_dict:dict- A dictionary where gene names are mapped to numerical values.
Returns
dict- A new dictionary with the remapped keys (gene names).