Module molcrawl.rna.dataset.tokenization

Functions

def tokenize(output_dir)
Expand source code
def tokenize(output_dir):
    import pandas as pd
    from datasets import load_dataset

    tokenizer = TranscriptomeTokenizer()

    loom_outdir = Path(output_dir) / "loom_dir"
    parquet_outdir = Path(output_dir) / "parquet_files"

    parquet_outdir.mkdir(exist_ok=True, parents=True)
    paths = list(loom_outdir.iterdir())

    for path in tqdm(paths):
        parquet_path = parquet_outdir / path.with_suffix(".parquet").name
        try:
            tokens, _ = tokenizer.tokenize_loom(loom_file_path=path)
            tokenized_sequences = []

            for line in tokens:
                tokenized_sequences.append((line, len(line)))

            df = pd.DataFrame(tokenized_sequences, columns=["token", "token_count"])
            df.to_parquet(parquet_path, index=False)

        except Exception as e:
            print(f"Error with {path}: {e}")

    # Read dataset once with hugging face to preload the dataset can be commented out
    cache_dir = Path(output_dir) / "hf_cache"
    load_dataset("parquet", data_dir=str(parquet_outdir), cache_dir=str(cache_dir))