Expand source code
def tokenize_batch_dataset(output_dir, context_length, number_sample):
from datasets import DatasetDict, load_dataset
import sentencepiece as spm
data = (
load_dataset(
"parquet",
data_files=[str(Path(output_dir) / "parquet_files")],
cache_dir=str(Path(output_dir) / "hf_cache"),
split="train",
)
.shuffle()
.select(range(number_sample))
)
tokenized_datasets = data.train_test_split(test_size=0.2)
valid_test_split = tokenized_datasets["test"].train_test_split(test_size=0.5)
tokenized_datasets = DatasetDict(
{"train": tokenized_datasets["train"], "valid": valid_test_split["train"], "test": valid_test_split["test"]}
)
tokenizer = spm.SentencePieceProcessor(model_file=str(Path(output_dir) / "spm_tokenizer.model"))
# tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
concatenated_dataset = tokenized_datasets.map(
partial(concatenate_texts, eos_token_id=tokenizer.eos_id()),
# partial(concatenate_texts, eos_token_id=tokenizer.eos_token_id),
batched=True,
batch_size=-1,
remove_columns=["num_tokens"],
)
chunked_dataset = concatenated_dataset.map(
partial(create_chunks, context_length=context_length), batched=True, batch_size=context_length * 10
)
path_dataset = str(Path(output_dir) / "training_ready_hf_dataset")
print(f"Saving dataset to: {path_dataset}. Match this path to the train_gpt2_config.py->dataset_dir parameter.")
chunked_dataset.save_to_disk(path_dataset)