Module molcrawl.preparation.convert_parquet_to_arrow
Convert a combined parquet file with split column into separate arrow files
Functions
def convert_parquet_to_arrow(parquet_path: str, output_dir: str)-
Expand source code
def convert_parquet_to_arrow(parquet_path: str, output_dir: str): """ Convert a parquet file with split column into separate arrow files Args: parquet_path: Path to the input parquet file output_dir: Directory to save the arrow files """ parquet_path_obj = Path(parquet_path) output_dir_obj = Path(output_dir) if not parquet_path_obj.exists(): raise FileNotFoundError(f"Parquet file not found: {parquet_path}") logger.info(f"Reading parquet file from {parquet_path}") # Read the parquet file table = pq.read_table(str(parquet_path_obj)) df = table.to_pandas() logger.info(f"Loaded {len(df)} total samples") # Check if split column exists if "split" not in df.columns: raise ValueError("Parquet file does not contain 'split' column") # Get unique splits splits = df["split"].unique() logger.info(f"Found splits: {list(splits)}") # Create output directory os.makedirs(output_dir_obj, exist_ok=True) # Process each split for split_name in splits: logger.info(f"Processing {split_name} split...") # Filter data for this split split_df = df[df["split"] == split_name].copy() # Remove the split column (no longer needed) split_df = split_df.drop(columns=["split"]) logger.info(f" {split_name}: {len(split_df)} samples") # Convert to Dataset dataset = Dataset.from_pandas(split_df) # Save as arrow file output_path = output_dir_obj / f"{split_name}.arrow" logger.info(f" Saving to {output_path}") # Save to disk in arrow format dataset.save_to_disk(str(output_path)) logger.info(f" Successfully saved {split_name} split") logger.info("=" * 70) logger.info("Conversion completed successfully!") logger.info(f"Output directory: {output_dir_obj}") logger.info("=" * 70)Convert a parquet file with split column into separate arrow files
Args
parquet_path- Path to the input parquet file
output_dir- Directory to save the arrow files