Module molcrawl.esm2.main
ESM-2 Training Script for Protein Sequence Data
ESM-2 (Evolutionary Scale Modeling 2) is a protein sequence-specific software developed by Meta. This is a cutting edge Transformer model.
Main features: - Pre-learning specialized for protein sequences - Scalable architecture up to 650 million parameters - Supports a wide range of tasks such as structure prediction, function annotation, variant effect prediction, etc. - Faster and more accurate than ESM-1b
reference: - Language models of protein sequences at the scale of evolution enable accurate structure prediction - https://github.com/facebookresearch/esm
Classes
class ProteinDatasetLoader (data_dir, split='train', test_size=0.1)-
Expand source code
class ProteinDatasetLoader: """ Loader for protein sequence datasets Load the existing protein_sequence dataset, Perform preprocessing for ESM-2. """ def __init__(self, data_dir, split="train", test_size=0.1): self.data_dir = data_dir self.split = split self.test_size = test_size print(f"📂 Loading protein dataset from {data_dir}") # Load data from Arrow files or HuggingFace format try: arrow_files = list(Path(data_dir).glob("*.arrow")) if arrow_files: print(f"📁 Found {len(arrow_files)} arrow files") all_batches = [] for arrow_file in arrow_files: try: print(f"📖 Reading {arrow_file.name}...") with pa.memory_map(str(arrow_file), "r") as mmap: with pa.ipc.open_stream(mmap) as reader: table = reader.read_all() print(f"🔢 Read table: {len(table)} rows") all_batches.append(table) except Exception as e: print(f"❌ Failed to read {arrow_file.name}: {e}") continue if all_batches: combined_table = pa.concat_tables(all_batches) print(f"📊 Combined {len(all_batches)} tables: {len(combined_table)} total rows") df = combined_table.to_pandas() # Convert numpy arrays to lists for HuggingFace compatibility if "token" in df.columns: df["token"] = df["token"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else x) if "sequence_tokens" in df.columns: df["sequence_tokens"] = df["sequence_tokens"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else x) self.dataset = Dataset.from_pandas(df) print("✅ Created HuggingFace Dataset") print(f"🔍 Dataset columns: {self.dataset.column_names}") else: raise ValueError("No arrow files could be read successfully") else: raise FileNotFoundError(f"No .arrow files found in {data_dir}") except Exception as e: print(f"❌ Arrow loading failed: {e}") raise FileNotFoundError(f"Could not load data from {data_dir}") from e # Split into train/valid if needed if hasattr(self.dataset, "keys") and isinstance(self.dataset, dict) and "train" in self.dataset: if split == "train": self.data = self.dataset["train"] elif split in ["valid", "val", "test"]: self.data = self.dataset.get("valid", self.dataset.get("test", self.dataset["train"])) else: if test_size > 0: split_dataset = self.dataset.train_test_split(test_size=test_size, seed=42) if split == "train": self.data = split_dataset["train"] elif split in ["valid", "val", "test"]: self.data = split_dataset["test"] else: self.data = self.dataset print(f"✅ Loaded {len(self.data)} samples for {split}") # Show sample if len(self.data) > 0: sample = self.data[0] print("Sample keys:", list(sample.keys())) for key, value in sample.items(): if isinstance(value, list): print(f" {key}: {type(value)} of length {len(value)}") else: print(f" {key}: {type(value)} = {value}") def get_dataset(self): """Return the HuggingFace Dataset object""" return self.dataLoader for protein sequence datasets
Load the existing protein_sequence dataset, Perform preprocessing for ESM-2.
Methods
def get_dataset(self)-
Expand source code
def get_dataset(self): """Return the HuggingFace Dataset object""" return self.dataReturn the HuggingFace Dataset object