AGiottonini
/

ProtBertDistilled

Safetensors

bert

Model card Files Files and versions

xet

Community

AGiottonini commited on Oct 30, 2025

Commit

6569534

verified ·

1 Parent(s): eea6f25

Upload bert-distilled-pretrain.py

Browse files

Files changed (1) hide show

bert-distilled-pretrain.py +295 -0

bert-distilled-pretrain.py ADDED Viewed

	@@ -0,0 +1,295 @@

+from typing import Tuple
+import os
+import tqdm
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from transformers.models.bert import (
+    BertForPreTraining,
+    BertTokenizer,
+    BertConfig
+)
+from muon import SingleDeviceMuonWithAuxAdam
+class Dataset():
+    def __init__(self, file_path, tokenizer, min_length=32, max_length=512):
+        self.sequences = []
+        self.tokenizer = tokenizer
+        self.min_length = min_length
+        self.max_length = max_length
+        self._load_data(file_path)
+    def _load_data(self, file_path):
+        with open(file_path, "rb") as f:
+            n_lines = (sum(1 for _ in f))
+        with open(file_path, "r") as f:
+            all_headers = []
+            all_sequences = []
+            with tqdm.tqdm(total=n_lines) as pbar:
+                for line in f:
+                    line = line.rstrip("\n")
+                    if line.startswith(">"):
+                        all_headers.append(line.lstrip(">"))
+                        all_sequences.append("")
+                    else:
+                        all_sequences[-1] += line
+                    pbar.update(1)
+        self.sequences = [s for s in all_sequences if self.min_length - 2 <= len(s) <= self.max_length - 2]
+    def __len__(self):
+        return len(self.sequences)
+    def __getitem__(self, idx):
+        sequence = self.sequences[idx]
+        tokens = self.tokenizer(
+            " ".join(list(sequence)),
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt"
+        )
+        return tokens
+class DistillationLoss(nn.Module):
+    def __init__(self, alpha: float, temperature: float, num_labels: int, ignore_index: int):
+        super().__init__()
+        self.alpha = alpha
+        self.temperature = temperature
+        self.num_labels = num_labels
+        self.ignore_index = ignore_index
+        self.soft_loss_fn = nn.KLDivLoss(reduction="batchmean", log_target=True)
+        self.hard_loss_fn = nn.CrossEntropyLoss(ignore_index=ignore_index)
+    def forward(self, student_logits, teacher_logits, labels):
+        """
+        Compute the distillation loss.
+        Args:
+            student_logits (torch.Tensor): Logits from the student model.
+            teacher_logits (torch.Tensor): Logits from the teacher model.
+            labels (torch.Tensor): Ground truth labels.
+        Returns:
+            torch.Tensor: The computed distillation loss.
+        """
+        # Soft loss
+        soft_loss = self.soft_loss_fn(
+            nn.LogSoftmax(dim=-1)(student_logits / self.temperature),
+            nn.LogSoftmax(dim=-1)(teacher_logits / self.temperature)
+        )
+        # Hard loss
+        hard_loss = self.hard_loss_fn(
+            student_logits.view(-1, self.num_labels), labels.view(-1)
+        )
+        return self.alpha * hard_loss + (1 - self.alpha) * soft_loss
+def main(
+    model_name: str,
+    student_hidden_size: int,
+    student_intermediate_size: int,
+    student_num_attention_heads: int,
+    student_num_hidden_layers: int,
+    train_data_path: str,
+    batch_size: int,
+    epochs: int,
+    lr: float,
+    default_lr: float,
+    teacher_use_bf16: bool=True,
+    teacher_use_sdpa: bool=True,
+    min_length: int=32,
+    max_length: int=512,
+    alpha: float=0.1,
+    temperature: float=10.0,
+    use_muon: bool=True,
+    weight_decay: float=0.01,
+    betas: Tuple[float, float]=(0.9,0.95),
+    default_weight_decay: float=0.01,
+    default_betas: Tuple[float, float]=(0.9, 0.95),
+    device: torch.device=torch.device("cpu"),
+    num_workers: int=256,
+    wandb_entity: str="giottonini-axel-unibe"
+):
+    import wandb
+    wandb_project = f"{model_name.replace('/', "_")}-distilled"
+    wandb_run = wandb.init(
+        entity=wandb_entity,
+        project=wandb_project,
+        config=dict(
+            hidden_size=student_hidden_size,
+            intermediate_size=student_intermediate_size,
+            num_attention_heads=student_num_attention_heads,
+            num_hidden_layers=student_num_hidden_layers,
+            alpha=alpha,
+            temperature=temperature,
+            use_muon=use_muon,
+            lr=lr,
+            weight_decay=weight_decay,
+            betas=betas,
+            default_lr=default_lr,
+            default_weight_decay=default_weight_decay,
+            default_betas=default_betas
+        )
+    )
+    # Initialize tokenizer, teacher model and student model
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+    teacher_model_kwargs = dict()
+    if teacher_use_bf16:
+        teacher_model_kwargs["torch_dtype"] = torch.bfloat16
+    if teacher_use_sdpa:
+        teacher_model_kwargs["attn_implementation"] = "sdpa"
+    teacher_model = BertForPreTraining.from_pretrained(model_name, **teacher_model_kwargs)
+    teacher_model_compiled = torch.compile(teacher_model, mode="max-autotune", fullgraph=True)
+    student_config = BertConfig.from_pretrained(
+        "Rostlab/prot_bert",
+        hidden_size=student_hidden_size,
+        intermediate_size=student_intermediate_size,
+        num_attention_heads=student_num_attention_heads,
+        num_hidden_layers=student_num_hidden_layers
+    )
+    student_model = BertForPreTraining(student_config)
+    teacher_model_compiled.to(device) # type: ignore
+    student_model.to(device) # type: ignore
+    # Load dataset
+    dataset = Dataset(train_data_path, tokenizer, min_length=min_length, max_length=max_length)
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) # type: ignore
+    # Loss function
+    loss_fn = DistillationLoss(alpha, temperature, tokenizer.vocab_size, tokenizer.pad_token_type_id)
+    # Initialize optimizer
+    hidden_weights = [p for p in student_model.bert.encoder.parameters() if p.ndim >= 2]
+    hidden_biases = [p for p in student_model.bert.encoder.parameters() if p.ndim < 2]
+    nonhidden_params = [
+        *student_model.bert.embeddings.parameters(),
+        *student_model.cls.parameters()
+    ]
+    hidden_param_group = dict(
+        params=hidden_weights,
+        use_muon=use_muon,
+        lr=lr,
+        weight_decay=weight_decay
+    )
+    if not use_muon:
+        hidden_param_group["betas"] = betas # type: ignore
+    default_param_group = dict(
+        params=hidden_biases + nonhidden_params,
+        use_muon=False,
+        lr=default_lr,
+        betas=default_betas,
+        weight_decay=default_weight_decay
+    )
+    optimizer = SingleDeviceMuonWithAuxAdam([hidden_param_group, default_param_group])
+    # Training loop
+    wandb_run.watch(student_model)
+    step = 0
+    for epoch in range(epochs):
+        with tqdm.tqdm(dataloader) as pbar:
+            for batch in pbar:
+                # Clear optimizer and model gradients
+                optimizer.zero_grad()
+                student_model.zero_grad()
+                # Send the data to the device
+                batch = {k : v.squeeze(1).to(device) for k, v in batch.items()}
+                # Compute teacher logits
+                with torch.no_grad():
+                    teacher_logits = teacher_model(
+                        input_ids=batch["input_ids"],
+                        attention_mask=batch["attention_mask"]
+                    ).prediction_logits
+                # Compute student logits
+                student_logits = student_model(
+                    input_ids=batch["input_ids"],
+                    attention_mask=batch["attention_mask"]
+                ).prediction_logits
+                # Loss backpropagation and optimization step
+                loss = loss_fn(student_logits, teacher_logits, batch["input_ids"])
+                loss.backward()
+                optimizer.step()
+                step += 1
+                pbar.set_description(f"Epoch {epoch} | Loss: {loss.item():.4f}")
+                wandb_run.log(dict(loss=loss.item()))
+                # Save checkpoint
+                if step % 1000 == 0:
+                    checkpoint = dict(
+                        state_dict=student_model.state_dict(),
+                        optimizer=optimizer.state_dict(),
+                    )
+                    os.makedirs(os.path.join(wandb_run.project, wandb_run.name, str(step))) # type: ignore
+                    torch.save(
+                        checkpoint,
+                        os.path.join(wandb_run.project, wandb_run.name, str(step), "checkpoint.pt") # type: ignore
+                    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default="Rostlab/prot_bert", help="Name of the teacher model")
+    parser.add_argument("--student_hidden_size", type=int, default=16, help="Hidden size of the student model")
+    parser.add_argument("--student_intermediate_size", type=int, default=64, help="Intermediate size of the studen model")
+    parser.add_argument("--student_num_attention_heads", type=int, default=4, help="Number of attention heads of the student model")
+    parser.add_argument("--student_num_hidden_layers", type=int, default=12, help="Number of hidden layers of the student model")
+    parser.add_argument("--train_data_path", type=str, help="Path to the training data (fasta file)")
+    parser.add_argument("--batch_size", type=int, default=1024, help="Batch size for training")
+    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs for training")
+    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate for the hidden parameters")
+    parser.add_argument("--default_lr", type=float, default=1e-4, help="Learning rate for the non-hidden parameters and biases")
+    parser.add_argument("--device", type=int, default=-1, help="GPU device to use (-1 for CPU)")
+    args = vars(parser.parse_args())
+    device = torch.device(f"cuda:{int(args['device'])}" if torch.cuda.is_available() and int(args['device']) >= 0 else "cpu")
+    main(
+        model_name=args["model_name"],
+        student_hidden_size=args["student_hidden_size"],
+        student_intermediate_size=args["student_intermediate_size"],
+        student_num_attention_heads=args["student_num_attention_heads"],
+        student_num_hidden_layers=args["student_num_hidden_layers"],
+        train_data_path=args["train_data_path"],
+        batch_size=args["batch_size"],
+        epochs=args["epochs"],
+        lr=args["lr"],
+        default_lr=args["default_lr"],
+        device=device
+    )