macwiatrak
/

bacformer-causal-MAG

@@ -16,112 +16,8 @@ from torch.nn.functional import (
 from transformers import PreTrainedModel
 from transformers.utils import ModelOutput
-from bacformer_model.configuration_bacformer import SPECIAL_TOKENS_DICT, BacformerConfig
-def compute_contrastive_loss(
-    protein_embeddings: torch.Tensor,
-    last_hidden_state: torch.Tensor,
-    special_tokens_mask: torch.Tensor,
-) -> torch.Tensor:
-    """Compute contrastive loss between protein embeddings and masked items."""
-    # keep protein embeddings and masked items
-    # ensure the batch size is 1, the model currently does not work with batch size > 1
-    assert protein_embeddings.shape[0] == last_hidden_state.shape[0] == 1
-    # subset to mask and protein embedding tokens
-    special_tokens_mask = special_tokens_mask.squeeze(0)
-    mask = (special_tokens_mask == SPECIAL_TOKENS_DICT["PROT_EMB"]) | (
-        special_tokens_mask == SPECIAL_TOKENS_DICT["MASK"]
-    )
-    protein_embeddings = protein_embeddings.squeeze(0)[mask]
-    last_hidden_state = last_hidden_state.squeeze(0)[mask]
-    # Normalize embeddings
-    last_hidden_state = last_hidden_state / last_hidden_state.norm(dim=1, keepdim=True)
-    protein_embeddings = protein_embeddings / protein_embeddings.norm(dim=1, keepdim=True)
-    # Compute similarity matrix and loss as before
-    similarity_matrix = torch.matmul(last_hidden_state, protein_embeddings.T)
-    n_prots = protein_embeddings.shape[0]
-    labels = torch.arange(n_prots).to(protein_embeddings.device)
-    # Compute the loss
-    loss = cross_entropy(similarity_matrix, labels)
-    return loss
-def top_k_filtering(logits: torch.Tensor, top_k: int = 50):
-    """
-    Keep only top_k logits and set the rest to -inf.
-    Args:
-        logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
-        top_k (int): The number of highest probability logits to keep.
-    Returns
-    -------
-        torch.Tensor: Filtered logits where only the top k values remain, and all others are -inf.
-    """
-    if top_k <= 0:
-        return logits
-    # Find top_k values
-    top_k = min(top_k, logits.size(-1))
-    vals, idx = torch.topk(logits, top_k, dim=-1)
-    # Get the smallest logit in the top_k
-    min_vals = vals[:, -1].unsqueeze(-1)
-    # Mask all logits that are < this min value
-    mask = logits < min_vals
-    logits[mask] = float("-inf")
-    return logits
-def top_p_filtering(logits: torch.Tensor, top_p: float = 0.9):
-    """
-    Keep the smallest set of logits whose cumulative probability >= top_p.
-    Args:
-        logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
-        top_p (float): Cumulative probability threshold.
-    Returns
-    -------
-        torch.Tensor: Filtered logits where only tokens within the top_p cumulative
-                      probability mass are kept; the rest are set to -inf.
-    """
-    if top_p >= 1.0:
-        return logits
-    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-    cumulative_probs = torch.cumsum(softmax(sorted_logits, dim=-1), dim=-1)
-    # Identify where cumulative probability exceeds top_p
-    sorted_indices_to_remove = cumulative_probs > top_p
-    # Shift the mask to ensure we always keep at least one token
-    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-    sorted_indices_to_remove[..., 0] = False
-    # Scatter to replicate the mask in the original ordering
-    for i in range(logits.size(0)):
-        remove_indices = sorted_indices[i, sorted_indices_to_remove[i]]
-        logits[i, remove_indices] = float("-inf")
-    return logits
-def create_4d_from_2d_attn_mask(attn_mask: torch.Tensor, num_attn_heads: int):
-    """Helper function to reshape attn_mask to 3D from 2D"""
-    assert (
-        len(attn_mask.shape) == 2
-    ), f"Please provide attn_mask of shape (batch_size, seq_len), current shape {attn_mask.shape}"
-    bs, seq_len = attn_mask.shape
-    attn_mask = attn_mask.view(bs, 1, 1, seq_len)
-    attn_mask = attn_mask.expand(-1, num_attn_heads, -1, -1)
-    attn_mask = attn_mask.view(bs, num_attn_heads, -1, seq_len)
-    return attn_mask
 @dataclass
@@ -186,23 +82,6 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
     return freqs_cos, freqs_sin
-def symmetrize(x):
-    """Make layer symmetric in final two dimensions, used for protein-protein interaction prediction."""
-    return x + x.transpose(-1, -2)
-def average_product_correct(x):
-    """Perform average product correct, used for protein-protein interaction prediction."""
-    a1 = x.sum(-1, keepdims=True)
-    a2 = x.sum(-2, keepdims=True)
-    a12 = x.sum((-1, -2), keepdims=True)
-    avg = a1 * a2
-    # avg.div_(a12)  # in-place to reduce memory
-    normalized = x - avg.div_(a12)
-    return normalized
 def scaled_dot_product_attention_w_attn_weights(
     query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -416,7 +295,7 @@ class BacformerTransformerEncoder(nn.Module):
 class BacformerEmbeddings(nn.Module):
     """Construct the protein embeddings from protein sequence, position embeddings and sequence type embeddings."""
-    def __init__(self, config: BacformerConfig):
         super().__init__()
         self.config = config
         self.linear = nn.Linear(config.hidden_size, config.hidden_size)
@@ -469,7 +348,7 @@ class BacformerProteinFamilyEmbeddings(nn.Module):
     def __init__(
         self,
-        config: BacformerConfig,
         protein_family_embeddings: torch.Tensor = None,
         token_type_embeddings: torch.Tensor = None,
         special_tokens_embeddings: torch.Tensor = None,
@@ -573,7 +452,7 @@ class BacformerProteinFamilyEmbeddings(nn.Module):
 class BacformerEncoder(nn.Module):
     """Bacformer encoder model"""
-    def __init__(self, config: BacformerConfig):
         super().__init__()
         self.config = config

 from transformers import PreTrainedModel
 from transformers.utils import ModelOutput
+from .configuration_bacformer import SPECIAL_TOKENS_DICT, BacformerConfig
+from .utils_bacformer import compute_contrastive_loss, create_4d_from_2d_attn_mask, top_k_filtering, top_p_filtering
 @dataclass
     return freqs_cos, freqs_sin
 def scaled_dot_product_attention_w_attn_weights(
     query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
 ) -> tuple[torch.Tensor, torch.Tensor]:
 class BacformerEmbeddings(nn.Module):
     """Construct the protein embeddings from protein sequence, position embeddings and sequence type embeddings."""
+    def __init__(self, config):
         super().__init__()
         self.config = config
         self.linear = nn.Linear(config.hidden_size, config.hidden_size)
     def __init__(
         self,
+        config,
         protein_family_embeddings: torch.Tensor = None,
         token_type_embeddings: torch.Tensor = None,
         special_tokens_embeddings: torch.Tensor = None,
 class BacformerEncoder(nn.Module):
     """Bacformer encoder model"""
+    def __init__(self, config):
         super().__init__()
         self.config = config

utils_bacformer.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+from torch.nn.functional import cross_entropy, softmax
+from .configuration_bacformer import SPECIAL_TOKENS_DICT
+def compute_contrastive_loss(
+    protein_embeddings: torch.Tensor,
+    last_hidden_state: torch.Tensor,
+    special_tokens_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Compute contrastive loss between protein embeddings and masked items."""
+    # keep protein embeddings and masked items
+    # ensure the batch size is 1, the model currently does not work with batch size > 1
+    assert protein_embeddings.shape[0] == last_hidden_state.shape[0] == 1
+    # subset to mask and protein embedding tokens
+    special_tokens_mask = special_tokens_mask.squeeze(0)
+    mask = (special_tokens_mask == SPECIAL_TOKENS_DICT["PROT_EMB"]) | (
+        special_tokens_mask == SPECIAL_TOKENS_DICT["MASK"]
+    )
+    protein_embeddings = protein_embeddings.squeeze(0)[mask]
+    last_hidden_state = last_hidden_state.squeeze(0)[mask]
+    # Normalize embeddings
+    last_hidden_state = last_hidden_state / last_hidden_state.norm(dim=1, keepdim=True)
+    protein_embeddings = protein_embeddings / protein_embeddings.norm(dim=1, keepdim=True)
+    # Compute similarity matrix and loss as before
+    similarity_matrix = torch.matmul(last_hidden_state, protein_embeddings.T)
+    n_prots = protein_embeddings.shape[0]
+    labels = torch.arange(n_prots).to(protein_embeddings.device)
+    # Compute the loss
+    loss = cross_entropy(similarity_matrix, labels)
+    return loss
+def top_k_filtering(logits: torch.Tensor, top_k: int = 50):
+    """
+    Keep only top_k logits and set the rest to -inf.
+    Args:
+        logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
+        top_k (int): The number of highest probability logits to keep.
+    Returns
+    -------
+        torch.Tensor: Filtered logits where only the top k values remain, and all others are -inf.
+    """
+    if top_k <= 0:
+        return logits
+    # Find top_k values
+    top_k = min(top_k, logits.size(-1))
+    vals, idx = torch.topk(logits, top_k, dim=-1)
+    # Get the smallest logit in the top_k
+    min_vals = vals[:, -1].unsqueeze(-1)
+    # Mask all logits that are < this min value
+    mask = logits < min_vals
+    logits[mask] = float("-inf")
+    return logits
+def top_p_filtering(logits: torch.Tensor, top_p: float = 0.9):
+    """
+    Keep the smallest set of logits whose cumulative probability >= top_p.
+    Args:
+        logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
+        top_p (float): Cumulative probability threshold.
+    Returns
+    -------
+        torch.Tensor: Filtered logits where only tokens within the top_p cumulative
+                      probability mass are kept; the rest are set to -inf.
+    """
+    if top_p >= 1.0:
+        return logits
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(softmax(sorted_logits, dim=-1), dim=-1)
+    # Identify where cumulative probability exceeds top_p
+    sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the mask to ensure we always keep at least one token
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = False
+    # Scatter to replicate the mask in the original ordering
+    for i in range(logits.size(0)):
+        remove_indices = sorted_indices[i, sorted_indices_to_remove[i]]
+        logits[i, remove_indices] = float("-inf")
+    return logits
+def create_4d_from_2d_attn_mask(attn_mask: torch.Tensor, num_attn_heads: int):
+    """Helper function to reshape attn_mask to 3D from 2D"""
+    assert (
+        len(attn_mask.shape) == 2
+    ), f"Please provide attn_mask of shape (batch_size, seq_len), current shape {attn_mask.shape}"
+    bs, seq_len = attn_mask.shape
+    attn_mask = attn_mask.view(bs, 1, 1, seq_len)
+    attn_mask = attn_mask.expand(-1, num_attn_heads, -1, -1)
+    attn_mask = attn_mask.view(bs, num_attn_heads, -1, seq_len)
+    return attn_mask