bacformer-causal-MAG / modeling_bacformer.py

feat: move protein embeddings input to the right dtype

70da698 verified 5 months ago

52.1 kB

	import math
	from collections import OrderedDict
	from dataclasses import dataclass
	from typing import Literal, Optional, Union

	import torch
	from torch import nn
	from torch.nn.functional import (
	binary_cross_entropy_with_logits,
	cross_entropy,
	gelu,
	mse_loss,
	scaled_dot_product_attention,
	softmax,
	)
	from transformers import PreTrainedModel
	from transformers.utils import ModelOutput

	from .configuration_bacformer import SPECIAL_TOKENS_DICT, BacformerConfig
	from .utils_bacformer import compute_contrastive_loss, create_4d_from_2d_attn_mask, top_k_filtering, top_p_filtering


	@dataclass
	class BacformerModelOutput(ModelOutput):
	"""Base class for outputs of the Bacformer model."""

	loss: torch.FloatTensor \| None = None
	logits: torch.FloatTensor = None
	last_hidden_state: torch.FloatTensor \| None = None
	attentions: Union[torch.FloatTensor, None] = None
	pooler_output: torch.FloatTensor \| None = None


	# Taken from facebookresearch/llama/model.py
	def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
	"""Reshape the rotary embeddings for broadcasting."""
	ndim = x.ndim
	assert 0 <= 1 < ndim
	assert freqs_cis.shape == (x.shape[1], x.shape[-1])
	shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
	return freqs_cis.view(*shape)


	# Taken from facebookresearch/llama/model.py
	def apply_rotary_emb(
	xq: torch.Tensor,
	xk: torch.Tensor,
	freqs_cos: torch.Tensor,
	freqs_sin: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Apply rotary embeddings to the query and key tensors."""
	# reshape xq and xk to match the complex representation
	xq_r, xq_i = xq.float().reshape(*xq.shape[:-1], -1, 2).unbind(-1)
	xk_r, xk_i = xk.float().reshape(*xk.shape[:-1], -1, 2).unbind(-1)

	# reshape freqs_cos and freqs_sin for broadcasting
	freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
	freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)

	# apply rotation using real numbers
	xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
	xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
	xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
	xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos

	# flatten last two dimensions
	xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
	xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)

	return xq_out.type_as(xq), xk_out.type_as(xk)


	# Taken from facebookresearch/llama/model.py
	def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
	"""Precompute the freqs cis for rotary embeddings."""
	freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
	t = torch.arange(end, device=freqs.device) # type: ignore
	freqs = torch.outer(t, freqs).float() # type: ignore

	freqs_cos = torch.cos(freqs) # real part
	freqs_sin = torch.sin(freqs) # imaginary part
	return freqs_cos, freqs_sin


	def scaled_dot_product_attention_w_attn_weights(
	query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""PyTorch Native implementation, modified to return attention weights."""
	L, S = query.size(-2), key.size(-2)
	scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
	attn_bias = torch.zeros(L, S, dtype=query.dtype).to(query.device)
	if is_causal:
	assert attn_mask is None
	temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
	attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
	attn_bias.to(query.dtype)

	if attn_mask is not None:
	if attn_mask.dtype == torch.bool:
	attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
	else:
	attn_bias += attn_mask
	attn_weight = query @ key.transpose(-2, -1) * scale_factor
	attn_weight += attn_bias
	attn_weight = torch.softmax(attn_weight, dim=-1)
	attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
	attn_output = attn_weight @ value
	return attn_output, attn_weight


	class RotarySelfAttention(nn.Module):
	"""Rotary self-attention module."""

	def __init__(
	self,
	embed_dim: int,
	num_heads: int,
	dropout: float = 0.1,
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.num_heads = num_heads
	self.dim_head = embed_dim // num_heads
	self.dropout_rate = dropout

	self.q = nn.Linear(embed_dim, embed_dim, bias=False)
	self.k = nn.Linear(embed_dim, embed_dim, bias=False)
	self.v = nn.Linear(embed_dim, embed_dim, bias=False)
	self.att_proj_linear = nn.Linear(embed_dim, embed_dim)

	def forward(
	self,
	x: torch.Tensor,
	attn_mask: torch.Tensor,
	freqs_cos: torch.Tensor,
	freqs_sin: torch.Tensor,
	is_causal: bool = False,
	return_attn_weights: bool = False,
	):
	"""Forward pass for the rotary self-attention module."""
	batch_size, seq_len, _ = x.shape
	xq, xk, xv = self.q(x), self.k(x), self.v(x)
	# Reshape for rotary embeddings
	xq = xq.view(batch_size, seq_len, self.num_heads, self.dim_head)
	xk = xk.view(batch_size, seq_len, self.num_heads, self.dim_head)
	xv = xv.view(batch_size, seq_len, self.num_heads, self.dim_head)
	xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)

	# Reshape for attention calculation: (b_sz, n_head, s_len, d_head)
	xq = xq.transpose(1, 2)
	xk = xk.transpose(1, 2)
	xv = xv.transpose(1, 2)

	attn_weights = None
	if return_attn_weights:
	att, attn_weights = scaled_dot_product_attention_w_attn_weights(
	query=xq,
	key=xk,
	value=xv,
	attn_mask=attn_mask,
	dropout_p=self.dropout_rate if self.training else 0.0,
	is_causal=is_causal,
	)
	else:
	att = scaled_dot_product_attention(
	query=xq,
	key=xk,
	value=xv,
	attn_mask=attn_mask,
	dropout_p=self.dropout_rate if self.training else 0.0,
	is_causal=is_causal,
	)
	# Shape (b_sz, s_len, n_head, d_head)
	out = att.transpose(1, 2).contiguous()
	out = out.view(batch_size, seq_len, self.num_heads * self.dim_head)

	return self.att_proj_linear(out), attn_weights


	class BacformerTransformerLayer(nn.Module):
	"""Own implementation of transformer layer which uses pytorch native MHA but returns attention weights"""

	def __init__(
	self,
	hidden_size: int,
	intermediate_size: int,
	num_attention_heads: int,
	dropout: float = 0.1,
	activation: Literal["gelu", "relu"] = "gelu",
	):
	super().__init__()
	self.self_mha = RotarySelfAttention(
	embed_dim=hidden_size,
	num_heads=num_attention_heads,
	dropout=dropout,
	)

	self.fc1 = nn.Linear(hidden_size, intermediate_size)
	self.fc2 = nn.Linear(intermediate_size, hidden_size)
	self.activation = nn.GELU() if activation == "gelu" else nn.ReLU()
	self.norm1 = nn.LayerNorm(hidden_size)
	self.norm2 = nn.LayerNorm(hidden_size)
	self.dropout1 = nn.Dropout(dropout)
	self.dropout2 = nn.Dropout(dropout)
	self.dropout3 = nn.Dropout(dropout)

	def forward(
	self,
	hidden_state: torch.Tensor,
	attention_mask: torch.Tensor = None,
	freqs_cos: torch.Tensor = None,
	freqs_sin: torch.Tensor = None,
	return_attn_weights: bool = False,
	is_causal: bool = False,
	) -> tuple[torch.Tensor, torch.Tensor \| None]:
	"""Forward pass"""
	attn_outputs, attn_weights = self.self_mha(
	hidden_state,
	attn_mask=attention_mask,
	freqs_cos=freqs_cos,
	freqs_sin=freqs_sin,
	return_attn_weights=return_attn_weights,
	is_causal=is_causal,
	)
	x = self.norm1(hidden_state + self.dropout1(attn_outputs))
	ff_output = self.fc2(self.dropout2(self.activation(self.fc1(x))))
	x = self.norm2(x + self.dropout3(ff_output))
	return x, attn_weights


	class BacformerTransformerEncoder(nn.Module):
	"""Own implementation of Transformer which return attention weights"""

	def __init__(
	self,
	num_hidden_layers: int,
	hidden_size: int,
	intermediate_size: int,
	num_attention_heads: int,
	dropout: float = 0.1,
	activation: Literal["gelu", "relu"] = "gelu",
	):
	super().__init__()

	self.layers = nn.ModuleList(
	[
	BacformerTransformerLayer(
	hidden_size=hidden_size,
	intermediate_size=intermediate_size,
	num_attention_heads=num_attention_heads,
	dropout=dropout,
	activation=activation,
	)
	for _ in range(num_hidden_layers)
	]
	)
	self.gradient_checkpointing = False

	def forward(
	self,
	hidden_state: torch.Tensor,
	attention_mask: torch.Tensor = None,
	freqs_cos: torch.Tensor = None,
	freqs_sin: torch.Tensor = None,
	return_attn_weights: bool = False,
	is_causal: bool = False,
	) -> tuple[torch.Tensor, list[torch.Tensor \| None]]:
	"""Forward pass"""
	attn_weights_arr = []
	for layer in self.layers:
	if self.gradient_checkpointing and self.training:
	hidden_state, attn_weights = self._gradient_checkpointing_func(
	layer.__call__,
	hidden_state,
	attention_mask,
	freqs_cos,
	freqs_sin,
	return_attn_weights,
	is_causal,
	)
	else:
	hidden_state, attn_weights = layer(
	hidden_state=hidden_state,
	attention_mask=attention_mask,
	freqs_cos=freqs_cos,
	freqs_sin=freqs_sin,
	return_attn_weights=return_attn_weights,
	is_causal=is_causal,
	)
	# keep the attention weights from each layer
	attn_weights_arr.append(attn_weights)
	return hidden_state, attn_weights_arr


	class BacformerEmbeddings(nn.Module):
	"""Construct the protein embeddings from protein sequence, position embeddings and sequence type embeddings."""

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.linear = nn.Linear(config.hidden_size, config.hidden_size)

	self.token_type_embeddings = nn.Embedding(
	num_embeddings=config.max_token_type_embeddings + 1,
	embedding_dim=config.hidden_size,
	padding_idx=config.max_token_type_embeddings,
	)

	self.special_tokens_embeddings = nn.Embedding(
	num_embeddings=config.num_special_tokens,
	embedding_dim=config.hidden_size,
	)
	self.prot_emb_token_id = config.prot_emb_token_id
	self.pad_token_id = config.pad_token_id

	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	def forward(
	self,
	protein_embeddings: torch.Tensor = None,
	special_tokens_mask: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	labels: torch.Tensor = None, # used for causal protein family modeling
	property_ids: torch.Tensor = None, # used for conditional fine-tuning for desired property
	) -> torch.Tensor:
	"""Forward pass for protein embeddings."""
	bs, seq_length, dim = protein_embeddings.shape

	# pass the pooled ESM protein embeddings through a linear layer
	protein_embeddings = self.linear(protein_embeddings.type_as(self.linear.weight))
	protein_embeddings = torch.where(
	special_tokens_mask.unsqueeze(-1).repeat(1, 1, dim) == self.prot_emb_token_id,
	protein_embeddings,
	self.special_tokens_embeddings(special_tokens_mask),
	)

	if token_type_ids is not None:
	protein_embeddings += self.token_type_embeddings(token_type_ids)

	protein_embeddings = self.LayerNorm(protein_embeddings)
	protein_embeddings = self.dropout(protein_embeddings)
	return protein_embeddings


	class BacformerProteinFamilyEmbeddings(nn.Module):
	"""Construct the protein embeddings from protein family tokens, special tokens and sequence type embeddings."""

	def __init__(
	self,
	config,
	protein_family_embeddings: torch.Tensor = None,
	token_type_embeddings: torch.Tensor = None,
	special_tokens_embeddings: torch.Tensor = None,
	n_conditional_properties: int = None,
	):
	super().__init__()
	self.config = config

	if protein_family_embeddings is not None:
	self.protein_family_embeddings = nn.Embedding.from_pretrained(
	protein_family_embeddings,
	freeze=False,
	padding_idx=config.pad_token_id,
	)
	else:
	self.protein_family_embeddings = nn.Embedding(
	num_embeddings=config.protein_clusters_vocab_size + 1,
	embedding_dim=config.hidden_size,
	padding_idx=config.pad_token_id,
	)

	if token_type_embeddings is not None:
	self.token_type_embeddings = nn.Embedding.from_pretrained(
	token_type_embeddings,
	freeze=False,
	padding_idx=config.max_token_type_embeddings,
	)
	else:
	self.token_type_embeddings = nn.Embedding(
	num_embeddings=config.max_token_type_embeddings + 1,
	embedding_dim=config.hidden_size,
	padding_idx=config.max_token_type_embeddings,
	)

	if special_tokens_embeddings is not None:
	self.special_tokens_embeddings = nn.Embedding.from_pretrained(
	special_tokens_embeddings,
	freeze=False,
	padding_idx=config.pad_token_id,
	)
	else:
	self.special_tokens_embeddings = nn.Embedding(
	num_embeddings=config.num_special_tokens,
	embedding_dim=config.hidden_size,
	padding_idx=config.pad_token_id,
	)

	# add layer for conditional properties
	if n_conditional_properties is not None:
	self.conditional_properties_layer = nn.Embedding(n_conditional_properties, config.hidden_size)

	self.prot_emb_token_id = config.prot_emb_token_id
	self.pad_token_id = config.pad_token_id

	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	def forward(
	self,
	protein_embeddings: torch.Tensor = None,
	special_tokens_mask: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	labels: torch.Tensor = None, # used for causal protein family modeling
	property_ids: torch.Tensor = None, # used for conditional fine-tuning for desired property
	) -> torch.Tensor:
	"""Forward pass for protein embeddings."""
	# pass the pooled ESM protein embeddings through a linear layer
	# replace -100 with pad_token_id
	labels[labels == -100] = self.pad_token_id
	protein_embeddings = self.protein_family_embeddings(labels)

	bs, seq_length, dim = protein_embeddings.shape
	protein_embeddings = torch.where(
	special_tokens_mask.unsqueeze(-1).repeat(1, 1, dim) == self.prot_emb_token_id,
	protein_embeddings,
	self.special_tokens_embeddings(special_tokens_mask),
	)

	if token_type_ids is not None:
	protein_embeddings += self.token_type_embeddings(token_type_ids)

	if property_ids is not None:
	# get the embeddings for the conditional properties
	property_embedding = self.conditional_properties_layer(property_ids).unsqueeze(1)
	# concatenate the protein embeddings with the conditional properties embeddings
	# property embeddings are added to the beginning of the protein embeddings after the CLS token
	protein_embeddings = torch.cat(
	[
	protein_embeddings[:, :1, :], # CLS token
	property_embedding, # conditional properties embeddings
	protein_embeddings[:, 1:, :],
	], # protein embeddings
	dim=1,
	)

	protein_embeddings = self.LayerNorm(protein_embeddings)
	protein_embeddings = self.dropout(protein_embeddings)
	return protein_embeddings


	class BacformerEncoder(nn.Module):
	"""Bacformer encoder model"""

	def __init__(self, config):
	super().__init__()
	self.config = config

	self.encoder = BacformerTransformerEncoder(
	num_hidden_layers=config.num_hidden_layers,
	hidden_size=config.hidden_size,
	num_attention_heads=config.num_attention_heads,
	intermediate_size=config.intermediate_size,
	activation="gelu",
	dropout=config.attention_probs_dropout_prob,
	)

	# Note that config.max_position_embeddings is multiplied by 1.5 because the token limit for the Bacformer of
	# models is 6000. Adding this multiplier instead of using 6000 directly allows for dynamism of token
	# lengths while training or fine-tuning.
	freqs_cos, freqs_sin = precompute_freqs_cis(
	config.hidden_size // config.num_attention_heads, int(config.max_position_embeddings * 1.5)
	)
	self.register_buffer("freqs_cos", freqs_cos, persistent=False)
	self.register_buffer("freqs_sin", freqs_sin, persistent=False)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor = None,
	return_attn_weights: Union[bool, None] = None,
	is_causal: bool = False,
	) -> tuple[torch.Tensor, list[torch.Tensor \| None]]:
	"""Pass the input through the encoder layers in turn.

	Args:
	hidden_states: hidden states from the BacformerEmbeddings layer
	attention_mask: mask for the attention in the transformer
	"""
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)
	bs, seq_len, _ = hidden_states.shape
	last_hidden_state, attn_weights = self.encoder(
	hidden_state=hidden_states,
	attention_mask=attention_mask,
	freqs_cos=self.freqs_cos[:seq_len, :],
	freqs_sin=self.freqs_sin[:seq_len, :],
	return_attn_weights=return_attn_weights,
	is_causal=is_causal,
	)
	return last_hidden_state, attn_weights


	class BacformerPreTrainedModel(PreTrainedModel):
	"""An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models."""

	config_class = BacformerConfig
	base_model_prefix = "bacformer"
	supports_gradient_checkpointing = True
	_no_split_modules = ["BacformerEmbeddings", "BacformerTransformerLayer"]

	# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
	def _init_weights(self, module):
	"""Initialize the weights"""
	if isinstance(module, nn.Linear):
	# Slightly different from the TF version which uses truncated_normal for initialization
	# cf https://github.com/pytorch/pytorch/pull/5617
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	class BacformerModel(BacformerPreTrainedModel):
	"""Bacformer model."""

	def __init__(self, config: BacformerConfig, add_pooling_layer: bool = False):
	super().__init__(config)
	self.config = config

	self.embeddings = BacformerEmbeddings(config)
	self.encoder = BacformerEncoder(config)

	self.pooler = BacformerPooler(config) if add_pooling_layer else None

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	protein_embeddings: torch.Tensor = None,
	special_tokens_mask: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	labels: torch.Tensor = None,
	property_ids: torch.Tensor = None,
	return_attn_weights: bool = False,
	return_dict: Union[bool, None] = None,
	is_causal: bool = False,
	) -> Optional[BacformerModelOutput]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	# get embeddings
	protein_embeddings = self.embeddings(
	protein_embeddings=protein_embeddings,
	labels=labels,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	property_ids=property_ids,
	)

	# create 3D attention mask from 2D if not doing causal GM
	if attention_mask is not None and not is_causal:
	attention_mask = create_4d_from_2d_attn_mask(
	attn_mask=attention_mask, num_attn_heads=self.config.num_attention_heads
	).bool()

	last_hidden_state, attentions = self.encoder(
	hidden_states=protein_embeddings,
	attention_mask=attention_mask,
	return_attn_weights=return_attn_weights,
	is_causal=is_causal,
	)
	pooler_output = (
	self.pooler(hidden_states=last_hidden_state, padding_mask=attention_mask)
	if self.pooler is not None
	else None
	)

	if not return_dict:
	return (last_hidden_state, pooler_output, attentions)

	return BacformerModelOutput(
	last_hidden_state=last_hidden_state,
	pooler_output=pooler_output,
	attentions=attentions,
	)


	class BacformerForCausalGM(BacformerPreTrainedModel):
	"""Bacformer model with genomic modeling head on top"""

	_tied_weights_keys = ["gm_head.decoder.weight"]

	def __init__(self, config: BacformerConfig):
	super().__init__(config)
	self.config = config

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.gm_head = BacformerGMHead(config)

	# Initialize weights
	self.init_weights()

	def forward(
	self,
	protein_embeddings: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	labels: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Optional[BacformerModelOutput]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)

	outputs = self.bacformer(
	protein_embeddings=protein_embeddings,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	attention_mask=None, # attention mechanism handles the causal mask
	return_attn_weights=return_attn_weights,
	return_dict=return_dict,
	is_causal=True,
	)
	last_hidden_state = outputs[0]
	prediction_scores = self.gm_head(last_hidden_state)

	loss = None
	if labels is not None:
	labels = labels.to(prediction_scores.device)

	shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous().view(-1, prediction_scores.shape[-1])
	labels = labels[:, 1:].contiguous().view(-1)
	loss = cross_entropy(shifted_prediction_scores, labels)

	if not return_dict:
	return (
	loss,
	prediction_scores,
	) + outputs

	return BacformerModelOutput(
	loss=loss,
	logits=prediction_scores,
	last_hidden_state=outputs.last_hidden_state,
	attentions=outputs.attentions,
	)


	class BacformerForMaskedGM(BacformerPreTrainedModel):
	"""Bacformer model with genomic modeling head on top"""

	_tied_weights_keys = ["gm_head.decoder.weight"]

	def __init__(self, config: BacformerConfig):
	super().__init__(config)
	self.config = config

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.gm_head = BacformerGMHead(config)

	# Initialize weights
	self.init_weights()

	def forward(
	self,
	protein_embeddings: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	labels: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Union[BacformerModelOutput, None]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)

	outputs = self.bacformer(
	protein_embeddings=protein_embeddings,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	attention_mask=attention_mask,
	return_attn_weights=return_attn_weights,
	return_dict=return_dict,
	)
	last_hidden_state = outputs[0]

	# to speed up the forward pass, let's only consider the masked tokens

	loss = None
	if labels is not None:
	# to speed up the forward pass, let's only consider the masked tokens
	last_hidden_state = last_hidden_state[labels != -100]
	prediction_scores = self.gm_head(last_hidden_state)
	labels = labels.to(prediction_scores.device)

	### notes
	# use the labels to get -100 for non-masked tokens
	# do not use special_tokens_mask
	# check how the labels are constructed

	# only considering the masked tokens
	labels = labels[labels != -100]
	loss = cross_entropy(prediction_scores, labels)
	else:
	prediction_scores = self.gm_head(last_hidden_state)

	if not return_dict:
	return (
	loss,
	prediction_scores,
	) + outputs

	return BacformerModelOutput(
	loss=loss,
	logits=prediction_scores,
	last_hidden_state=outputs.last_hidden_state,
	attentions=outputs.attentions,
	)


	class BacformerForCausalProteinFamilyModeling(BacformerPreTrainedModel):
	"""Bacformer model for causal modeling of protein families. Using protein family as tokens rather than protein embeddings"""

	_tied_weights_keys = ["gm_head.decoder.weight"]

	def __init__(
	self,
	config: BacformerConfig,
	n_conditional_properties: int = None,
	initialise_from_non_pfm_model: bool = False,
	):
	super().__init__(config)
	self.config = config
	self.cls_token_id = SPECIAL_TOKENS_DICT["CLS"]

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.gm_head = BacformerGMHead(config)

	if initialise_from_non_pfm_model:
	# Initialize weights
	self.init_weights()
	# overwrite the embeddings with the pretrained
	# protein family embeddings from the decoder of the GM Head
	self.bacformer.embeddings = BacformerProteinFamilyEmbeddings(
	config,
	protein_family_embeddings=self.gm_head.decoder.weight,
	token_type_embeddings=self.bacformer.embeddings.token_type_embeddings.weight,
	special_tokens_embeddings=self.bacformer.embeddings.special_tokens_embeddings.weight,
	n_conditional_properties=n_conditional_properties,
	)
	else:
	self.bacformer.embeddings = BacformerProteinFamilyEmbeddings(
	config,
	n_conditional_properties=n_conditional_properties,
	)
	self.init_weights()

	def forward(
	self,
	labels: torch.Tensor = None,
	special_tokens_mask: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	property_ids: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Optional[BacformerModelOutput]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)

	outputs = self.bacformer(
	protein_embeddings=None,
	labels=labels,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	property_ids=property_ids,
	return_attn_weights=return_attn_weights,
	return_dict=return_dict,
	is_causal=True,
	)
	last_hidden_state = outputs[0]
	prediction_scores = self.gm_head(last_hidden_state)

	loss = None
	if labels is not None:
	if property_ids is not None:
	labels = torch.cat(
	[
	torch.tensor([-100], dtype=torch.long)
	.unsqueeze(0)
	.to(labels.device), # account for the property token
	labels,
	],
	dim=1,
	) # ignore index
	labels = labels.to(prediction_scores.device)

	shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous().view(-1, prediction_scores.shape[-1])
	labels = labels[:, 1:].contiguous().view(-1)
	loss = cross_entropy(shifted_prediction_scores, labels)

	if not return_dict:
	return (
	loss,
	prediction_scores,
	) + outputs

	return BacformerModelOutput(
	loss=loss,
	logits=prediction_scores,
	last_hidden_state=outputs.last_hidden_state,
	attentions=outputs.attentions,
	)

	def generate(
	self,
	protein_family_ids: torch.LongTensor,
	special_tokens_mask: torch.LongTensor = None,
	token_type_ids: torch.LongTensor = None,
	max_length: int = 6000,
	end_token_id: int = 50000,
	do_sample: bool = False,
	top_k: int = 50,
	top_p: float = 1.0,
	temperature: float = 1.0,
	property_ids: torch.LongTensor = None,
	return_last_hidden_states: bool = False,
	):
	"""
	Generate a sequence of tokens autoregressively from a given prompt.

	Args:
	protein_family_ids (torch.LongTensor): Tensor of shape (batch, seq_len) with token indices.
	max_length (int): Maximum length of the generated sequence (prompt + newly generated).
	end_token_id (int, optional): Token ID signifying end-of-sequence (END).
	If encountered, generation stops.
	do_sample (bool): Whether to sample from the probability distribution (True)
	or use greedy decoding (False).
	top_k (int): If >0, use top-k filtering in sampling mode.
	top_p (float): If <1.0, use nucleus (top-p) filtering in sampling mode.
	temperature (float): Softmax temperature for scaling logits.
	Higher => more random, lower => more deterministic.
	return_last_hidden_states (bool): If True, return final hidden states as well.

	Returns
	-------
	torch.LongTensor: The generated token sequence of shape (batch, final_seq_len).
	(Optional) torch.FloatTensor: Final hidden states of shape (batch, final_seq_len, hidden_dim)
	if `return_hidden_states=True`.
	"""
	# Default END token
	if end_token_id is None:
	end_token_id = getattr(self, "end_token_id", None)

	# Switch to eval mode and move input to correct device
	self.eval()
	device = next(self.parameters()).device
	protein_family_ids = protein_family_ids.to(device)

	# create a special tokens mask if not provided
	if special_tokens_mask is None:
	# add a cls token at the beginning
	protein_family_ids = torch.cat(
	[torch.tensor([[-100]]).to(device), protein_family_ids],
	dim=1,
	)
	special_tokens_mask = [self.cls_token_id] + [self.config.prot_emb_token_id] * (
	protein_family_ids.shape[1] - 1
	)
	special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.long).to(device)

	# create a token type mask if not provided
	if token_type_ids is None:
	token_type_ids = torch.zeros_like(protein_family_ids)

	# Prepare the initial sequence and define max new tokens
	generated = protein_family_ids.clone()
	batch_size, prompt_length = generated.shape
	max_new_tokens = max_length - prompt_length
	if max_new_tokens <= 0:
	max_new_tokens = 0

	# Disable gradient calculations for generation
	with torch.no_grad():
	for _step in range(max_new_tokens):
	# Forward pass
	logits = self.forward(
	labels=generated,
	special_tokens_mask=special_tokens_mask,
	# assume it's all on one chromosome
	token_type_ids=token_type_ids,
	property_ids=property_ids,
	return_dict=True,
	).logits
	# Focus on the last token's logits
	next_token_logits = logits[:, -1, :] # (batch_size, vocab_size)

	# Apply temperature
	if temperature != 1.0:
	next_token_logits = next_token_logits / temperature

	# Sampling or greedy?
	if do_sample:
	# Top-k filter
	next_token_logits = top_k_filtering(next_token_logits, top_k=top_k)
	# Top-p filter
	next_token_logits = top_p_filtering(next_token_logits, top_p=top_p)

	probs = softmax(next_token_logits, dim=-1)
	next_token_id = torch.multinomial(probs, num_samples=1)
	else:
	# Greedy decoding
	next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)

	# Append predicted token
	generated = torch.cat([generated, next_token_id], dim=1)
	special_tokens_mask = torch.cat(
	[special_tokens_mask, torch.tensor([[self.config.prot_emb_token_id]]).to(generated.device)], dim=1
	)
	last_token_type_id = token_type_ids[:, -1].unsqueeze(1)
	token_type_ids = torch.cat([token_type_ids, last_token_type_id], dim=1)

	# Check for END in all sequences
	if end_token_id is not None:
	if (next_token_id.squeeze(1) == end_token_id).all():
	# If every sequence ended, break early
	break

	if not return_last_hidden_states:
	return generated

	# Optionally compute final hidden states
	if return_last_hidden_states:
	last_hidden_state = self.forward(
	labels=generated,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	return_dict=True,
	).last_hidden_state

	return generated, last_hidden_state


	class BacformerForMaskedGMWithContrastiveLoss(BacformerPreTrainedModel):
	"""Bacformer model with genomic modeling head on top"""

	_tied_weights_keys = ["gm_head.decoder.weight"]

	def __init__(self, config: BacformerConfig):
	super().__init__(config)
	self.config = config

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.gm_head = BacformerGMHead(config)

	# Initialize weights
	self.init_weights()

	def forward(
	self,
	protein_embeddings: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	labels: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Union[BacformerModelOutput, None]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)

	outputs = self.bacformer(
	protein_embeddings=protein_embeddings,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	attention_mask=attention_mask,
	return_attn_weights=return_attn_weights,
	return_dict=return_dict,
	)
	last_hidden_state = outputs[0]

	# to speed up the forward pass, let's only consider the masked tokens

	loss = None
	if labels is not None:
	# contrastive loss
	contrastive_loss = compute_contrastive_loss(protein_embeddings, last_hidden_state, special_tokens_mask)
	# to speed up the forward pass, let's only consider the masked tokens
	last_hidden_state = last_hidden_state[labels != -100]
	prediction_scores = self.gm_head(last_hidden_state)
	labels = labels.to(prediction_scores.device)

	# only considering the masked tokens
	labels = labels[labels != -100]
	masked_loss = cross_entropy(prediction_scores, labels)
	loss = masked_loss + self.config.alpha_contrastive_loss * contrastive_loss
	else:
	prediction_scores = self.gm_head(last_hidden_state)

	if not return_dict:
	return (
	loss,
	prediction_scores,
	) + outputs

	return BacformerModelOutput(
	loss=loss,
	logits=prediction_scores,
	last_hidden_state=outputs.last_hidden_state,
	attentions=outputs.attentions,
	)


	class BacformerForProteinClassification(BacformerPreTrainedModel):
	"""Bacformer model with a classification head on top for protein classification tasks."""

	def __init__(self, config: BacformerConfig, benchmark_esm: bool = False):
	super().__init__(config)
	self.config = config
	self.benchmark_esm = benchmark_esm

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	self.classifier = nn.Linear(config.hidden_size, config.num_labels)

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	protein_embeddings: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	labels: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Optional[BacformerModelOutput]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)

	if self.benchmark_esm:
	outputs = [protein_embeddings]
	else:
	outputs = self.bacformer(
	protein_embeddings=protein_embeddings,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	attention_mask=attention_mask,
	return_attn_weights=return_attn_weights,
	return_dict=return_dict,
	)

	last_hidden_state = outputs[0]

	last_hidden_state = self.dropout(last_hidden_state)
	logits = self.classifier(last_hidden_state)

	loss = None
	if labels is not None:
	labels = labels.to(logits.device)

	if self.config.problem_type == "regression":
	loss = mse_loss(logits, labels)
	elif self.config.problem_type == "single_label_classification":
	loss = cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
	elif (
	self.config.problem_type == "multi_label_classification"
	or self.config.problem_type == "binary_classification"
	):
	# remove the -100 labels from loss computation
	mask = torch.ones_like(labels.view(-1)) - (labels.view(-1) == -100.0).float()
	loss = binary_cross_entropy_with_logits(
	logits.view(-1), labels.view(-1).type_as(logits), reduction="none"
	)
	loss = (loss * mask).sum() / mask.sum()

	if not return_dict:
	return (
	loss,
	None,
	logits,
	) # + outputs

	return BacformerModelOutput(
	loss=loss,
	logits=logits,
	last_hidden_state=last_hidden_state,
	attentions=outputs.attentions,
	)


	class BacformerForGenomeClassification(BacformerPreTrainedModel):
	"""Bacformer model with a classification head on top for genome classification tasks."""

	def __init__(self, config: BacformerConfig):
	super().__init__(config)
	self.config = config

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.classifier = BacformerGenomeClassificationHead(config)

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	protein_embeddings: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	labels: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Optional[BacformerModelOutput]:
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict
	return_attn_weights = (
	return_attn_weights if return_attn_weights is not None else self.config.return_attn_weights
	)

	outputs = self.bacformer(
	protein_embeddings=protein_embeddings,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	attention_mask=attention_mask,
	return_attn_weights=return_attn_weights,
	return_dict=return_dict,
	)
	last_hidden_state = outputs[0]
	logits = self.classifier(last_hidden_state, attention_mask)

	loss = None
	if labels is not None:
	labels = labels.to(logits.device)

	if self.config.problem_type == "regression":
	loss = mse_loss(logits.view(-1), labels.view(-1))
	elif self.config.problem_type == "binary_classification":
	loss = binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits))
	elif self.config.problem_type == "single_label_classification":
	loss = cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
	elif self.config.problem_type == "multi_label_classification":
	loss = binary_cross_entropy_with_logits(logits, labels)

	if not return_dict:
	return (
	loss,
	None,
	logits,
	)

	return BacformerModelOutput(
	loss=loss,
	logits=logits,
	last_hidden_state=outputs.last_hidden_state,
	attentions=outputs.attentions,
	)


	class BacformerForProteinProteinInteraction(BacformerPreTrainedModel):
	"""Bacformer model with a protein-protein interaction head on top."""

	def __init__(self, config: BacformerConfig, benchmark_esm: bool = False):
	super().__init__(config)
	self.config = config
	self.benchmark_esm = benchmark_esm
	print("Benchmark ESM:", self.benchmark_esm)
	self.return_attn_weights = config.return_attn_weights

	self.bacformer = BacformerModel(config, add_pooling_layer=False)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	self.dense = nn.Sequential(
	nn.Linear(config.hidden_size, config.hidden_size),
	nn.GELU(),
	nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps),
	nn.Dropout(0.2),
	)
	self.ppi_head = BacformerProteinProteinInteractionHead(
	in_features=config.hidden_size, prot_emb_idx=config.prot_emb_token_id
	)

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	protein_embeddings: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	labels: torch.Tensor = None,
	token_type_ids: torch.Tensor = None,
	attention_mask: torch.Tensor = None,
	return_attn_weights: bool = None,
	return_dict: Union[bool, None] = None,
	) -> Union[OrderedDict, None]: # TODO: change it from token classifier output
	"""Forward method for the model."""
	return_dict = return_dict if return_dict is not None else self.config.return_dict

	if self.benchmark_esm:
	last_hidden_state = protein_embeddings.squeeze(0)[1:-2, :]
	else:
	outputs = self.bacformer(
	protein_embeddings=protein_embeddings,
	special_tokens_mask=special_tokens_mask,
	token_type_ids=token_type_ids,
	attention_mask=attention_mask,
	return_attn_weights=False,
	return_dict=True,
	)
	last_hidden_state = outputs.last_hidden_state.squeeze(0)[1:-2, :]

	assert labels.shape[0] == 1, "Batch size should be 1 for protein-protein interaction task"

	last_hidden_state = self.dense(self.dropout(last_hidden_state))
	last_hidden_state = torch.cat([last_hidden_state[labels[:, 0]], last_hidden_state[labels[:, 1]]], dim=0).mean(
	dim=0
	)
	logits = self.ppi_head(last_hidden_state)

	loss = binary_cross_entropy_with_logits(logits, labels[:, 2].type_as(logits).squeeze(0))

	if not return_dict:
	return (
	loss,
	logits,
	)

	return BacformerModelOutput(
	loss=loss,
	logits=logits,
	last_hidden_state=outputs.last_hidden_state,
	attentions=outputs.attentions,
	)


	# Copied from transformers.models.bert.modeling_bert.BertPooler
	class BacformerPooler(nn.Module):
	"""Pooler for Bacformer model."""

	def __init__(self, config: BacformerConfig):
	super().__init__()
	self.dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.activation = nn.Tanh()

	def forward(self, hidden_states: torch.Tensor, padding_mask: torch.Tensor = None) -> torch.Tensor:
	"""Forward method for the pooler."""
	# We "pool" the model by taking the mean of non-padding tokens
	padding_mask = padding_mask.to(hidden_states.device) if padding_mask is not None else None
	if padding_mask is not None:
	mean_hidden_states = torch.einsum("ijk,ij->ik", hidden_states, padding_mask) / padding_mask.sum(
	1
	).unsqueeze(1)
	else:
	mean_hidden_states = hidden_states.mean(dim=1)
	pooled_output = self.dense(mean_hidden_states)
	pooled_output = self.activation(pooled_output)
	return pooled_output


	class BacformerGMHead(nn.Module):
	"""Bacformer Head for genomic modeling."""

	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

	# add 1 to the condfig.protein_clusters_vocab_size to account for the end token
	self.decoder = nn.Linear(config.hidden_size, config.protein_clusters_vocab_size + 1, bias=False)
	self.bias = nn.Parameter(torch.zeros(config.protein_clusters_vocab_size + 1))

	def forward(self, features, **kwargs):
	"""Forward method for the head."""
	x = self.dense(features)
	x = gelu(x)
	x = self.layer_norm(x)

	# project back to nr of labels with bias
	x = self.decoder(x) + self.bias
	return x


	class BacformerGenomeClassificationHead(nn.Module):
	"""Head for genome-level classification tasks."""

	def __init__(self, config: BacformerConfig):
	super().__init__()
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

	def forward(self, features: torch.Tensor, padding_mask: torch.Tensor, **kwargs):
	"""Forward method for the head."""
	if padding_mask is not None:
	x = torch.einsum("ijk,ij->ik", features, padding_mask) / padding_mask.sum(1).unsqueeze(1)
	else:
	x = features[:, 0, :] # take <s> token (equiv. to [CLS])
	x = self.dropout(x)
	x = self.out_proj(x)
	return x


	class BacformerProteinProteinInteractionHead(nn.Module):
	"""Head for protein-protein interaction task at a genome level."""

	def __init__(self, in_features: int, prot_emb_idx: int = 4, bias: bool = True):
	super().__init__()
	self.in_features = in_features
	self.prot_emb_idx = prot_emb_idx
	self.dropout = nn.Dropout(0.2)
	self.linear = nn.Linear(in_features, 1, bias=bias)

	def forward(
	self, hidden_states: torch.Tensor
	) -> torch.Tensor: # special_tokens_mask: torch.Tensor, attentions: torch.Tensor):
	"""Forward method for the head."""
	return self.linear(self.dropout(hidden_states)).squeeze(-1)