import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import PreTrainedModel, AutoModelForMaskedLM, AutoConfig
try:
    from .configuration_dlm import DiscreteDiffusionConfig
except ImportError:
    from configuration_dlm import DiscreteDiffusionConfig

from collections import namedtuple
import math
import numpy as np
from typing import List, Optional, Tuple, Union

decoder_out_t = namedtuple(
    "decoder_out_t",
    ["output_tokens", "output_scores", "output_masks", "non_fixed_sym_masks", "attn", "step", "max_step", "history"],
)

def topk_masking(scores, cutoff_len, stochastic=False, temp=1.0):
    """
    scores: [b, n]
    cutoff_len: [b, 1]
    stochastic: bool, whether to add noise to select top_k or not
    returns:
        mask: [b, n], with 1 if the token is in top-k lowest scores, 0 otherwise
    """
    if stochastic:
        gumbel_noise = -torch.log(-torch.log(torch.rand_like(scores) + 1e-8) + 1e-8)
        _scores = scores + temp * gumbel_noise
    else:
        _scores = scores
    sorted_index = _scores.sort(-1)[0]
    cutoff = sorted_index.gather(dim=-1, index=cutoff_len) # + 1e-10
    # cutoff_len = k -> select k + 1 tokens
    masking = _scores < cutoff
    return masking

class DiscreteDiffusionModel(PreTrainedModel):
    config_class = DiscreteDiffusionConfig
    _keys_to_ignore_on_load_missing = ["fake_layer", "length_trm", "length_predictor", "model.lm_head.decoder.weight"] 

    def __init__(self, config: DiscreteDiffusionConfig):
        super().__init__(config)
        self.config = config
        self.args = config # Alias for compatibility with existing code

        # Initialize backbone
        if config.backbone_config:
            # We assume backbone_config is a dict
            backbone_config_obj = AutoConfig.for_model(**config.backbone_config)
            self.model = AutoModelForMaskedLM.from_config(backbone_config_obj)
        else:
             # Fallback or error
             raise ValueError("backbone_config must be provided in config")

        if config.tie_word_embeddings:
             self.model.lm_head.decoder.weight = self.model.roberta.embeddings.word_embeddings.weight

        self.mask_id = config.mask_token_id
        self.bos_id = config.bos_token_id
        self.eos_id = config.eos_token_id
        self.pad_id = config.pad_token_id
        
        # Lora
        if config.lora:
            self.add_fake_layer()

        # Length predictor (optional, as in original code)
        self.length_trm = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=self.config.hidden_size, 
                nhead=self.config.num_attention_heads,
                dim_feedforward=self.config.intermediate_size,
                batch_first=True
            ),
            num_layers=1,
        )
        self.length_predictor = nn.Sequential(
            nn.Linear(self.config.hidden_size , self.config.intermediate_size),
            nn.Tanh(),
            nn.Linear(self.config.intermediate_size, self.config.max_position_embeddings)
        )

    def add_fake_layer(self):
        self.fake_layer = nn.Parameter(torch.zeros((self.config.hidden_size, )))

    def gradient_checkpointing_enable(self):
        self.model.gradient_checkpointing_enable()

    def _tie_weights(self):
        """Tie the weights between the input embeddings and the output embeddings."""
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(
                self.model.lm_head.decoder,
                self.model.roberta.embeddings.word_embeddings
            )

    def _init_weights(self, module):
        """Initialize the weights - called after loading checkpoint."""
        # Call parent init_weights
        super()._init_weights(module)
        # Ensure weights are tied after initialization
        self._tie_weights()

    @property
    def _tied_weights_keys(self):
        """Return the keys of tied weights."""
        if self.config.tie_word_embeddings:
            return ["model.lm_head.decoder.weight"]
        return []

    def q_sample_coupled(self, x_0, t1, t2, maskable_mask):
        # ... copy from DiscreteDiffusionBase ...
        assert self.config.diffusion_type == "absorbing", "we only support absorbing diffusion temporarily"
        t1_eq_t2_mask = (t1 == t2)
        t1, t2 = torch.maximum(t1, t2).float(), torch.minimum(t1, t2).float()
        
        u = torch.rand_like(x_0, dtype=torch.float)
        t1_mask = (u < (t1 / self.config.num_diffusion_timesteps)[:, None]) & maskable_mask
        x_t1 = x_0.masked_fill(t1_mask, self.mask_id)
        
        u = torch.rand_like(x_0, dtype=torch.float)
        t2_mask = t1_mask & (u > ((t1 - t2) / t1)[:, None])
        u = torch.rand_like(x_0[t1_eq_t2_mask], dtype=torch.float) 
        t2_mask[t1_eq_t2_mask] = (u < (t1[t1_eq_t2_mask] / self.config.num_diffusion_timesteps)[:, None]) & (maskable_mask[t1_eq_t2_mask])
        x_t2 = x_0.masked_fill(t2_mask, self.mask_id)
        
        return {
            "x_t": torch.cat([x_t1, x_t2], dim=0),
            "t": torch.cat([t1, t2]),
            "mask_mask": torch.cat([t1_mask, t2_mask], dim=0)
        }

    def initialize_decode_samples(self, tokens, partial_masks, prefix_masks, oracle_length=False, length_beam=1, mbr=1):
        # ... copy from DiscreteDiffusionBase ...
        if tokens is None:
            raise NotImplementedError
        else:
            if not oracle_length:
                inputs_tokens = tokens.masked_fill(~prefix_masks, self.pad_id)
                src_length = inputs_tokens.ne(self.pad_id).sum(dim=-1)
                inputs_tokens = inputs_tokens[:, :src_length.max()]
                length_logits = self.forward_length(inputs_tokens)
                # Giới hạn độ dài output tối đa: không quá 3x độ dài source và không quá 100 tokens
                max_allowed_length = torch.min(
                    torch.tensor([100]).to(src_length.device),
                    (src_length * 3)[:, None]
                )
                length = (
                    torch.min(
                        torch.min(
                            length_logits.topk(length_beam, dim=-1).indices + 1,
                            max_allowed_length
                        ),
                        self.config.max_position_embeddings - 2 - src_length[:, None] - 1
                    )
                )
                output_tokens = []
                new_partial_masks = []
                for i, token in enumerate(inputs_tokens):
                    for b in range(length_beam):
                        for m in range(mbr):
                            # Create output token sequence
                            seq = torch.cat([
                                token[:src_length[i]], 
                                torch.tensor([self.mask_id] * length[i][b] + [self.eos_id]).to(token)
                            ])
                            output_tokens.append(seq)
                            
                            # Create corresponding partial mask
                            # True for fixed (source), False for generated (mask/eos)
                            # partial_masks[i] corresponds to token[i]
                            # We assume partial_masks[i] has same length as token[i] (or at least src_length[i])
                            p_mask = torch.cat([
                                partial_masks[i][:src_length[i]],
                                torch.tensor([False] * (length[i][b] + 1)).to(partial_masks)
                            ])
                            new_partial_masks.append(p_mask)
                            
                output_tokens = pad_sequence(output_tokens, batch_first=True, padding_value=self.pad_id)
                # Pad partial masks to match output_tokens length
                # We need to pad with True (fixed) or False (maskable)?
                # Usually padding tokens should be ignored. 
                # In finalized_hypos: cutoff = tokens.ne(pad) & ... & (~partial_mask)
                # If we pad partial_mask with True, ~partial_mask is False, so it's filtered out.
                # If we pad with False, ~partial_mask is True, so it's kept (if not pad_id).
                # Since we check tokens.ne(pad_id), padding tokens are filtered anyway.
                # But for safety, let's pad with True (fixed) so they are treated as non-generated?
                # Actually, pad_sequence pads with 0. For bool tensor, 0 is False.
                # So if we use pad_sequence on bool tensor, it pads with False.
                partial_masks = pad_sequence(new_partial_masks, batch_first=True, padding_value=True) # Pad with True to be safe?
                # Wait, if we pad with True, then ~partial_mask is False.
                
                output_mask = output_tokens.eq(self.mask_id)
                # non_fixed_sym_masks should be all positions that can be modified (not source, not pad, not special tokens)
                # This is critical for _reparam_decoding to work correctly!
                non_fixed_sym_masks = (
                    output_tokens.ne(self.pad_id) &
                    output_tokens.ne(self.bos_id) &
                    ~partial_masks  # Not source tokens
                )
            else:
                output_tokens = torch.stack([token for token in tokens for m in range(mbr)])
                partial_masks = torch.stack([mask for mask in partial_masks for m in range(mbr)])
                prefix_masks = torch.stack([mask for mask in prefix_masks for m in range(mbr)])
                output_mask = (
                    output_tokens.ne(self.pad_id) &
                    output_tokens.ne(self.bos_id) &
                    output_tokens.ne(self.eos_id) &
                    ~prefix_masks
                )
                output_tokens = output_tokens.masked_fill(output_mask, self.mask_id)
                non_fixed_sym_masks = output_mask.clone()
            output_scores = torch.zeros_like(output_tokens, dtype=torch.float)
            
            return partial_masks, decoder_out_t(
                output_tokens=output_tokens,
                output_scores=output_scores,
                output_masks=output_mask,
                non_fixed_sym_masks=non_fixed_sym_masks,
                attn=None,
                step=0,
                max_step=math.inf,
                history=None
            )

    def forward_length(self, input_ids):
        attention_mask = input_ids.ne(self.pad_id).int()
        with torch.no_grad():
            _feature = self.model.roberta(input_ids, attention_mask=attention_mask)[0]
        feature = self.length_trm(_feature, src_key_padding_mask=(1-attention_mask).bool())
        length = attention_mask.sum(dim=-1)
        pooled_feature = feature.masked_fill((attention_mask==0)[:, :, None], 0).float().sum(1) / length[:, None]
        length_logits = self.length_predictor(pooled_feature.to(feature))
        return length_logits

    def forward(self, prev_output_tokens, partial_mask, attention_mask=None, loss_mask=None, cache=None):
        input_ids = prev_output_tokens
        if attention_mask is None:
            attention_mask = prev_output_tokens.ne(self.pad_id).int()        
        
        embeddings = self.model.roberta.embeddings.word_embeddings(input_ids)
        
        if hasattr(self, "fake_layer") and self.training:
            self.fake_layer.requires_grad = True
            embeddings = embeddings + self.fake_layer * 0 
        
        if self.config.attention_strategy == "prefix_lm":
             # ... simplified for now, assuming full attention or handling it ...
             # Copying logic from original
            ext_partial_mask = partial_mask.float()
            ext_partial_mask = torch.bmm(ext_partial_mask[:, :, None], ext_partial_mask[:, None, :]).int()
            ext_mask = attention_mask[:, None, :].repeat(1, attention_mask.size(-1), 1)
            ext_mask[partial_mask] = ext_partial_mask[partial_mask]
            outputs = self.model.roberta(inputs_embeds=embeddings, attention_mask=ext_mask)[0]
        else:
            outputs = self.model.roberta(inputs_embeds=embeddings, attention_mask=attention_mask)[0]
        
        if not (~torch.isnan(outputs)).all():
            outputs.masked_fill_(outputs.isnan(), 0)
        
        outputs = outputs[loss_mask] if loss_mask is not None else outputs
        return self.model.lm_head(outputs)

    def _reparam_decoding(
        self, 
        output_tokens, 
        output_scores, 
        cur_tokens,
        cur_scores,
        decoding_strategy,
        xt_neq_x0, 
        non_special_sym_mask, 
        t,
        max_step,
        noise
    ):
        _, condition, topk_mode, schedule = decoding_strategy.split("-")

        if schedule == "linear":
            rate = 1 - t / max_step
        elif schedule == "cosine":
            rate = np.cos(t / max_step * np.pi * 0.5)
        else:
            raise NotImplementedError

        cutoff_len = (
            non_special_sym_mask.sum(1, keepdim=True).type_as(output_scores) * rate
            ).long()
        _scores_for_topk = cur_scores.masked_fill(~non_special_sym_mask, 1000.0)
        
        if topk_mode.startswith("stochastic"):
            noise_scale = float(topk_mode.replace("stochastic", ""))
            lowest_k_mask = topk_masking(_scores_for_topk, cutoff_len, stochastic=True, temp=noise_scale * rate)
        elif topk_mode == "deterministic":
            lowest_k_mask = topk_masking(_scores_for_topk, cutoff_len, stochastic=False)
        else:
            raise NotImplementedError
        
        if condition == "cond":
            not_v1_t = (cur_tokens == output_tokens) & (cur_scores < output_scores) & lowest_k_mask
        elif condition == "uncond":
            not_v1_t = lowest_k_mask
        else:
            raise NotImplementedError
        
        not_v2_t = lowest_k_mask

        masked_to_noise = (~xt_neq_x0 & not_v1_t) | (xt_neq_x0 & not_v2_t)
        if isinstance(noise, torch.Tensor):
            output_tokens.masked_scatter_(masked_to_noise, noise[masked_to_noise])
        elif isinstance(noise, (int, float)):
            output_tokens.masked_fill_(masked_to_noise, noise)
        else:
            raise NotImplementedError("noise should be either a tensor or a scalar")
        output_scores.masked_fill_(masked_to_noise, -math.inf)

        masked_to_x0 = xt_neq_x0 & ~not_v2_t
        output_tokens.masked_scatter_(masked_to_x0, cur_tokens[masked_to_x0])
        output_scores.masked_scatter_(masked_to_x0, cur_scores[masked_to_x0])
        
        new_xt_neq_x0 = (xt_neq_x0 | not_v1_t) & not_v2_t
        return new_xt_neq_x0

    def denoise_step(self, decoder_out, partial_masks, temperature=1.0, strategy="reparam-uncond-deterministic-cosine"):
        output_tokens = decoder_out.output_tokens
        output_scores = decoder_out.output_scores
        prev_step, cur_step = decoder_out.step, decoder_out.step + 1 
        max_step = decoder_out.max_step
        
        logits = self.forward(output_tokens, partial_masks)
        
        logits[..., self.mask_id] = -math.inf
        scores = torch.log_softmax(logits, dim=-1)
        
        if strategy == "cmlm":
            # get the mask
            # <bos>, <eos> are ignored in this case since
            # they are not equal to unk.
            output_masks = output_tokens.eq(self.mask_id)
            unmask_prob = 1 / (max_step - prev_step)
            # where to unmask
            changes = torch.rand(output_tokens.shape, device=output_tokens.device) < unmask_prob
            # don't unmask somewhere already unmasked
            changes = torch.bitwise_and(changes, output_masks)

            if getattr(self.config, "argmax_decoding", False):
                output_scores, new_tokens = scores.max(-1)
            else:
                # Assuming dists is imported or available, otherwise use torch.multinomial or similar
                # But let's stick to what was in generator if possible, or implement simple sampling
                # The generator used: dists.Categorical(logits=scores / temperature).sample()
                # We need to import dists or use torch.distributions
                import torch.distributions as dists
                new_tokens = dists.Categorical(logits=scores / temperature).sample()
                output_scores = torch.gather(scores, -1, new_tokens.unsqueeze(-1)).squeeze(-1)
            output_tokens[changes] = new_tokens[changes]
        elif strategy == "ar":
            output_masks = output_tokens.eq(self.mask_id)
            unmask_indices = (output_tokens.ne(self.mask_id) & output_tokens.ne(self.eos_id) & output_tokens.ne(self.pad_id)).sum(dim=-1)
            indices = torch.arange(output_tokens.size(-1)).expand(output_tokens.shape).to(output_masks.device)
            if getattr(self.config, "argmax_decoding", False):
                output_scores, new_tokens = scores.max(-1)
            else:
                import torch.distributions as dists
                new_tokens = dists.Categorical(logits=scores / temperature).sample()
                output_scores = torch.gather(scores, -1, new_tokens.unsqueeze(-1)).squeeze(-1)
            output_tokens[unmask_indices[:, None]==indices] = new_tokens[unmask_indices[:, None]==indices]
        else:
            if getattr(self.config, "argmax_decoding", False):
                cur_scores, cur_tokens = scores.max(-1)
            else:
                import torch.distributions as dists
                cur_tokens = dists.Categorical(logits=scores / temperature).sample()
                cur_scores = torch.gather(scores, -1, cur_tokens.unsqueeze(-1)).squeeze(-1)
            cur_scores = cur_scores.to(output_scores)
            
            output_masks = self._reparam_decoding(
                output_tokens=output_tokens,
                output_scores=output_scores,
                cur_tokens=cur_tokens,
                cur_scores=cur_scores,
                decoding_strategy=strategy,
                xt_neq_x0=decoder_out.output_masks,
                non_special_sym_mask=decoder_out.non_fixed_sym_masks,
                t=cur_step,
                max_step=max_step,
                noise=self.mask_id
            )
        
        history = (
            ([] if decoder_out.history is None else decoder_out.history) + [output_tokens.clone()]
            if decoder_out.history is not None else None
        )
        
        return decoder_out._replace(
            step=cur_step,
            output_tokens=output_tokens,
            output_scores=output_scores,
            output_masks=output_masks,
            history=history,
        )

    @torch.no_grad()
    def generate(
        self, 
        input_ids, 
        attention_mask=None, 
        max_iterations=10, 
        strategy="reparam-uncond-deterministic-cosine",
        temperature=1.0,
        return_history=False,
        max_length=128,  # Fixed generation length hyperparameter (like LLaDA)
        **kwargs
    ):
        # Prepare inputs
        src_tokens = input_ids
        
        if attention_mask is None:
            partial_masks = torch.ones_like(src_tokens).bool()
        else:
            partial_masks = attention_mask.bool()
            
        prefix_masks = partial_masks 
        
        # Initialize canvas with fixed length (LLaDA approach)
        # Instead of predicting length, use max_length as hyperparameter
        batch_size = src_tokens.size(0)
        src_length = src_tokens.ne(self.pad_id).sum(dim=-1)
        
        # Create fully masked response of fixed length
        output_tokens = []
        new_partial_masks = []
        
        for i in range(batch_size):
            # Format: <source_without_eos> <mask>...<mask> <eos>
            # Remove EOS from source if it exists
            src_len = src_length[i].item()
            src_seq = src_tokens[i, :src_len]
            
            # Remove trailing EOS from source
            if src_seq[-1] == self.eos_id:
                src_seq = src_seq[:-1]
                src_len -= 1
            
            seq = torch.cat([
                src_seq,
                torch.full((max_length,), self.mask_id, dtype=src_tokens.dtype, device=src_tokens.device),
                torch.tensor([self.eos_id], dtype=src_tokens.dtype, device=src_tokens.device)
            ])
            output_tokens.append(seq)
            
            # Mask: True for source (fixed), False for generated part
            mask = torch.cat([
                torch.ones(src_len, dtype=torch.bool, device=src_tokens.device),
                torch.zeros(max_length + 1, dtype=torch.bool, device=src_tokens.device)  # +1 for eos
            ])
            new_partial_masks.append(mask)
        
        output_tokens = pad_sequence(output_tokens, batch_first=True, padding_value=self.pad_id)
        partial_masks = pad_sequence(new_partial_masks, batch_first=True, padding_value=True)
        
        # Create masks for decoding
        output_mask = output_tokens.eq(self.mask_id)
        non_fixed_sym_masks = (
            output_tokens.ne(self.pad_id) &
            output_tokens.ne(self.bos_id) &
            ~partial_masks  # Not source tokens
        )
        
        output_scores = torch.zeros_like(output_tokens, dtype=torch.float)
        
        prev_decoder_out = decoder_out_t(
            output_tokens=output_tokens,
            output_scores=output_scores,
            output_masks=output_mask,
            non_fixed_sym_masks=non_fixed_sym_masks,
            attn=None,
            step=0,
            max_step=max_iterations,
            history=None
        )
        
        if return_history:
            prev_decoder_out = prev_decoder_out._replace(history=[])
        
        for step in range(max_iterations):
            prev_decoder_out = self.denoise_step(prev_decoder_out, partial_masks, temperature=temperature, strategy=strategy)            
            
        # Finalize: discard tokens after EOS (LLaDA approach)
        def finalized_hypos(tokens, scores, partial_mask, history=None):
            # First, find EOS position and cut there
            eos_positions = (tokens == self.eos_id).nonzero(as_tuple=True)[0]
            if len(eos_positions) > 0:
                first_eos = eos_positions[0].item()
                # Cut everything after EOS
                tokens = tokens[:first_eos]  # Exclude EOS
                if scores is not None:
                    scores = scores[:first_eos]
                partial_mask = partial_mask[:first_eos]
            
            # Then apply cutoff logic: keep only generated tokens (not source, not special)
            cutoff = (
                tokens.ne(self.pad_id) & 
                tokens.ne(self.bos_id) & 
                tokens.ne(self.eos_id) & 
                (~partial_mask)  # Not source tokens (partial_mask=False for generated)
            )
            tokens = tokens[cutoff]
            if scores is None:
                score = None
            else:
                scores = scores[cutoff]
                score = scores.mean().item() if len(scores) > 0 else 0.0
            ret_dict = {
                "tokens": tokens,
                "positional_scores": scores,
                "score": score,
                "alignment": None
            }
            if history is not None:
                ret_dict["history"] = [
                    finalized_hypos(history_tokens, None, partial_mask, history=None)
                    for history_tokens in history
                ]
            return ret_dict
        
        def score_select(hyps):
            index = np.argmax([hyp["score"] for hyp in hyps])
            return hyps[index]
        
        output_tokens, output_scores = prev_decoder_out.output_tokens, prev_decoder_out.output_scores
        
        # Handle history if needed
        if return_history and prev_decoder_out.history is not None:
            full_history = prev_decoder_out.history 
            histories = [[full_history[j][i] for j in range(max_iterations)] for i in range(output_tokens.size(0))]
            hyps = []
            for tokens, scores, partial_mask, history in zip(output_tokens, output_scores, partial_masks, histories):
                hyps.append(finalized_hypos(tokens, scores, partial_mask, history))
        else:
            hyps = [
                finalized_hypos(tokens, scores, partial_mask, None) 
                for tokens, scores, partial_mask in zip(output_tokens, output_scores, partial_masks)
            ]
            
        repeatition = kwargs.get("mbr", 1) * kwargs.get("length_beam", 1)
        if repeatition > 1:
            hyps = [score_select(hyps[i:i+repeatition]) for i in range(0, len(hyps), repeatition)]
            
        finalized = pad_sequence([h["tokens"] for h in hyps ], batch_first=True, padding_value=self.pad_id)
        
        # If the user expects just tokens, we return finalized tokens.
        # The original model.generate returned just tokens.
        return finalized