"""
Defines a wrapper class of RobertaPreTrainedModel model to do regression on text data.
Based on: https://www.kaggle.com/code/sumantindurkhya/bert-for-regression
"""

from typing import Optional, Tuple, Union
from tqdm import tqdm, trange

import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers import BertModel, BertPreTrainedModel, RobertaPreTrainedModel, RobertaModel

class RobertaRegressor(RobertaPreTrainedModel):

    def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
        super().__init__(config)

        self.num_outputs = num_outputs

        self.roberta = RobertaModel(config)
        if freeze_bert:
            # freeze the roberta parameters
            for param in self.roberta.parameters():
                param.requires_grad = False
        self.classifier = nn.Linear(config.hidden_size, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.tanh = nn.Tanh()
        self.regressor = nn.Linear(128, num_outputs)
        

    def forward(self, input_ids, attention_mask):
        # forward pass of the model
        base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = base_out.pooler_output
        out = self.classifier(logits)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.tanh(out)
        out = self.dropout(out)
        out = self.regressor(out)
        return out
    
    def predict(self, text:str, tokenizer, device, numpy=True) -> Tuple[float, float, float, float]:
        input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        output = self(input_ids, attention_mask).squeeze()
        # free up memory
        del input_ids, attention_mask
        out = output.detach()
        if numpy:
            return out.cpu().numpy()
        return out
    

class RobertaSeqMultiRegressor(RobertaPreTrainedModel):
    """
    A wrapper class of RobertaPreTrainedModel model to do multi-output regression on text data.
    This models the task of predicting multiple outputs from a single text input.
    The problem is formulated in a sequential manner, where the model predicts the next output
    conditioned on the previous outputs.

    This approach is ideal for modeling problems where the outputs are correlated
    such as probability distributions, where the sum of the outputs must be 1.
    Or, for example, in the case of predicting the next word in a sentence, where the
    model must predict the next word conditioned on the previous words.

    The model is similar to the one described in the RobertaRegressor class, with the
    exception that the head of the model is a sequential model, where the output of the
    previous layer is fed as input to the next layer similar to how a RNN works.
    """

    def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
        super().__init__(config)

        self.num_outputs = num_outputs

        self.roberta = RobertaModel(config)
        if freeze_bert:
            # freeze the roberta parameters
            for param in self.roberta.parameters():
                param.requires_grad = False
        # head of the model is a model that takes the output of the previous layer as input
        # and outputs a single value until the number of outputs is reached
        for i in range(num_outputs):
            setattr(self, f"regressor_{i}", nn.Linear(config.hidden_size, 128))
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.tanh = nn.Tanh()

    def forward(self, input_ids, attention_mask):
        # forward pass of the model
        base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = base_out.pooler_output
        outputs = []
        for i in range(self.num_outputs):
            out = getattr(self, f"regressor_{i}")(logits)
            out = self.dropout(out)
            out = self.relu(out)
            out = self.tanh(out)
            outputs.append(out)
        return outputs


def sum_diff_loss(output, target):
    return torch.sum(torch.abs(output - target))

def evaluate(model, criterion, dataloader, device, sum_diff_penalty=False):
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0

    with torch.no_grad():
        for input_ids, attention_mask, target in (dataloader):
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            output = model(input_ids, attention_mask)
            
            mean_loss += criterion(output.squeeze(), target.type_as(output)).item()
            count += 1
            
    return mean_loss/count    

# def predict(model, dataloader, device):
#     predicted_label = []
#     actual_label = []
#     with torch.no_grad():
#         for input_ids, attention_mask, target in (dataloader):
            
#             input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
#             output = model(input_ids, attention_mask)
                        
#             predicted_label += output
#             actual_label += target
            
#     return predicted_label

def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
    best_acc = 0
    for epoch in trange(epochs, desc="Epoch"):
        model.train()
        train_loss = 0
        for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
            optimizer.zero_grad()  
            
            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
            
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            # out = model.classifier(output)
            loss = criterion(output.squeeze(), target.type_as(output))
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        print(f"Training loss is {train_loss/len(train_loader)}")
        val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
        print("Epoch {} complete! Validation Loss : {}".format(epoch, val_loss))

def multi_reg_loss(loss='mse', sum_diff_penalty:float=0.0):
    """
    A custom loss function that penalizes the sum of differences
    between the predicted and actual values for multi-output regression.
    This is done to guide the model to predict outputs where 
    sum(y_hat1, y_hat2, ...) = sum(y1, y2, ...)

    e.g: in task d, we have that sum(label1, label2, label3, label4) = 1
    since its a probability distribution. 

    Parameters
    ----------
    loss : str, optional
        The loss function to be used, by default 'mse'
        Available options: 'mse' and 'cross_entropy'
        for mean squared error and cross entropy loss respectively
    sum_diff_penalty : float, optional
        The penalty to be applied to the sum of differences between the predicted and actual values, by default 0.0 (no penalty)
    """
    if loss == 'mse':
        loss_func = F.mse_loss
    elif loss == 'cross_entropy':
        loss_func = F.cross_entropy
    else:
        raise ValueError("Invalid loss function. Available options: 'mse' and 'cross_entropy'")
    def reg_loss(input, target):
        # first compute the normal MSE loss
        mse = loss_func(input, target)
        # then penalize the sum of differences between the predicted and actual values
        sum_diff = torch.square(torch.sum(input) - torch.sum(target))
        return mse + sum_diff_penalty*sum_diff
    return reg_loss