#%%
import torch
import numpy as np
from torch.autograd import Variable
# from sklearn import metrics

import datetime
from typing import Dict, Tuple, List
import logging
import os
import utils
import pickle as pkl
import json 
import torch.backends.cudnn as cudnn

from tqdm import tqdm

import sys
sys.path.append("..")
import Parameters

parser = utils.get_argument_parser()
parser = utils.add_attack_parameters(parser)
parser.add_argument('--mode', type=str, default='sentence', help='sentence, biogpt or finetune')
parser.add_argument('--ratio', type = str, default='', help='ratio of the number of changed words')
args = parser.parse_args()
args = utils.set_hyperparams(args)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
utils.seed_all(args.seed)
np.set_printoptions(precision=5)
cudnn.benchmark = False

data_path = os.path.join('processed_data', args.data)
target_path = os.path.join(data_path, 'DD_target_{0}_{1}_{2}_{3}_{4}_{5}.txt'.format(args.model, args.data, args.target_split, args.target_size, 'exists:'+str(args.target_existed), args.attack_goal))
attack_path = os.path.join('attack_results', args.data, 'cos_{0}_{1}_{2}_{3}_{4}_{5}_{6}_{7}.txt'.format(args.model, 
                                                        args.target_split, 
                                                        args.target_size, 
                                                        'exists:'+str(args.target_existed),
                                                        args.neighbor_num,
                                                        args.candidate_mode,
                                                        args.attack_goal,
                                                        str(args.reasonable_rate)))
# target_data = utils.load_data(target_path)
attack_data = utils.load_data(attack_path, drop=False)
# assert target_data.shape == attack_data.shape
#%%

with open(os.path.join(data_path, 'entities_reverse_dict.json')) as fl:
    id_to_meshid = json.load(fl)
with open(Parameters.GNBRfile+'entity_raw_name', 'rb') as fl:
    entity_raw_name = pkl.load(fl)
with open(Parameters.GNBRfile+'retieve_sentence_through_edgetype', 'rb') as fl:
    retieve_sentence_through_edgetype = pkl.load(fl)
with open(Parameters.GNBRfile+'raw_text_of_each_sentence', 'rb') as fl:
    raw_text_sen = pkl.load(fl)

if not os.path.exists('generate_abstract/valid_entity.json'):
    valid_entity = set()
    for paper_id, paper in raw_text_sen.items():
        for sen_id, sen in paper.items():
            text = sen['text'].split(' ')
            for a in text:
                if '_' in a:
                    valid_entity.add(a.replace('_', ' '))
    with open('valid_entity.json', 'w') as fl:
        json.dump(list(valid_entity), fl, indent=4)
    print('Valid entity saved!!')

if args.mode == 'sentence':
    import torch 
    from torch.nn.modules.loss import CrossEntropyLoss
    from transformers import AutoTokenizer
    from transformers import BioGptForCausalLM
    criterion = CrossEntropyLoss(reduction="none")

    print('Generating GPT input ...')

    tokenizer = AutoTokenizer.from_pretrained('microsoft/biogpt')
    tokenizer.pad_token = tokenizer.eos_token
    model = BioGptForCausalLM.from_pretrained('microsoft/biogpt', pad_token_id=tokenizer.eos_token_id)
    model.to(device)
    model.eval()
    GPT_batch_size = 32
    single_sentence = {}
    test_text = []
    test_dp = []
    test_parse = []
    for i, (s, r, o) in enumerate(tqdm(attack_data)):

        if int(s) != -1:

            dependency_sen_dict = retieve_sentence_through_edgetype[int(r)]['manual']
            candidate_sen = []
            Dp_path = []
            L = len(dependency_sen_dict.keys())
            bound = 500 // L
            if bound == 0:
                bound = 1
            for dp_path, sen_list in dependency_sen_dict.items():
                if len(sen_list) > bound:
                    index = np.random.choice(np.array(range(len(sen_list))), bound, replace=False)
                    sen_list = [sen_list[aa] for aa in index]
                candidate_sen += sen_list
                Dp_path += [dp_path] * len(sen_list)

            text_s = entity_raw_name[id_to_meshid[s]]
            text_o = entity_raw_name[id_to_meshid[o]]
            candidate_text_sen = []
            candidate_ori_sen = []
            candidate_parse_sen = []

            for paper_id, sen_id in candidate_sen:
                sen = raw_text_sen[paper_id][sen_id]
                text = sen['text']
                candidate_ori_sen.append(text)
                ss = sen['start_formatted']
                oo = sen['end_formatted']
                text = text.replace('-LRB-', '(')
                text = text.replace('-RRB-', ')')
                text = text.replace('-LSB-', '[')
                text = text.replace('-RSB-', ']')
                text = text.replace('-LCB-', '{')
                text = text.replace('-RCB-', '}')
                parse_text = text
                parse_text = parse_text.replace(ss, text_s.replace(' ', '_'))
                parse_text = parse_text.replace(oo, text_o.replace(' ', '_'))
                text = text.replace(ss, text_s)
                text = text.replace(oo, text_o)
                text = text.replace('_', ' ')
                candidate_text_sen.append(text)
                candidate_parse_sen.append(parse_text)
            tokens = tokenizer( candidate_text_sen,
                                truncation = True,
                                padding = True,
                                max_length = 300,
                                return_tensors="pt")
            target_ids = tokens['input_ids'].to(device)
            attention_mask = tokens['attention_mask'].to(device)

            L = len(candidate_text_sen)
            assert L > 0
            ret_log_L = []
            for l in range(0, L, GPT_batch_size):
                R = min(L, l + GPT_batch_size)
                target = target_ids[l:R, :]
                attention = attention_mask[l:R, :]
                outputs = model(input_ids = target,
                                attention_mask = attention,
                                labels = target)
                logits = outputs.logits
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = target[..., 1:].contiguous()
                Loss = criterion(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))
                Loss = Loss.view(-1, shift_logits.shape[1])
                attention = attention[..., 1:].contiguous()
                log_Loss = (torch.mean(Loss * attention.float(), dim = 1) / torch.mean(attention.float(), dim = 1))
                ret_log_L.append(log_Loss.detach())
            

            ret_log_L = list(torch.cat(ret_log_L, -1).cpu().numpy())
            sen_score = list(zip(candidate_text_sen, ret_log_L, candidate_ori_sen, Dp_path, candidate_parse_sen))
            sen_score.sort(key = lambda x: x[1])
            test_text.append(sen_score[0][2])
            test_dp.append(sen_score[0][3])
            test_parse.append(sen_score[0][4])
            single_sentence.update({f'{s}_{r}_{o}_{i}': sen_score[0][0]})

        else:
            single_sentence.update({f'{s}_{r}_{o}_{i}': ''})

    with open(f'generate_abstract/{args.target_split}_{args.reasonable_rate}_sentence.json', 'w') as fl:
        json.dump(single_sentence, fl, indent=4)
    # with open('generate_abstract/test.txt', 'w') as fl:
    #     fl.write('\n'.join(test_text))
    # with open('generate_abstract/dp.txt', 'w') as fl:
    #     fl.write('\n'.join(test_dp))
    with open (f'generate_abstract/path/{args.target_split}_{args.reasonable_rate}_path.json', 'w') as fl:
        fl.write('\n'.join(test_dp))
    with open (f'generate_abstract/path/{args.target_split}_{args.reasonable_rate}_temp.json', 'w') as fl:
        fl.write('\n'.join(test_text))

elif args.mode == 'finetune':

    import spacy
    import pprint
    from transformers import AutoModel, AutoTokenizer,BartForConditionalGeneration

    print('Finetuning ...')

    with open(f'generate_abstract/{args.target_split}_{args.reasonable_rate}_chat.json', 'r') as fl:
        draft = json.load(fl)
    with open (f'generate_abstract/path/{args.target_split}_{args.reasonable_rate}_path.json', 'r') as fl:
        dpath = fl.readlines()
    
    nlp = spacy.load("en_core_web_sm")
    if os.path.exists(f'generate_abstract/bioBART/{args.target_split}_{args.reasonable_rate}{args.ratio}_candidates.json'):
        with open(f'generate_abstract/bioBART/{args.target_split}_{args.reasonable_rate}{args.ratio}_candidates.json', 'r') as fl:
            ret_candidates = json.load(fl)
    # if False:
    #     pass
    else:

        def find_mini_span(vec, words, check_set):
            

            def cal(text, sset):
                add = 0
                for tt in sset:
                    if tt in text:
                        add += 1
                return add
            text = ' '.join(words)
            max_add = cal(text, check_set)

            minn = 10000000
            span = ''
            rc = None
            for i  in range(len(vec)):
                if vec[i] == True:
                    p = -1
                    for j in range(i+1, len(vec)+1):
                        if vec[j-1] == True:
                            text = ' '.join(words[i:j])
                            if cal(text, check_set) == max_add:
                                p = j
                                break
                    if p > 0:
                        if (p-i) < minn:
                            minn = p-i
                            span = ' '.join(words[i:p])
                            rc = (i, p)
            if rc:
                for i in range(rc[0], rc[1]):
                    vec[i] = True
            return vec, span
        
        def mask_func(tokenized_sen):

            if len(tokenized_sen) == 0:
                return []
            token_list = []
            # for sen in tokenized_sen:
            #     for token in sen:
            #         token_list.append(token)
            for sen in tokenized_sen:
                token_list += sen.text.split(' ')
            if args.ratio == '':
                P = 0.3
            else:
                P = float(args.ratio)

            ret_list = []
            i = 0
            mask_num = 0
            while i < len(token_list):
                t = token_list[i]
                if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t:
                    ret_list.append(t)
                    i += 1
                    mask_num = 0
                else:
                    length = np.random.poisson(3)
                    if np.random.rand() < P and length > 0:
                        if mask_num < 8:
                            ret_list.append('<mask>')
                            mask_num += 1
                        i += length
                    else:
                        ret_list.append(t)
                        i += 1
                        mask_num = 0
            return [' '.join(ret_list)]
                            
        model = BartForConditionalGeneration.from_pretrained('GanjinZero/biobart-large')
        model.eval()
        model.to(device)
        tokenizer = AutoTokenizer.from_pretrained('GanjinZero/biobart-large')

        ret_candidates = {}
        dpath_i = 0

        for i,(k, v) in enumerate(tqdm(draft.items())):

            input = v['in'].replace('\n', '')
            output = v['out'].replace('\n', '')
            s, r, o = attack_data[i]

            if int(s) == -1:
                ret_candidates[str(i)] = {'span': '', 'prompt' : '', 'out' : [], 'in': [], 'assist': []}
                continue

            path_text = dpath[dpath_i].replace('\n', '')
            dpath_i += 1
            text_s = entity_raw_name[id_to_meshid[s]]
            text_o = entity_raw_name[id_to_meshid[o]]

            doc = nlp(output)
            words= input.split(' ')
            tokenized_sens = [sen for sen in doc.sents]
            sens = np.array([sen.text for sen in doc.sents])

            checkset = set([text_s, text_o])
            e_entity = set(['start_entity', 'end_entity'])
            for path in path_text.split(' '):
                a, b, c = path.split('|')
                if a not in e_entity:
                    checkset.add(a)
                if c not in e_entity:
                    checkset.add(c)
            vec = []
            l = 0
            while(l < len(words)):
                bo =False
                for j in range(len(words), l, -1): # reversing is important !!!
                    cc = ' '.join(words[l:j])
                    if (cc in checkset):
                        vec += [True] * (j-l)
                        l = j
                        bo = True
                        break
                if not bo:
                    vec.append(False)
                    l += 1
            vec, span = find_mini_span(vec, words, checkset)
            # vec = np.vectorize(lambda x: x in checkset)(words)
            vec[-1] = True
            prompt = []
            mask_num = 0
            for j, bo in enumerate(vec):
                if not bo:
                    mask_num += 1
                else:
                    if mask_num > 0:
                        # mask_num = mask_num // 3 # span length ~ poisson distribution (lambda = 3)
                        mask_num = max(mask_num, 1)
                        mask_num= min(8, mask_num)
                        prompt += ['<mask>'] * mask_num
                    prompt.append(words[j])
                    mask_num = 0
            prompt = ' '.join(prompt)
            Text = []
            Assist = []

            for j in range(len(sens)):
                Bart_input = list(sens[:j]) + [prompt] +list(sens[j+1:])
                assist = list(sens[:j]) + [input] +list(sens[j+1:])
                Text.append(' '.join(Bart_input))
                Assist.append(' '.join(assist))
            
            for j in range(len(sens)):
                Bart_input = mask_func(tokenized_sens[:j]) + [input] + mask_func(tokenized_sens[j+1:])
                assist = list(sens[:j]) + [input] +list(sens[j+1:])
                Text.append(' '.join(Bart_input))
                Assist.append(' '.join(assist))

            batch_size = len(Text) // 2
            Outs = []
            for l in range(2):
                A = tokenizer(Text[batch_size * l:batch_size * (l+1)],
                truncation = True,
                padding = True,
                max_length = 1024,
                return_tensors="pt")
                input_ids = A['input_ids'].to(device)
                attention_mask = A['attention_mask'].to(device)
                aaid = model.generate(input_ids, attention_mask = attention_mask, num_beams = 5, max_length = 1024)
                outs = tokenizer.batch_decode(aaid, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                Outs += outs
            ret_candidates[str(i)] = {'span': span, 'prompt' : prompt, 'out' : Outs, 'in': Text, 'assist': Assist}
            with open(f'generate_abstract/bioBART/{args.target_split}_{args.reasonable_rate}{args.ratio}_candidates.json', 'w') as fl:
                json.dump(ret_candidates, fl, indent = 4)
    
    from torch.nn.modules.loss import CrossEntropyLoss
    from transformers import BioGptForCausalLM
    criterion = CrossEntropyLoss(reduction="none")

    tokenizer = AutoTokenizer.from_pretrained('microsoft/biogpt')
    tokenizer.pad_token = tokenizer.eos_token
    model = BioGptForCausalLM.from_pretrained('microsoft/biogpt', pad_token_id=tokenizer.eos_token_id)
    model.to(device)
    model.eval()

    scored = {}
    ret = {}
    dpath_i = 0
    for i,(k, v) in enumerate(tqdm(draft.items())):

        span = ret_candidates[str(i)]['span']
        prompt = ret_candidates[str(i)]['prompt']
        sen_list = ret_candidates[str(i)]['out']
        BART_in = ret_candidates[str(i)]['in']
        Assist = ret_candidates[str(i)]['assist']

        s, r, o = attack_data[i]

        if int(s) == -1:
            ret[k] = {'prompt': '', 'in':'', 'out': ''}
            continue

        text_s = entity_raw_name[id_to_meshid[s]]
        text_o = entity_raw_name[id_to_meshid[o]]

        def process(text):

            for i in range(ord('A'), ord('Z')+1):
               text = text.replace(f'.{chr(i)}', f'. {chr(i)}')            
            return text

        sen_list = [process(text) for text in sen_list]
        path_text = dpath[dpath_i].replace('\n', '')
        dpath_i += 1

        checkset = set([text_s, text_o])
        e_entity = set(['start_entity', 'end_entity'])
        for path in path_text.split(' '):
            a, b, c = path.split('|')
            if a not in e_entity:
                checkset.add(a)
            if c not in e_entity:
                checkset.add(c)

        input = v['in'].replace('\n', '')
        output = v['out'].replace('\n', '')

        doc = nlp(output)
        gpt_sens = [sen.text for sen in doc.sents]
        assert len(gpt_sens) == len(sen_list) // 2

        word_sets = []
        for sen in gpt_sens:
            word_sets.append(set(sen.split(' ')))

        def sen_align(word_sets, modified_word_sets):
            
            l = 0
            while(l < len(modified_word_sets)):
                if len(word_sets[l].intersection(modified_word_sets[l])) > len(word_sets[l]) * 0.8:
                    l += 1
                else:
                    break
            if l == len(modified_word_sets):
                return -1, -1, -1, -1
            r = l + 1
            r1 = None
            r2 = None
            for pos1 in range(r, len(word_sets)):
                for pos2 in range(r, len(modified_word_sets)):
                    if len(word_sets[pos1].intersection(modified_word_sets[pos2])) > len(word_sets[pos1]) * 0.8:
                        r1 = pos1
                        r2 = pos2
                        break
                if r1 is not None:
                    break
            if r1 is None:
                r1 = len(word_sets)
                r2 = len(modified_word_sets)
            return l, r1, l, r2

        replace_sen_list = []
        boundary = []
        assert len(sen_list) % 2 == 0
        for j in range(len(sen_list) // 2):
            doc = nlp(sen_list[j])
            sens = [sen.text for sen in doc.sents]
            modified_word_sets = [set(sen.split(' ')) for sen in sens]
            l1, r1, l2, r2 = sen_align(word_sets, modified_word_sets)
            boundary.append((l1, r1, l2, r2))
            if l1 == -1:
                replace_sen_list.append(sen_list[j])
                continue
            check_text = ' '.join(sens[l2: r2])
            replace_sen_list.append(' '.join(gpt_sens[:l1] + [check_text] + gpt_sens[r1:]))
        sen_list = replace_sen_list + sen_list[len(sen_list) // 2:]

        old_L = len(sen_list)
        sen_list.append(output)
        sen_list += Assist
        tokens = tokenizer( sen_list,
                            truncation = True,
                            padding = True,
                            max_length = 1024,
                            return_tensors="pt")
        target_ids = tokens['input_ids'].to(device)
        attention_mask = tokens['attention_mask'].to(device)
        L = len(sen_list)
        ret_log_L = []
        for l in range(0, L, 5):
            R = min(L, l + 5)
            target = target_ids[l:R, :]
            attention = attention_mask[l:R, :]
            outputs = model(input_ids = target,
                            attention_mask = attention,
                            labels = target)
            logits = outputs.logits
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = target[..., 1:].contiguous()
            Loss = criterion(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))
            Loss = Loss.view(-1, shift_logits.shape[1])
            attention = attention[..., 1:].contiguous()
            log_Loss = (torch.mean(Loss * attention.float(), dim = 1) / torch.mean(attention.float(), dim = 1))
            ret_log_L.append(log_Loss.detach())
        log_Loss = torch.cat(ret_log_L, -1).cpu().numpy()

        real_log_Loss = log_Loss.copy()

        log_Loss = log_Loss[:old_L]

        p = np.argmin(log_Loss)
        content = []
        for i in range(len(real_log_Loss)):
            content.append([sen_list[i], str(real_log_Loss[i])])
        scored[k] = {'path':path_text, 'prompt': prompt, 'in':input, 's':text_s, 'o':text_o, 'out': content, 'bound': boundary}
        p_p = p
        # print('Old_L:', old_L)

        if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
            p_p = p+1+old_L

        if real_log_Loss[p] > real_log_Loss[old_L]:
            if real_log_Loss[p] > real_log_Loss[p+1+old_L]:
                p = p+1+old_L
        ret[k] = {'prompt': prompt, 'in':input, 'out': sen_list[p]}
    with open(f'generate_abstract/{args.target_split}_{args.reasonable_rate}{args.ratio}_bioBART_finetune.json', 'w') as fl:
        json.dump(ret, fl, indent=4)
    with open(f'generate_abstract/bioBART/{args.target_split}_{args.reasonable_rate}{args.ratio}_scored.json', 'w') as fl:
        json.dump(scored, fl, indent=4)
else:
    raise Exception('Wrong mode !!')