import streamlit as st
import re
import requests
from newspaper import Article
from newspaper import Config
import preprocessor as p
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch.nn.functional as F
from goose3 import Goose
from goose3.configuration import Configuration  
from bs4 import BeautifulSoup

st.write("""
# ESG Prediction App

This is a Proof of Concept for a company ESG (Environmental, Social, and Governance) risk prediction application.
""")

company = st.text_input("Company", placeholder="PT Adaro Minerals Indonesia Tbk")

GOOGLE = 'https://www.google.com/search'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}

API_KEY = 'AIzaSyDCfIltnvAQ3lvpovRXydRMhGQ-VxkboQ4'
SEARCH_ENGINE_ID = 'e586ee8a6c7e64d7b'

from googleapiclient.discovery import build
import math

def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    
    num_search_results = kwargs['num']
    if num_search_results > 100:
        raise NotImplementedError('Google Custom Search API supports max of 100 results')
    elif num_search_results > 10:
        kwargs['num'] = 10 # this cannot be > 10 in API call 
        calls_to_make = math.ceil(num_search_results / 10)
    else:
        calls_to_make = 1
        
    kwargs['start'] = start_item = 1
    items_to_return = []
    while calls_to_make > 0:
        res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
        items_to_return.extend(res['items'])
        calls_to_make -= 1
        start_item += 10
        kwargs['start'] = start_item
        leftover = num_search_results - start_item + 1
        if 0 < leftover < 10:
            kwargs['num'] = leftover
        
    return items_to_return 

if company:
    # print(f'Run: {company}')
    links = []
    news_text = []
    
    query = f'{company}'
    response = google_search(query, API_KEY, SEARCH_ENGINE_ID, num=50)

    url_collection = [item['link'] for item in response]
    import os
    os.environ['ST_USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'

    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 60
    config.fetch_images = False
    config.memoize_articles = True
    config.language = 'id'

    # p.set_options(p.OPT.MENTION, p.OPT.EMOJI, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.URL)

    def cleaner(text):
        text = re.sub("@[A-Za-z0-9]+", "", text) #Remove @ sign
        text = text.replace("#", "").replace("_", "") #Remove hashtag sign but keep the text
        # text = p.clean(text) # Clean text from any mention, emoji, hashtag, reserve words(such as FAV, RT), smiley, and url
        text = text.strip().replace("\n","")
        return text
    
    for url in url_collection:
        if "http" not in url:
            continue
        lang = "id"
        if "eco-business.com" in url or "thejakartapost.com" in url or "marketforces.org.au" in url or "jakartaglobe.id" in url:
            lang = "en"

        ### Selenium
        # from selenium import webdriver
        # from selenium.webdriver.chrome.options import Options
        # from goose3 import Goose

        # options = Options()
        # options.headless = True
        # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

        # driver = webdriver.Chrome(options=options)
        # # url = 'https://example.com/news-article'
        # driver.get(url)

        # html = driver.page_source
        # driver.quit()

        # g = Goose()
        # article = g.extract(raw_html=html)

        # print(article.cleaned_text)
        # news_text.append(article.cleaned_text)
        ###

        # article = Article(url, language=lang, config=config)
        # article.download()
        # article.parse()
        # article_clean = cleaner(article.text)

        # url = 'https://example.com/news-article'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

        response = requests.get(url, headers=headers)
        # html = response.text

        soup = BeautifulSoup(response.content, 'html.parser')

        g = Goose()
        article = g.extract(raw_html=str(soup))

        # print(url)
        # print(soup)
        # news_empty = True

        possible_class = ['detail', 'body-content', 'article-content', 'detail-konten', 'DetailBlock']
        excluded_sentence = ['Komentar menjadi tanggung-jawab Anda sesuai UU ITE', 'Dapatkan berita terbaru dari kami Ikuti langkah ini untuk mendapatkan notifikasi:']

        if not article.cleaned_text:
            article_content = soup.find('div', class_=possible_class)
            if article_content and article_content.get_text() not in excluded_sentence:
                news_text.append(article_content.get_text())
                news_empty = False
                # print(f'{url} News Exist using POSSIBLE CLASS')
        else:
            if article.cleaned_text not in excluded_sentence:
                news_text.append(article.cleaned_text)
                news_empty = False
                # print(f'{url} News Exist using ARTICLE CLEANED TEXT')

        # if news_empty:
        #     print(f'Cannot Get URL: {url}')
            # print(soup)

        # print(article.cleaned_text)

        

        # goose = Goose()
        # config = Configuration()
        # config.strict = False  # turn of strict exception handling
        # config.browser_user_agent = 'Mozilla 5.0'  # set the browser agent string
        # config.http_timeout = 5.05  # set http timeout in seconds

        # with Goose(config) as g:
        #     article = goose.extract(url=url)

        #     news_text.append(article.cleaned_text)

    df = pd.DataFrame({
        'news': news_text
        })
    
    # Load the tokenizer and model
    tokenizer_esg = AutoTokenizer.from_pretrained("didev007/ESG-indobert-model")
    model_esg = AutoModelForSequenceClassification.from_pretrained("didev007/ESG-indobert-model")

    # Load the tokenizer and model
    tokenizer_sentiment = AutoTokenizer.from_pretrained("adhityaprimandhika/distillbert_sentiment_analysis")
    model_sentiment = AutoModelForSequenceClassification.from_pretrained("adhityaprimandhika/distillbert_sentiment_analysis")

    def get_chunk_weights(num_chunks):
        center = num_chunks / 2
        sigma = num_chunks / 4
        weights = [np.exp(-0.5 * ((i - center) / sigma) ** 2) for i in range(num_chunks)]
        weights = np.array(weights)
        return weights / weights.sum()

    def tokenize_and_chunk(text, tokenizer, chunk_size=512):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        input_ids = inputs['input_ids'][0]
        
        chunks = [input_ids[i:i+chunk_size] for i in range(0, len(input_ids), chunk_size)]
        return chunks

    def esg_category(chunks, model):
        num_chunks = len(chunks)
        weights = get_chunk_weights(num_chunks)
        
        esg_scores = np.zeros(4)
        labels = ["none", "E", "S", "G"]
        
        for i, chunk in enumerate(chunks):
            inputs = {'input_ids': chunk.unsqueeze(0)}
            outputs = model(**inputs)
            logits = outputs.logits
            probs = F.softmax(logits, dim=1).detach().numpy()[0]
            esg_scores += weights[i] * probs

        predicted_class = esg_scores.argmax()
        aggregated_esg = labels[predicted_class]
        
        return aggregated_esg

    def sentiment_analysis(text, tokenizer, model):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        labels = ["positive", "neutral", "negative"]
        predicted_sentiment = labels[predicted_class]
        return predicted_sentiment

    def apply_model_to_dataframe(df, tokenizer_esg, model_esg, tokenizer_sentiment, model_sentiment, text_column='news'):
        esg_categories = []
        sentiments = []
        for text in df[text_column]:
            if isinstance(text, str): 
                chunks = tokenize_and_chunk(text, tokenizer_esg)
                esg = esg_category(chunks, model_esg)
                sentiment = sentiment_analysis(text, tokenizer_sentiment, model_sentiment)
                esg_categories.append(esg)
                sentiments.append(sentiment)
            else:
                esg_categories.append("none") 
                sentiments.append("neutral") 
        
        df['aggregated_esg'] = esg_categories
        df['sentiment'] = sentiments
        return df
    
    result_data = apply_model_to_dataframe(df, tokenizer_esg, model_esg, tokenizer_sentiment, model_sentiment)

    grouped_counts = df.groupby(['aggregated_esg', 'sentiment']).size().reset_index(name='count')
    data = grouped_counts.pivot(index='aggregated_esg', columns='sentiment', values='count')
    required_columns_sentiment = ['negative', 'positive', 'neutral']
    for col in required_columns_sentiment:
        if col not in data.columns:
            data[col] = 0

    # Handle potential missing values
    data['negative'] = data['negative'].fillna(0)
    data['positive'] = data['positive'].fillna(0)
    data['neutral'] = data['neutral'].fillna(0)

    # print(data)
    
    data['count'] = (data['negative']+data['positive']+data['neutral'])
    data['total'] = data['negative']/data['count'] + data['positive']*(-0.2)/data['count']
    # data['total'] = data['negative'] + data['positive']*(-1)
    if 'none' in data:
        data = data.drop('none')
    # data

    total = data['total'].sum()

    # Min-max normalization
    min_esg = -1
    max_esg = 2
    min_score = 0
    max_score = 60

    ESG_score = ((total - min_esg) / (max_esg - min_esg)) * (max_score - min_score) + min_score
    
    def esg_risk_categorization(esg_score):
        if esg_score <= 10:
            return 'Negligible'
        elif 10 < esg_score <= 20:
            return 'Low'
        elif 20 < esg_score <= 30:
            return 'Medium'
        elif 30 < esg_score <= 40:
            return 'High'
        else:
            return 'Severe'
        
    risk = esg_risk_categorization(ESG_score)

    st.write(company)
    log_detail = """
        Company: {}
        ESG Score Prediction: {}
        ESG Category Risk Prediction: {}
    """.format(company, ESG_score, risk)
    print(log_detail)

    st.write(f'ESG Category Risk Prediction: {risk}')