File size: 6,568 Bytes
3c77d98
27be2d9
 
 
 
 
cb58c8d
3c77d98
cb58c8d
 
 
 
 
3c77d98
 
27be2d9
 
 
 
e5eb7fe
 
 
 
 
 
 
 
 
 
 
 
27be2d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb58c8d
 
 
 
 
481f13d
27be2d9
481f13d
 
3c77d98
d2324b7
3c77d98
cb58c8d
27be2d9
 
 
 
3c77d98
7ba3a06
27be2d9
 
 
3c77d98
 
27be2d9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr
import pandas as pd
import torch
from data_collect import collect
from model_predict import predict_text
from wide_analysis.data.process_data import prepare_dataset

model_dict = {
    "Outcome Prediction": "outcome",
    "Stance Detection": "stance",
    "Policy Prediction": "policy",
    "Sentiment Analysis": "sentiment",
    "Offensive Language Detection": "offensive"
}

platform_dict = ["wikipedia", "wikidata_entity", "wikidata_property", "wikinews", "wikiquote"]
lang_dict = ["en", "es", "gr"]

title = 'Wide-Analysis: A Wikipedia Deletion Discussion Analysis Suite'
desc = """ Wide-Analysis is a suite of tools for analyzing deletion discussions across various Wikimedia platforms in multiple languages. The platform currently supports Outcome Prediction, Stance Detection, Policy Prediction, Sentiment Detection and Offensive Language Detection, for the languages English (en), Spanish (es), and Greek (gr) and the platforms: Wikipedia, Wikidata (entity and property), Wikinews, and Wikiquote.
    The package contains the following functionalities
   
     - Outcome Prediction: Predicting the outcome of a deletion discussion, the outcome can be the decision made with the discussion (e.g., keep, delete, merge, etc.) (determined from the complete discussion)
     - Stance Detection: Identifying the stance of the participants in the discussion, in relation to the deletion decision.(determined from each individual comment in discussion)
     - Policy Prediction: Predicting the policy that is most relevant to the comments of the participants in the discussion.(determined from each individual comment in discussion)
     - Sentiment Prediction: Predicting the sentiment of the participants in the discussion, in relation to the deletion decision.(determined from each individual comment in discussion)
     - Offensive Language Detection: Detecting offensive language in the comments of the participants in the discussion.(determined from each individual comment in discussion)

The input to the classifier is a URL of a Wikipedia deletion discussion page with the task, language and platform listed in the drop-down box, along with additional information for Greek language (gr), and the output is the predicted label of the discussion, along with the probability of the predicted label, and the probabilities of all the labels.

"""


def process_url(url, task_name, lang, platform, date):
    model_name = model_dict[task_name]
    derived_date = url.split('/')[-1].split('#')[0]
    derived_title = url.split('#')[-1]

    if model_name == 'outcome':
        if lang == 'en' and platform == 'wikipedia':
            df = prepare_dataset(mode='title', start_date=derived_date, url=url, title=derived_title)
            if df.empty:
                return "", "", "", {"error": "No data found"}
            processed_text = df['discussion'].iloc[0]

        else:
            if lang == 'es' and platform == 'wikipedia':
                df = collect(mode='title', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date='')
            elif lang == 'gr' and platform == 'wikipedia':
                if not date or len(date.split('/')) != 2:
                    return "", "", "", {"error": "For Greek Wikipedia (title mode), please provide 'date' in 'mm/yyyy' format."}
                df = collect(mode='title', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date, years=[date])

            elif lang == 'en' and platform == 'wikidata_entity':
                df = collect(mode='url', url=url, platform=platform, lang=lang)
            
            elif lang == 'en' and platform == 'wikidata_property':
                df = collect(mode='url', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date)

            elif lang == 'en' and platform == 'wikinews':
                df = collect(mode='url', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date)
                
            elif lang == 'en' and platform == 'wikiquote':
                df = collect(mode='title', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date)
                
            else:
                return "", "", "", {"error": f"No implementation detail for lang={lang} platform={platform} provided."}

            if isinstance(df, str) or df.empty:
                return "", "", "", {"error": "No data returned from collect function."}

            processed_text = df['discussion'].iloc[0]

    else:
        if lang == 'en' and platform == 'wikipedia':
            df = prepare_dataset(mode='title', start_date=derived_date, url=url, title=derived_title)
            if df.empty:
                return "", "", "", {"error": "No data found"}
            processed_text = df['discussion'].iloc[0]
        else:
            return "", "", "", {"error": f"Currently only English Wikipedia supported for {task_name}."}

    final_scores = predict_text(processed_text, model_name, lang=lang, platform=platform, date=date)

    if model_name == 'outcome':
        highest_prob_item = max(final_scores, key=lambda x: x['score'])
        highest_prob_label = highest_prob_item['outcome']
        highest_prob = highest_prob_item['score']
        progress_bars = {item['outcome']: item['score'] for item in final_scores}
        
        return processed_text, highest_prob_label, str(highest_prob), progress_bars
    else:
        return processed_text, "", "", final_scores


url_input = gr.Textbox(label="URL")
model_name_input = gr.Dropdown(label="Choose the Task", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
lang_input = gr.Dropdown(label="Language", choices=lang_dict, value="en")
platform_input = gr.Dropdown(label="Platform", choices=platform_dict, value="wikipedia")
date_input = gr.Textbox(label="Date (if required)", placeholder="For ES year mode: dd/mm/yyyy, for GR title mode: mm/yyyy")

outputs = [
    gr.Textbox(label="Processed Text"),
    gr.Textbox(label="Label with Highest Probability"),  # Only for outcome
    gr.Textbox(label="Probability"),  # Only for outcome
    gr.JSON(label="All Labels and Probabilities")  # For all tasks
]

demo = gr.Interface(fn=process_url, 
                    inputs=[url_input, model_name_input, lang_input, platform_input, date_input],
                    outputs=outputs, 
                    title=title, 
                    description=desc)
demo.launch(share=True)