hsuvaskakoty commited on
Commit
27be2d9
·
verified ·
1 Parent(s): 0d0a4e0
Files changed (1) hide show
  1. app.py +159 -32
app.py CHANGED
@@ -1,13 +1,9 @@
1
- import data_prep
2
- import model_predict
3
  import gradio as gr
4
-
5
- # model_dict = {
6
- # "BERT-Base": "research-dump/bert-base-uncased_deletion_multiclass_complete_Final",
7
- # "BERT-Large": "research-dump/bert-large-uncased_deletion_multiclass_complete_final",
8
- # "RoBERTa-Base": "research-dump/roberta-base_deletion_multiclass_complete_final",
9
- # "RoBERTa-Large": "research-dump/roberta-large_deletion_multiclass_complete_final"
10
- # }
11
 
12
  model_dict = {
13
  "Outcome Prediction": "outcome",
@@ -17,42 +13,173 @@ model_dict = {
17
  "Offensive Language Detection": "offensive"
18
  }
19
 
20
- def process_url(url, model_name):
21
- model_name = model_dict[model_name]
22
- processed_text = data_prep.process_data(url)
23
- final_scores = model_predict.predict_text(processed_text, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  if model_name == 'outcome':
25
  highest_prob_item = max(final_scores, key=lambda x: x['score'])
26
  highest_prob_label = highest_prob_item['outcome']
27
  highest_prob = highest_prob_item['score']
28
  progress_bars = {item['outcome']: item['score'] for item in final_scores}
29
 
30
- return processed_text, highest_prob_label, highest_prob, progress_bars
31
-
32
  else:
33
  return processed_text, "", "", final_scores
34
 
35
- title = 'Wide-Analysis: A Wikipedia Deletion Discussion Analysis Suite'
36
- desc = """ Wide-Analysis is a suite of tools for analyzing Wikipedia deletion discussions. It is designed to help researchers and practitioners to understand the dynamics of deletion discussions, and to develop tools for supporting the decision-making process in Wikipedia. The suite includes a set of tools for collecting, processing, and analyzing deletion discussions. The package contains the following functionalities
37
-
38
- - Outcome Prediction: Predicting the outcome of a deletion discussion, the outcome can be the decision made with the discussion (e.g., keep, delete, merge, etc.) (determined from the complete discussion)
39
- - Stance Detection: Identifying the stance of the participants in the discussion, in relation to the deletion decision.(determined from each individual comment in discussion)
40
- - Policy Prediction: Predicting the policy that is most relevant to the comments of the participants in the discussion.(determined from each individual comment in discussion)
41
- - Sentiment Prediction: Predicting the sentiment of the participants in the discussion, in relation to the deletion decision.(determined from each individual comment in discussion)
42
- - Offensive Language Detection: Detecting offensive language in the comments of the participants in the discussion.(determined from each individual comment in discussion)
43
-
44
- The input to the classifier is a URL of a Wikipedia deletion discussion page with the task listed in the drop-down box, and the output is the predicted label of the discussion, along with the probability of the predicted label, and the probabilities of all the labels.
45
-
46
- """
47
 
48
  url_input = gr.Textbox(label="URL")
49
  model_name_input = gr.Dropdown(label="Choose the Task", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
 
 
 
 
50
  outputs = [
51
  gr.Textbox(label="Processed Text"),
52
- gr.Textbox(label="Label with Highest Probability"), # This will only be used for the outcome task
53
- gr.Textbox(label="Probability"), # This will only be used for the outcome task
54
- gr.JSON(label="All Labels and Probabilities") # This will be used for all tasks
55
  ]
56
 
57
- demo = gr.Interface(fn=process_url, inputs=[url_input, model_name_input], outputs=outputs, title=title, description=desc)
58
- demo.launch() # share=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import torch
4
+ from data_collect import collect
5
+ from model_predict import predict_text
6
+ from wide_analysis.data.process_data import prepare_dataset
 
 
7
 
8
  model_dict = {
9
  "Outcome Prediction": "outcome",
 
13
  "Offensive Language Detection": "offensive"
14
  }
15
 
16
+ platform_dict = ["wikipedia", "wikidata_entity", "wikidata_property", "wikinews", "wikiquote"]
17
+ lang_dict = ["en", "es", "gr"]
18
+
19
+ title = 'Wide-Analysis: A Wikipedia Deletion Discussion Analysis Suite'
20
+ desc = """ Wide-Analysis is a suite of tools for analyzing deletion discussions across various Wikimedia platforms in multiple languages... """
21
+
22
+
23
+ def process_url(url, task_name, lang, platform, date):
24
+ model_name = model_dict[task_name]
25
+ derived_date = url.split('/')[-1].split('#')[0]
26
+ derived_title = url.split('#')[-1]
27
+
28
+ if model_name == 'outcome':
29
+ if lang == 'en' and platform == 'wikipedia':
30
+ df = prepare_dataset(mode='title', start_date=derived_date, url=url, title=derived_title)
31
+ if df.empty:
32
+ return "", "", "", {"error": "No data found"}
33
+ processed_text = df['discussion'].iloc[0]
34
+
35
+ else:
36
+ if lang == 'es' and platform == 'wikipedia':
37
+ df = collect(mode='title', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date='')
38
+ elif lang == 'gr' and platform == 'wikipedia':
39
+ if not date or len(date.split('/')) != 2:
40
+ return "", "", "", {"error": "For Greek Wikipedia (title mode), please provide 'date' in 'mm/yyyy' format."}
41
+ df = collect(mode='title', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date, years=[date])
42
+
43
+ elif lang == 'en' and platform == 'wikidata_entity':
44
+ df = collect(mode='url', url=url, platform=platform, lang=lang)
45
+
46
+ elif lang == 'en' and platform == 'wikidata_property':
47
+ df = collect(mode='url', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date)
48
+
49
+ elif lang == 'en' and platform == 'wikinews':
50
+ df = collect(mode='url', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date)
51
+
52
+ elif lang == 'en' and platform == 'wikiquote':
53
+ df = collect(mode='title', start_date=derived_date, url=url, title=derived_title, platform=platform, lang=lang, date=date)
54
+
55
+ else:
56
+ return "", "", "", {"error": f"No implementation detail for lang={lang} platform={platform} provided."}
57
+
58
+ if isinstance(df, str) or df.empty:
59
+ return "", "", "", {"error": "No data returned from collect function."}
60
+
61
+ processed_text = df['discussion'].iloc[0]
62
+
63
+ else:
64
+ if lang == 'en' and platform == 'wikipedia':
65
+ df = prepare_dataset(mode='title', start_date=derived_date, url=url, title=derived_title)
66
+ if df.empty:
67
+ return "", "", "", {"error": "No data found"}
68
+ processed_text = df['discussion'].iloc[0]
69
+ else:
70
+ return "", "", "", {"error": f"Currently only English Wikipedia supported for {task_name}."}
71
+
72
+ final_scores = predict_text(processed_text, model_name, lang=lang, platform=platform, date=date)
73
+
74
  if model_name == 'outcome':
75
  highest_prob_item = max(final_scores, key=lambda x: x['score'])
76
  highest_prob_label = highest_prob_item['outcome']
77
  highest_prob = highest_prob_item['score']
78
  progress_bars = {item['outcome']: item['score'] for item in final_scores}
79
 
80
+ return processed_text, highest_prob_label, str(highest_prob), progress_bars
 
81
  else:
82
  return processed_text, "", "", final_scores
83
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  url_input = gr.Textbox(label="URL")
86
  model_name_input = gr.Dropdown(label="Choose the Task", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
87
+ lang_input = gr.Dropdown(label="Language", choices=lang_dict, value="en")
88
+ platform_input = gr.Dropdown(label="Platform", choices=platform_dict, value="wikipedia")
89
+ date_input = gr.Textbox(label="Date (if required)", placeholder="For ES year mode: dd/mm/yyyy, for GR title mode: mm/yyyy")
90
+
91
  outputs = [
92
  gr.Textbox(label="Processed Text"),
93
+ gr.Textbox(label="Label with Highest Probability"), # Only for outcome
94
+ gr.Textbox(label="Probability"), # Only for outcome
95
+ gr.JSON(label="All Labels and Probabilities") # For all tasks
96
  ]
97
 
98
+ demo = gr.Interface(fn=process_url,
99
+ inputs=[url_input, model_name_input, lang_input, platform_input, date_input],
100
+ outputs=outputs,
101
+ title=title,
102
+ description=desc)
103
+ demo.launch(share=True)
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+ # import data_prep
129
+ # import model_predict
130
+ # import gradio as gr
131
+
132
+ # # model_dict = {
133
+ # # "BERT-Base": "research-dump/bert-base-uncased_deletion_multiclass_complete_Final",
134
+ # # "BERT-Large": "research-dump/bert-large-uncased_deletion_multiclass_complete_final",
135
+ # # "RoBERTa-Base": "research-dump/roberta-base_deletion_multiclass_complete_final",
136
+ # # "RoBERTa-Large": "research-dump/roberta-large_deletion_multiclass_complete_final"
137
+ # # }
138
+
139
+ # model_dict = {
140
+ # "Outcome Prediction": "outcome",
141
+ # "Stance Detection": "stance",
142
+ # "Policy Prediction": "policy",
143
+ # "Sentiment Analysis": "sentiment",
144
+ # "Offensive Language Detection": "offensive"
145
+ # }
146
+
147
+ # def process_url(url, model_name):
148
+ # model_name = model_dict[model_name]
149
+ # processed_text = data_prep.process_data(url)
150
+ # final_scores = model_predict.predict_text(processed_text, model_name)
151
+ # if model_name == 'outcome':
152
+ # highest_prob_item = max(final_scores, key=lambda x: x['score'])
153
+ # highest_prob_label = highest_prob_item['outcome']
154
+ # highest_prob = highest_prob_item['score']
155
+ # progress_bars = {item['outcome']: item['score'] for item in final_scores}
156
+
157
+ # return processed_text, highest_prob_label, highest_prob, progress_bars
158
+
159
+ # else:
160
+ # return processed_text, "", "", final_scores
161
+
162
+ # title = 'Wide-Analysis: A Wikipedia Deletion Discussion Analysis Suite'
163
+ # desc = """ Wide-Analysis is a suite of tools for analyzing Wikipedia deletion discussions. It is designed to help researchers and practitioners to understand the dynamics of deletion discussions, and to develop tools for supporting the decision-making process in Wikipedia. The suite includes a set of tools for collecting, processing, and analyzing deletion discussions. The package contains the following functionalities
164
+
165
+ # - Outcome Prediction: Predicting the outcome of a deletion discussion, the outcome can be the decision made with the discussion (e.g., keep, delete, merge, etc.) (determined from the complete discussion)
166
+ # - Stance Detection: Identifying the stance of the participants in the discussion, in relation to the deletion decision.(determined from each individual comment in discussion)
167
+ # - Policy Prediction: Predicting the policy that is most relevant to the comments of the participants in the discussion.(determined from each individual comment in discussion)
168
+ # - Sentiment Prediction: Predicting the sentiment of the participants in the discussion, in relation to the deletion decision.(determined from each individual comment in discussion)
169
+ # - Offensive Language Detection: Detecting offensive language in the comments of the participants in the discussion.(determined from each individual comment in discussion)
170
+
171
+ # The input to the classifier is a URL of a Wikipedia deletion discussion page with the task listed in the drop-down box, and the output is the predicted label of the discussion, along with the probability of the predicted label, and the probabilities of all the labels.
172
+
173
+ # """
174
+
175
+ # url_input = gr.Textbox(label="URL")
176
+ # model_name_input = gr.Dropdown(label="Choose the Task", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
177
+ # outputs = [
178
+ # gr.Textbox(label="Processed Text"),
179
+ # gr.Textbox(label="Label with Highest Probability"), # This will only be used for the outcome task
180
+ # gr.Textbox(label="Probability"), # This will only be used for the outcome task
181
+ # gr.JSON(label="All Labels and Probabilities") # This will be used for all tasks
182
+ # ]
183
+
184
+ # demo = gr.Interface(fn=process_url, inputs=[url_input, model_name_input], outputs=outputs, title=title, description=desc)
185
+ # demo.launch() # share=True