Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import re | |
import pysbd | |
############################################### | |
# Functions from Code 1 (collapsible approach)# | |
############################################### | |
def extract_result(sentence): | |
match = re.search(r"(Διαγραφή|Παραμονή|Άλλο αποτέλεσμα|διαγραφή|Συγχώνευση|Διατήρηση)", sentence, flags=re.IGNORECASE) | |
delete_cases = [ | |
'Μη εγκυκλοπαιδικό', 'Πράγματι δεν φαίνεται πως το λήμμα είναι εγκυκλοπαιδικό', | |
'Δεν διαπιστώθηκε εγκυκλοπαιδικότητα', 'Μη εγκυκλοπαιδικό λήμμα', | |
'Το λήμμα κρίθηκε ότι είναι καταλληλότερο για κάποιο άλλο αδελφό εγχείρημα, παρά για την Βικιπαίδεια + ατεκμηρίωτο.', | |
'Δεν υπάρχουν επαρκείς αναφορές για την βιογραφούμενη' | |
] | |
if match: | |
outcome = match.group(1).strip() | |
elif sentence in delete_cases: | |
outcome = 'Διαγραφή' | |
else: | |
outcome = 'Δεν υπάρχει συναίνεση' | |
return normalize_outcome(outcome) | |
def normalize_outcome(o): | |
lowered = o.lower() | |
if 'διαγρ' in lowered: # covers 'διαγραφή' | |
return 'Διαγραφή' | |
elif 'διατήρη' in lowered or 'παραμονή' in lowered: | |
return 'Διατήρηση' | |
elif 'συγχών' in lowered: | |
return 'συγχώνευση' | |
else: | |
# Covers 'Άλλο αποτέλεσμα' and unknown cases | |
return 'Δεν υπάρχει συναίνεση' | |
def extract_discussions_from_page_collapsible(url): | |
response = requests.get(url) | |
if response.status_code != 200: | |
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
discussion_sections = soup.find_all('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') | |
titles = [] | |
for section in discussion_sections: | |
try: | |
h2_tag = section.find('h2') | |
if not h2_tag: | |
continue | |
title_link = h2_tag.find('a') | |
title = title_link.text.strip() if title_link else h2_tag.get_text(strip=True) | |
titles.append(title) | |
except: | |
pass | |
discussion_tables = soup.find_all('table') | |
if not discussion_tables: | |
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) | |
data = [] | |
for idx, table in enumerate(discussion_tables): | |
try: | |
decision_row = table.find('tr') | |
decision_cell = decision_row.find('th') if decision_row else None | |
if decision_cell: | |
result_match = re.search( | |
r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL | |
) | |
result_sentence = result_match.group(1).strip() if result_match else "No result found" | |
else: | |
result_sentence = "No result found" | |
discussion_row = decision_row.find_next_sibling('tr') if decision_row else None | |
discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None | |
discussion_content = discussion_cell.get_text(separator="\n") if discussion_cell else "No discussion content found" | |
discussion_content = discussion_content.split('\nμητρώο\n)\n\n\n\n\n')[-1].replace('\n','') | |
title = titles[idx] if idx < len(titles) else f"Discussion {idx + 1}" | |
data.append({ | |
"title": title, | |
"discussion": discussion_content, | |
"result_sentence": result_sentence, | |
"result": extract_result(result_sentence), | |
"text_url": url | |
}) | |
except: | |
pass | |
return pd.DataFrame(data, columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) | |
########################################### | |
# Functions from Code 2 (non-collapsible) # | |
########################################### | |
def extract_discussions_from_page_non_collapsible(url): | |
response = requests.get(url) | |
if response.status_code != 200: | |
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
discussion_sections = soup.find_all('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') | |
titles = [] | |
for section in discussion_sections: | |
try: | |
h2_tag = section.find('h2') | |
if not h2_tag: | |
continue | |
title_link = h2_tag.find('a') | |
title = title_link.text.strip() if title_link else h2_tag.get_text(strip=True) | |
titles.append(title) | |
except: | |
pass | |
discussion_tables = soup.find_all('table', class_='pagediscussion') | |
if not discussion_tables: | |
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) | |
data = [] | |
for idx, table in enumerate(discussion_tables): | |
try: | |
decision_row = table.find('tr') | |
decision_cell = decision_row.find('th') if decision_row else None | |
if decision_cell: | |
result_match = re.search( | |
r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL | |
) | |
result_sentence = result_match.group(1).strip() if result_match else "No result found" | |
else: | |
result_sentence = "No result found" | |
discussion_row = decision_row.find_next_sibling('tr') if decision_row else None | |
discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None | |
discussion_content = discussion_cell.get_text(separator="\n") if discussion_cell else "No discussion content found" | |
discussion_content = discussion_content.split('\nμητρώο\n)\n\n\n\n\n')[-1].replace('\n','') | |
title = titles[idx] if idx < len(titles) else f"Discussion {idx + 1}" | |
data.append({ | |
"title": title, | |
"discussion": discussion_content, | |
"result_sentence": result_sentence, | |
"result": extract_result(result_sentence), | |
"text_url": url | |
}) | |
except: | |
pass | |
return pd.DataFrame(data, columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) | |
########################################### | |
# Title-based extraction with fallback # | |
########################################### | |
def html_to_plaintext(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']): | |
tag.insert_before('\n') | |
tag.insert_after('\n') | |
for br in soup.find_all('br'): | |
br.replace_with('\n') | |
text = soup.get_text(separator=' ', strip=True) | |
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()]) | |
return text | |
def split_text_into_sentences(text): | |
seg = pysbd.Segmenter(language="el", clean=False) | |
sentences = seg.segment(text) | |
return ' '.join(sentences) | |
def clean_discussion_text(text): | |
return text.strip() | |
def extract_outcome_from_text(text): | |
outcomes = ['Διαγραφή', 'Παραμονή', 'διαγραφή', 'Συγχώνευση', 'Διατήρηση', 'Άλλο αποτέλεσμα'] | |
lowered = text.lower() | |
found_outcome = None | |
for outcome in outcomes: | |
if outcome.lower() in lowered: | |
found_outcome = outcome | |
break | |
if not found_outcome: | |
found_outcome = 'Δεν υπάρχει συναίνεση' | |
return normalize_outcome(found_outcome) | |
def extract_discussion_section(soup, title): | |
t = title.replace(' ', '_') | |
h2_tag = soup.find('h2', id=t) | |
if not h2_tag: | |
return '', '', '' | |
heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') | |
if not heading_div: | |
return '', '', '' | |
next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') | |
html_fragments = [] | |
current = heading_div.next_sibling | |
while current and current != next_heading_div: | |
if hasattr(current, 'prettify'): | |
html_fragments.append(current.prettify()) | |
else: | |
html_fragments.append(str(current)) | |
current = current.next_sibling | |
discussion_html = ''.join(html_fragments).strip() | |
if not discussion_html: | |
return '', '', '' | |
sub_soup = BeautifulSoup(discussion_html, 'html.parser') | |
discussion_tags = sub_soup.find_all(['p', 'ul', 'dl']) | |
if not discussion_tags: | |
return '', '', '' | |
cleaned_parts = [] | |
for tag in discussion_tags: | |
for unwanted in tag.find_all(['span', 'img', 'a', 'div', 'table'], recursive=True): | |
unwanted.decompose() | |
text = tag.get_text(separator=' ', strip=True) | |
if text: | |
cleaned_parts.append(text) | |
cleaned_discussion = ' '.join(cleaned_parts) | |
label = extract_outcome_from_text(cleaned_discussion) | |
return discussion_html, label, cleaned_discussion | |
def extract_fallback_discussion(url, title): | |
response = requests.get(url) | |
if response.status_code != 200: | |
return '', None | |
soup = BeautifulSoup(response.text, 'html.parser') | |
discussion_tables = soup.find_all('table') | |
if not discussion_tables: | |
return '', None | |
for table in discussion_tables: | |
table_text = table.get_text(separator='\n', strip=True) | |
if title in table_text: | |
decision_row = table.find('tr') | |
decision_cell = decision_row.find('th') if decision_row else None | |
if decision_cell: | |
result_match = re.search(r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL) | |
result_sentence = result_match.group(1).strip() if result_match else "No result found" | |
else: | |
result_sentence = "No result found" | |
discussion_row = decision_row.find_next_sibling('tr') if decision_row else None | |
discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None | |
discussion_content = '' | |
if discussion_cell: | |
discussion_content = discussion_cell.get_text(separator=' ', strip=True) | |
if discussion_content: | |
outcome = extract_result(result_sentence) | |
return discussion_content, outcome | |
return '', None | |
def extract_div_from_title_with_fallback(title, url ='', date=''): | |
if not date: | |
raise ValueError("For 'title' mode, 'date' must be provided in the format: mm/yyyy") | |
month_map = { | |
'01': 'Ιανουαρίου', '02': 'Φεβρουαρίου', '03': 'Μαρτίου', '04': 'Απριλίου', '05': 'Μαΐου', '06': 'Ιουνίου', | |
'07': 'Ιουλίου', '08': 'Αυγούστου', '09': 'Σεπτεμβρίου', '10': 'Οκτωβρίου', '11': 'Νοεμβρίου', '12': 'Δεκεμβρίου' | |
} | |
if '_' in date and date.split('_')[0] in month_map.values(): | |
# If date is already in 'Month_Year' format | |
date_str = date | |
else: | |
# Try to parse date in 'mm/yyyy' format | |
match = re.match(r'(\d{2})/(\d{4})', date) | |
if not match: | |
raise ValueError("Date must be in the format mm/yyyy or Month_Year") | |
mm, yyyy = match.groups() | |
if mm not in month_map: | |
raise ValueError(f"Invalid month: {mm}") | |
date_str = f"{month_map[mm]}_{yyyy}" # Convert to 'Month_Year' format | |
base_url = 'https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή' | |
url = f"{base_url}/{date_str}#{title}" | |
response = requests.get(url) | |
if response.status_code != 200: | |
return pd.DataFrame(columns=['title', 'discussion_url', 'discussion', 'outcome']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title) | |
text_url = f"{base_url}/{date_str}" | |
discussion_url = text_url + '#' + title | |
cleaned_discussion = html_to_plaintext(cleaned_discussion) | |
cleaned_discussion = split_text_into_sentences(cleaned_discussion) | |
cleaned_discussion = clean_discussion_text(cleaned_discussion) | |
if not cleaned_discussion.strip(): | |
fallback_url = f"{base_url}/{date_str}" | |
discussion_content, outcome = extract_fallback_discussion(fallback_url, title) | |
cleaned_discussion = html_to_plaintext(discussion_content) | |
cleaned_discussion = split_text_into_sentences(cleaned_discussion) | |
cleaned_discussion = clean_discussion_text(cleaned_discussion) | |
if outcome: | |
label = normalize_outcome(outcome) | |
df = pd.DataFrame([[title, discussion_url, cleaned_discussion, label]], | |
columns=['title', 'discussion_url', 'discussion', 'outcome']) | |
return df | |
def normalize_outcome(o): | |
lowered = o.lower() | |
if 'διαγρ' in lowered: | |
return 'Διαγραφή' | |
elif 'διατήρη' in lowered or 'παραμονή' in lowered: | |
return 'Διατήρηση' | |
elif 'συγχών' in lowered: | |
return 'συγχώνευση' | |
else: | |
return 'Δεν υπάρχει συναίνεση' | |
################################### | |
# The collect_gr() function # | |
################################### | |
def collect_gr(mode='url', title='', url = '', years=[]): | |
if mode not in ['title', 'year', 'url']: | |
raise ValueError("mode must be either 'title' or 'year' or 'url'.") | |
if mode == 'title': | |
if not title or not years or len(years) != 1: | |
raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be a single-element list like ['mm/yyyy'].") | |
date = years[0] | |
df = extract_div_from_title_with_fallback(title, date=date) | |
return df[['title', 'discussion_url', 'discussion', 'outcome']] | |
elif mode == 'url': | |
if title or years: | |
raise ValueError("For 'url' mode, 'title' must be empty and 'years' must be empty.") | |
#collect the title and date from the url like: base_url = 'https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή'/{date_str}#{title} | |
match = re.search(r'Βικιπαίδεια:Σελίδες_για_διαγραφή/([^#]+)#(.+)', url) | |
if not match: | |
raise ValueError("URL format is incorrect.") | |
date_str, title = match.groups() | |
print(date_str, title) | |
df = extract_div_from_title_with_fallback(title, date=date_str) | |
return df[['title', 'discussion_url', 'discussion', 'outcome']] | |
elif mode == 'year': | |
if title or not years: | |
raise ValueError("For 'year' mode, 'title' must be empty and 'years' must be provided.") | |
if len(years) == 1: | |
start_year = end_year = years[0] | |
elif len(years) == 2: | |
start_year, end_year = min(years), max(years) | |
else: | |
raise ValueError("Invalid years input. Provide one year or two years for a range.") | |
all_data = [] | |
for year in range(start_year, end_year + 1): | |
url = f"https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή/Ιανουαρίου_{year}" | |
df = extract_discussions_from_page_collapsible(url) | |
if df.empty: | |
df = extract_discussions_from_page_non_collapsible(url) | |
if not df.empty: | |
df['result'] = df['result'].apply(normalize_outcome) | |
df['discussion_url'] = df.apply(lambda row: row['text_url'] + '#' + row['title'].replace(' ', '_'), axis=1) | |
df = df.rename(columns={'result':'outcome'}) | |
all_data.append(df[['title', 'discussion_url', 'discussion', 'outcome']]) | |
if all_data: | |
return pd.concat(all_data, ignore_index=True) | |
else: | |
return pd.DataFrame(columns=['title', 'discussion_url', 'discussion', 'outcome']) | |