wide_analysis_space / data_collect.py
hsuvaskakoty's picture
Upload 9 files
0d0a4e0 verified
raw
history blame
9.09 kB
from datetime import datetime
from wide_analysis.data.process_data import prepare_dataset
from datasets import load_dataset
from collect_data_wikidata_ent import collect_wikidata_entity
from collect_data_wikidata_prop import collect_wikidata
from collect_data_wikinews import collect_wikinews
from collect_data_wikiquote import collect_wikiquote
from collect_data_es import collect_es
from collect_data_gr import collect_gr
def normalize_outcome(o):
lowered = o.lower()
if 'διαγρ' in lowered:
return 'Διαγραφή'
elif 'διατήρη' in lowered or 'παραμονή' in lowered:
return 'Διατήρηση'
elif 'συγχών' in lowered:
return 'συγχώνευση'
else:
return 'Δεν υπάρχει συναίνεση'
def collect(mode, start_date=None, end_date=None, url=None, title=None, output_path=None,
platform=None, lang=None, date=None, years=None):
if mode not in ['date_range', 'date', 'title','url','wide_2023']:
raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title','url','wide_2023']")
if mode == 'wide_2023':
dataset = load_dataset('hsuvaskakoty/wide_analysis')
print('Dataset loaded successfully as huggingface dataset')
print('The dataset has the following columns:', dataset.column_names)
return dataset
underlying_mode = mode
if mode in ['date', 'date_range']:
underlying_mode = 'year'
if mode == 'url':
underlying_mode = 'url'
if (platform is None and lang is None) or (platform=='wikipedia' and lang=='en'):
if mode in ['date_range', 'date', 'title']:
return prepare_dataset(
mode=mode,
start_date=start_date,
end_date=end_date,
url=url,
title=title,
output_path=output_path
)
else:
print("Invalid input. Choose from ['date_range', 'date', 'title','wide_2023']")
return None
if platform == 'wikidata_entity':
if underlying_mode == 'title':
if not title or (years and len(years)>0):
raise ValueError("For 'title' mode in wikidata entity, 'title' must be provided and 'years' must be empty.")
return collect_wikidata_entity(mode='title', title=title, years=[])
elif underlying_mode == 'year':
if start_date and end_date:
start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_wikidata_entity(mode='year', years=[start_year, end_year])
elif start_date:
single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_wikidata_entity(mode='year', years=single_year)
else:
raise ValueError("For 'year' mode in wikidata entity, start_date (and optionally end_date) is required.")
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikidata entity, 'url' must be provided.")
return collect_wikidata_entity(mode='url', url=url)
else:
raise ValueError("Invalid mode for wikidata entity. Use 'title' or 'year'.")
elif platform == 'wikidata_property':
if underlying_mode == 'title':
if not title or (years and len(years)>0):
raise ValueError("For 'title' mode in wikidata property, 'title' must be provided and 'years' must be empty.")
return collect_wikidata(mode='title', title=title, years=[])
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikidata property, 'url' must be provided.")
return collect_wikidata(mode='url', title='', url=url, years=[])
elif underlying_mode == 'year':
if start_date and end_date:
start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_wikidata(mode='year', years=[start_year, end_year])
elif start_date:
single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_wikidata(mode='year', years=single_year)
else:
raise ValueError("For 'year' mode in wikidata property, start_date (and optionally end_date) is required.")
else:
raise ValueError("Invalid mode for wikidata property. Use 'title' or 'year'.")
# else:
# raise ValueError("Invalid lang for wikidata. Use 'entity' or 'property'.")
elif platform == 'wikinews':
if underlying_mode == 'title':
if not title:
raise ValueError("For 'title' mode in wikinews, 'title' is required.")
return collect_wikinews(mode='title', title=title)
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikinews, 'url' is required.")
return collect_wikinews(mode='url', url=url)
elif underlying_mode == 'year':
if start_date and end_date:
start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_wikinews(mode='year', year=[start_y, end_y])
elif start_date:
single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_wikinews(mode='year', year=single_y)
else:
raise ValueError("For 'year' mode in wikinews, start_date (and optionally end_date) is required.")
else:
raise ValueError("Invalid mode for wikinews. Use 'title' or 'year' or 'url'.")
# elif platform == 'wikiquote':
# if underlying_mode != 'title':
# raise ValueError("Wikiquote collection currently only supports 'title' mode.")
# if not title:
# title = 'all'
# return collect_wikiquote(mode='title', title=title)
elif platform == 'wikiquote':
if underlying_mode not in ['title', 'url']:
raise ValueError("Wikiquote collection currently only supports 'title' or 'url' mode.")
if underlying_mode == 'title':
if not title:
title = 'all'
return collect_wikiquote(mode='title', title=title)
elif underlying_mode == 'url':
if not url:
raise ValueError("For 'url' mode in wikiquote, 'url' must be provided.")
return collect_wikiquote(mode='url', url=url)
elif platform == 'wikipedia':
if lang == 'es':
if underlying_mode == 'title':
if not title or date:
raise ValueError("For 'title' mode in spanish wikipedia, 'title' must be provided and 'date' must be empty.")
return collect_es(mode='title', title=title, date='')
elif underlying_mode == 'year':
if not date:
raise ValueError("For 'year' mode in spanish wikipedia, 'date' parameter (dd/mm/yyyy) is required.")
return collect_es(mode='year', title='', date=date)
else:
raise ValueError("Invalid mode for spanish wikipedia. Use 'title' or 'year'.")
elif lang == 'gr':
if underlying_mode == 'title':
if not title or not years or len(years) != 1:
raise ValueError("For 'title' mode in greek wikipedia, 'title' and a single-element list years=['mm/yyyy'] are required.")
return collect_gr(mode='title', title=title, years=years)
elif underlying_mode == 'year':
if start_date and end_date:
start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
return collect_gr(mode='year', title='', years=[start_y,end_y])
elif start_date:
single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
return collect_gr(mode='year', title='', years=[single_y])
else:
raise ValueError("For 'year' mode in greek wikipedia, start_date (and optionally end_date) is required.")
else:
raise ValueError("Invalid mode for greek wikipedia. Use 'title' or 'year'.")
else:
raise ValueError("Invalid lang for wikipedia. Use 'en', 'es', or 'gr'.")
else:
raise ValueError("Invalid platform. Use 'wikipedia', 'wikidata_entity', Wikidata_property', 'wikinews', or 'wikiquote'.")