from datetime import datetime from process_data import prepare_dataset from datasets import load_dataset from collect_data_wikidata_ent import collect_wikidata_entity from collect_data_wikidata_prop import collect_wikidata from collect_data_wikinews import collect_wikinews from collect_data_wikiquote import collect_wikiquote from collect_data_es import collect_es from collect_data_gr import collect_gr def normalize_outcome(o): lowered = o.lower() if 'διαγρ' in lowered: return 'Διαγραφή' elif 'διατήρη' in lowered or 'παραμονή' in lowered: return 'Διατήρηση' elif 'συγχών' in lowered: return 'συγχώνευση' else: return 'Δεν υπάρχει συναίνεση' def collect(mode, start_date=None, end_date=None, url=None, title=None, output_path=None, platform=None, lang=None, date=None, years=None): if mode not in ['date_range', 'date', 'title','url','wide_2023']: raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title','url','wide_2023']") if mode == 'wide_2023': dataset = load_dataset('hsuvaskakoty/wide_analysis') print('Dataset loaded successfully as huggingface dataset') print('The dataset has the following columns:', dataset.column_names) return dataset underlying_mode = mode if mode in ['date', 'date_range']: underlying_mode = 'year' if mode == 'url': underlying_mode = 'url' if (platform is None and lang is None) or (platform=='wikipedia' and lang=='en'): if mode in ['date_range', 'date', 'title']: return prepare_dataset( mode=mode, start_date=start_date, end_date=end_date, url=url, title=title, output_path=output_path ) else: print("Invalid input. Choose from ['date_range', 'date', 'title','wide_2023']") return None if platform == 'wikidata_entity': if underlying_mode == 'title': if not title or (years and len(years)>0): raise ValueError("For 'title' mode in wikidata entity, 'title' must be provided and 'years' must be empty.") return collect_wikidata_entity(mode='title', title=title, years=[]) elif underlying_mode == 'year': if start_date and end_date: start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year) return collect_wikidata_entity(mode='year', years=[start_year, end_year]) elif start_date: single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) return collect_wikidata_entity(mode='year', years=single_year) else: raise ValueError("For 'year' mode in wikidata entity, start_date (and optionally end_date) is required.") elif underlying_mode == 'url': if not url: raise ValueError("For 'url' mode in wikidata entity, 'url' must be provided.") return collect_wikidata_entity(mode='url', url=url) else: raise ValueError("Invalid mode for wikidata entity. Use 'title' or 'year'.") elif platform == 'wikidata_property': if underlying_mode == 'title': if not title or (years and len(years)>0): raise ValueError("For 'title' mode in wikidata property, 'title' must be provided and 'years' must be empty.") return collect_wikidata(mode='title', title=title, years=[]) elif underlying_mode == 'url': if not url: raise ValueError("For 'url' mode in wikidata property, 'url' must be provided.") return collect_wikidata(mode='url', title='', url=url, years=[]) elif underlying_mode == 'year': if start_date and end_date: start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year) return collect_wikidata(mode='year', years=[start_year, end_year]) elif start_date: single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) return collect_wikidata(mode='year', years=single_year) else: raise ValueError("For 'year' mode in wikidata property, start_date (and optionally end_date) is required.") else: raise ValueError("Invalid mode for wikidata property. Use 'title' or 'year'.") # else: # raise ValueError("Invalid lang for wikidata. Use 'entity' or 'property'.") elif platform == 'wikinews': if underlying_mode == 'title': if not title: raise ValueError("For 'title' mode in wikinews, 'title' is required.") return collect_wikinews(mode='title', title=title) elif underlying_mode == 'url': if not url: raise ValueError("For 'url' mode in wikinews, 'url' is required.") return collect_wikinews(mode='url', url=url) elif underlying_mode == 'year': if start_date and end_date: start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year) return collect_wikinews(mode='year', year=[start_y, end_y]) elif start_date: single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) return collect_wikinews(mode='year', year=single_y) else: raise ValueError("For 'year' mode in wikinews, start_date (and optionally end_date) is required.") else: raise ValueError("Invalid mode for wikinews. Use 'title' or 'year' or 'url'.") # elif platform == 'wikiquote': # if underlying_mode != 'title': # raise ValueError("Wikiquote collection currently only supports 'title' mode.") # if not title: # title = 'all' # return collect_wikiquote(mode='title', title=title) elif platform == 'wikiquote': if underlying_mode not in ['title', 'url']: raise ValueError("Wikiquote collection currently only supports 'title' or 'url' mode.") if underlying_mode == 'title': if not title: title = 'all' return collect_wikiquote(mode='title', title=title) elif underlying_mode == 'url': if not url: raise ValueError("For 'url' mode in wikiquote, 'url' must be provided.") return collect_wikiquote(mode='url', url=url) elif platform == 'wikipedia': if lang == 'es': if underlying_mode == 'title': if not title or date: raise ValueError("For 'title' mode in spanish wikipedia, 'title' must be provided and 'date' must be empty.") return collect_es(mode='title', title=title, date='') elif underlying_mode == 'year': if not date: raise ValueError("For 'year' mode in spanish wikipedia, 'date' parameter (dd/mm/yyyy) is required.") return collect_es(mode='year', title='', date=date) else: raise ValueError("Invalid mode for spanish wikipedia. Use 'title' or 'year'.") elif lang == 'gr': if underlying_mode == 'title': if not title or not years or len(years) != 1: raise ValueError("For 'title' mode in greek wikipedia, 'title' and a single-element list years=['mm/yyyy'] are required.") return collect_gr(mode='title', title=title, years=years) elif underlying_mode == 'year': if start_date and end_date: start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year) return collect_gr(mode='year', title='', years=[start_y,end_y]) elif start_date: single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) return collect_gr(mode='year', title='', years=[single_y]) else: raise ValueError("For 'year' mode in greek wikipedia, start_date (and optionally end_date) is required.") else: raise ValueError("Invalid mode for greek wikipedia. Use 'title' or 'year'.") else: raise ValueError("Invalid lang for wikipedia. Use 'en', 'es', or 'gr'.") else: raise ValueError("Invalid platform. Use 'wikipedia', 'wikidata_entity', Wikidata_property', 'wikinews', or 'wikiquote'.")