import json import re from collections import defaultdict import trebek_bot QuestionSet = list[dict[str, str]] _JEOPARDY_DATA = "data/jeopardy.json" _NUM_QUESTIONS_PER_CATEGORY = 5 def load(use_gemini: bool = False) -> list[QuestionSet]: """Loads a cleaned up data set to use in Mesop Jeopardy game. Args: use_gemini: If enabled, use Gemini to generate questions instead of an existing data set. """ if use_gemini: data = trebek_bot.generate_questions() else: data = _load_raw_data() data = _add_raw_value(data) data = _clean_questions(data) question_sets = _group_into_question_sets(data) question_sets = _sort_question_sets(question_sets) question_sets = _normalize_values(question_sets) return _filter_out_incomplete_question_sets(question_sets) def _load_raw_data() -> QuestionSet: """Load the raw data set. Format of each question/clue looks like this: { "category": "HISTORY", "air_date": "2004-12-31", "question": "'For the last 8 years of his life, Galileo was...", "value": "$200", "answer": "Copernicus", "round": "Jeopardy!", "show_number": "4680" } """ with open(_JEOPARDY_DATA, "r") as f: return json.load(f) def _add_raw_value(data: QuestionSet) -> QuestionSet: """Add raw value since the value is formatted as a dollar string that isn't as easy to sort""" for row in data: row["raw_value"] = _convert_dollar_amount(row["value"]) return data def _clean_questions(data: QuestionSet) -> QuestionSet: """Clean up questions - Strip single quotes around each question - Replace escaped single quotes - Strip HTML tags """ for row in data: row["question"] = re.sub( "<[^<]+?>", "", row["question"].strip("'").replace("\\'", "'") ) return data def _convert_dollar_amount(value: str) -> int: """Coverts raw value into an integer. The raw value is string formatted as a dollar amount, such as $1,000. In this dataset the dollar amount is not given for Daily Doubles that were not answered, so we'll set those cases to a value of 0 for now. In addition, answered daily doubles will have odd dollar amounts. These values won't be used in the actually game. Only for roughly sorting the question difficulty. """ if value: return int(value.replace("$", "").replace(",", "")) else: return 0 def _group_into_question_sets(data: QuestionSet) -> list[QuestionSet]: """Groups the questions by category for that air date. We want to mix and match questions across games, but we want to keep the questions within a category together. """ question_sets = defaultdict(lambda: []) for row in data: question_sets[(row["category"], row["air_date"])].append(row) return list(question_sets.values()) def _sort_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]: return [_sort_question_set(question_set) for question_set in question_sets] def _sort_question_set(question_set: QuestionSet) -> QuestionSet: """Sort the question sets so they are ordered roughly in order difficulty. This will not always be true due to Daily Doubles skewing the order. The data set did not store the Daily Double values separately from the normal game value. """ return sorted(question_set, key=lambda q: q["raw_value"]) def _normalize_values(question_sets: list[QuestionSet]) -> list[QuestionSet]: """Normalizes question dollar amounts based on order of appearance. Since we picking random categories across different rounds and years, the dollar values will differ. So we will normalize them here. """ for question_set in question_sets: for index, question in enumerate(question_set): question["normalized_value"] = (index + 1) * 200 return question_sets def _filter_out_incomplete_question_sets(question_sets: list[QuestionSet]) -> list[QuestionSet]: """Filters out question sets that are incomplete (do not contain five questions). Final Jeopardy categories only have one question so we want to ignore those. We also want to avoid anomalies in the data set. In addition there are cases where not all questions were answered for a category. This means that we will be missing a question on the board. """ return [ question_set for question_set in question_sets if len(question_set) == _NUM_QUESTIONS_PER_CATEGORY ]