Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

App Files Files Community

derek-thomas commited on Oct 25, 2023

Commit

285612d

1 Parent(s): 1d46c26

Major updates, moving away from pushshift.io into PRAW

Browse files

Files changed (16) hide show

.gitignore +2 -1
Dockerfile +6 -2
app.py +2 -2
archive/subreddit_downloader.py +0 -145
main.py +40 -94
media/reddit_scraper.drawio.html +0 -11
media/reddit_scraper.drawio.png +0 -0
notebooks/data_processing.ipynb +0 -0
notebooks/explore.ipynb +0 -323
notebooks/validate.ipynb +0 -617
requirements.txt +5 -5
utilities/data_collator.py +55 -0
utilities/my_logger.py +22 -0
utilities/praw_downloader.py +54 -0
utilities/praw_processor.py +35 -0
utilities/readme_update.py +8 -12

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .idea/
 notebooks/.ipynb_checkpoints
-mylog.log

 .idea/
 notebooks/.ipynb_checkpoints
+mylog.log
+.env

Dockerfile CHANGED Viewed

@@ -1,8 +1,10 @@
 # Use the official Python base image
-FROM python:3.9
 # Install Git LFS
-RUN apt-get update && apt-get install -y git-lfs
 # https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
 RUN useradd -m -u 1000 user
@@ -29,7 +31,9 @@ COPY . .
 COPY supervisord.conf .
 # Set permissions on the log file
 RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
 # RUN mkdir -m 777 -p /.cache/huggingface/hub/

 # Use the official Python base image
+FROM python:3.10
 # Install Git LFS
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
+RUN apt-get -o Acquire::AllowInsecureRepositories=true update && apt-get install -y git-lfs
+#RUN apt-get update && apt-get install -y git-lfs
 # https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
 RUN useradd -m -u 1000 user
 COPY supervisord.conf .
 # Set permissions on the log file
+USER root
 RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
+USER user
 # RUN mkdir -m 777 -p /.cache/huggingface/hub/

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ proj_dir = Path(__name__).parent
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
-dataset_name = f"{username}/dataset-creator-{subreddit}"
 def log_file_to_html_string():
@@ -37,7 +37,7 @@ markdown = f"""
 # Reddit Scraper
 This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
-As shown below this space pulls data from pushshift.io, processes it, and puts it in a corresponding dataset.
 """
 with gr.Blocks() as demo:

 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
+dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
 def log_file_to_html_string():
 # Reddit Scraper
 This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
+As shown below this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
 """
 with gr.Blocks() as demo:

archive/subreddit_downloader.py DELETED Viewed

@@ -1,145 +0,0 @@
-import csv
-import json
-import sys
-import time
-import traceback
-from datetime import datetime
-import requests
-username = ""  # put the username you want to download in the quotes
-subreddit = "BestofRedditorUpdates"  # put the subreddit you want to download in the quotes
-thread_id = ""  # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
-# leave either one blank to download an entire user's or subreddit's history
-# or fill in both to download a specific users history from a specific subreddit
-# change this to one of "human", "csv" or "json"
-# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
-# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
-# - json: the full json object
-output_format = "csv"
-# default start time is the current time and default end time is all history
-# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
-# start_time = datetime.utcnow()  # datetime.strptime("10/05/2021", "%m/%d/%Y")
-start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
-end_time = None  # datetime.strptime("09/25/2021", "%m/%d/%Y")
-convert_to_ascii = False  # don't touch this unless you know what you're doing
-convert_thread_id_to_base_ten = True  # don't touch this unless you know what you're doing
-def write_csv_line(writer, obj, is_submission):
-    output_list = []
-    output_list.append(str(obj['score']))
-    output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
-    if is_submission:
-        output_list.append(obj['title'])
-    output_list.append(f"u/{obj['author']}")
-    output_list.append(f"https://www.reddit.com{obj['permalink']}")
-    if is_submission:
-        if obj['is_self']:
-            if 'selftext' in obj:
-                output_list.append(obj['selftext'])
-            else:
-                output_list.append("")
-        else:
-            output_list.append(obj['url'])
-    else:
-        output_list.append(obj['body'])
-    writer.writerow(output_list)
-def write_json_line(handle, obj):
-    handle.write(json.dumps(obj))
-    handle.write("\n")
-def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
-    print(f"Saving to {filename}")
-    count = 0
-    if output_format == "human" or output_format == "json":
-        if convert_to_ascii:
-            handle = open(filename, 'w', encoding='ascii')
-        else:
-            handle = open(filename, 'w', encoding='UTF-8')
-    else:
-        handle = open(filename, 'w', encoding='UTF-8', newline='')
-        writer = csv.writer(handle)
-    previous_epoch = int(start_datetime.timestamp())
-    break_out = False
-    while True:
-        new_url = url_base + str(previous_epoch)
-        json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
-        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
-        try:
-            json_data = json_text.json()
-        except json.decoder.JSONDecodeError:
-            time.sleep(1)
-            continue
-        if 'data' not in json_data:
-            break
-        objects = json_data['data']
-        if len(objects) == 0:
-            break
-        for obj in objects:
-            previous_epoch = obj['created_utc'] - 1
-            if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
-                break_out = True
-                break
-            count += 1
-            try:
-                if output_format == "csv":
-                    write_csv_line(writer, obj, is_submission)
-                elif output_format == "json":
-                    write_json_line(handle, obj)
-            except Exception as err:
-                if 'permalink' in obj:
-                    print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
-                else:
-                    print(f"Couldn't print object, missing permalink: {obj['id']}")
-                print(err)
-                print(traceback.format_exc())
-        if break_out:
-            break
-        print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
-    print(f"Saved {count}")
-    handle.close()
-if __name__ == "__main__":
-    filter_string = None
-    if username == "" and subreddit == "" and thread_id == "":
-        print("Fill in username, subreddit or thread id")
-        sys.exit(0)
-    if output_format not in ("human", "csv", "json"):
-        print("Output format must be one of human, csv, json")
-        sys.exit(0)
-    filters = []
-    if username:
-        filters.append(f"author={username}")
-    if subreddit:
-        filters.append(f"subreddit={subreddit}")
-    if thread_id:
-        if convert_thread_id_to_base_ten:
-            filters.append(f"link_id={int(thread_id, 36)}")
-        else:
-            filters.append(f"link_id=t3_{thread_id}")
-    filter_string = '&'.join(filters)
-    url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
-    if not thread_id:
-        download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
-                          end_time, True, convert_to_ascii)
-    # download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
-    #                   end_time, False, convert_to_ascii)

main.py CHANGED Viewed

@@ -3,17 +3,18 @@ import time
 from datetime import datetime, timedelta
 import pandas as pd
-from datasets import Dataset, DatasetDict, load_dataset
 from huggingface_hub import login
-from my_logger import setup_logger
-from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe
 from utilities.readme_update import update_readme
 # Set dataset name, path to README.md, and existing dataset details
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
-dataset_name = f"{username}/dataset-creator-{subreddit}"
 dataset_readme_path = "README.md"
 # Authenticate with Hugging Face using an auth token
@@ -23,94 +24,6 @@ login(auth_token, add_to_git_credential=True)
 logger = setup_logger(__name__)
-def main(dataset, date_to_fetch):
-    """
-    Runs the main data processing function to fetch and process subreddit data for the specified date.
-    Args:
-        dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
-        date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
-    Returns:
-        most_recent_date (str): The most recent date in the updated dataset.
-    """
-    # Call get_subreddit_day with the calculated date
-    logger.info(f"Fetching data for {str(date_to_fetch)}")
-    submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
-    df = submissions_to_dataframe(submissions)
-    logger.debug(f"Data fetched for {str(date_to_fetch)}")
-    most_recent_date = date_to_fetch
-    # Append DataFrame to split 'all_days' or create new split
-    if "all_days" in dataset:
-        logger.debug("Appending data to split 'all_days'")
-        # Merge the new submissions
-        old_data = dataset['all_days'].to_pandas()
-        new_data = pd.concat([old_data, df], ignore_index=True)
-        if '__index_level_0__' in new_data.columns:
-            new_data = new_data.drop('__index_level_0__', axis=1)
-        # Drop duplicates just in case
-        new_data = new_data.drop_duplicates(subset=['id'], keep="first")
-        # Figure out dates when we restart
-        old_data_most_recent_date = old_data['date'].max()
-        old_data_most_recent_date = datetime.strptime(old_data_most_recent_date, '%Y-%m-%d').date()
-        most_recent_date = max(old_data_most_recent_date, most_recent_date)
-        if len(old_data) == len(new_data):
-            logger.warning("Data in hub is much more recent, using that next!")
-            return most_recent_date
-        # Convert back to dataset
-        dataset["all_days"] = Dataset.from_pandas(new_data)
-        # Update README
-        update_readme(dataset_name, subreddit, date_to_fetch)
-    else:
-        logger.debug("Creating new split 'all_days'")
-        dataset["all_days"] = Dataset.from_pandas(df)
-    # Log appending or creating split 'all'
-    logger.debug("Appended or created split 'all_days'")
-    # Push the augmented dataset to the Hugging Face hub
-    logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
-    dataset.push_to_hub(dataset_name, token=auth_token)
-    logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
-    return most_recent_date
-def run_main_continuously():
-    """
-    This function runs the given `main_function` continuously, starting from the date specified
-    in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
-    it will wait until tomorrow to start again at the same time as when it started today.
-    """
-    start_date_str = os.environ.get("START_DATE")
-    start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
-    # Calculate the start time for running the main_function every day.
-    start_time = datetime.now().time()
-    dataset = get_dataset()
-    while True:
-        today = datetime.now().date()
-        two_days_ago = today - timedelta(days=2)
-        if start_date <= two_days_ago:
-            logger.warning(f"Running main function for date: {start_date}")
-            most_recent_date = main(dataset, start_date)
-            start_date = most_recent_date + timedelta(days=1)
-        else:
-            tomorrow = today + timedelta(days=1)
-            now = datetime.now()
-            start_of_tomorrow = datetime.combine(tomorrow, start_time)
-            wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
-            logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
-            time.sleep(wait_until_tomorrow)
 def get_dataset():
     # Load the existing dataset from the Hugging Face hub or create a new one
     try:
@@ -124,5 +37,38 @@ def get_dataset():
     return dataset
-if __name__ == '__main__':
-    run_main_continuously()

 from datetime import datetime, timedelta
 import pandas as pd
+import schedule
+from datasets import DatasetDict, load_dataset, Dataset
 from huggingface_hub import login
+from utilities.data_collator import merge_and_filter_data
+from utilities.my_logger import setup_logger
 from utilities.readme_update import update_readme
 # Set dataset name, path to README.md, and existing dataset details
 subreddit = os.environ["SUBREDDIT"]
 username = os.environ["USERNAME"]
+dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
 dataset_readme_path = "README.md"
 # Authenticate with Hugging Face using an auth token
 logger = setup_logger(__name__)
 def get_dataset():
     # Load the existing dataset from the Hugging Face hub or create a new one
     try:
     return dataset
+def main():
+    date = datetime.now().strftime('%Y-%m-%d')
+    logger.warning(f"Running main function for date: {date}")
+    dataset = get_dataset()
+    # Get Latest Data and merge with historic data
+    old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
+    new_df = merge_and_filter_data(old_df=old_df)
+    dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
+    # Update README
+    update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date)
+    # Push the augmented dataset to the Hugging Face hub
+    logger.debug(f"Pushing data for {date} to the Hugging Face hub")
+    dataset.push_to_hub(dataset_name, token=auth_token)
+    logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
+def schedule_daily_task():
+    """
+    Schedule the daily_task to run at the specific time every day.
+    """
+    start_time = (datetime.now() + timedelta(seconds=5)).time().strftime('%H:%M')  # Now + 30 seconds
+    logger.info(f'Scheduling tasks to run every day at: {start_time}')
+    main()
+    schedule.every().day.at(start_time).do(main)
+    while True:
+        schedule.run_pending()
+        time.sleep(1)
+if __name__ == "__main__":
+    schedule_daily_task()

media/reddit_scraper.drawio.html DELETED Viewed

@@ -1,11 +0,0 @@
-<!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=5,IE=9" ><![endif]-->
-<!DOCTYPE html>
-<html>
-<head>
-<title>reddit_scraper</title>
-<meta charset="utf-8"/>
-</head>
-<body><div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{&quot;highlight&quot;:&quot;#0000ff&quot;,&quot;nav&quot;:true,&quot;resize&quot;:true,&quot;toolbar&quot;:&quot;zoom layers tags lightbox&quot;,&quot;edit&quot;:&quot;_blank&quot;,&quot;xml&quot;:&quot;&lt;mxfile host=\&quot;app.diagrams.net\&quot; modified=\&quot;2023-04-14T12:12:14.014Z\&quot; agent=\&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36\&quot; etag=\&quot;puEjOIZigDmpONhGThsE\&quot; version=\&quot;21.1.7\&quot; type=\&quot;device\&quot;&gt;\n  &lt;diagram name=\&quot;Page-1\&quot; id=\&quot;14ddc1Tw5ZQC4xUkB2ri\&quot;&gt;\n    &lt;mxGraphModel dx=\&quot;1034\&quot; dy=\&quot;783\&quot; grid=\&quot;1\&quot; gridSize=\&quot;10\&quot; guides=\&quot;1\&quot; tooltips=\&quot;1\&quot; connect=\&quot;1\&quot; arrows=\&quot;1\&quot; fold=\&quot;1\&quot; page=\&quot;1\&quot; pageScale=\&quot;1\&quot; pageWidth=\&quot;850\&quot; pageHeight=\&quot;1100\&quot; math=\&quot;0\&quot; shadow=\&quot;0\&quot;&gt;\n      &lt;root&gt;\n        &lt;mxCell id=\&quot;0\&quot; /&gt;\n        &lt;mxCell id=\&quot;1\&quot; parent=\&quot;0\&quot; /&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-3\&quot; value=\&quot;\&quot; style=\&quot;edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;\&quot; edge=\&quot;1\&quot; parent=\&quot;1\&quot; source=\&quot;KhBTRBst3V2Bs5u7l5Na-1\&quot; target=\&quot;KhBTRBst3V2Bs5u7l5Na-2\&quot;&gt;\n          &lt;mxGeometry relative=\&quot;1\&quot; as=\&quot;geometry\&quot; /&gt;\n        &lt;/mxCell&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-7\&quot; value=\&quot;HF API\&quot; style=\&quot;edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\&quot; vertex=\&quot;1\&quot; connectable=\&quot;0\&quot; parent=\&quot;KhBTRBst3V2Bs5u7l5Na-3\&quot;&gt;\n          &lt;mxGeometry x=\&quot;-0.125\&quot; y=\&quot;1\&quot; relative=\&quot;1\&quot; as=\&quot;geometry\&quot;&gt;\n            &lt;mxPoint as=\&quot;offset\&quot; /&gt;\n          &lt;/mxGeometry&gt;\n        &lt;/mxCell&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-1\&quot; value=\&quot;HF SPACE&amp;lt;br&amp;gt;&amp;lt;a href=&amp;quot;SPACE_LINK&amp;quot;&amp;gt;SPACE_NAME&amp;lt;/a&amp;gt;\&quot; style=\&quot;rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\&quot; vertex=\&quot;1\&quot; parent=\&quot;1\&quot;&gt;\n          &lt;mxGeometry x=\&quot;340\&quot; y=\&quot;360\&quot; width=\&quot;160\&quot; height=\&quot;80\&quot; as=\&quot;geometry\&quot; /&gt;\n        &lt;/mxCell&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-2\&quot; value=\&quot;HF DATASET &amp;lt;br&amp;gt;&amp;lt;a href=&amp;quot;DATASET_LINK&amp;quot;&amp;gt;DATASET_NAME&amp;lt;/a&amp;gt;\&quot; style=\&quot;rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\&quot; vertex=\&quot;1\&quot; parent=\&quot;1\&quot;&gt;\n          &lt;mxGeometry x=\&quot;110\&quot; y=\&quot;360\&quot; width=\&quot;160\&quot; height=\&quot;80\&quot; as=\&quot;geometry\&quot; /&gt;\n        &lt;/mxCell&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-4\&quot; value=\&quot;&amp;lt;a href=&amp;quot;pushshift.io&amp;quot;&amp;gt;Pushshift.io&amp;lt;/a&amp;gt;&amp;lt;br&amp;gt;Hosts Reddit Data\&quot; style=\&quot;rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;\&quot; vertex=\&quot;1\&quot; parent=\&quot;1\&quot;&gt;\n          &lt;mxGeometry x=\&quot;590\&quot; y=\&quot;360\&quot; width=\&quot;160\&quot; height=\&quot;80\&quot; as=\&quot;geometry\&quot; /&gt;\n        &lt;/mxCell&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-8\&quot; value=\&quot;\&quot; style=\&quot;endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;\&quot; edge=\&quot;1\&quot; parent=\&quot;1\&quot; source=\&quot;KhBTRBst3V2Bs5u7l5Na-1\&quot; target=\&quot;KhBTRBst3V2Bs5u7l5Na-4\&quot;&gt;\n          &lt;mxGeometry width=\&quot;50\&quot; height=\&quot;50\&quot; relative=\&quot;1\&quot; as=\&quot;geometry\&quot;&gt;\n            &lt;mxPoint x=\&quot;470\&quot; y=\&quot;530\&quot; as=\&quot;sourcePoint\&quot; /&gt;\n            &lt;mxPoint x=\&quot;520\&quot; y=\&quot;480\&quot; as=\&quot;targetPoint\&quot; /&gt;\n          &lt;/mxGeometry&gt;\n        &lt;/mxCell&gt;\n        &lt;mxCell id=\&quot;KhBTRBst3V2Bs5u7l5Na-9\&quot; value=\&quot;HTTP\&quot; style=\&quot;edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\&quot; vertex=\&quot;1\&quot; connectable=\&quot;0\&quot; parent=\&quot;KhBTRBst3V2Bs5u7l5Na-8\&quot;&gt;\n          &lt;mxGeometry x=\&quot;0.225\&quot; y=\&quot;1\&quot; relative=\&quot;1\&quot; as=\&quot;geometry\&quot;&gt;\n            &lt;mxPoint x=\&quot;-9\&quot; y=\&quot;1\&quot; as=\&quot;offset\&quot; /&gt;\n          &lt;/mxGeometry&gt;\n        &lt;/mxCell&gt;\n      &lt;/root&gt;\n    &lt;/mxGraphModel&gt;\n  &lt;/diagram&gt;\n&lt;/mxfile&gt;\n&quot;}"></div>
-<script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>
-</body>
-</html>

media/reddit_scraper.drawio.png CHANGED Viewed

notebooks/data_processing.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/explore.ipynb DELETED Viewed

@@ -1,323 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "730ba509",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.core.interactiveshell import InteractiveShell\n",
-    "InteractiveShell.ast_node_interactivity = \"all\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d9acd4b6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "import sys\n",
-    "proj_dir = Path.cwd().parent\n",
-    "\n",
-    "sys.path.append(str(proj_dir))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "62452860",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "a956a623",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "17df3f2812084d3591e914ffcfd948b0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "0it [00:00, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n",
-      "2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n",
-      "2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n",
-      "2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n",
-      "2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n",
-      "2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n",
-      "2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "subreddit_to_scrape = \"askreddit\"\n",
-    "day_to_scrape = \"2013-03-01\"\n",
-    "submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "b1cc845b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>permalink</th>\n",
-       "      <th>selftext</th>\n",
-       "      <th>url</th>\n",
-       "      <th>created_utc</th>\n",
-       "      <th>author</th>\n",
-       "      <th>num_comments</th>\n",
-       "      <th>score</th>\n",
-       "      <th>title</th>\n",
-       "      <th>id</th>\n",
-       "      <th>downs</th>\n",
-       "      <th>ups</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>/r/AskReddit/comments/19hbm0/in_the_way_that_p...</td>\n",
-       "      <td>Basically, do other parts of the world have th...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
-       "      <td>2013-03-01 19:58:55</td>\n",
-       "      <td>sjr63</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>In the way that popular English and American m...</td>\n",
-       "      <td>19hbm0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>/r/AskReddit/comments/19hblp/could_i_buy_an_an...</td>\n",
-       "      <td></td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
-       "      <td>2013-03-01 19:58:50</td>\n",
-       "      <td>WeirdPlane</td>\n",
-       "      <td>13</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Could I buy an Android phone without a plan an...</td>\n",
-       "      <td>19hblp</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>/r/AskReddit/comments/19hblj/how_do_i_reddit/</td>\n",
-       "      <td>Yeah.\n",
-       "\n",
-       "How do I reddit? I don't use or read re...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
-       "      <td>2013-03-01 19:58:47</td>\n",
-       "      <td>xxnovaroxgg</td>\n",
-       "      <td>14</td>\n",
-       "      <td>0</td>\n",
-       "      <td>How do I reddit</td>\n",
-       "      <td>19hblj</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...</td>\n",
-       "      <td>My brothers, dad and I have always been huge L...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
-       "      <td>2013-03-01 19:58:07</td>\n",
-       "      <td>tuffstough</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>(x-post r/survival) Have any redditors seen Le...</td>\n",
-       "      <td>19hbjx</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>/r/AskReddit/comments/19hbjk/female_redditors_...</td>\n",
-       "      <td>I'm curious, guys tend to get asked the usual ...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
-       "      <td>2013-03-01 19:57:58</td>\n",
-       "      <td>redditredditx3</td>\n",
-       "      <td>13</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Female Redditors, which part of the male physi...</td>\n",
-       "      <td>19hbjk</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                           permalink  \\\n",
-       "0  /r/AskReddit/comments/19hbm0/in_the_way_that_p...   \n",
-       "1  /r/AskReddit/comments/19hblp/could_i_buy_an_an...   \n",
-       "2      /r/AskReddit/comments/19hblj/how_do_i_reddit/   \n",
-       "3  /r/AskReddit/comments/19hbjx/xpost_rsurvival_h...   \n",
-       "4  /r/AskReddit/comments/19hbjk/female_redditors_...   \n",
-       "\n",
-       "                                            selftext  \\\n",
-       "0  Basically, do other parts of the world have th...   \n",
-       "1                                                      \n",
-       "2  Yeah.\n",
-       "\n",
-       "How do I reddit? I don't use or read re...   \n",
-       "3  My brothers, dad and I have always been huge L...   \n",
-       "4  I'm curious, guys tend to get asked the usual ...   \n",
-       "\n",
-       "                                                 url          created_utc  \\\n",
-       "0  http://www.reddit.com/r/AskReddit/comments/19h...  2013-03-01 19:58:55   \n",
-       "1  http://www.reddit.com/r/AskReddit/comments/19h...  2013-03-01 19:58:50   \n",
-       "2  http://www.reddit.com/r/AskReddit/comments/19h...  2013-03-01 19:58:47   \n",
-       "3  http://www.reddit.com/r/AskReddit/comments/19h...  2013-03-01 19:58:07   \n",
-       "4  http://www.reddit.com/r/AskReddit/comments/19h...  2013-03-01 19:57:58   \n",
-       "\n",
-       "           author  num_comments  score  \\\n",
-       "0           sjr63             1      1   \n",
-       "1      WeirdPlane            13      1   \n",
-       "2     xxnovaroxgg            14      0   \n",
-       "3      tuffstough             0      1   \n",
-       "4  redditredditx3            13      2   \n",
-       "\n",
-       "                                               title      id  downs  ups  \n",
-       "0  In the way that popular English and American m...  19hbm0      0    1  \n",
-       "1  Could I buy an Android phone without a plan an...  19hblp      0    1  \n",
-       "2                                    How do I reddit  19hblj      0    0  \n",
-       "3  (x-post r/survival) Have any redditors seen Le...  19hbjx      0    1  \n",
-       "4  Female Redditors, which part of the male physi...  19hbjk      0    2  "
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = submissions_to_dataframe(submissions)\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "518addff",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6e5490dc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n",
-    "start_date"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf13555a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "48e413f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9e83befa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.dtypes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ba84be68",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

notebooks/validate.ipynb DELETED Viewed

@@ -1,617 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "730ba509",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.core.interactiveshell import InteractiveShell\n",
-    "InteractiveShell.ast_node_interactivity = \"all\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "d9acd4b6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "import sys\n",
-    "proj_dir = Path.cwd().parent\n",
-    "\n",
-    "sys.path.append(str(proj_dir))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "62452860",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "00affc9a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a106bb47c1194b15bc289d2ef24258af",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/804 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using custom data configuration derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset None/None to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "705d55e70bf442f98a51dd0618a5c2c6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "139220a81674444997f7657a4c2e1a01",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0.00/702k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1a361406937144cebd4ff6168e56ec3d",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating all_days split:   0%|          | 0/3272 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset parquet downloaded and prepared to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4df7107473904386aebd66c543858abd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "dataset = load_dataset('derek-thomas/dataset-creator-askreddit', download_mode=\"reuse_cache_if_exists\", ignore_verifications=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "ba84be68",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>score</th>\n",
-       "      <th>num_comments</th>\n",
-       "      <th>title</th>\n",
-       "      <th>permalink</th>\n",
-       "      <th>selftext</th>\n",
-       "      <th>url</th>\n",
-       "      <th>created_utc</th>\n",
-       "      <th>author</th>\n",
-       "      <th>id</th>\n",
-       "      <th>downs</th>\n",
-       "      <th>ups</th>\n",
-       "      <th>date</th>\n",
-       "      <th>time</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2</td>\n",
-       "      <td>4</td>\n",
-       "      <td>Reddit, if someone had to describe you to a st...</td>\n",
-       "      <td>/r/AskReddit/comments/15sn6y/reddit_if_someone...</td>\n",
-       "      <td>They would be talking about you without your p...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
-       "      <td>2013-01-01 23:59:40+00:00</td>\n",
-       "      <td>[deleted]</td>\n",
-       "      <td>15sn6y</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>23:59:40</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>5</td>\n",
-       "      <td>24</td>\n",
-       "      <td>What kind of car does the average \\nRedditor d...</td>\n",
-       "      <td>/r/AskReddit/comments/15sn6m/what_kind_of_car_...</td>\n",
-       "      <td>I've always wanted to know what kind of car th...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
-       "      <td>2013-01-01 23:59:31+00:00</td>\n",
-       "      <td>PaytonAdams</td>\n",
-       "      <td>15sn6m</td>\n",
-       "      <td>0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>23:59:31</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>5</td>\n",
-       "      <td>What movies have made you go back to the theat...</td>\n",
-       "      <td>/r/AskReddit/comments/15sn6b/what_movies_have_...</td>\n",
-       "      <td></td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
-       "      <td>2013-01-01 23:59:20+00:00</td>\n",
-       "      <td>[deleted]</td>\n",
-       "      <td>15sn6b</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>23:59:20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0</td>\n",
-       "      <td>18</td>\n",
-       "      <td>Worst fear(s)?</td>\n",
-       "      <td>/r/AskReddit/comments/15sn4u/worst_fears/</td>\n",
-       "      <td>So what is your worst fear, reddit?</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
-       "      <td>2013-01-01 23:58:37+00:00</td>\n",
-       "      <td>[deleted]</td>\n",
-       "      <td>15sn4u</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>23:58:37</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>11</td>\n",
-       "      <td>29</td>\n",
-       "      <td>If there was a type of ink that lasted only fo...</td>\n",
-       "      <td>/r/AskReddit/comments/15sn44/if_there_was_a_ty...</td>\n",
-       "      <td></td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
-       "      <td>2013-01-01 23:58:15+00:00</td>\n",
-       "      <td>Honeybeard</td>\n",
-       "      <td>15sn44</td>\n",
-       "      <td>0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>23:58:15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3267</th>\n",
-       "      <td>0</td>\n",
-       "      <td>11</td>\n",
-       "      <td>Smokers of Reddit- What are your reasons for s...</td>\n",
-       "      <td>/r/AskReddit/comments/15qzen/smokers_of_reddit...</td>\n",
-       "      <td>I'm very curious as to what causes someone to ...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
-       "      <td>2013-01-01 00:01:36+00:00</td>\n",
-       "      <td>kelsofb</td>\n",
-       "      <td>15qzen</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>00:01:36</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3268</th>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>Hi</td>\n",
-       "      <td>/r/AskReddit/comments/15qzei/hi/</td>\n",
-       "      <td></td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
-       "      <td>2013-01-01 00:01:34+00:00</td>\n",
-       "      <td>ImJE5US</td>\n",
-       "      <td>15qzei</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>00:01:34</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3269</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>At the stroke of midnight I was writing this p...</td>\n",
-       "      <td>/r/AskReddit/comments/15qzdx/at_the_stroke_of_...</td>\n",
-       "      <td></td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
-       "      <td>2013-01-01 00:01:15+00:00</td>\n",
-       "      <td>Sangfroid_Sonder</td>\n",
-       "      <td>15qzdx</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>00:01:15</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3270</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>With all the rape stories in the news, why don...</td>\n",
-       "      <td>/r/AskReddit/comments/15qzdc/with_all_the_rape...</td>\n",
-       "      <td></td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
-       "      <td>2013-01-01 00:00:58+00:00</td>\n",
-       "      <td>[deleted]</td>\n",
-       "      <td>15qzdc</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>00:00:58</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3271</th>\n",
-       "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Do beautiful people have low entropy?</td>\n",
-       "      <td>/r/AskReddit/comments/15qzd3/do_beautiful_peop...</td>\n",
-       "      <td>I have been reading about entropy and arrows o...</td>\n",
-       "      <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
-       "      <td>2013-01-01 00:00:53+00:00</td>\n",
-       "      <td>[deleted]</td>\n",
-       "      <td>15qzd3</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2013-01-01</td>\n",
-       "      <td>00:00:53</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3272 rows × 13 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      score  num_comments                                              title  \\\n",
-       "0         2             4  Reddit, if someone had to describe you to a st...   \n",
-       "1         5            24  What kind of car does the average \\nRedditor d...   \n",
-       "2         1             5  What movies have made you go back to the theat...   \n",
-       "3         0            18                                     Worst fear(s)?   \n",
-       "4        11            29  If there was a type of ink that lasted only fo...   \n",
-       "...     ...           ...                                                ...   \n",
-       "3267      0            11  Smokers of Reddit- What are your reasons for s...   \n",
-       "3268      1             4                                                 Hi   \n",
-       "3269      1             2  At the stroke of midnight I was writing this p...   \n",
-       "3270      1             2  With all the rape stories in the news, why don...   \n",
-       "3271      0             3              Do beautiful people have low entropy?   \n",
-       "\n",
-       "                                              permalink  \\\n",
-       "0     /r/AskReddit/comments/15sn6y/reddit_if_someone...   \n",
-       "1     /r/AskReddit/comments/15sn6m/what_kind_of_car_...   \n",
-       "2     /r/AskReddit/comments/15sn6b/what_movies_have_...   \n",
-       "3             /r/AskReddit/comments/15sn4u/worst_fears/   \n",
-       "4     /r/AskReddit/comments/15sn44/if_there_was_a_ty...   \n",
-       "...                                                 ...   \n",
-       "3267  /r/AskReddit/comments/15qzen/smokers_of_reddit...   \n",
-       "3268                   /r/AskReddit/comments/15qzei/hi/   \n",
-       "3269  /r/AskReddit/comments/15qzdx/at_the_stroke_of_...   \n",
-       "3270  /r/AskReddit/comments/15qzdc/with_all_the_rape...   \n",
-       "3271  /r/AskReddit/comments/15qzd3/do_beautiful_peop...   \n",
-       "\n",
-       "                                               selftext  \\\n",
-       "0     They would be talking about you without your p...   \n",
-       "1     I've always wanted to know what kind of car th...   \n",
-       "2                                                         \n",
-       "3                   So what is your worst fear, reddit?   \n",
-       "4                                                         \n",
-       "...                                                 ...   \n",
-       "3267  I'm very curious as to what causes someone to ...   \n",
-       "3268                                                      \n",
-       "3269                                                      \n",
-       "3270                                                      \n",
-       "3271  I have been reading about entropy and arrows o...   \n",
-       "\n",
-       "                                                    url  \\\n",
-       "0     http://www.reddit.com/r/AskReddit/comments/15s...   \n",
-       "1     http://www.reddit.com/r/AskReddit/comments/15s...   \n",
-       "2     http://www.reddit.com/r/AskReddit/comments/15s...   \n",
-       "3     http://www.reddit.com/r/AskReddit/comments/15s...   \n",
-       "4     http://www.reddit.com/r/AskReddit/comments/15s...   \n",
-       "...                                                 ...   \n",
-       "3267  http://www.reddit.com/r/AskReddit/comments/15q...   \n",
-       "3268  http://www.reddit.com/r/AskReddit/comments/15q...   \n",
-       "3269  http://www.reddit.com/r/AskReddit/comments/15q...   \n",
-       "3270  http://www.reddit.com/r/AskReddit/comments/15q...   \n",
-       "3271  http://www.reddit.com/r/AskReddit/comments/15q...   \n",
-       "\n",
-       "                   created_utc            author      id  downs  ups  \\\n",
-       "0    2013-01-01 23:59:40+00:00         [deleted]  15sn6y      0    2   \n",
-       "1    2013-01-01 23:59:31+00:00       PaytonAdams  15sn6m      0    5   \n",
-       "2    2013-01-01 23:59:20+00:00         [deleted]  15sn6b      0    1   \n",
-       "3    2013-01-01 23:58:37+00:00         [deleted]  15sn4u      0    0   \n",
-       "4    2013-01-01 23:58:15+00:00        Honeybeard  15sn44      0   11   \n",
-       "...                        ...               ...     ...    ...  ...   \n",
-       "3267 2013-01-01 00:01:36+00:00           kelsofb  15qzen      0    0   \n",
-       "3268 2013-01-01 00:01:34+00:00           ImJE5US  15qzei      0    1   \n",
-       "3269 2013-01-01 00:01:15+00:00  Sangfroid_Sonder  15qzdx      0    1   \n",
-       "3270 2013-01-01 00:00:58+00:00         [deleted]  15qzdc      0    1   \n",
-       "3271 2013-01-01 00:00:53+00:00         [deleted]  15qzd3      0    0   \n",
-       "\n",
-       "            date      time  \n",
-       "0     2013-01-01  23:59:40  \n",
-       "1     2013-01-01  23:59:31  \n",
-       "2     2013-01-01  23:59:20  \n",
-       "3     2013-01-01  23:58:37  \n",
-       "4     2013-01-01  23:58:15  \n",
-       "...          ...       ...  \n",
-       "3267  2013-01-01  00:01:36  \n",
-       "3268  2013-01-01  00:01:34  \n",
-       "3269  2013-01-01  00:01:15  \n",
-       "3270  2013-01-01  00:00:58  \n",
-       "3271  2013-01-01  00:00:53  \n",
-       "\n",
-       "[3272 rows x 13 columns]"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = dataset['all_days'].to_pandas()\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "28df4b06",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "score            Int64\n",
-       "num_comments     Int64\n",
-       "title           string\n",
-       "permalink       string\n",
-       "selftext        string\n",
-       "url             string\n",
-       "created_utc     string\n",
-       "author          string\n",
-       "id              string\n",
-       "downs            Int64\n",
-       "ups              Int64\n",
-       "dtype: object"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.convert_dtypes().dtypes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "e322b6c0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "ed1b06c3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['created_utc'] = pd.to_datetime(df['created_utc'])\n",
-    "df['date'] = df['created_utc'].dt.date\n",
-    "df['time'] = df['created_utc'].dt.time"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "ff477737",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2013-01-01    3272\n",
-       "Name: date, dtype: int64"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.date.value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "1d11b967",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "new_df = df.drop_duplicates(subset=['id'], keep=\"first\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "eec00dd6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Axes: >"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+sklEQVR4nO3de3hU1b3/8U+CuXFJAiqEaJC0FgG5CgoRpVpiIlALlMJBsFqLUGmwYqwXWosBLxRURAGLHItIJZXSI4jAiaRwNCqRSyRVQan6UPEUE34WSSSUZCDr94dndjO5T7ImMzt5v55nnpC916z92WvW3vNlzyVhxhgjAAAANEt4sAMAAAC0BhRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGDBOcEOEEyVlZU6evSoOnXqpLCwsGDHAQAAjWCM0ddff63ExESFh4fO9aE2XVQdPXpUSUlJwY4BAACa4PPPP9eFF14Y7BiONl1UderUSdI3D0psbKy1fj0ej7Zv3660tDRFRERY6zfQ3Jpbcnd2ifzB5ObsknvzuzW35O7sUuvIv2nTJt12223O83ioaNNFlfclv9jYWOtFVfv27RUbG+uqCevW3JK7s0vkDyY3Z5fcm9+tuSV3Z5daT35JIffWndB5IRIAAMDFKKoAAAAs8LuoysvL0w033KDExESFhYVp06ZNzjqPx6P77rtP/fv3V4cOHZSYmKibb75ZR48e9enj+PHjmjZtmmJjYxUfH6/p06fr5MmTPm3ee+89XX311YqOjlZSUpIWL15cI8uGDRvUu3dvRUdHq3///tq2bZu/uwMAAGCF30VVWVmZBg4cqBUrVtRYd+rUKb377rv6zW9+o3fffVcvv/yyDh06pB/84Ac+7aZNm6YDBw4oNzdXW7ZsUV5enmbOnOmsLy0tVVpami666CIVFBToscceU1ZWllatWuW02bVrl2688UZNnz5d+/fv1/jx4zV+/Hh98MEH/u4SAABAs/n9RvXRo0dr9OjRta6Li4tTbm6uz7Lly5friiuu0JEjR9SjRw99+OGHysnJ0d69ezV06FBJ0rJlyzRmzBg9/vjjSkxM1Lp161RRUaHVq1crMjJSl156qQoLC7VkyRKn+Hrqqad0/fXX65577pEkPfTQQ8rNzdXy5cu1cuVKf3cLAACgWQL+6b+SkhKFhYUpPj5ekpSfn6/4+HinoJKk1NRUhYeHa/fu3ZowYYLy8/M1cuRIRUZGOm3S09O1aNEiffXVV+rcubPy8/OVmZnps6309HSflyOrKy8vV3l5ufN7aWmppG9etvR4PBb2Vk5/VX+6hVtzS+7OLpE/mNycXXJvfrfmltydXWo9+UNRQIuq06dP67777tONN97ofGVBUVGRunbt6hvinHPUpUsXFRUVOW2Sk5N92nTr1s1Z17lzZxUVFTnLqrbx9lGbhQsXav78+TWWb9++3fl4pk3Vr9q5hVtzS+7OLpE/mNycXXJvfrfmltydXXJ//lAUsKLK4/Fo8uTJMsbod7/7XaA245e5c+f6XN0qLS1VUlKS0tLSrH9PVW5urq677jpXfQeIW3NL7s4ukT+Y3Jxdcm9+t+aW3J1dah35X3nllWDHqFVAiipvQfXZZ59p586dPgVLQkKCjh075tP+zJkzOn78uBISEpw2xcXFPm28vzfUxru+NlFRUYqKiqqxPCIiIiATK1D9Bppbc0vuzi6RP5jcnF1yb3635pbcnV1yf/5QZP17qrwF1ccff6y//OUvOvfcc33Wp6Sk6MSJEyooKHCW7dy5U5WVlRo2bJjTJi8vz+d109zcXF1yySXq3Lmz02bHjh0+fefm5iolJcX2LgEAADTI76Lq5MmTKiwsVGFhoSTp8OHDKiws1JEjR+TxePSjH/1I+/bt07p163T27FkVFRWpqKhIFRUVkqQ+ffro+uuv14wZM7Rnzx69/fbbmj17tqZMmaLExERJ0tSpUxUZGanp06frwIEDWr9+vZ566imfl+7uvPNO5eTk6IknntBHH32krKws7du3T7Nnz7YwLAAAAP7xu6jat2+fBg8erMGDB0uSMjMzNXjwYM2bN0//+Mc/tHnzZv3v//6vBg0apO7duzu3Xbt2OX2sW7dOvXv31qhRozRmzBhdddVVPt9BFRcXp+3bt+vw4cMaMmSI7r77bs2bN8/nu6yuvPJKZWdna9WqVRo4cKD+/Oc/a9OmTerXr19zxgMAAKBJ/H5P1TXXXCNjTJ3r61vn1aVLF2VnZ9fbZsCAAXrzzTfrbTNp0iRNmjSpwe0BAAAEGn/7DwAAwAKKKgAA4Jee928NdoSQRFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAA1KLn/VuDHQEuQ1EFAABgAUUVAACoFVfr/ENRBQAAYAFFFQAAgAUUVQAA+KFf1mvBjoAQRVEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQCgWfiCSOAbFFUAEEQUJEDrQVEFAABgAUUVAACABRRVAACgWXrev5WXskVRBQAAYIXfRVVeXp5uuOEGJSYmKiwsTJs2bfJZb4zRvHnz1L17d8XExCg1NVUff/yxT5vjx49r2rRpio2NVXx8vKZPn66TJ0/6tHnvvfd09dVXKzo6WklJSVq8eHGNLBs2bFDv3r0VHR2t/v37a9u2bf7uDgAAgBV+F1VlZWUaOHCgVqxYUev6xYsX6+mnn9bKlSu1e/dudejQQenp6Tp9+rTTZtq0aTpw4IByc3O1ZcsW5eXlaebMmc760tJSpaWl6aKLLlJBQYEee+wxZWVladWqVU6bXbt26cYbb9T06dO1f/9+jR8/XuPHj9cHH3zg7y4BAAA02zn+3mH06NEaPXp0reuMMVq6dKkeeOABjRs3TpK0du1adevWTZs2bdKUKVP04YcfKicnR3v37tXQoUMlScuWLdOYMWP0+OOPKzExUevWrVNFRYVWr16tyMhIXXrppSosLNSSJUuc4uupp57S9ddfr3vuuUeS9NBDDyk3N1fLly/XypUrmzQYANDaed/38vffjg1yEqD18buoqs/hw4dVVFSk1NRUZ1lcXJyGDRum/Px8TZkyRfn5+YqPj3cKKklKTU1VeHi4du/erQkTJig/P18jR45UZGSk0yY9PV2LFi3SV199pc6dOys/P1+ZmZk+209PT6/xcmRV5eXlKi8vd34vLS2VJHk8Hnk8nubuvsPbl80+W4Jbc0vuzi6RP5iCnT2qnWnWtv3NH9XO+NW+sX3621+wx70x6tqvqHD7Y9iS/Bn7Oseg2vJAzKu6hPK4hxljTJPvHBamjRs3avz48ZK+eUluxIgROnr0qLp37+60mzx5ssLCwrR+/Xo9+uijeuGFF3To0CGfvrp27ar58+dr1qxZSktLU3Jysp599lln/cGDB3XppZfq4MGD6tOnjyIjI/XCCy/oxhtvdNo888wzmj9/voqLi2vNm5WVpfnz59dYnp2drfbt2zd1GAAAQAs6deqUpk6dqpKSEsXGxgY7jsPqlapQN3fuXJ+rW6WlpUpKSlJaWprVB8Xj8Sg3N1fXXXedIiIirPUbaG7NLbk7u0T+YAp29n5Zr+mDrPQm39/f/P2yXpOkZm2ztj797a+p497c8fJHXdsasiBHDw2tdOV8l/wb+7rGoPryQMyrung8Hr3yyisB305TWC2qEhISJEnFxcU+V6qKi4s1aNAgp82xY8d87nfmzBkdP37cuX9CQkKNq03e3xtq411fm6ioKEVFRdVYHhEREZADI1D9Bppbc0vuzi6RP5iClb38bJiV7daWv+f9W2u8d6r8bJjT3pbm7IO/425rvJqzrfLKf4+hW+e71Lj8dY5BteWBmFduZPV7qpKTk5WQkKAdO3Y4y0pLS7V7926lpKRIklJSUnTixAkVFBQ4bXbu3KnKykoNGzbMaZOXl+fzumlubq4uueQSde7c2WlTdTveNt7tAAAAtCS/i6qTJ0+qsLBQhYWFkr55c3phYaGOHDmisLAwzZkzRw8//LA2b96s999/XzfffLMSExOd91316dNH119/vWbMmKE9e/bo7bff1uzZszVlyhQlJiZKkqZOnarIyEhNnz5dBw4c0Pr16/XUU0/5vHR35513KicnR0888YQ++ugjZWVlad++fZo9e3bzRwUBwzfuAqhNU88NbjinuCGjTW1tf6vyu6jat2+fBg8erMGDB0uSMjMzNXjwYM2bN0+SdO+99+qOO+7QzJkzdfnll+vkyZPKyclRdHS008e6devUu3dvjRo1SmPGjNFVV13l8x1UcXFx2r59uw4fPqwhQ4bo7rvv1rx583y+y+rKK69Udna2Vq1apYEDB+rPf/6zNm3apH79+jV5MACgLs19omjLTzRAW+H3e6quueYa1feBwbCwMC1YsEALFiyos02XLl2UnZ1d73YGDBigN998s942kyZN0qRJk+oPDAAhpLb3OQV7u8HKBLQ2/O0/AECbF4g/CMzVybaHogoAAMACiioAaGW4QhLaAnFVDKGBogohg5MM3Ka1Pzm25n0DAoGiCkHByRpAa9Gazmf+7ktr2ncbKKoAAAAsoKgCAIQkroLAbSiqAAAALKCoAgA0CleO3IXHq+VRVAEAWlQoPNmHQobqassUijlRN4oqAAAACyiqENJa+/cAAQBaD4oqAAhB/GeiZTDOsImiCgAAwAKKKuD/8FIjYB/HlDvwONlBURVCmNQAALgXRRUAVMN/cAA0BUUVAIQQCjrAvSiqAABWURhCapvzgKIKVrTFgwcAgoHzbeiiqELQcYIAALQGFFXwCwUQYAfHEmxiPoUGiiq4DicPAI3F+QItiaIKACziSbx2jEtwMf4tg6IK9eJAdBceLwAIHooqAAAACyiqAABAg7gS3jCKKgCAX/jj40DtKKrQZrnpScFNWQGgraKoQqvVmguRxu4bVxRQH+YGYBdFVRD484SI4ONxAAA0BkUVAAAtgP+gtX4UVQAAABZQVAFNwP84EWr6Zb0W7AghjfcXoiVQVLUCnCgAAAg+iiqgDWsrBXlr28/Wtj9Aa0FRhZDEkwYAwG0oqhAQvH+h9eHxBID6UVQBAABYQFHVSnFVAWgcPjUHwBaKKgCtBv+ZgBdzAcFAUeVCnCxaDx5LNBdX2oDQQVGFNodCpn6MD1qSm4tCN2dHYFBUAS2MT0YCCHWco5qGogoO/tcFwO2CXQzwn6a2jaKqBXGghSYeF/fisUMgMK/QVBRVAOACbeUKSGP3sS2MBdyHogoAgAZQxKExKKqANsLNTwq2srt5DACEPutF1dmzZ/Wb3/xGycnJiomJ0be//W099NBDMsY4bYwxmjdvnrp3766YmBilpqbq448/9unn+PHjmjZtmmJjYxUfH6/p06fr5MmTPm3ee+89XX311YqOjlZSUpIWL15se3cAK3gyBxAKOBcFlvWiatGiRfrd736n5cuX68MPP9SiRYu0ePFiLVu2zGmzePFiPf3001q5cqV2796tDh06KD09XadPn3baTJs2TQcOHFBubq62bNmivLw8zZw501lfWlqqtLQ0XXTRRSooKNBjjz2mrKwsrVq1yvYutUr1HVj9sl7jwAMAwE/Wi6pdu3Zp3LhxGjt2rHr27Kkf/ehHSktL0549eyR9c5Vq6dKleuCBBzRu3DgNGDBAa9eu1dGjR7Vp0yZJ0ocffqicnBw999xzGjZsmK666iotW7ZML730ko4ePSpJWrdunSoqKrR69WpdeumlmjJlin7xi19oyZIltncJaJS2Voi2xv1tzj61hvFoDfsABNM5tju88sortWrVKv3tb39Tr1699Ne//lVvvfWWU+wcPnxYRUVFSk1Nde4TFxenYcOGKT8/X1OmTFF+fr7i4+M1dOhQp01qaqrCw8O1e/duTZgwQfn5+Ro5cqQiIyOdNunp6Vq0aJG++uorde7cuUa28vJylZeXO7+XlpZKkjwejzwej7Ux8PZVvc+odkYej8f52S/rNX2QlV5jfV2/+9OurvvWe59w4/OzatamZPL2UVu7qu0bWla9n9oy1DbmDY1zbT/ry9yUZbWtq7od7/K65kxdfTflcWjuftSXoaH8zdWcY6Hq8vrmfW1zoqEMtY1vYzPU1b6+5dW3VTV/VLj/x1Rjs9SWzdtHQ49DfVmq5m6oXUPj2Jhxri1zY8altsy1ZW9oTgTyXNPQstr2qbHjVusY+Dl+NgXqPGNDmKn6ZicLKisr9atf/UqLFy9Wu3btdPbsWT3yyCOaO3eupG+uZI0YMUJHjx5V9+7dnftNnjxZYWFhWr9+vR599FG98MILOnTokE/fXbt21fz58zVr1iylpaUpOTlZzz77rLP+4MGDuvTSS3Xw4EH16dOnRrasrCzNnz+/xvLs7Gy1b9/e1hAAAIAAOnXqlKZOnaqSkhLFxsYGO86/Gcv++Mc/mgsvvND88Y9/NO+9955Zu3at6dKli1mzZo0xxpi3337bSDJHjx71ud+kSZPM5MmTjTHGPPLII6ZXr141+j7//PPNM888Y4wx5rrrrjMzZ870WX/gwAEjyRw8eLDWbKdPnzYlJSXO7fPPPzeSzJdffmkqKiqs3crKysymTZtMWVmZz/Jev3q11p/V19f1uz/tGlpW2/r+D2w2mzZtMv0f2NzojPVlqi9DbVnqWtaYDLWNeUP7UNvPpoxlQ2NT275V33Zdc8bG3KhrHPzdj/oyNJS/ubfmHAv1zYWKin/P+7KysiaNVUPHdWPmnb/zs+p9qh639e1vff0091ip73GoK4t3znhz15e5MZmamrkx49LQmDd0fAfqXOPP8V21XWPOlw2Ngb/jZ/NWVlZmsrOzjSRTUlLS7LrFJusv/91zzz26//77NWXKFElS//799dlnn2nhwoW65ZZblJCQIEkqLi72uVJVXFysQYMGSZISEhJ07Ngxn37PnDmj48ePO/dPSEhQcXGxTxvv79421UVFRSkqKqrG8oiICEVERDRhb+tXvd/ys2GKiIio8bP6+rp+96ddQ8tqXV8Z5vxsbMb6MnnHoLZ2tWWpa1n1furK4G1XX7+1La9vW3WOVSOX1bau6naq36euuWjjcWjufjQmQ6COpeZkqrq8vnlf37FZV9+1jW9jMzRnfvrMpyrHrb/HlI0sDT0O9WWpmruhdg1lamrmxoxLbZlry97QnLB9rvHn+K5tn+o7XzY4Bn6OX1th/Y3qp06dUni4b7ft2rVTZWWlJCk5OVkJCQnasWOHs760tFS7d+9WSkqKJCklJUUnTpxQQUGB02bnzp2qrKzUsGHDnDZ5eXk+r63m5ubqkksuqfX9VG7Cm0UB92uLx3Fb3GegKutF1Q033KBHHnlEW7du1d///ndt3LhRS5Ys0YQJEyRJYWFhmjNnjh5++GFt3rxZ77//vm6++WYlJiZq/PjxkqQ+ffro+uuv14wZM7Rnzx69/fbbmj17tqZMmaLExERJ0tSpUxUZGanp06frwIEDWr9+vZ566illZmba3iW4DCd29+MxBOBG1l/+W7ZsmX7zm9/o5z//uY4dO6bExET97Gc/07x585w29957r8rKyjRz5kydOHFCV111lXJychQdHe20WbdunWbPnq1Ro0YpPDxcEydO1NNPP+2sj4uL0/bt25WRkaEhQ4bovPPO07x583y+ywpA6PAWSn//7dggJ2m+nvdvbRX7AcAu60VVp06dtHTpUi1durTONmFhYVqwYIEWLFhQZ5suXbooOzu73m0NGDBAb775ZlOjAgHHky9jADuYR3AD/vYfAACABRRVAAAAFlBUAQAAWEBRBYSAflmvNfm+tX1Sjk/PAUDLo6gCAACwgKIKLcLGlROuvgAAQhlFFQDUg2IegcYcaz0oqgAACCEUWe5FUQXUgRMbAMAfFFUAgioYxSsFM4BAoKiCVW3tycr2/jbnqxUAAMFFUQVAUtsriAG34RgNfRRVAdbz/q2t+kBozfvWWvGYAUBgUFQBAABYQFEFAABgAUVVK9KUlxp5KQg28UZ7AG0ZRRUAAIAFFFVAI7X2Dx0AAJqHogpAi6M4BdAaUVQBAABYQFEFICRw9QqA21FUAWi1KNQAtCSKKrgaT5oAgFBBUYVGoXgBAKB+FFVAAFCEMgYA2h6KKgAAAAsoqtoArhgAABB4FFUAmoWiHQC+QVEFuAgFDACELooqAAAACyiqXICrEwAAhD6KKgAAAAsoqlyMK1itV8/7twb98Q329v3hpqwAWi+KKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAIYs3oANwE4oqAAAACyiqAAAALKCoAgAAsICiCoBfeJ8TANSOogoAAMACiioAAAALKKoAuEoo/F1EAKgNRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgQUCKqn/84x+66aabdO655yomJkb9+/fXvn37nPXGGM2bN0/du3dXTEyMUlNT9fHHH/v0cfz4cU2bNk2xsbGKj4/X9OnTdfLkSZ827733nq6++mpFR0crKSlJixcvDsTuAAAANMh6UfXVV19pxIgRioiI0H//93/r4MGDeuKJJ9S5c2enzeLFi/X0009r5cqV2r17tzp06KD09HSdPn3aaTNt2jQdOHBAubm52rJli/Ly8jRz5kxnfWlpqdLS0nTRRRepoKBAjz32mLKysrRq1SrbuwQAANCgc2x3uGjRIiUlJen55593liUnJzv/NsZo6dKleuCBBzRu3DhJ0tq1a9WtWzdt2rRJU6ZM0YcffqicnBzt3btXQ4cOlSQtW7ZMY8aM0eOPP67ExEStW7dOFRUVWr16tSIjI3XppZeqsLBQS5Ys8Sm+AAAAWoL1omrz5s1KT0/XpEmT9MYbb+iCCy7Qz3/+c82YMUOSdPjwYRUVFSk1NdW5T1xcnIYNG6b8/HxNmTJF+fn5io+PdwoqSUpNTVV4eLh2796tCRMmKD8/XyNHjlRkZKTTJj09XYsWLdJXX33lc2XMq7y8XOXl5c7vpaWlkiSPxyOPx2NtDLx9eTweRbUzPv+u7adXXev9aVd9W7W1r3NZuPH5aStLfe0ak6+uLFXbVR/zQGVuTJbG7G+NfqqMfVPHL5CZGzs/vfkbm6Wx87PB8Wtm5saMkb/zs8WO72pzp6Uy+zs/65x34Q3f1/b4NXZ+1pe5tuyNnZ+BPL6rauhx8PtYaca5xqZA9GlLmDHG2OwwOjpakpSZmalJkyZp7969uvPOO7Vy5Urdcsst2rVrl0aMGKGjR4+qe/fuzv0mT56ssLAwrV+/Xo8++qheeOEFHTp0yKfvrl27av78+Zo1a5bS0tKUnJysZ5991ll/8OBBXXrppTp48KD69OlTI1tWVpbmz59fY3l2drbat29vawgAAEAAnTp1SlOnTlVJSYliY2ODHeffjGUREREmJSXFZ9kdd9xhhg8fbowx5u233zaSzNGjR33aTJo0yUyePNkYY8wjjzxievXqVaPv888/3zzzzDPGGGOuu+46M3PmTJ/1Bw4cMJLMwYMHa812+vRpU1JS4tw+//xzI8l8+eWXpqKiwtqtrKzMbNq0yZSVlZlev3rV9PrVq6aioqLOn96bjXa13aexy/o/sNls2rTJ9H9gs9UsDfXT0LLG3Lf6mAcqc2OyNGZ/qy+vOvZNHb9AZm5o3Lzj781ve342NH7NOaa8Y1/b3GnO/Gyp47v63GmpzM09vqvPmabOz2Ac37Udr/7Mz0Ae37WNRV3Ha1lZmV/j25zxs3krKysz2dnZRpIpKSlpWrESINZf/uvevbv69u3rs6xPnz76r//6L0lSQkKCJKm4uNjnSlVxcbEGDRrktDl27JhPH2fOnNHx48ed+yckJKi4uNinjfd3b5vqoqKiFBUVVWN5RESEIiIiGruLjRYREaHys2E+/67tp1dd6/1pV31btbWvc1llmPPTZpb62jUmX13jV7Vd9TEPVObGZmlof2v0U2Xsmzp+gczc2Pnpzd/YLI2dnw2OXzMzN2aM/J2fLXZ8V5s7LZXZ3/lZ57yrbPi+tsevsfOzvsy1ZW/s/Azk8V1VQ49DREREgxlsnWvaCuuf/hsxYkSNl+3+9re/6aKLLpL0zZvWExIStGPHDmd9aWmpdu/erZSUFElSSkqKTpw4oYKCAqfNzp07VVlZqWHDhjlt8vLyfF5bzc3N1SWXXFLr+6kAAAACyXpRddddd+mdd97Ro48+qk8++UTZ2dlatWqVMjIyJElhYWGaM2eOHn74YW3evFnvv/++br75ZiUmJmr8+PGSvrmydf3112vGjBnas2eP3n77bc2ePVtTpkxRYmKiJGnq1KmKjIzU9OnTdeDAAa1fv15PPfWUMjMzbe8SAABAg6y//Hf55Zdr48aNmjt3rhYsWKDk5GQtXbpU06ZNc9rce++9Kisr08yZM3XixAldddVVysnJcd7kLknr1q3T7NmzNWrUKIWHh2vixIl6+umnnfVxcXHavn27MjIyNGTIEJ133nmaN28eX6cAAACCwnpRJUnf//739f3vf7/O9WFhYVqwYIEWLFhQZ5suXbooOzu73u0MGDBAb775ZpNzAgAA2MLf/gMAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAABaiZ73bw12hDaNogoAAMACiioAAAALKKoAAECj8RJj3SiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgCghQT7Db7B3j7Q2lFUAQAAWEBRBQAAYAFFFQAA8BsvJ9dEUQUAAGABRRUABAn/0wdaF4oqAAgxNootCjag5VFUAQBCSlMLQgpJBNs5wQ4AAABClz/FalsvbLlSBQBoUxrzxN/WiwM0DUUVAMAVAlXo1NcvxRX8QVEFAG1YWysaWnp/29r4tnUUVQgJnHjgBnXNU+avHW4ex8Zmd8M+ejO6IWuo4Y3qAADrRUFt7Xrev1V//+1Yv3I1pu+WevL3bieqXYtsDi7ElSoAAAALuFIFAEHmvQJi4ypOc/GST9vFY998XKkCgCbgCcgX48EYgCtVaAV63r9VUe2MFl8R7CQAGhLqhUeo5AuVHPAPV6oAwA+h9GQXSllsa837htaLogquxAkXgdbz/q3MM7QYG19AWtcnLpnHLYeiCiElWAc/Jx2gdhwbjAEaj6IqyFr7wdra988teBwQKMwt4N94o3oI4KRkT7+s1ySFNdiuNY95a963YGFMEWqYk6GJK1UAAKDJKPD+jaIKABBUPCmjteDlPzRboE6InGjr/1tpjE/o4rGxh7H0xXiENq5UoU7+Hrwt8Vfa2+IJpS3uM0IP87BlhPo4h3q+YKOochl/JnRLFDmhrrn7ZuO7YxCa6vr+nlB6XL1ZQikTgLpRVLUxbngiQcvgMQeCi2Ow9aGoQoM48NFcLTmHmK8AgoWiCkCbQLGFlsR8a5soquA3/paUf+oaq2COIY9f28LjDbQMvlIhRIXiSTAUMwFtVSi8N5JzAuCLK1UB9M2fTGkZbfHkxh9fBhrGfG0deBzdgStVLYwDA27DnAWAxgn4larf/va3CgsL05w5c5xlp0+fVkZGhs4991x17NhREydOVHFxsc/9jhw5orFjx6p9+/bq2rWr7rnnHp05c8anzeuvv67LLrtMUVFRuvjii7VmzZpA7w7agOZ+OWlbuoJGwQWgNm313BDQomrv3r169tlnNWDAAJ/ld911l1599VVt2LBBb7zxho4ePaof/vCHzvqzZ89q7Nixqqio0K5du/TCCy9ozZo1mjdvntPm8OHDGjt2rK699loVFhZqzpw5uu222/Taay33kpsbtNWJ3Rq1xGPZWuYLH6aA1HrmM9wjYEXVyZMnNW3aNP3nf/6nOnfu7CwvKSnR73//ey1ZskTf+973NGTIED3//PPatWuX3nnnHUnS9u3bdfDgQb344osaNGiQRo8erYceekgrVqxQRUWFJGnlypVKTk7WE088oT59+mj27Nn60Y9+pCeffDJQuwQgiHiCRGMxV+ofg5Z8v29bE7D3VGVkZGjs2LFKTU3Vww8/7CwvKCiQx+NRamqqs6x3797q0aOH8vPzNXz4cOXn56t///7q1q2b0yY9PV2zZs3SgQMHNHjwYOXn5/v04W1T9WXG6srLy1VeXu78XlpaKknyeDzyeDzN3WWHt6+ocOOzLKpd3b831M7p8//WVV1evZ+mLvPmrZq7of30dx+qL6+6T/7mr6p69rq2VX17/u5DbRq7H/W2q5K/oX21+ZjXlau+x6zexzLcd67Wl6G2dnUtqz7vJemSX29RVLv6j4Xa9rmued/QuDdGbdvyPoFFtau7XUPL62vj73Hrz7aawu9552fuhvr1p11Tx8Cf4zVQGerro6FtVZ3zXoGYC9W3Ecp92hJmjLE+ki+99JIeeeQR7d27V9HR0brmmms0aNAgLV26VNnZ2br11lt9ihtJuuKKK3Tttddq0aJFmjlzpj777DOfl/JOnTqlDh06aNu2bRo9erR69eqlW2+9VXPnznXabNu2TWPHjtWpU6cUExNTI1dWVpbmz59fY3l2drbat29vcQQAAECgnDp1SlOnTlVJSYliY2ODHcdh/UrV559/rjvvvFO5ubmKjo623X2zzJ07V5mZmc7vpaWlSkpKUlpamtUHxePxKDc3V7/ZF67yyjBJ0gdZ6T6XXKv/XtfyhtrVtr6py6LCjR4aWumTuyn82Tep5qXoxuavqnr2urZVfXvNfRz87aP69r3LhyzIcfIXzLu+3n21+ZjXlauu+VXX8v2//p4z56vmry9D1e021K6x/TXmZY265v11112nwY/sbPD+/vTd2EzN0dTjtiWy1betqnOmOeebpm6/vuUNsXWubE6GpvTpXV51zkdEREgK3EuCVc+9tng8Hr3yyivW+7XBelFVUFCgY8eO6bLLLnOWnT17Vnl5eVq+fLlee+01VVRU6MSJE4qPj3faFBcXKyEhQZKUkJCgPXv2+PTr/XRg1TbVPzFYXFys2NjYWq9SSVJUVJSioqJqLI+IiHAmlk3llWEqPxvmbMP779p+r2t5Q+1qW9+cZdVzN4U/+yapWVmr82ava1vVt9fcx8HfPqpv32n3fyfm8sraH9OGttfcZdVzfec32yXVzFLXcqePavnry1B1uw21a2x/jZkjjX28msKfeWObv8dtS2ard94183zT1O1L/57PTWUjeyAeh8bO8cGP7NShR74vqeZ5yWaWtsT6G9VHjRql999/X4WFhc5t6NChmjZtmvPviIgI7dixw7nPoUOHdOTIEaWkpEiSUlJS9P777+vYsWNOm9zcXMXGxqpv375Om6p9eNt4+0DrEuw3ngZ7+21ZQ2Mfyo9NKGcDYJ/1K1WdOnVSv379fJZ16NBB5557rrN8+vTpyszMVJcuXRQbG6s77rhDKSkpGj58uCQpLS1Nffv21Y9//GMtXrxYRUVFeuCBB5SRkeFcabr99tu1fPly3XvvvfrpT3+qnTt36k9/+pO2bg3Nk1hTT65NuR8ncrgB87Tt4LFGWxGUb1R/8sknFR4erokTJ6q8vFzp6el65plnnPXt2rXTli1bNGvWLKWkpKhDhw665ZZbtGDBAqdNcnKytm7dqrvuuktPPfWULrzwQj333HNKT7f/+i0ANAbFA9C2tUhR9frrr/v8Hh0drRUrVmjFihV13ueiiy7Stm3b6u33mmuu0f79+21EdC1O4m0PjzlsYB4B9vEHlV2qNZ8QW/O+tQW2Hj/mAdA0HDvBQ1EVYlrzwdCa9w0AQh3n4MCjqAoSJjeAtow/lYLWiKIKrRJFKwCgpVFUAQAAWBCUr1RA68ZVotaDxxIIrGAfY8HefmvDlSqgmdriSakt7jMANIQrVWhTKAZqctOYuCkrQgfzBi2FogpB0xZPdG1xnwGgreDlPyDIAlVoUcABQMuiqEKbFoqFRyhmaqrWtC8A0BBe/kObEApP7oHMEAr7BwBtHVeqgACh0AGAtoWiqo3gCd4/jJd/QuFPjvCYAQg2iiqgDaHwqFsoFIYA3I2iCkCLorAD0FpRVAEAAFhAUQWgTeFKGYBAoagCAACwgKIKAADAAoqqNoyXQQAAsIeiCgAAwAKKKgBBx1VTAK0BRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWWC+qFi5cqMsvv1ydOnVS165dNX78eB06dMinzenTp5WRkaFzzz1XHTt21MSJE1VcXOzT5siRIxo7dqzat2+vrl276p577tGZM2d82rz++uu67LLLFBUVpYsvvlhr1qyxvTsAAACNYr2oeuONN5SRkaF33nlHubm58ng8SktLU1lZmdPmrrvu0quvvqoNGzbojTfe0NGjR/XDH/7QWX/27FmNHTtWFRUV2rVrl1544QWtWbNG8+bNc9ocPnxYY8eO1bXXXqvCwkLNmTNHt912m1577TXbuwQAANCgc2x3mJOT4/P7mjVr1LVrVxUUFGjkyJEqKSnR73//e2VnZ+t73/ueJOn5559Xnz599M4772j48OHavn27Dh48qL/85S/q1q2bBg0apIceekj33XefsrKyFBkZqZUrVyo5OVlPPPGEJKlPnz5666239OSTTyo9Pd32bgEAANTLelFVXUlJiSSpS5cukqSCggJ5PB6lpqY6bXr37q0ePXooPz9fw4cPV35+vvr3769u3bo5bdLT0zVr1iwdOHBAgwcPVn5+vk8f3jZz5sypM0t5ebnKy8ud30tLSyVJHo9HHo+n2fvq5e0rKtxY67MlePO6Lbfk7uwS+YPJzdkl9+Z3a27J3dmlls1v87k1kH3aEtCiqrKyUnPmzNGIESPUr18/SVJRUZEiIyMVHx/v07Zbt24qKipy2lQtqLzrvevqa1NaWqp//etfiomJqZFn4cKFmj9/fo3l27dvV/v27Zu2k/V4aGil9T5bgltzS+7OLpE/mNycXXJvfrfmltydXWqZ/Nu2bQv4NkJJQIuqjIwMffDBB3rrrbcCuZlGmzt3rjIzM53fS0tLlZSUpLS0NMXGxlrbjsfjUW5urn6zL1zllWHW+g20qHCjh4ZWui635O7sEvmDyc3ZJffmd2tuyd3ZpZbN/0GW/bfjeDwevfLKK9b7tSFgRdXs2bO1ZcsW5eXl6cILL3SWJyQkqKKiQidOnPC5WlVcXKyEhASnzZ49e3z68346sGqb6p8YLC4uVmxsbK1XqSQpKipKUVFRNZZHREQoIiLC/51sQHllmMrPuu+Ac2tuyd3ZJfIHk5uzS+7N79bckruzSy2TPxDPraHM+qf/jDGaPXu2Nm7cqJ07dyo5Odln/ZAhQxQREaEdO3Y4yw4dOqQjR44oJSVFkpSSkqL3339fx44dc9rk5uYqNjZWffv2ddpU7cPbxtsHAABAS7J+pSojI0PZ2dl65ZVX1KlTJ+c9UHFxcYqJiVFcXJymT5+uzMxMdenSRbGxsbrjjjuUkpKi4cOHS5LS0tLUt29f/fjHP9bixYtVVFSkBx54QBkZGc6Vpttvv13Lly/Xvffeq5/+9KfauXOn/vSnP2nr1q22dwkAAKBB1q9U/e53v1NJSYmuueYade/e3bmtX7/eafPkk0/q+9//viZOnKiRI0cqISFBL7/8srO+Xbt22rJli9q1a6eUlBTddNNNuvnmm7VgwQKnTXJysrZu3arc3FwNHDhQTzzxhJ577jm+TgEAAASF9StVxjT8Ec3o6GitWLFCK1asqLPNRRdd1OCnBq655hrt37/f74wAAAC28bf/AAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALHB9UbVixQr17NlT0dHRGjZsmPbs2RPsSAAAoA1ydVG1fv16ZWZm6sEHH9S7776rgQMHKj09XceOHQt2NAAA0Ma4uqhasmSJZsyYoVtvvVV9+/bVypUr1b59e61evTrY0QAAQBtzTrADNFVFRYUKCgo0d+5cZ1l4eLhSU1OVn59f633Ky8tVXl7u/F5SUiJJOn78uDwej7VsHo9Hp06d0jmecJ2tDLPWb6CdU2l06lSl63JL7s4ukT+Y3Jxdcm9+t+aW3J1datn8//znP6336X2OlSRjjPX+m8W41D/+8Q8jyezatctn+T333GOuuOKKWu/z4IMPGkncuHHjxo0bt1Zw+/zzz1ui5Gg0116paoq5c+cqMzPT+b2yslLHjx/Xueeeq7Awe9V6aWmpkpKS9Pnnnys2NtZav4Hm1tySu7NL5A8mN2eX3Jvfrbkld2eXWk/+gwcPKjExMdhxfLi2qDrvvPPUrl07FRcX+ywvLi5WQkJCrfeJiopSVFSUz7L4+PhARVRsbKwrJ6xbc0vuzi6RP5jcnF1yb3635pbcnV1yf/4LLrhA4eGh9dbw0Erjh8jISA0ZMkQ7duxwllVWVmrHjh1KSUkJYjIAANAWufZKlSRlZmbqlltu0dChQ3XFFVdo6dKlKisr06233hrsaAAAoI1xdVH1H//xH/p//+//ad68eSoqKtKgQYOUk5Ojbt26BTVXVFSUHnzwwRovNYY6t+aW3J1dIn8wuTm75N78bs0tuTu7RP5ACjMm1D6PCAAA4D6ufU8VAABAKKGoAgAAsICiCgAAwAKKKgAAAAvaTFG1cOFCXX755erUqZO6du2q8ePH69ChQz5tTp8+rYyMDJ177rnq2LGjJk6cWOPLRX/xi19oyJAhioqK0qBBg2ps59ChQ7r22mvVrVs3RUdH61vf+pYeeOCBRv1twRUrVqhnz56Kjo7WsGHDtGfPHp/csbGxOu+889SxY0eFhYXpxIkTIZvba+HCheratavCw8MVFhamqKgojRo1Sh999JHTJlTze8f+nHPOUVhYmM/t9ttvD+nsknTvvffWyO29bdiwIeTzL1y4UAMGDFBERITCw8MVERGh0aNH++QLZv68vDzdcMMNSkxMVFhYmDZt2uST/fLLL1dMTIwiIyMVGRmpsLAwFRYWWs1e1SeffKJOnTo1+guNg3m+sZ1b+veYR0ZGql27dmrXrp06d+6scePGOeebUM3uzR8bG1vnuSbU87fU+SZQ+SXp008/1YQJE3T++ecrNjZWkydPrpGvQcH+OzktJT093Tz//PPmgw8+MIWFhWbMmDGmR48e5uTJk06b22+/3SQlJZkdO3aYffv2meHDh5srr7zSp5877rjDLF++3Pz4xz82AwcOrLGdTz/91KxevdoUFhaav//97+aVV14xXbt2NXPnzq0330svvWQiIyPN6tWrzYEDB8yMGTNMfHy8ufbaa53cv/zlL80ll1xi4uPjjSTz1VdfhWzu4uJiZ9xvueUWs2bNGrN161YzYsQIExMTYy644AJz5syZkM7vnTNDhw41P/zhD82oUaPMBRdcYD799FNTUlIS0tmNMSYtLc0sXbrUvP766+Yvf/mLGTVqlImLizMdOnQwX3/9dcjnT01NNeeff74ZNWqU2bBhg7n66qtNTEyMueyyy8zZs2eDnn/btm3m17/+tXn55ZeNJLNx40ZnnXfuPProo2bWrFmmf//+RvL9W6U2sntVVFSYoUOHmtGjR5u4uLh6cxsT/PON7dxVj9cHH3zQ/P73vzfXXnutSUhIMKNHjzZJSUnmzJkzIZvdmG/mzCWXXGImTpzoHK8XXHCBOXr0qNNHKOdvqfNNoPKfPHnSfOtb3zITJkww7733nnnvvffMuHHjzOWXX+6cbxqjzRRV1R07dsxIMm+88YYxxpgTJ06YiIgIs2HDBqfNhx9+aCSZ/Pz8Gvd/8MEH633Aq7rrrrvMVVddVW+bK664wmRkZDi/nz171iQmJpqFCxfWmtt7knNL7tryf/LJJ67I/93vftfceeedrp0zXt78Y8aMcUX+1157zYSHhzsFbNW5k5ubG/T8VVUvqqrbt2+fkWSee+45Y4z9sb/33nvNTTfdZJ5//vlGPcGEyvkmULmrZl+9erWRZPbv3x/y2b3nmqr5A3W+CeTYV80fqPON7fzVzzfezGFhYSY3N7fB/r3azMt/1ZWUlEiSunTpIkkqKCiQx+NRamqq06Z3797q0aOH8vPzm7ydTz75RDk5Ofrud79bZ5uKigoVFBT4bDs8PFypqak1tu3N7eWW3F5ffPGFpG/+ZlNSUpJr8q9bt069evWSJK1du1anTp1yTXavt99+W5J04403Sgr9uVNeXu68ZCz9e+6Hh4frrbfeCmp+f3399deSpLi4OEl2x37nzp3asGGDVqxY0aj2oXK+CWTuqtl37typ5ORkFRcXuyL7unXrdN555+nKK6+UJMXExEhy19hLgT3fBCJ/9fONJEVHRzvnm8Zqk0VVZWWl5syZoxEjRqhfv36SpKKiIkVGRtZ4bbZbt24qKiryextXXnmloqOj9Z3vfEdXX321FixYUGfbL7/8UmfPnq3xTfDVt+3N7c3sltyS9Mwzz6hDhw4aOHCgYmJi9MYbbygyMtIV+adOnaq1a9dqwIAB6tWrl3JycnTTTTe5IruXd+7ExMTopptukhT6c2f48OHq0KGD7rvvPp08eVJ33HGHunfvrsrKSn3xxRdBze+PyspKPfTQQ5Kkiy++WJK9sf/nP/+pn/zkJ1qzZk2j/zBuKJxvAplbkpYvX67evXtLkvbu3avc3Fz985//DPnsU6dO1YsvvqgdO3aoY8eOioyM1MKFCyW5Z+ylwJ5vApW/6vnm1KlTKisr0y9/+UudPXvWuRjQGG2yqMrIyNAHH3ygl156KWDbWL9+vd59911lZ2dr69atevzxxyVJb775pjp27Ojc1q1b1+g+vbnnzZsXqNgByS1J06ZN07hx45SQkKARI0Zo8uTJOn36tCvyz5w5U5s3b9Znn32mHTt2aO3atdq4caP/b2AMQnavn/3sZzpy5Ijuvvtuq5mrsp3//PPP14YNG/Tqq6+qU6dOysnJ0fDhw3XZZZcF5C/TB2r8MzIyanwoxpYZM2Zo6tSpGjlyZK3rQ/V8E8jckrR//34lJCRow4YN6tWrlyZPnqyKigob0QOafebMmUpPT9fKlSv11VdfOeeaTz/91Ep2KfBjLwX2fBOo/FXPNx07dlRcXJxOnDjh9/nG1X/7rylmz56tLVu2KC8vTxdeeKGzPCEhQRUVFTpx4oRPJV1cXKyEhAS/t5OUlCRJ6tu3r86ePauZM2fq7rvv1tChQ30+AdStWzdFRUWpXbt2NZ6kq267au7PPvvMNbm9fv3rX+vNN9/Url27dMEFF6hz587auHGjK/JXnzOdO3eW9M0l5VDP7s3/8ssvq127dvrFL37hLHfD2KelpWn06NHauHGjtm3bpoEDByohIUHf+ta3gpq/sbxz549//KPPk4Ct7Dt37tTmzZudAtAYo8rKSp1zzjlatWqVbrzxxpA83wQqtzf79u3b9eabbyo5OVk/+MEP1LlzZ33yySchn92b3zv2Xbt2lfTNS9JuGHtv/kCebwKZPy0tTZ9++qm+/PJLnXPOOYqPj3fON43VZq5UGWM0e/Zsbdy40XmNvaohQ4YoIiJCO3bscJYdOnRIR44cUUpKSrO2XVlZKY/Ho8rKSsXExOjiiy92bp06dVJkZKSGDBnis+3Kykrt2LFDw4cPd2Vu77ZrG3fzzQckVF5eHtL565oz3gP2qquuCtnsku/Yf/vb39a4ceN0/vnnO+1Deeyr53/99dc1cOBA7dy5U8eOHdMPfvCDoOZvSPW54y3YvGxlz8/PV2FhoXNbsGCBOnXqpMLCQk2YMCFkzze2c9d3vHrPN4mJiSGb3Zuzen7vuaZ79+4hPfbV8wfyfBOo/FWdd955io+P9znfNFqj39LucrNmzTJxcXHm9ddfN1988YVzO3XqlNPm9ttvNz169DA7d+40+/btMykpKSYlJcWnn48//tjs37/f/OxnPzO9evUy+/fvN/v37zfl5eXGGGNefPFFs379enPw4EHz6aefmvXr15vExEQzbdq0evO99NJLJioqyqxZs8YcPHjQzJw508THx5tbbrnFyf3Xv/7V5Obmmscff9xIMnl5eWb//v3mJz/5ScjlLioqMsYYM23aNBMdHW2effZZs3fvXrN582Zz3XXXmc6dOzsfZQ3FcS8qKjKzZs0ynTp1MrfeeqvJyckxe/bsMWvWrDE9e/Y0I0eODOnsxvx7zr/44otGklm3bp0r5nzV/DExMWbFihUmPz/fLFu2zMTHx5s77rgjJPJ//fXXTl+SzJIlS8z+/fvNZ5995oz95s2bTW5urvnDH/5gJJm1a9ea/fv3my+++MJK9uoa+0moYJ9vbOeuerzedtttJicnxznfjBkzxnTp0sUUFxeH5Jh75/vUqVOdc6X3XNOjRw8zYsQIp49Qzt9S55tA5TfGmNWrV5v8/HzzySefmD/84Q+mS5cuJjMzs8G+q2ozRZX+72PB1W/PP/+80+Zf//qX+fnPf246d+5s2rdvbyZMmGC++OILn36++93v1trP4cOHjTHfPHCXXXaZ6dixo+nQoYPp27evefTRR82//vWvBjMuW7bM9OjRw0RGRporrrjCvPPOO3XmrnpbtWpVyOVuaNwfffTRkB73+rKPHj3a+dhtqGavL3+oz/mG8q9evTok8v/P//xPrf3ecsstDR6zDz74oJXs1TX2CaausW+p843t3MbUPV+GDx9uPvroI2OMnfkSiOz15X/mmWecNm7Mb/t8E6j8xhhz3333mW7dupmIiAjzne98xzzxxBOmsrKyUX17hRljjAAAANAsbeY9VQAAAIFEUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABb8f58N2gQoK7TXAAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<Figure size 640x480 with 1 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "new_df.date.hist(bins=400)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1acf60dc",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-praw==7.7.0
-gradio==3.23
 nbdev==2.3.12
-datasets==2.11.0
 requests==2.28.2
 loguru==0.7.0
 rich==13.3.4
-gradio==3.23.0
-supervisor==4.2.5

+praw==7.7.1
+gradio==3.50.2
 nbdev==2.3.12
+datasets==2.14.6
 requests==2.28.2
 loguru==0.7.0
 rich==13.3.4
+supervisor==4.2.5
+schedule==1.2.0

utilities/data_collator.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+from utilities.praw_downloader import praw_downloader
+from utilities.praw_processor import preprocess_praw_data
+def get_latest_data():
+    submissions = praw_downloader()
+    df = preprocess_praw_data(submissions=submissions)
+    return df
+def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Removes rows with redundant ids, retaining the one with the longest content.
+    Parameters:
+    - df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
+    Returns:
+    - pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
+                    with the longest content available.
+    """
+    # Create a column for content length
+    df['content_length'] = df['content'].str.len()
+    # Use groupby to get the index of the row with the longest content for each 'id'
+    idx_to_keep = df.groupby('id')['content_length'].idxmax().values
+    # Filter the DataFrame to only keep those rows
+    df_filtered = df.loc[idx_to_keep]
+    # Drop the 'content_length' column
+    df_filtered = df_filtered.drop(columns=['content_length'])
+    return df_filtered
+def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Merges the provided dataset with the latest data, sorts them by 'date_utc',
+    filters out redundant IDs, and returns the merged and filtered dataset.
+    Args:
+    - dataset (Type[Dataset]): The dataset to be merged with the latest data.
+    Returns:
+    - Type[Dataset]: The merged and filtered dataset.
+    """
+    latest_df = get_latest_data()
+    df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
+    df = filter_redundant_ids(df)
+    return df

utilities/my_logger.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import logging
+def setup_logger(name: str):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    # Create a file handler to write logs to a file
+    file_handler = logging.FileHandler('mylog.log')
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    # Create a stream handler to write logs to the console
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.DEBUG)
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(stream_handler)
+    return logger

utilities/praw_downloader.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from datetime import datetime
+from typing import Any, Dict, List
+import praw
+from utilities.my_logger import setup_logger
+# Setup logging
+logger = setup_logger(__name__)
+def get_reddit_instance() -> praw.Reddit:
+    """Initialize and return a Reddit instance using PRAW."""
+    return praw.Reddit(
+            client_id=os.getenv('REDDIT_CLIENT_ID'),
+            client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
+            user_agent=os.getenv('REDDIT_USER_AGENT'),
+            ratelimit_seconds=20,
+            )
+def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]:
+    """Extract and return relevant data from a given Reddit submission."""
+    return {
+        "content": submission.selftext,
+        "poster": str(submission.author),
+        "date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
+        "flair": submission.link_flair_text,
+        "title": submission.title,
+        "score": submission.ups,
+        "permalink": submission.permalink,
+        }
+def praw_downloader() -> List[Dict[str, str]]:
+    """Main function to extract and save all submissions from the subreddit."""
+    reddit = get_reddit_instance()
+    subreddit = reddit.subreddit('bestofredditorupdates')
+    logger.info('Starting to fetch submissions from bestofredditorupdates.')
+    submissions = []
+    for submission in subreddit.new(limit=200):  # Set limit=None to get all posts
+        logger.debug(f'Processing post {submission.id} - {submission.title}')
+        data = extract_submission_data(submission)
+        submissions.append(data)
+    logger.info(f'Finished downloading {len(submissions)} submissions.')
+    return submissions
+if __name__ == "__main__":
+    praw_downloader()

utilities/praw_processor.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import Dict, List
+import pandas as pd
+from utilities.my_logger import setup_logger
+# Setup logging
+logger = setup_logger(__name__)
+def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
+    """
+    Preprocesses praw data into a DataFrame.
+    Parameters:
+    - submissions: List of submission dictionaries.
+    Returns:
+    - pd.DataFrame: Preprocessed DataFrame.
+    """
+    # Convert the submissions list to a DataFrame
+    praw_df = pd.DataFrame(submissions)
+    # Convert 'date' column to datetime format
+    praw_df.date_utc = pd.to_datetime(praw_df.date_utc)
+    # Remove 'poster_link' column if it exists
+    if 'poster_link' in praw_df.columns:
+        del praw_df['poster_link']
+    # Extract the 4th element from 'permalink' as 'id'
+    praw_df['id'] = praw_df.permalink.str.split('/').str[4]
+    return praw_df

utilities/readme_update.py CHANGED Viewed

@@ -10,24 +10,20 @@ def get_readme_path(dataset_name):
     return cached_path(readme_path, download_config=DownloadConfig())
-def update_readme(dataset_name, subreddit, date_to_fetch):
     path = get_readme_path(dataset_name=dataset_name)
     readme_text = f"""
 # Dataset Name
 {dataset_name}
 ## Update Frequency
-The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to {date_to_fetch}
-## Dataset Overview
-The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
-## Data Collection
-This has been collected with sequential calls that follow the pagination of the pushshift request.
-## Attribution
-Data sourced from the Pushshift API.
-    """
     append_readme(path=path, readme_text=readme_text)
     return readme_text

     return cached_path(readme_path, download_config=DownloadConfig())
+def update_readme(dataset_name, subreddit, latest_date):
     path = get_readme_path(dataset_name=dataset_name)
     readme_text = f"""
+## Dataset Overview
+The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging PRAW and the reddit API to get downloads.
+There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
 # Dataset Name
 {dataset_name}
 ## Update Frequency
+The dataset is updated daily with the most recent day being: {latest_date}
+"""
     append_readme(path=path, readme_text=readme_text)
     return readme_text