Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

File size: 4,552 Bytes

500b9e5

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine .csv files into one .jsonl file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'combined_data.jsonl'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# Define the path for the uploaded files\n",
    "files = [\n",
    "    \"data/ai-tutor-csv-files/activeloop.csv\",\n",
    "    \"data/ai-tutor-csv-files/advanced_rag_course.csv\",\n",
    "    \"data/ai-tutor-csv-files/filtered_tai_v2.csv\",\n",
    "    \"data/ai-tutor-csv-files/hf_transformers.csv\",\n",
    "    \"data/ai-tutor-csv-files/langchain_course.csv\",\n",
    "    \"data/ai-tutor-csv-files/langchain_docs.csv\",\n",
    "    \"data/ai-tutor-csv-files/llm_course.csv\",\n",
    "    \"data/ai-tutor-csv-files/openai.csv\",\n",
    "    \"data/ai-tutor-csv-files/wiki.csv\"\n",
    "]\n",
    "\n",
    "# Function to load and clean CSV data\n",
    "def load_and_clean_csv(file_path):\n",
    "    # Attempt to load the CSV file\n",
    "    df = pd.read_csv(file_path)\n",
    "    \n",
    "    # Check if the first column is unnamed and drop it if so\n",
    "    if 'Unnamed: 0' in df.columns or df.columns[0] == '':\n",
    "        df = df.drop(df.columns[0], axis=1)\n",
    "        \n",
    "    # Reorder columns based on expected headers\n",
    "    expected_headers = ['title', 'url', 'content', 'source']\n",
    "    df = df[expected_headers]\n",
    "    \n",
    "    return df\n",
    "\n",
    "# Load, clean, and combine all CSV files\n",
    "combined_df = pd.concat([load_and_clean_csv(file) for file in files], ignore_index=True)\n",
    "\n",
    "# Convert to JSON - list of JSON objects\n",
    "combined_json_list = combined_df.to_dict(orient=\"records\")\n",
    "\n",
    "# Save the combined JSON list to a new JSONL file, with each line representing a JSON object\n",
    "output_file_jsonl = \"combined_data.jsonl\"\n",
    "with open(output_file_jsonl, \"w\") as file:\n",
    "    for record in combined_json_list:\n",
    "        file.write(json.dumps(record) + '\\n')\n",
    "\n",
    "output_file_jsonl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Merge chunks together if they have the same title and url\n",
    "Also prepend the titles into the content for the sources that did not have that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import json\n",
    "\n",
    "# def remove_title_from_content_simple(jsonl_file_path, output_file_path):\n",
    "#     with open(jsonl_file_path, 'r') as file, open(output_file_path, 'w') as output_file:\n",
    "#         for line in file:\n",
    "#             # Parse the JSON line\n",
    "#             data = json.loads(line)\n",
    "\n",
    "#             content = str(data['content'])\n",
    "#             title = str(data['title'])\n",
    "            \n",
    "#             # Replace the title in the content with an empty string\n",
    "#             # This removes the exact match of the title from the content\n",
    "#             data['content'] = content.replace(title, '', 1)\n",
    "            \n",
    "#             # Write the updated data to the output file\n",
    "#             json.dump(data, output_file)\n",
    "#             output_file.write('\\n')  # Add newline to separate JSON objects\n",
    "\n",
    "# # Example usage\n",
    "# jsonl_file_path = 'combined_data.jsonl'\n",
    "# output_file_path = 'output.jsonl'\n",
    "# remove_title_from_content_simple(jsonl_file_path, output_file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}