File size: 4,552 Bytes
500b9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combine .csv files into one .jsonl file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'combined_data.jsonl'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# Define the path for the uploaded files\n",
    "files = [\n",
    "    \"data/ai-tutor-csv-files/activeloop.csv\",\n",
    "    \"data/ai-tutor-csv-files/advanced_rag_course.csv\",\n",
    "    \"data/ai-tutor-csv-files/filtered_tai_v2.csv\",\n",
    "    \"data/ai-tutor-csv-files/hf_transformers.csv\",\n",
    "    \"data/ai-tutor-csv-files/langchain_course.csv\",\n",
    "    \"data/ai-tutor-csv-files/langchain_docs.csv\",\n",
    "    \"data/ai-tutor-csv-files/llm_course.csv\",\n",
    "    \"data/ai-tutor-csv-files/openai.csv\",\n",
    "    \"data/ai-tutor-csv-files/wiki.csv\"\n",
    "]\n",
    "\n",
    "# Function to load and clean CSV data\n",
    "def load_and_clean_csv(file_path):\n",
    "    # Attempt to load the CSV file\n",
    "    df = pd.read_csv(file_path)\n",
    "    \n",
    "    # Check if the first column is unnamed and drop it if so\n",
    "    if 'Unnamed: 0' in df.columns or df.columns[0] == '':\n",
    "        df = df.drop(df.columns[0], axis=1)\n",
    "        \n",
    "    # Reorder columns based on expected headers\n",
    "    expected_headers = ['title', 'url', 'content', 'source']\n",
    "    df = df[expected_headers]\n",
    "    \n",
    "    return df\n",
    "\n",
    "# Load, clean, and combine all CSV files\n",
    "combined_df = pd.concat([load_and_clean_csv(file) for file in files], ignore_index=True)\n",
    "\n",
    "# Convert to JSON - list of JSON objects\n",
    "combined_json_list = combined_df.to_dict(orient=\"records\")\n",
    "\n",
    "# Save the combined JSON list to a new JSONL file, with each line representing a JSON object\n",
    "output_file_jsonl = \"combined_data.jsonl\"\n",
    "with open(output_file_jsonl, \"w\") as file:\n",
    "    for record in combined_json_list:\n",
    "        file.write(json.dumps(record) + '\\n')\n",
    "\n",
    "output_file_jsonl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Merge chunks together if they have the same title and url\n",
    "Also prepend the titles into the content for the sources that did not have that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import json\n",
    "\n",
    "# def remove_title_from_content_simple(jsonl_file_path, output_file_path):\n",
    "#     with open(jsonl_file_path, 'r') as file, open(output_file_path, 'w') as output_file:\n",
    "#         for line in file:\n",
    "#             # Parse the JSON line\n",
    "#             data = json.loads(line)\n",
    "\n",
    "#             content = str(data['content'])\n",
    "#             title = str(data['title'])\n",
    "            \n",
    "#             # Replace the title in the content with an empty string\n",
    "#             # This removes the exact match of the title from the content\n",
    "#             data['content'] = content.replace(title, '', 1)\n",
    "            \n",
    "#             # Write the updated data to the output file\n",
    "#             json.dump(data, output_file)\n",
    "#             output_file.write('\\n')  # Add newline to separate JSON objects\n",
    "\n",
    "# # Example usage\n",
    "# jsonl_file_path = 'combined_data.jsonl'\n",
    "# output_file_path = 'output.jsonl'\n",
    "# remove_title_from_content_simple(jsonl_file_path, output_file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}