File size: 4,552 Bytes
500b9e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Combine .csv files into one .jsonl file"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'combined_data.jsonl'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pathlib import Path\n",
"import pandas as pd\n",
"import json\n",
"\n",
"# Define the path for the uploaded files\n",
"files = [\n",
" \"data/ai-tutor-csv-files/activeloop.csv\",\n",
" \"data/ai-tutor-csv-files/advanced_rag_course.csv\",\n",
" \"data/ai-tutor-csv-files/filtered_tai_v2.csv\",\n",
" \"data/ai-tutor-csv-files/hf_transformers.csv\",\n",
" \"data/ai-tutor-csv-files/langchain_course.csv\",\n",
" \"data/ai-tutor-csv-files/langchain_docs.csv\",\n",
" \"data/ai-tutor-csv-files/llm_course.csv\",\n",
" \"data/ai-tutor-csv-files/openai.csv\",\n",
" \"data/ai-tutor-csv-files/wiki.csv\"\n",
"]\n",
"\n",
"# Function to load and clean CSV data\n",
"def load_and_clean_csv(file_path):\n",
" # Attempt to load the CSV file\n",
" df = pd.read_csv(file_path)\n",
" \n",
" # Check if the first column is unnamed and drop it if so\n",
" if 'Unnamed: 0' in df.columns or df.columns[0] == '':\n",
" df = df.drop(df.columns[0], axis=1)\n",
" \n",
" # Reorder columns based on expected headers\n",
" expected_headers = ['title', 'url', 'content', 'source']\n",
" df = df[expected_headers]\n",
" \n",
" return df\n",
"\n",
"# Load, clean, and combine all CSV files\n",
"combined_df = pd.concat([load_and_clean_csv(file) for file in files], ignore_index=True)\n",
"\n",
"# Convert to JSON - list of JSON objects\n",
"combined_json_list = combined_df.to_dict(orient=\"records\")\n",
"\n",
"# Save the combined JSON list to a new JSONL file, with each line representing a JSON object\n",
"output_file_jsonl = \"combined_data.jsonl\"\n",
"with open(output_file_jsonl, \"w\") as file:\n",
" for record in combined_json_list:\n",
" file.write(json.dumps(record) + '\\n')\n",
"\n",
"output_file_jsonl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge chunks together if they have the same title and url\n",
"Also prepend the titles into the content for the sources that did not have that."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# import json\n",
"\n",
"# def remove_title_from_content_simple(jsonl_file_path, output_file_path):\n",
"# with open(jsonl_file_path, 'r') as file, open(output_file_path, 'w') as output_file:\n",
"# for line in file:\n",
"# # Parse the JSON line\n",
"# data = json.loads(line)\n",
"\n",
"# content = str(data['content'])\n",
"# title = str(data['title'])\n",
" \n",
"# # Replace the title in the content with an empty string\n",
"# # This removes the exact match of the title from the content\n",
"# data['content'] = content.replace(title, '', 1)\n",
" \n",
"# # Write the updated data to the output file\n",
"# json.dump(data, output_file)\n",
"# output_file.write('\\n') # Add newline to separate JSON objects\n",
"\n",
"# # Example usage\n",
"# jsonl_file_path = 'combined_data.jsonl'\n",
"# output_file_path = 'output.jsonl'\n",
"# remove_title_from_content_simple(jsonl_file_path, output_file_path)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|