Spaces:
Sleeping
Sleeping
update fine-tuning
Browse files
notebooks/08-Finetune_Embedding.ipynb
CHANGED
@@ -27,7 +27,7 @@
|
|
27 |
},
|
28 |
"outputs": [],
|
29 |
"source": [
|
30 |
-
"!pip install -q llama-index==0.10.
|
31 |
]
|
32 |
},
|
33 |
{
|
@@ -114,7 +114,7 @@
|
|
114 |
"id": "6Lua8G8seyEx"
|
115 |
},
|
116 |
"source": [
|
117 |
-
"##
|
118 |
]
|
119 |
},
|
120 |
{
|
@@ -129,12 +129,13 @@
|
|
129 |
},
|
130 |
"outputs": [],
|
131 |
"source": [
|
|
|
132 |
"from llama_index.legacy.readers import SimpleWebPageReader\n",
|
133 |
"\n",
|
134 |
"# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.\n",
|
135 |
"TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)\n",
|
136 |
"VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)\n",
|
137 |
-
"print(len(TRAIN_DOCs), len(VALIDATION_DOCs))"
|
138 |
]
|
139 |
},
|
140 |
{
|
@@ -167,13 +168,20 @@
|
|
167 |
],
|
168 |
"source": [
|
169 |
"from llama_index.core.node_parser import SimpleNodeParser\n",
|
|
|
|
|
|
|
|
|
170 |
"\n",
|
171 |
-
"#
|
172 |
-
"
|
|
|
173 |
"\n",
|
174 |
-
"#
|
|
|
175 |
"TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)\n",
|
176 |
"VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)\n",
|
|
|
177 |
"print(len(TRAIN_NODEs), len(VALIDATION_NODEs))"
|
178 |
]
|
179 |
},
|
|
|
27 |
},
|
28 |
"outputs": [],
|
29 |
"source": [
|
30 |
+
"!pip install -q llama-index==0.10.65 llama-index-finetuning openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 cohere==5.6.2 llama-index-llms-gemini==0.1.11 html2text llama-index-llms-openai llama-index-embeddings-huggingface"
|
31 |
]
|
32 |
},
|
33 |
{
|
|
|
114 |
"id": "6Lua8G8seyEx"
|
115 |
},
|
116 |
"source": [
|
117 |
+
"## Web Page Reader\n"
|
118 |
]
|
119 |
},
|
120 |
{
|
|
|
129 |
},
|
130 |
"outputs": [],
|
131 |
"source": [
|
132 |
+
"\n",
|
133 |
"from llama_index.legacy.readers import SimpleWebPageReader\n",
|
134 |
"\n",
|
135 |
"# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.\n",
|
136 |
"TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)\n",
|
137 |
"VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)\n",
|
138 |
+
"print(len(TRAIN_DOCs), len(VALIDATION_DOCs))\n"
|
139 |
]
|
140 |
},
|
141 |
{
|
|
|
168 |
],
|
169 |
"source": [
|
170 |
"from llama_index.core.node_parser import SimpleNodeParser\n",
|
171 |
+
"from llama_index.core.schema import Document\n",
|
172 |
+
"# Convert legacy documents to the new format\n",
|
173 |
+
"def convert_legacy_to_new(legacy_doc):\n",
|
174 |
+
" return Document(text=legacy_doc.text)\n",
|
175 |
"\n",
|
176 |
+
"# Assuming TRAIN_DOCs and VALIDATION_DOCs are lists of legacy documents\n",
|
177 |
+
"TRAIN_DOCs = [convert_legacy_to_new(doc) for doc in TRAIN_DOCs]\n",
|
178 |
+
"VALIDATION_DOCs = [convert_legacy_to_new(doc) for doc in VALIDATION_DOCs]\n",
|
179 |
"\n",
|
180 |
+
"# Now use the parser\n",
|
181 |
+
"parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
|
182 |
"TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)\n",
|
183 |
"VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)\n",
|
184 |
+
"\n",
|
185 |
"print(len(TRAIN_NODEs), len(VALIDATION_NODEs))"
|
186 |
]
|
187 |
},
|