GaoDalie commited on
Commit
41726e0
Β·
1 Parent(s): 009d017

update fine-tuning

Browse files
notebooks/08-Finetune_Embedding.ipynb CHANGED
@@ -27,7 +27,7 @@
27
  },
28
  "outputs": [],
29
  "source": [
30
- "!pip install -q llama-index==0.10.57 llama-index-finetuning openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 cohere==5.6.2 llama-index-llms-gemini==0.1.11"
31
  ]
32
  },
33
  {
@@ -114,7 +114,7 @@
114
  "id": "6Lua8G8seyEx"
115
  },
116
  "source": [
117
- "## Read the Page\n"
118
  ]
119
  },
120
  {
@@ -129,12 +129,13 @@
129
  },
130
  "outputs": [],
131
  "source": [
 
132
  "from llama_index.legacy.readers import SimpleWebPageReader\n",
133
  "\n",
134
  "# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.\n",
135
  "TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)\n",
136
  "VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)\n",
137
- "print(len(TRAIN_DOCs), len(VALIDATION_DOCs))"
138
  ]
139
  },
140
  {
@@ -167,13 +168,20 @@
167
  ],
168
  "source": [
169
  "from llama_index.core.node_parser import SimpleNodeParser\n",
 
 
 
 
170
  "\n",
171
- "# Define a parser to perform the chunking process.\n",
172
- "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
 
173
  "\n",
174
- "# Apply chunking on the training/validation sets.\n",
 
175
  "TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)\n",
176
  "VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)\n",
 
177
  "print(len(TRAIN_NODEs), len(VALIDATION_NODEs))"
178
  ]
179
  },
 
27
  },
28
  "outputs": [],
29
  "source": [
30
+ "!pip install -q llama-index==0.10.65 llama-index-finetuning openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 cohere==5.6.2 llama-index-llms-gemini==0.1.11 html2text llama-index-llms-openai llama-index-embeddings-huggingface"
31
  ]
32
  },
33
  {
 
114
  "id": "6Lua8G8seyEx"
115
  },
116
  "source": [
117
+ "## Web Page Reader\n"
118
  ]
119
  },
120
  {
 
129
  },
130
  "outputs": [],
131
  "source": [
132
+ "\n",
133
  "from llama_index.legacy.readers import SimpleWebPageReader\n",
134
  "\n",
135
  "# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.\n",
136
  "TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)\n",
137
  "VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)\n",
138
+ "print(len(TRAIN_DOCs), len(VALIDATION_DOCs))\n"
139
  ]
140
  },
141
  {
 
168
  ],
169
  "source": [
170
  "from llama_index.core.node_parser import SimpleNodeParser\n",
171
+ "from llama_index.core.schema import Document\n",
172
+ "# Convert legacy documents to the new format\n",
173
+ "def convert_legacy_to_new(legacy_doc):\n",
174
+ " return Document(text=legacy_doc.text)\n",
175
  "\n",
176
+ "# Assuming TRAIN_DOCs and VALIDATION_DOCs are lists of legacy documents\n",
177
+ "TRAIN_DOCs = [convert_legacy_to_new(doc) for doc in TRAIN_DOCs]\n",
178
+ "VALIDATION_DOCs = [convert_legacy_to_new(doc) for doc in VALIDATION_DOCs]\n",
179
  "\n",
180
+ "# Now use the parser\n",
181
+ "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
182
  "TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)\n",
183
  "VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)\n",
184
+ "\n",
185
  "print(len(TRAIN_NODEs), len(VALIDATION_NODEs))"
186
  ]
187
  },