Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Sleeping

App Files Files Community

GaoDalie commited on Aug 23, 2024

Commit

41726e0

1 Parent(s): 009d017

update fine-tuning

Browse files

Files changed (1) hide show

notebooks/08-Finetune_Embedding.ipynb +14 -6

notebooks/08-Finetune_Embedding.ipynb CHANGED Viewed

@@ -27,7 +27,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install -q llama-index==0.10.57 llama-index-finetuning openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 cohere==5.6.2 llama-index-llms-gemini==0.1.11"
       ]
     },
     {
@@ -114,7 +114,7 @@
         "id": "6Lua8G8seyEx"
       },
       "source": [
-        "## Read the Page\n"
       ]
     },
     {
@@ -129,12 +129,13 @@
       },
       "outputs": [],
       "source": [
         "from llama_index.legacy.readers import SimpleWebPageReader\n",
         "\n",
         "# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.\n",
         "TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)\n",
         "VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)\n",
-        "print(len(TRAIN_DOCs), len(VALIDATION_DOCs))"
       ]
     },
     {
@@ -167,13 +168,20 @@
       ],
       "source": [
         "from llama_index.core.node_parser import SimpleNodeParser\n",
         "\n",
-        "# Define a parser to perform the chunking process.\n",
-        "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
         "\n",
-        "# Apply chunking on the training/validation sets.\n",
         "TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)\n",
         "VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)\n",
         "print(len(TRAIN_NODEs), len(VALIDATION_NODEs))"
       ]
     },

       },
       "outputs": [],
       "source": [
+        "!pip install -q llama-index==0.10.65 llama-index-finetuning openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 cohere==5.6.2 llama-index-llms-gemini==0.1.11 html2text llama-index-llms-openai llama-index-embeddings-huggingface"
       ]
     },
     {
         "id": "6Lua8G8seyEx"
       },
       "source": [
+        "## Web Page Reader\n"
       ]
     },
     {
       },
       "outputs": [],
       "source": [
+        "\n",
         "from llama_index.legacy.readers import SimpleWebPageReader\n",
         "\n",
         "# Read the content of webpage into lists. We need two sets of documents for Training, and Validation.\n",
         "TRAIN_DOCs = SimpleWebPageReader(html_to_text=True).load_data(TRAIN_URLs)\n",
         "VALIDATION_DOCs = SimpleWebPageReader(html_to_text=True).load_data(VALIDATION_URLs)\n",
+        "print(len(TRAIN_DOCs), len(VALIDATION_DOCs))\n"
       ]
     },
     {
       ],
       "source": [
         "from llama_index.core.node_parser import SimpleNodeParser\n",
+        "from llama_index.core.schema import Document\n",
+        "# Convert legacy documents to the new format\n",
+        "def convert_legacy_to_new(legacy_doc):\n",
+        "    return Document(text=legacy_doc.text)\n",
         "\n",
+        "# Assuming TRAIN_DOCs and VALIDATION_DOCs are lists of legacy documents\n",
+        "TRAIN_DOCs = [convert_legacy_to_new(doc) for doc in TRAIN_DOCs]\n",
+        "VALIDATION_DOCs = [convert_legacy_to_new(doc) for doc in VALIDATION_DOCs]\n",
         "\n",
+        "# Now use the parser\n",
+        "parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)\n",
         "TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)\n",
         "VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)\n",
+        "\n",
         "print(len(TRAIN_NODEs), len(VALIDATION_NODEs))"
       ]
     },