|
|
|
"""nomic_embedding_rag.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1vAQoZx_07yU0nVCkFxJQkcVeymgNpzFF |
|
""" |
|
|
|
!pip install nomic |
|
!pip install --upgrade langchain |
|
|
|
! nomic login |
|
|
|
! nomic login nk-bqukmTuFJHW8tgXzXXBw1qDL062-pth-ACecKP7CkXs |
|
|
|
! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain |
|
|
|
|
|
import os |
|
|
|
os.environ["LANGCHAIN_TRACING_V2"] = "true" |
|
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" |
|
os.environ["LANGCHAIN_API_KEY"] = "api_key" |
|
|
|
"""## Document Loading |
|
|
|
Let's test 3 interesting blog posts. |
|
""" |
|
|
|
import json |
|
from langchain_community.document_loaders import JSONLoader |
|
from langchain.docstore.document import Document |
|
|
|
|
|
class JSONLoader: |
|
def __init__(self, message): |
|
self.message = message |
|
|
|
def load(self): |
|
|
|
return Document( |
|
page_content=self.message['content'], |
|
metadata={ |
|
'role': self.message['role'], |
|
'conversation_id': self.message['conversation_id'], |
|
'message_id': self.message['message_id'] |
|
} |
|
) |
|
|
|
|
|
file_path = 'RAG_Datos.json' |
|
|
|
with open(file_path, 'r') as file: |
|
data = json.load(file) |
|
|
|
|
|
docs_list = [] |
|
for conversation in data: |
|
for message in conversation['messages']: |
|
docs_list.append(JSONLoader(message).load()) |
|
|
|
|
|
for doc in docs_list: |
|
print(doc.page_content, doc.metadata) |
|
|
|
"""from langchain_community.document_loaders import WebBaseLoader |
|
|
|
urls = [ |
|
"https://lilianweng.github.io/posts/2023-06-23-agent/", |
|
"https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/", |
|
"https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/", |
|
]""" |
|
|
|
"""docs = [WebBaseLoader(url).load() for url in urls]"" |
|
|
|
"""docs_list = [item for sublist in docs for item in sublist] |
|
|
|
|
|
|
|
Long context retrieval, |
|
Chunck_size -> tamaño de cada texto |
|
""" |
|
|
|
# Ahora puedes usar docs_list con text_splitter |
|
from langchain.text_splitter import CharacterTextSplitter |
|
|
|
text_splitter = CharacterTextSplitter( |
|
chunk_size=7500, chunk_overlap=100 |
|
) |
|
doc_splits = text_splitter.split_documents(docs_list) |
|
|
|
# Verificar el contenido de los splits (opcional) |
|
for split in doc_splits: |
|
print(split.page_content, split.metadata) |
|
|
|
import tiktoken |
|
|
|
encoding = tiktoken.get_encoding("cl100k_base") |
|
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") |
|
for d in doc_splits: |
|
print("The document is %s tokens" % len(encoding.encode(d.page_content))) |
|
|
|
""" |
|
|
|
Nomic embeddings [here](https://docs.nomic.ai/reference/endpoints/nomic-embed-text). |
|
""" |
|
|
|
import os |
|
|
|
from langchain_community.vectorstores import Chroma |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.runnables import RunnableLambda, RunnablePassthrough |
|
from langchain_nomic import NomicEmbeddings |
|
from langchain_nomic.embeddings import NomicEmbeddings |
|
|
|
# Add to vectorDB |
|
vectorstore = Chroma.from_documents( |
|
documents=doc_splits, |
|
collection_name="rag-chroma", |
|
embedding=NomicEmbeddings(model="nomic-embed-text-v1"), |
|
) |
|
retriever = vectorstore.as_retriever() |
|
|
|
""" |
|
|
|
We can use the |
|
""" |
|
|
|
import os |
|
from sklearn.metrics import precision_score, recall_score, f1_score |
|
from nltk.translate.bleu_score import corpus_bleu |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain_openai import ChatOpenAI |
|
from langchain.chains import LLMChain |
|
|
|
# Configurar la clave de API como variable de entorno |
|
os.environ['OPENAI_API_KEY'] = 'sk-proj-OaIQbNSKP2uATHxcaUxhT3BlbkFJi2HSSi4zSHSOw9UjtUWn' |
|
|
|
# Prompt |
|
template = """Answer the question based only on the following context: |
|
{context} |
|
|
|
Question: {question} |
|
""" |
|
prompt = ChatPromptTemplate.from_template(template) |
|
|
|
# LLM API |
|
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview") |
|
|
|
# Placeholder para `retriever` |
|
class DummyRetriever: |
|
def __call__(self, *args, **kwargs): |
|
return {"context": "This is a test context"} |
|
|
|
retriever = DummyRetriever() |
|
|
|
# Crear una cadena LLM |
|
llm_chain = LLMChain( |
|
prompt=prompt, |
|
llm=model, |
|
) |
|
|
|
# Datos de prueba |
|
test_data = [ |
|
{"context": "Write a Python function to sum all prime numbers up to 1000.", "question": "How to write a function to sum all prime numbers up to 1000?", "expected_answer": "def sum_primes(limit):\n def is_prime(n):\n if n <= 1:\n return False\n for i in range(2, int(n**0.5) + 1):\n if n % i == 0:\n return False\n return True\n return sum(x for x in range(limit) if is_prime(x))\n\nprint(sum_primes(1000))"}, |
|
{"context": "Write a Python function to calculate the factorial of a number.", "question": "How to write a function to calculate the factorial of a number?", "expected_answer": "def factorial(n):\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\nprint(factorial(5))"}, |
|
{"context": "Write a Python function to check if a number is palindrome.", "question": "How to write a function to check if a number is palindrome?", "expected_answer": "def is_palindrome(n):\n return str(n) == str(n)[::-1]\n\nprint(is_palindrome(121))"}, |
|
{"context": "Write a Python function to generate Fibonacci sequence up to n.", "question": "How to write a function to generate Fibonacci sequence up to n?", "expected_answer": "def fibonacci(n):\n fib_sequence = [0, 1]\n while len(fib_sequence) < n:\n fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])\n return fib_sequence\n\nprint(fibonacci(10))"}, |
|
{"context": "Write a Python function to find the greatest common divisor (GCD) of two numbers.", "question": "How to write a function to find the greatest common divisor (GCD) of two numbers?", "expected_answer": "def gcd(a, b):\n while b:\n a, b = b, a % b\n return a\n\nprint(gcd(48, 18))"}, |
|
{"context": "Write a Python function to check if a string is an anagram of another string.", "question": "How to write a function to check if a string is an anagram of another string?", "expected_answer": "def is_anagram(str1, str2):\n return sorted(str1) == sorted(str2)\n\nprint(is_anagram('listen', 'silent'))"}, |
|
{"context": "Write a Python function to find the maximum element in a list.", "question": "How to write a function to find the maximum element in a list?", "expected_answer": "def find_max(lst):\n return max(lst)\n\nprint(find_max([3, 5, 7, 2, 8]))"}, |
|
{"context": "Write a Python function to reverse a string.", "question": "How to write a function to reverse a string?", "expected_answer": "def reverse_string(s):\n return s[::-1]\n\nprint(reverse_string('hello'))"}, |
|
{"context": "Write a Python function to merge two sorted lists.", "question": "How to write a function to merge two sorted lists?", "expected_answer": "def merge_sorted_lists(lst1, lst2):\n return sorted(lst1 + lst2)\n\nprint(merge_sorted_lists([1, 3, 5], [2, 4, 6]))"}, |
|
{"context": "Write a Python function to remove duplicates from a list.", "question": "How to write a function to remove duplicates from a list?", "expected_answer": "def remove_duplicates(lst):\n return list(set(lst))\n\nprint(remove_duplicates([1, 2, 2, 3, 4, 4, 5]))"}, |
|
] |
|
|
|
# Evaluar la precisión, recall y F1-score de la recuperación |
|
retrieved_contexts = [retriever()["context"] for _ in test_data] |
|
expected_contexts = [item["context"] for item in test_data] |
|
precision = precision_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1) |
|
recall = recall_score(expected_contexts, retrieved_contexts, average='macro', zero_division=1) |
|
f1 = f1_score(expected_contexts, retrieved_contexts, average='macro') |
|
|
|
print(f"Retrieval Precision: {precision}") |
|
print(f"Retrieval Recall: {recall}") |
|
print(f"Retrieval F1 Score: {f1}") |
|
|
|
# Evaluar la generación de respuestas |
|
generated_answers = [] |
|
for item in test_data: |
|
output = llm_chain.run({"context": item["context"], "question": item["question"]}) |
|
generated_answers.append(output) |
|
|
|
# BLEU Score |
|
reference_answers = [[item["expected_answer"].split()] for item in test_data] |
|
generated_answers_tokens = [answer.split() for answer in generated_answers] |
|
bleu_score = corpus_bleu(reference_answers, generated_answers_tokens) |
|
|
|
print(f"BLEU Score: {bleu_score}") |