Spaces:
Running
Running
# import required libraries | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.llms import HuggingFaceHub | |
#from langchain.vectorstores import Chroma | |
from langchain_community.vectorstores import Chroma | |
import tensorflow_datasets as tfds | |
from sentence_transformers import SentenceTransformer | |
from datasets import load_dataset | |
from transformers import BartForConditionalGeneration, BartTokenizer | |
import textwrap | |
import chromadb | |
import streamlit as st | |
import sys,yaml | |
import uuid | |
import Utilities as ut | |
def text_summarizer(text): | |
initdict = ut.get_tokens() | |
BART_Model_Name = initdict["BART_model"] | |
#model_name = "facebook/bart-large-cnn" | |
model = BartForConditionalGeneration.from_pretrained(BART_Model_Name) | |
tokenizer = BartTokenizer.from_pretrained(BART_Model_Name) | |
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True) | |
summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
formatted_summary = "\n".join(textwrap.wrap(summary, width=80)) | |
return formatted_summary | |
def load_patentBIGdata(): | |
initdict={} | |
initdict = ut.get_tokens() | |
embedding_model_id = initdict["embedding_model"] | |
chromadbpath = initdict["dataset_chroma_db"] | |
chromadbcollname = initdict["dataset_chroma_db_collection_name"] | |
embedding_model = SentenceTransformer(embedding_model_id) | |
chroma_client = chromadb.PersistentClient(path= chromadbpath) | |
collection = chroma_client.get_or_create_collection(name=chromadbcollname) | |
# Load the Big patent dataset | |
ds = load_dataset("big_patent", "a", split="validation[:1%]",trust_remote_code=True) | |
for record in ds.take(10): | |
abstract, desc = record ["abstract"], record["description"] | |
# Summarize to 150 words | |
abstract = text_summarizer(abstract) | |
textembeddings = embedding_model.encode(abstract).tolist() | |
genguid=str(uuid.uuid4()) | |
#take 8 characters | |
uniqueid = genguid[:8] | |
# Now we will store the expert explanation field of first 10 questions from dataset into collection. | |
collection.add( | |
documents=[ | |
abstract | |
], | |
embeddings=[textembeddings], | |
ids=[uniqueid] | |
) | |
#print(abstract) | |
st.title("Patent Ingestion - BIG Patent") | |
# Main chat form | |
with st.form("chat_form"): | |
submit_button = st.form_submit_button("Upload BIG Patent data...") | |
if submit_button: | |
load_patentBIGdata() | |
response = "BIG Patent dataset was successfully loaded" | |
st.write (response) | |