Spaces:

gutai123
/

AUTOMATIC_TICKET_CLASSIFICATION_TOOL

Sleeping

App Files Files Community

AUTOMATIC_TICKET_CLASSIFICATION_TOOL / admin_utils.py

gutai123

Update admin_utils.py

bac31f6 verified 7 days ago

raw

history blame

2.93 kB

	from pypdf import PdfReader
	from langchain_huggingface import HuggingFaceEndpoint
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	#from langchain.embeddings import OpenAIEmbeddings #This import has been replaced by the below one :)
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
	#from langchain.llms import OpenAI #This import has been replaced by the below one :)
	from langchain_openai import OpenAI
	#Pinecone team has been making a lot of changes to there code and here is how it should be used going forward :)
	from pinecone import Pinecone as PineconeClient
	#from langchain.vectorstores import Pinecone #This import has been replaced by the below one :)
	from langchain_community.vectorstores import Pinecone
	import pandas as pd
	from sklearn.model_selection import train_test_split




	#********Functions to help you load documents to PINECONE**********

	#Read PDF data
	def read_pdf_data(pdf_file):
	pdf_page = PdfReader(pdf_file)
	text = ""
	for page in pdf_page.pages:
	text += page.extract_text()
	return text

	#Split data into chunks
	def split_data(text):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
	docs = text_splitter.split_text(text)
	docs_chunks =text_splitter.create_documents(docs)
	return docs_chunks

	#Create embeddings instance
	def create_embeddings_load_data():
	#embeddings = OpenAIEmbeddings()
	embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
	return embeddings

	#Function to push data to Pinecone
	def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):

	PineconeClient(
	api_key=pinecone_apikey,
	environment=pinecone_environment
	)

	index_name = pinecone_index_name
	index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
	return index

	def read_data(data):
	df = pd.read_csv(data,delimiter=',', header=None)
	return df

	#Create embeddings instance
	def get_embeddings():
	embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
	return embeddings

	#Generating embeddings for our input dataset
	def create_embeddings(df,embeddings):
	df[2] = df[0].apply(lambda x: embeddings.embed_query(x))
	return df

	#Splitting the data into train & test
	def split_train_test__data(df_sample):
	# Split into training and testing sets
	sentences_train, sentences_test, labels_train, labels_test = train_test_split(
	list(df_sample[2]), list(df_sample[1]), test_size=0.25, random_state=0)
	print(len(sentences_train))
	return sentences_train, sentences_test, labels_train, labels_test

	#Get the accuracy score on test data
	def get_score(svm_classifier,sentences_test,labels_test):
	score = svm_classifier.score(sentences_test, labels_test)
	return score