giovannefeitosa commited on
Commit
f7db77c
·
1 Parent(s): ecbf46a

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ .venv
3
+ io/data/original-sample.txt
4
+ **/*.pyc
5
+ **/__pycache__
6
+ io/generated/**
7
+ !io/generated/.gitkeep
README.md CHANGED
@@ -1,3 +1,3 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
1
+ # Chatbot about Pele
2
+
3
+ This is demo project.
commons/Configs.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ class Configs:
5
+ def __init__(self):
6
+ # environment variables
7
+ os.environ["PROJECT_ROOT"] = os.getcwd()
8
+ # openai
9
+ self.OPENAI_KEY = ""
10
+ self.chatCompletionModel = "gpt-3.5-turbo"
11
+ self.embeddingsModel = "text-embedding-ada-002"
12
+ # generated files
13
+ self.generatedDatasetPath = f"{os.environ['PROJECT_ROOT']}/io/generated/dataset.json"
14
+ self.generatedEmbeddingsPath = f"{os.environ['PROJECT_ROOT']}/io/generated/embeddings.json"
15
+ # spacy
16
+ self.spacyModel = 'en_core_web_sm'
17
+ # model
18
+ self.generatedModelPath = f"{os.environ['PROJECT_ROOT']}/io/generated/model.sklearn"
19
+
20
+
21
+ configs = Configs()
commons/File.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+
5
+ class File:
6
+ def readFile(self, file):
7
+ with open(file, 'r') as f:
8
+ return f.read()
9
+
10
+ def readJsonFile(self, file):
11
+ with open(file, 'r') as f:
12
+ return json.load(f)
13
+
14
+ def writeFile(self, outputFilePath, data):
15
+ with open(outputFilePath, 'w') as f:
16
+ f.write(json.dumps(data))
17
+
18
+ def exists(self, filePath):
19
+ return os.path.exists(filePath)
20
+
21
+
22
+ file = File()
commons/Model.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.linear_model import LogisticRegression
2
+ import joblib
3
+ from commons.Configs import configs
4
+ from commons.File import file
5
+
6
+
7
+ class Model:
8
+ def __init__(self, debug=False):
9
+ self.debug = debug
10
+
11
+ def train(self, x, y):
12
+ return LogisticRegression(solver='lbfgs', random_state=42).fit(x, y)
13
+
14
+ def save(self, clf):
15
+ # save model
16
+ joblib.dump(clf, configs.generatedModelPath)
17
+ print("Model saved to: ", configs.generatedModelPath)
18
+
19
+ def load(self):
20
+ if not file.exists(configs.generatedModelPath):
21
+ print("Model not found at: ", configs.generatedModelPath)
22
+ exit(1)
23
+ return joblib.load(configs.generatedModelPath)
24
+
25
+
26
+ model = Model()
commons/OpenAIClient.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from commons.Configs import configs
3
+ from commons.File import file
4
+ import openai
5
+ from openai.embeddings_utils import cosine_similarity
6
+ import json
7
+
8
+
9
+ class OpenAIClient:
10
+ def __init__(self, debug=False):
11
+ self.debug = debug
12
+ openai.api_key = configs.OPENAI_KEY
13
+ self.embeddingsModel = configs.embeddingsModel
14
+
15
+ def buildPrompt(self, name, variables):
16
+ # used by prepareutils.Dataset
17
+ promptFilePath = os.path.join(configs.promptsDir, f"{name}.prompt.txt")
18
+ prompt = file.readFile(promptFilePath)
19
+ for key, value in variables.items():
20
+ prompt = prompt.replace(f"{{{key}}}", value)
21
+ return prompt
22
+
23
+ def generateSyntheticQuestions(self, prompt, debugSentence=""):
24
+ # used by prepareutils.Dataset
25
+ """Use OpenAI completion API to generate synthetic questions for each sentence"""
26
+ # ----------------------------------------------
27
+ # generate questions (responseText)
28
+ # ----------------------------------------------
29
+ response = openai.ChatCompletion.create(
30
+ model=configs.chatCompletionModel,
31
+ messages=[{"role": "user", "content": prompt}]
32
+ )
33
+ responseText = response['choices'][0]['message']['content']
34
+ # ----------------------------------------------
35
+ # split questions and answers
36
+ # ----------------------------------------------
37
+ # make all question/answers to be on the same line
38
+ # and remove the response header
39
+ questionAnswers = responseText.replace("\n", "").split('(Q)', 1)[1]
40
+ # one line per question/answer
41
+ questionAnswers = questionAnswers.split('(Q)')
42
+ # split question and answers
43
+ questionAnswers = [x.split('(A)', 1) for x in questionAnswers]
44
+ # remove invalid rows and strip
45
+ questionAnswers = [[x[0].strip(), x[1].strip()]
46
+ for x in questionAnswers if len(x) == 2]
47
+ jsonData = [{"question": x[0], "answer": x[1]}
48
+ for x in questionAnswers]
49
+ # ----------------------------------------------
50
+ # debug
51
+ if self.debug:
52
+ print("Sentence: ", debugSentence)
53
+ print("Response text: ", responseText)
54
+ print("jsonData: ", json.dumps(jsonData, indent=4))
55
+ # ----------------------------------------------
56
+ return jsonData
57
+
58
+ def generateEmbeddings(self, sentences):
59
+ # used by prepareutils.Embeddings
60
+ response = openai.Embedding.create(
61
+ input=sentences,
62
+ model=self.embeddingsModel,
63
+ )
64
+ embeddings = []
65
+ for x in response['data']:
66
+ embeddings.append(x['embedding'])
67
+ assert len(embeddings) == len(sentences)
68
+ return embeddings
69
+
70
+ def searchBestEmbeddingIndex(self, embeddedQuestion, embeddingsToSearch):
71
+ # find the most similar sentence
72
+ # used by ask.py
73
+ """Search for the best embedding index"""
74
+ maxSimilarity = 0
75
+ maxSimilarityIndex = 0
76
+ for i, embedding in enumerate(embeddingsToSearch):
77
+ # similarity = cosineSimilarity(
78
+ # np.array(questionEmbedding['data'][0]['embedding']), embedding)
79
+ similarity = cosine_similarity(embeddedQuestion, embedding)
80
+ if similarity > maxSimilarity:
81
+ maxSimilarity = similarity
82
+ maxSimilarityIndex = i
83
+ # return the most similar sentence index
84
+ return maxSimilarityIndex
85
+ # return the most similar embedding
86
+ # return df.iloc[maxSimilarityIndex].sentences
87
+
88
+
89
+ openaiClient = OpenAIClient()
commons/SpacyUtils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from commons.Configs import configs
3
+
4
+
5
+ class SpacyUtils:
6
+ def __init__(self, debug=False):
7
+ self.debug = debug
8
+
9
+ # Receives a raw text and returns an array of sentences
10
+ def splitSentences(self, text):
11
+ """Split text into sentences"""
12
+ nlp = self.spacyLoad()
13
+ doc = nlp(text)
14
+ return [str(sent.text).replace('"', '') for sent in doc.sents]
15
+
16
+ # Returns a spacy.load() model
17
+ def spacyLoad(self):
18
+ """Load spacy model"""
19
+ if not hasattr(self, 'spacyInstance'):
20
+ self.spacyInstance = spacy.load(configs.spacyModel)
21
+ return self.spacyInstance
22
+
23
+
24
+ spacyUtils = SpacyUtils()
prepareutils/Dataset.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ from commons.Configs import configs
3
+ from commons.File import file
4
+ from commons.OpenAIClient import openaiClient
5
+ from commons.SpacyUtils import spacyUtils
6
+
7
+
8
+ class Dataset:
9
+ def __init__(self, debug=False):
10
+ self.debug = debug
11
+
12
+ # Receives an <inputFile>
13
+ # generate synthetic questions and answers
14
+ # save to <outputFile>
15
+ def generateDatasetFromFile(self, inputFile):
16
+ outputFile = configs.generatedDatasetPath
17
+ # allQaRows is an array where each item is a dict with {"question","answer"} keys
18
+ # ? should I use a list of tuples instead?
19
+ allQaRows = []
20
+ print("Reading input file: ", inputFile)
21
+ text = file.readFile(inputFile)
22
+ # split text into sentences and augment each sentence with synthetic questions and answers
23
+ print("Generating questions and answers for each sentence")
24
+ for sent in tqdm(spacyUtils.splitSentences(text)):
25
+ prompt = openaiClient.buildPrompt("generateQuestionsPerson", {
26
+ 'NAME': configs.PROMPT_PERSON_NAME,
27
+ 'SOCIALNAME': configs.PROMPT_PERSON_SOCIALNAME,
28
+ 'TITLE': configs.PROMPT_PERSON_TITLE,
29
+ 'HESHEIT': configs.PROMPT_PERSON_HESHEIT,
30
+ 'BIRTHDAY': configs.PROMPT_PERSON_BIRTHDAY,
31
+ 'DEATHDAY': configs.PROMPT_PERSON_DEATHDAY,
32
+ 'BIRTHPLACE': configs.PROMPT_PERSON_BIRTHPLACE,
33
+ 'DEATHPLACE': configs.PROMPT_PERSON_DEATHPLACE,
34
+ 'NUMBER_OF_QUESTIONS': configs.PROMPT_PERSON_NUMBER_OF_QUESTIONS,
35
+ 'SENTENCE': sent
36
+ })
37
+ genq = openaiClient.generateSyntheticQuestions(
38
+ prompt, debugSentence=sent)
39
+ allQaRows.extend(genq)
40
+ # debug
41
+ if self.debug:
42
+ for x in genq:
43
+ print("Sentence: ", sent)
44
+ print("Q: ", x['question'])
45
+ print("A: ", x['answer'])
46
+ # save all the generated questions and answers in a generated dataset file
47
+ # Default: io/generated/dataset.json
48
+ print("Writing dataset to file: ", outputFile)
49
+ file.writeFile(outputFile, allQaRows)
50
+
51
+ def loadDataset(self):
52
+ inputFilePath = configs.generatedDatasetPath
53
+ return file.readJsonFile(inputFilePath)
54
+
55
+
56
+ dataset = Dataset()
prepareutils/Embeddings.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import numpy as np
3
+ from commons.Configs import configs
4
+ from commons.File import file
5
+ from commons.OpenAIClient import openaiClient
6
+
7
+
8
+ class Embeddings:
9
+ def __init__(self, debug=False):
10
+ self.debug = debug
11
+
12
+ def generateEmbeddings(self):
13
+ inputFilePath = configs.generatedDatasetPath
14
+ outputFilePath = configs.generatedEmbeddingsPath
15
+ dataset = file.readJsonFile(inputFilePath)
16
+ embeddings = []
17
+ print("")
18
+ # for each sentence
19
+ for i, qa in enumerate(tqdm(dataset)):
20
+ sentences = [qa['question'], qa['answer']]
21
+ emb = openaiClient.generateEmbeddings(sentences)
22
+ embjson = {'question': emb[0], 'answer': emb[1], 'label': i}
23
+ print("Sentence: ", i, sentences)
24
+ embeddings.append(embjson)
25
+ # save all the generated embeddings
26
+ # Default: io/generated/embeddings.json
27
+ print("Writing embeddings to file: ", outputFilePath)
28
+ file.writeFile(outputFilePath, embeddings)
29
+
30
+ def loadEmbeddings(self):
31
+ inputFilePath = configs.generatedEmbeddingsPath
32
+ embeddings = file.readJsonFile(inputFilePath)
33
+ questionEmbeddings = [x['question'] for x in embeddings]
34
+ answerEmbeddings = [x['answer'] for x in embeddings]
35
+ labels = [x['label'] for x in embeddings]
36
+ # i would use float16, but I've had issues with GPU
37
+ # I know I'm not using GPU now, but I might in the future
38
+ return \
39
+ np.array(questionEmbeddings, dtype=np.float32), \
40
+ np.array(answerEmbeddings, dtype=np.float32), \
41
+ np.array(labels, dtype=np.int32)
42
+
43
+
44
+ embeddings = Embeddings()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # openai
2
+ openai==0.27.2
3
+ scipy
4
+
5
+ # spacy https://pypi.org/project/spacy/
6
+ setuptools
7
+ wheel
8
+ spacy
9
+
10
+ # model
11
+ scikit-learn
12
+ numpy
13
+
14
+ # demo webserver
15
+ gradio
serve.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from commons.Model import model
3
+ from commons.Configs import configs
4
+ from commons.OpenAIClient import openaiClient
5
+ from prepareutils.Dataset import dataset
6
+ import numpy as np
7
+ import openai
8
+
9
+ # load the model once
10
+ clf = model.load()
11
+ # load dataset
12
+ qaDataset = dataset.loadDataset()
13
+
14
+
15
+ def predict(question, openaiKey):
16
+ # set openaiKey
17
+ configs.OPENAI_KEY = openaiKey
18
+ openai.api_key = openaiKey
19
+ # embed question
20
+ questionEmbedding = openaiClient.generateEmbeddings([question])[0]
21
+ # predict answer index
22
+ answerIndex = clf.predict([questionEmbedding]).item()
23
+ # get answer
24
+ bestAnswer = qaDataset[answerIndex]
25
+ return bestAnswer["answer"]
26
+
27
+
28
+ def randomExamples(numberOfExamples=15):
29
+ # create random indexes in the range between 0 and len(qaDataset)
30
+ randomIndexes = np.random.randint(0, len(qaDataset), numberOfExamples)
31
+ examples = []
32
+ for index in randomIndexes:
33
+ question = qaDataset[index]["question"]
34
+ examples.append([question])
35
+ return examples
36
+
37
+
38
+ gr.Interface(
39
+ fn=predict,
40
+ inputs=["text", "text"],
41
+ outputs="text",
42
+ examples=randomExamples(),
43
+ ).launch()