abdulazeezoj commited on
Commit
be9a30f
0 Parent(s):

Add alaroye

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.docx filter=lfs diff=lfs merge=lfs -text
36
+ docs/**/* filter=lfs diff=lfs merge=lfs -text
37
+ alaroye/alaroyedb/**/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ **/.vscode
2
+ **/coverage
3
+ **/.env
4
+ **/.aws
5
+ **/.ssh
6
+ **/.DS_Store
7
+ **/__pycache__
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.3
2
+
3
+ # Set the working directory to /code
4
+ WORKDIR /code
5
+
6
+ # Copy Pipfile
7
+ COPY Pipfile Pipfile.lock ./
8
+
9
+ # Python PIP Upgrade & Install Dependencies
10
+ RUN pip install --no-cache-dir --upgrade pip \
11
+ && pip install --no-cache-dir pipenv
12
+
13
+ # Install Python Dependencies
14
+ RUN pipenv install --system --deploy --ignore-pipfile
15
+
16
+ # Set up a new user named "user" with user ID 1000
17
+ RUN useradd -m -u 1000 alaye
18
+
19
+ # Switch to the "alaye" user
20
+ USER alaye
21
+
22
+ # Set home to the alaye's home directory
23
+ ENV HOME=/home/alaye \
24
+ PATH=/home/alaye/.local/bin:$PATH \
25
+ PYTHONPATH=$HOME/omdenabot \
26
+ PYTHONUNBUFFERED=1 \
27
+ PYTHONDONTWRITEBYTECODE=1 \
28
+ GRADIO_ALLOW_FLAGGING=never \
29
+ GRADIO_NUM_PORTS=1 \
30
+ GRADIO_SERVER_NAME=0.0.0.0 \
31
+ GRADIO_THEME=huggingface \
32
+ SYSTEM=spaces
33
+
34
+ # Set the working directory to the user's home directory
35
+ WORKDIR $HOME/omdenabot
36
+
37
+ # Copy Project
38
+ COPY --chown=alaye . $HOME/omdenabot
39
+
40
+ # Expose Port
41
+ EXPOSE 7860
42
+
43
+ # Run entrypoint
44
+ CMD [ "python", "src/web.py"]
Makefile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include .env
2
+
3
+ DOCKER_IMAGE ?= alaroye
4
+ DOCKER_CONTAINER ?= alaroye
5
+ DOCKER_TAG ?= latest
6
+
7
+ docker-build:
8
+ docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
9
+
10
+ docker-run:
11
+ docker run --name $(DOCKER_CONTAINER) -it --rm -p 7860:7860 -e OPENAI_API_KEY=$(OPENAI_API_KEY) $(DOCKER_IMAGE):$(DOCKER_TAG)
Pipfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ gtts = "*"
8
+ speechrecognition = "*"
9
+ pyaudio = "*"
10
+ langchain = "*"
11
+ openai = "*"
12
+ gradio = "*"
13
+ chromadb = "*"
14
+ pydub = "*"
15
+ tiktoken = "*"
16
+
17
+ [dev-packages]
18
+ flake8 = "*"
19
+ black = "*"
20
+ ipykernel = "*"
21
+
22
+ [requires]
23
+ python_version = "3.10"
24
+ python_full_version = "3.10.10"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
alaroye/__init__.py ADDED
File without changes
alaroye/alaroye.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import textwrap
4
+ import pathlib
5
+ from typing import Any, Dict, List
6
+
7
+ import gradio as gr
8
+ from gtts import gTTS
9
+ from pydub import AudioSegment
10
+ from pydub.playback import play
11
+ from io import BytesIO
12
+ from dotenv import find_dotenv, load_dotenv
13
+ from langchain.chains import RetrievalQA
14
+ from langchain.chains.retrieval_qa.base import BaseRetrievalQA
15
+ from langchain.docstore.document import Document
16
+ from langchain.document_loaders import DirectoryLoader, TextLoader
17
+ from langchain.embeddings.openai import OpenAIEmbeddings
18
+ from langchain.llms import OpenAI
19
+ from langchain.memory import ConversationBufferWindowMemory
20
+ from langchain.prompts import PromptTemplate
21
+ from langchain.text_splitter import CharacterTextSplitter
22
+ from langchain.vectorstores import Chroma
23
+ import speech_recognition as sr
24
+
25
+
26
+ # Load environment variables
27
+ load_dotenv(find_dotenv())
28
+
29
+
30
+ class Alaroye:
31
+ """
32
+ A bot that answers questions about Omdena
33
+ """
34
+
35
+ def __init__(self, version: str = "v0.0.0"):
36
+ """
37
+ Initialize the Alaroye.
38
+
39
+ Parameters
40
+ ----------
41
+ version : str (default="v0.0.0")
42
+ The version of the Alaroye.
43
+ """
44
+
45
+ self.embeddings = OpenAIEmbeddings() # type: ignore
46
+ self.llm = OpenAI(temperature=0.1) # type: ignore
47
+ self.vector_store: Chroma | None = None
48
+ self.speech_recognizer = sr.Recognizer()
49
+ self.retrieval_qa: BaseRetrievalQA | None = None
50
+ self.persist_directory = os.path.join(os.path.dirname(__file__), "Alaroyedb")
51
+ self.prompt_template = """
52
+ Use the following pieces of context to answer the question delimited <<< >>>. If you don't know the answer, \
53
+ just say `I don't know, rephrase the question or contect omdena support on slack or email ([email protected])` \
54
+ Don't try to make up an answer.
55
+
56
+ {context}
57
+
58
+ <<<{question}>>>
59
+ """
60
+
61
+ self.prompt = PromptTemplate(
62
+ template=self.prompt_template, input_variables=["context", "question"]
63
+ )
64
+
65
+ # Load initializations variables
66
+ self.version = version
67
+
68
+ def train(
69
+ self, doc_dir: str, chunk: bool = True, chunk_size: int = 1000, chunk_overlap: int = 0
70
+ ) -> None:
71
+ """
72
+ Train the Alaroye.
73
+
74
+ Parameters
75
+ ----------
76
+ doc_dir : str
77
+ The directory containing the docx documents to train the Alaroye on.
78
+ chunk : bool (default=True)
79
+ Whether to chunk the documents.
80
+ chunk_size : int (default=1000)
81
+ The size of the chunks.
82
+ chunk_overlap : int (default=0)
83
+ The overlap between chunks.
84
+
85
+ """
86
+
87
+ # Get the documents to train on
88
+ documents = self._get_documents(
89
+ doc_dir, chunck=chunk, chunk_size=chunk_size, chunk_overlap=chunk_overlap
90
+ )
91
+
92
+ # Train the Alaroye
93
+ self.vector_store = Chroma.from_documents(
94
+ documents=documents,
95
+ embedding=self.embeddings,
96
+ collection_name=f"alaroye-{self.version}",
97
+ persist_directory=self.persist_directory,
98
+ )
99
+
100
+ # Persist the vectorstore
101
+ self.vector_store.persist()
102
+
103
+ # Create a retrieval QA chain
104
+ self.retrieval_qa = RetrievalQA.from_chain_type(
105
+ llm=self.llm,
106
+ chain_type="stuff",
107
+ retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
108
+ return_source_documents=True,
109
+ chain_type_kwargs={
110
+ "prompt": self.prompt,
111
+ },
112
+ )
113
+
114
+ def load(self) -> None:
115
+ """
116
+ Load the vectorstore.
117
+ """
118
+
119
+ # Load the vectorstore
120
+ self.vector_store = Chroma(
121
+ embedding_function=self.embeddings,
122
+ collection_name=f"alaroye-{self.version}",
123
+ persist_directory=self.persist_directory,
124
+ )
125
+
126
+ # Create a retrieval QA chain
127
+ self.retrieval_qa = RetrievalQA.from_chain_type(
128
+ llm=self.llm,
129
+ chain_type="stuff",
130
+ retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
131
+ return_source_documents=True,
132
+ chain_type_kwargs={
133
+ "prompt": self.prompt,
134
+ },
135
+ )
136
+
137
+ def ask(self, question: str, verbose: bool = False) -> dict[str, Any]:
138
+ """
139
+ Ask the Alaroye a question.
140
+
141
+ Parameters
142
+ ----------
143
+ question : str
144
+ The question to ask the Alaroye.
145
+ verbose : bool (default=False)
146
+ Whether to print the answer.
147
+
148
+ Returns
149
+ -------
150
+ dict[str, Any]
151
+ The answer to the question.
152
+ """
153
+
154
+ # Check if the Alaroye has been trained or loaded
155
+ if self.vector_store is None or self.retrieval_qa is None:
156
+ raise ValueError("The Alaroye has not been trained or loaded.")
157
+
158
+ # Get the answer
159
+ answer = self.retrieval_qa(question)
160
+
161
+ # Format the answer
162
+ formatted_answer = self._format_answer(answer, verbose=verbose)
163
+
164
+ # Return the formatted answer
165
+ return formatted_answer
166
+
167
+ def _listen(self) -> str:
168
+ """
169
+ Listen to the user.
170
+
171
+ Returns
172
+ -------
173
+ str
174
+ The user's input.
175
+ """
176
+
177
+ # Initialize the microphone
178
+ mic = sr.Microphone()
179
+
180
+ # Listen to the user
181
+ with mic as source:
182
+ print("Calibrating microphone...")
183
+ self.speech_recognizer.adjust_for_ambient_noise(source, duration=5)
184
+
185
+ print("Listening...")
186
+ try:
187
+ audio = self.speech_recognizer.listen(source, timeout=5)
188
+ print("Recognizing...")
189
+
190
+ text = self.speech_recognizer.recognize_google(audio)
191
+
192
+ return text
193
+ except Exception as e:
194
+ bad_response = "Sorry, I didn't catch that. Could you repeat yourself?"
195
+
196
+ return bad_response
197
+
198
+ def _speak(self, text: str) -> None:
199
+ """
200
+ Speak to the user.
201
+
202
+ Parameters
203
+ ----------
204
+ text : str
205
+ The text to speak to the user.
206
+ """
207
+
208
+ # Generate speech using gTTS
209
+ tts = gTTS(text=text, lang="de")
210
+
211
+ # Write the speech to bytes
212
+ mp3_fp = BytesIO()
213
+ tts.write_to_fp(mp3_fp)
214
+
215
+ # Play the speech
216
+ stream = BytesIO(mp3_fp.read())
217
+ mp3_fp.close()
218
+ audio = AudioSegment.from_file(stream, format="mp3")
219
+ play(audio)
220
+
221
+ @staticmethod
222
+ def _get_documents(
223
+ doc_dir: str, chunck: bool = True, chunk_size: int = 1000, chunk_overlap: int = 0
224
+ ) -> List[Document]:
225
+ """
226
+ Get the documents to train the Alaroye on.
227
+
228
+ Parameters
229
+ ----------
230
+ doc_dir : str
231
+ The directory containing the docx documents to train the Alaroye on.
232
+ chunk : bool (default=True)
233
+ Whether to split the documents into chunks.
234
+ chunk_size : int (default=1000)
235
+ The size of each chunk in characters.
236
+ chunk_overlap : int (default=0)
237
+ The number of characters to overlap between chunks.
238
+
239
+ Returns
240
+ -------
241
+ List[Document]
242
+ The documents to train the Alaroye on.
243
+ """
244
+
245
+ # Load the documents
246
+ docx_loader = DirectoryLoader(
247
+ doc_dir,
248
+ glob="./*.txt",
249
+ loader_cls=TextLoader, # type: ignore
250
+ )
251
+ documents: List[Document] = docx_loader.load()
252
+
253
+ # Split the documents into chunks
254
+ if chunck:
255
+ document_chuncks = Alaroye._split_documents(documents, chunk_size, chunk_overlap)
256
+
257
+ return document_chuncks
258
+
259
+ return documents
260
+
261
+ @staticmethod
262
+ def _split_documents(
263
+ documents: List[Document], chunk_size: int, chunk_overlap: int
264
+ ) -> List[Document]:
265
+ """
266
+ Split the documents into chunks.
267
+
268
+ Parameters
269
+ ----------
270
+ documents : List[Document]
271
+ The documents to split.
272
+ chunk_size : int
273
+ The size of each chunk in characters.
274
+ chunk_overlap : int
275
+ The number of characters to overlap between chunks.
276
+
277
+ Returns
278
+ -------
279
+ List[Document]
280
+ The chunked documents to train the Alaroye on.
281
+ """
282
+
283
+ # Create a text splitter
284
+ text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
285
+
286
+ # Split the documents into chunks
287
+ document_chuncks: List[Document] = text_splitter.split_documents(documents)
288
+
289
+ return document_chuncks
290
+
291
+ @staticmethod
292
+ def _wrap_text(text: str, width: int = 80) -> str:
293
+ """
294
+ Wrap text to a specified width while preserving newlines.
295
+
296
+ Parameters
297
+ ----------
298
+ text : str
299
+ The text to wrap.
300
+ width : int (default=110)
301
+ The maximum width of a line in characters.
302
+
303
+ Returns
304
+ -------
305
+ str
306
+ The wrapped text.
307
+ """
308
+
309
+ # Split the input text into lines based on newline characters
310
+ lines = text.strip().split("\n")
311
+
312
+ # Wrap each line individually
313
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
314
+
315
+ # Join the wrapped lines back together using newline characters
316
+ wrapped_text = "\n".join(wrapped_lines)
317
+
318
+ return wrapped_text
319
+
320
+ @staticmethod
321
+ def _format_answer(response: Dict[str, Any], verbose: bool = False) -> dict[str, Any]:
322
+ """
323
+ Parse the response from the Alaroye.
324
+
325
+ Parameters
326
+ ----------
327
+ response : Dict[str, Any]
328
+ The response from the Alaroye.
329
+ verbose : bool (default=False)
330
+ Whether to print the answer and source documents.
331
+
332
+ Returns
333
+ -------
334
+ dict[str, Any]
335
+ The parsed response.
336
+ """
337
+
338
+ # Get the answer from the response
339
+ answer = response["result"]
340
+ answer = answer.strip()
341
+
342
+ # Get the source documents from the response
343
+ source_documents: List[str] = [
344
+ source.metadata["source"] for source in response["source_documents"]
345
+ ]
346
+
347
+ # Extract the source documents file names
348
+ source_documents = [os.path.basename(source) for source in source_documents]
349
+
350
+ # Filter out duplicate source documents
351
+ source_documents = list(set(source_documents))
352
+
353
+ # Print the answer and source documents
354
+ if verbose:
355
+ print(f"Answer: \n{Alaroye._wrap_text(answer)}")
356
+ print("\n\nSource documents:")
357
+ for source_document in source_documents:
358
+ print(f" - {source_document}")
359
+
360
+ return {
361
+ "answer": answer,
362
+ "source_documents": source_documents,
363
+ }
alaroye/alaroyedb/chroma-collections.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba8f8c7ce3183dcece433278eda253ac414602bbf8fc056492f88fcb0f72dc41
3
+ size 582
alaroye/alaroyedb/chroma-embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c63ddf58e8724ec72057ac7ff59a8247bd51e0053e37fadc1c33c72e5cce719
3
+ size 133228
alaroye/alaroyedb/index/id_to_uuid_676957b6-5b85-4306-bd90-8fbd6a25173a.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c9b3099292e9a7715bec1415f43620f076b4815fd3b462df83cf7302b9a3030
3
+ size 350
alaroye/alaroyedb/index/index_676957b6-5b85-4306-bd90-8fbd6a25173a.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05dacfe0ea852273fa006eba491d99cce965b33f328da42cb36befcf4aa99948
3
+ size 63044
alaroye/alaroyedb/index/index_metadata_676957b6-5b85-4306-bd90-8fbd6a25173a.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c6688716b16bb645997c219107f1a6f13c20d5ea26fe4165014c682b08f29b5
3
+ size 103
alaroye/alaroyedb/index/uuid_to_id_676957b6-5b85-4306-bd90-8fbd6a25173a.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5652d13e72f5f9fbf7d9d28dba4f216c927b0c28cd6722c8061f31d549d6d3e
3
+ size 386
docs/state_of_the_union.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e70b0de092c9cfe2b6158ab1d24b5f472bc879340d4a199dd6f4bf8b5d5091f
3
+ size 13111
notebooks/0-alaroye-v0.0.0.ipynb ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Alaroye\n",
9
+ "\n",
10
+ "```\n",
11
+ "Version: 0.0.0\n",
12
+ "```"
13
+ ]
14
+ },
15
+ {
16
+ "attachments": {},
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": [
20
+ "## Create Alaroye"
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "metadata": {},
27
+ "source": [
28
+ "### Import libraries\n"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 1,
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "name": "stdout",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "['/Users/abdulazeezoj/Desktop/Devspace/Play/alaroye', '/Users/abdulazeezoj/Desktop/Devspace/Play/alaroye/notebooks', '/Users/abdulazeezoj/.pyenv/versions/3.10.10/lib/python310.zip', '/Users/abdulazeezoj/.pyenv/versions/3.10.10/lib/python3.10', '/Users/abdulazeezoj/.pyenv/versions/3.10.10/lib/python3.10/lib-dynload', '', '/Users/abdulazeezoj/.local/share/virtualenvs/alaroye-YVTdCuTc/lib/python3.10/site-packages']\n"
41
+ ]
42
+ },
43
+ {
44
+ "name": "stderr",
45
+ "output_type": "stream",
46
+ "text": [
47
+ "/Users/abdulazeezoj/.local/share/virtualenvs/alaroye-YVTdCuTc/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
48
+ " from .autonotebook import tqdm as notebook_tqdm\n"
49
+ ]
50
+ }
51
+ ],
52
+ "source": [
53
+ "import os\n",
54
+ "import sys\n",
55
+ "\n",
56
+ "# Add the parent directory to the path\n",
57
+ "sys.path.insert(0, os.path.abspath(\"../\"))\n",
58
+ "print(sys.path)\n",
59
+ "\n",
60
+ "from alaroye.alaroye import Alaroye"
61
+ ]
62
+ },
63
+ {
64
+ "attachments": {},
65
+ "cell_type": "markdown",
66
+ "metadata": {},
67
+ "source": [
68
+ "### Create Alaroye"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 2,
74
+ "metadata": {},
75
+ "outputs": [
76
+ {
77
+ "name": "stderr",
78
+ "output_type": "stream",
79
+ "text": [
80
+ "Created a chunk of size 1773, which is longer than the specified 1000\n",
81
+ "Created a chunk of size 1074, which is longer than the specified 1000\n",
82
+ "Created a chunk of size 2134, which is longer than the specified 1000\n",
83
+ "Created a chunk of size 1013, which is longer than the specified 1000\n"
84
+ ]
85
+ }
86
+ ],
87
+ "source": [
88
+ "alaroye = Alaroye(version=\"v0.0.0\")\n",
89
+ "\n",
90
+ "# Train the Osanyin\n",
91
+ "alaroye.train(doc_dir=\"../docs/\", chunk=True, chunk_size=1000, chunk_overlap=0)"
92
+ ]
93
+ },
94
+ {
95
+ "attachments": {},
96
+ "cell_type": "markdown",
97
+ "metadata": {},
98
+ "source": [
99
+ "### Test Alaroye"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 3,
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "name": "stdout",
109
+ "output_type": "stream",
110
+ "text": [
111
+ "Answer: \n",
112
+ "Alles gut so weit. Ich habe ein ganz wichtiges Anliegen mit ihnen zu besprechen.\n",
113
+ "\n",
114
+ "\n",
115
+ "Source documents:\n",
116
+ " - state_of_the_union.txt\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "# Test the model\n",
122
+ "query = \"Hallo, wie geht es Ihnen? Gut, sehen sie aus. Geht es ihnen gut?\"\n",
123
+ "response = alaroye.ask(query, verbose=True)\n"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 4,
129
+ "metadata": {},
130
+ "outputs": [
131
+ {
132
+ "name": "stdout",
133
+ "output_type": "stream",
134
+ "text": [
135
+ "Calibrating microphone...\n",
136
+ "Listening...\n",
137
+ "Recognizing...\n",
138
+ "hello\n"
139
+ ]
140
+ }
141
+ ],
142
+ "source": [
143
+ "# Test the model listening to the microphone\n",
144
+ "query = alaroye._listen()\n",
145
+ "print(query)\n",
146
+ "# response = alaroye.ask(query, verbose=True)"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 5,
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "Answer: \n",
159
+ "Omdena's terms and conditions state that applicants must be chosen and verified\n",
160
+ "before they can become collaborators, and that Omdena may collect and use\n",
161
+ "personal information for the purposes of administering the project. If there is\n",
162
+ "a conflict between the terms and conditions and the Code of Conduct, the last\n",
163
+ "appearing in the list will take precedence. Applicants must also represent,\n",
164
+ "warrant and undertake that their institution has authorized their entry into the\n",
165
+ "project, and must cease use of the project website and not participate if it is\n",
166
+ "contrary to their institution's policies. They may also be requested to sign a\n",
167
+ "Code of Conduct if chosen as a project participant.\n",
168
+ "\n",
169
+ "\n",
170
+ "Source documents:\n",
171
+ " - Omdena Terms and Conditions.docx\n"
172
+ ]
173
+ }
174
+ ],
175
+ "source": [
176
+ "# Test the model\n",
177
+ "query = \"Summarize the omdena terms and conditions.\"\n",
178
+ "response = alaroye.ask(query, verbose=True)"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 6,
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "Answer: \n",
191
+ "Omdena Top Talent projects are yet another frontier for people to collaborate on\n",
192
+ "solving AI-related problems. Here Omdena selects a smaller team (2-5 people)\n",
193
+ "from its top talents to work on projects. All members of top talent projects are\n",
194
+ "paid as per the market rates.\n",
195
+ "\n",
196
+ "\n",
197
+ "Source documents:\n",
198
+ " - Omdena Top Talent Projects Guidebook.docx\n"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "# Test the model\n",
204
+ "query = \"What is a top talent project?\"\n",
205
+ "response = alaroye.ask(query, verbose=True)"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 7,
211
+ "metadata": {},
212
+ "outputs": [
213
+ {
214
+ "name": "stdout",
215
+ "output_type": "stream",
216
+ "text": [
217
+ "Answer: \n",
218
+ "To apply at Omdena School, you can visit the website www.omdena.com/school and\n",
219
+ "fill out the application form.\n",
220
+ "\n",
221
+ "\n",
222
+ "Source documents:\n",
223
+ " - Omdena Local Chapter Collaborator Onboarding.docx\n",
224
+ " - Omdena Terms and Conditions.docx\n"
225
+ ]
226
+ }
227
+ ],
228
+ "source": [
229
+ "# Test the model\n",
230
+ "query = \"how to apply at Omdenaschool\"\n",
231
+ "response = alaroye.ask(query, verbose=True)"
232
+ ]
233
+ },
234
+ {
235
+ "attachments": {},
236
+ "cell_type": "markdown",
237
+ "metadata": {},
238
+ "source": [
239
+ "## Load Osanyin"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "markdown",
244
+ "metadata": {},
245
+ "source": [
246
+ "### Import libraries\n"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": 1,
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "name": "stderr",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "/Users/abdulazeezoj/.local/share/virtualenvs/omdenabot-m2zzZ4nN/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
259
+ " from .autonotebook import tqdm as notebook_tqdm\n"
260
+ ]
261
+ }
262
+ ],
263
+ "source": [
264
+ "import pathlib\n",
265
+ "import sys\n",
266
+ "\n",
267
+ "# Add the parent directory to the path\n",
268
+ "sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))\n",
269
+ "\n",
270
+ "from osanyin.osanyin import Osanyin"
271
+ ]
272
+ },
273
+ {
274
+ "attachments": {},
275
+ "cell_type": "markdown",
276
+ "metadata": {},
277
+ "source": [
278
+ "### Load Osanyin"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 2,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "# Load Osanyin\n",
288
+ "osanyin = Osanyin(version=\"v0.0.0\")\n",
289
+ "\n",
290
+ "# Load the Osanyin\n",
291
+ "osanyin.load()"
292
+ ]
293
+ },
294
+ {
295
+ "attachments": {},
296
+ "cell_type": "markdown",
297
+ "metadata": {},
298
+ "source": [
299
+ "### Test Osanyin"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 3,
305
+ "metadata": {},
306
+ "outputs": [
307
+ {
308
+ "name": "stdout",
309
+ "output_type": "stream",
310
+ "text": [
311
+ "Answer: \n",
312
+ "The publishing process for Omdena writers involves submitting an article idea\n",
313
+ "via a form shared in Slack, getting approval to publish the article from the\n",
314
+ "project, submitting a draft of the article if the quality is good enough\n",
315
+ "according to the guidelines, having the Omdena team edit and suggest\n",
316
+ "improvements, and then submitting the manuscript according to the Omdena JAII\n",
317
+ "Template via the journal online submission link at https://omdena.com/JAII/.\n",
318
+ "\n",
319
+ "\n",
320
+ "Source documents:\n",
321
+ " - Omdena Local Chapter Lead Manual.docx\n",
322
+ " - Omdena Writers Guidelines.docx\n",
323
+ " - Omdena AI Researcher Guidebook.docx\n"
324
+ ]
325
+ }
326
+ ],
327
+ "source": [
328
+ "# Test the model\n",
329
+ "query = \"What is the publishing process for omdena writers?\"\n",
330
+ "response = osanyin.ask(query, verbose=True)"
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "execution_count": 4,
336
+ "metadata": {},
337
+ "outputs": [
338
+ {
339
+ "name": "stdout",
340
+ "output_type": "stream",
341
+ "text": [
342
+ "Answer: \n",
343
+ "Rudradeb Mitra is the Founder & CEO of Omdena.\n",
344
+ "\n",
345
+ "\n",
346
+ "Source documents:\n",
347
+ " - Omdena Local Chapter Collaborator Onboarding.docx\n",
348
+ " - Omdena Product Manager QA.docx\n"
349
+ ]
350
+ }
351
+ ],
352
+ "source": [
353
+ "# Test the model\n",
354
+ "query = \"Who is the CEO of omdena?\"\n",
355
+ "response = osanyin.ask(query, verbose=True)"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 5,
361
+ "metadata": {},
362
+ "outputs": [
363
+ {
364
+ "name": "stdout",
365
+ "output_type": "stream",
366
+ "text": [
367
+ "Answer: \n",
368
+ "Omdena's most unique feature is its focus on collaboration, compassion,\n",
369
+ "curiosity, and consciousness. This approach creates empowerment, builds trust,\n",
370
+ "gives access to data, generates diverse opinions, and spurs innovation, while\n",
371
+ "also making the solutions more ethical.\n",
372
+ "\n",
373
+ "\n",
374
+ "Source documents:\n",
375
+ " - Omdena Local Chapter Collaborator Onboarding.docx\n",
376
+ " - Omdena Local Chapter Lead Manual.docx\n"
377
+ ]
378
+ }
379
+ ],
380
+ "source": [
381
+ "# Test the model\n",
382
+ "query = \"What do you think is MOST unique about Omdena compared to other platforms which build AI (or other software) solutions? Maximum 3 points.\"\n",
383
+ "response = osanyin.ask(query, verbose=True)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 6,
389
+ "metadata": {},
390
+ "outputs": [
391
+ {
392
+ "name": "stdout",
393
+ "output_type": "stream",
394
+ "text": [
395
+ "Answer: \n",
396
+ "Omdena is a global community of collaborators working together to solve social\n",
397
+ "and environmental issues through technology. We provide a platform for people\n",
398
+ "with the right skills and motivation to come together, learn, share and build\n",
399
+ "solutions. Our approach is to create a community-first innovation model where\n",
400
+ "members feel like a family.\n",
401
+ "\n",
402
+ "\n",
403
+ "Source documents:\n",
404
+ " - Omdena Local Chapter Collaborator Onboarding.docx\n",
405
+ " - Omdena Product Manager QA.docx\n"
406
+ ]
407
+ }
408
+ ],
409
+ "source": [
410
+ "# Test the model\n",
411
+ "query = \"Please write in maximum 3 sentences how you will explain Omdena to someone who has no idea about Omdena.\"\n",
412
+ "response = osanyin.ask(query, verbose=True)"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 7,
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "name": "stdout",
422
+ "output_type": "stream",
423
+ "text": [
424
+ "Answer: \n",
425
+ "I don't know, rephrase the question or contact Omdena support on slack or email\n",
426
+ "([email protected]).\n",
427
+ "\n",
428
+ "\n",
429
+ "Source documents:\n",
430
+ " - Omdena Product Manager Handbook.docx\n"
431
+ ]
432
+ }
433
+ ],
434
+ "source": [
435
+ "# Test the model\n",
436
+ "query = \"What is google?\"\n",
437
+ "response = osanyin.ask(query, verbose=True)"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 8,
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "name": "stdout",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "Answer: \n",
450
+ "Machine learning is a subset of artificial intelligence (AI) that enables\n",
451
+ "computers to learn from data and experiences without being explicitly\n",
452
+ "programmed. It uses algorithms to find patterns in data and make decisions with\n",
453
+ "minimal human intervention.\n",
454
+ "\n",
455
+ "\n",
456
+ "Source documents:\n",
457
+ " - Omdena Local Chapter Lead Manual.docx\n",
458
+ " - Omdena AI Researcher Guidebook.docx\n"
459
+ ]
460
+ }
461
+ ],
462
+ "source": [
463
+ "# Test the model\n",
464
+ "query = \"What is machine learning?\"\n",
465
+ "response = osanyin.ask(query, verbose=True)"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": null,
471
+ "metadata": {},
472
+ "outputs": [],
473
+ "source": []
474
+ }
475
+ ],
476
+ "metadata": {
477
+ "kernelspec": {
478
+ "display_name": "omdenabot-m2zzZ4nN",
479
+ "language": "python",
480
+ "name": "python3"
481
+ },
482
+ "language_info": {
483
+ "codemirror_mode": {
484
+ "name": "ipython",
485
+ "version": 3
486
+ },
487
+ "file_extension": ".py",
488
+ "mimetype": "text/x-python",
489
+ "name": "python",
490
+ "nbconvert_exporter": "python",
491
+ "pygments_lexer": "ipython3",
492
+ "version": "3.10.10"
493
+ },
494
+ "orig_nbformat": 4
495
+ },
496
+ "nbformat": 4,
497
+ "nbformat_minor": 2
498
+ }
src/cli.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import textwrap
4
+ import pathlib
5
+ import typer
6
+
7
+
8
+ # Add Osanyin to the path
9
+ sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
src/web.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import pathlib
4
+ import gradio as gr
5
+
6
+ # Add Osanyin to the path
7
+ sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
8
+
9
+ from alaroye.alaroye import Alaroye
10
+
11
+
12
+ # Initialize the OmdenaBot
13
+ alaroye = Alaroye(version="v0.0.0")
14
+
15
+ # Load the vectorstore
16
+ alaroye.load()
17
+
18
+
19
+ def add_text(history, text):
20
+ history = history + [(text, None)]
21
+ return history, gr.update(value="", interactive=False)
22
+
23
+
24
+ def bot(history):
25
+ # Generate a response
26
+ response = alaroye.ask(history[-1][0])
27
+
28
+ # Get the answer
29
+ answer = response.get(
30
+ "answer",
31
+ "Sorry, I don't know that.",
32
+ )
33
+
34
+ # Check if the answer is about out of context
35
+
36
+ # Update the history
37
+ history[-1][1] = answer
38
+
39
+ return history
40
+
41
+
42
+ with gr.Blocks(title="OmdenaBot") as demo:
43
+ chatbot = gr.Chatbot([], elem_id="chatbot", label="Osanyin").style(height=750)
44
+
45
+ with gr.Row():
46
+ with gr.Column(scale=0.85):
47
+ txt = gr.Textbox(
48
+ show_label=False,
49
+ placeholder="Enter text and press enter, or upload an image",
50
+ ).style(container=False)
51
+ with gr.Column(scale=0.15, min_width=0):
52
+ btn = gr.Button(value="Send")
53
+
54
+ # Button click
55
+ btn_msg = btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
56
+ bot, chatbot, chatbot
57
+ )
58
+ btn_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
59
+
60
+ # Textbox enter
61
+ txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
62
+ bot, chatbot, chatbot
63
+ )
64
+ txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()