khointn commited on
Commit
b1fe073
1 Parent(s): 6451cea

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -178,3 +178,4 @@ coverage_report/
178
  local_data/
179
  models/
180
  .DS_Store
 
 
178
  local_data/
179
  models/
180
  .DS_Store
181
+ /app/ui/multimodalchatbot
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: discord-bot
3
- app_file: app/main.py
4
  sdk: gradio
5
- sdk_version: 4.33.0
6
  ---
7
  # Capstone Project
8
 
@@ -40,6 +40,10 @@ Download embedding and(or) LLM models
40
  ```shell
41
  bash prestart.sh
42
  ```
 
 
 
 
43
 
44
  ### Install `pre-commit` hooks
45
 
 
1
  ---
2
  title: discord-bot
3
+ app_file: app/__main__.py
4
  sdk: gradio
5
+ sdk_version: 4.26.0
6
  ---
7
  # Capstone Project
8
 
 
40
  ```shell
41
  bash prestart.sh
42
  ```
43
+ Download web images and creating indices
44
+ ```shell
45
+ bash index.sh
46
+ ```
47
 
48
  ### Install `pre-commit` hooks
49
 
app/_config.py CHANGED
@@ -15,8 +15,9 @@ class Settings(BaseSettings):
15
 
16
  WEAVIATE_CLIENT_URL: str = "http://localhost:8080"
17
 
18
- LLM_MODE: Literal["openai", "mock", "local"] = "mock"
19
- EMBEDDING_MODE: Literal["openai", "mock", "local"] = "mock"
 
20
 
21
  LOCAL_DATA_FOLDER: str = "local_data/test"
22
 
@@ -44,11 +45,6 @@ class Settings(BaseSettings):
44
  IS_UI_ENABLED: bool = True
45
  UI_PATH: str = "/"
46
 
47
- # Rerank
48
- IS_RERANK_ENABLED: bool = True
49
- RERANK_TOP_N: int = 3
50
- RERANK_MODEL_NAME: str = "cross-encoder/ms-marco-MiniLM-L-2-v2"
51
-
52
  class Config:
53
  case_sensitive = True
54
  env_file_encoding = "utf-8"
 
15
 
16
  WEAVIATE_CLIENT_URL: str = "http://localhost:8080"
17
 
18
+ LLM_MODE: Literal["openai", "mock", "local"] = "local"
19
+ EMBEDDING_MODE: Literal["openai", "mock", "local"] = "local"
20
+ IMG_DATASET: Literal["growstuff"] = "growstuff"
21
 
22
  LOCAL_DATA_FOLDER: str = "local_data/test"
23
 
 
45
  IS_UI_ENABLED: bool = True
46
  UI_PATH: str = "/"
47
 
 
 
 
 
 
48
  class Config:
49
  case_sensitive = True
50
  env_file_encoding = "utf-8"
app/components/llm/component.py CHANGED
@@ -42,7 +42,7 @@ class LLMComponent:
42
  # set to at least 1 to use GPU
43
  # set to -1 for all gpu
44
  # set to 0 for cpu
45
- model_kwargs={"n_gpu_layers": 0},
46
  # transform inputs into Llama2 format
47
  messages_to_prompt=messages_to_prompt,
48
  completion_to_prompt=completion_to_prompt,
 
42
  # set to at least 1 to use GPU
43
  # set to -1 for all gpu
44
  # set to 0 for cpu
45
+ model_kwargs={"n_gpu_layers": -1},
46
  # transform inputs into Llama2 format
47
  messages_to_prompt=messages_to_prompt,
48
  completion_to_prompt=completion_to_prompt,
app/main.py CHANGED
@@ -1,7 +1,6 @@
1
  import logging
2
 
3
  from fastapi import FastAPI
4
- import gradio as gr
5
 
6
  from app._config import settings
7
  from app.components.embedding.component import EmbeddingComponent
@@ -37,6 +36,3 @@ if settings.IS_UI_ENABLED:
37
 
38
  ui = PrivateGptUi(ingest_service, chat_service)
39
  ui.mount_in_app(app, settings.UI_PATH)
40
-
41
- io = gr.Interface(lambda x: "Hello, " + x + "!", "textbox", "textbox")
42
- app = gr.mount_gradio_app(app, io, settings.UI_PATH)
 
1
  import logging
2
 
3
  from fastapi import FastAPI
 
4
 
5
  from app._config import settings
6
  from app.components.embedding.component import EmbeddingComponent
 
36
 
37
  ui = PrivateGptUi(ingest_service, chat_service)
38
  ui.mount_in_app(app, settings.UI_PATH)
 
 
 
app/server/chat/service.py CHANGED
@@ -3,11 +3,9 @@ from dataclasses import dataclass
3
  from llama_index import ServiceContext, StorageContext, VectorStoreIndex
4
  from llama_index.chat_engine import ContextChatEngine
5
  from llama_index.chat_engine.types import BaseChatEngine
6
- from llama_index.core.postprocessor import SentenceTransformerRerank
7
  from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
8
  from llama_index.llms import ChatMessage, MessageRole
9
 
10
- from app._config import settings
11
  from app.components.embedding.component import EmbeddingComponent
12
  from app.components.llm.component import LLMComponent
13
  from app.components.node_store.component import NodeStoreComponent
@@ -79,20 +77,13 @@ class ChatService:
79
  index=self.index
80
  )
81
 
82
- node_postprocessors = [
83
- MetadataReplacementPostProcessor(target_metadata_key="window")
84
- ]
85
- if settings.IS_RERANK_ENABLED:
86
- rerank = SentenceTransformerRerank(
87
- top_n=settings.RERANK_TOP_N, model=settings.RERANK_MODEL_NAME
88
- )
89
- node_postprocessors.append(rerank)
90
-
91
  return ContextChatEngine.from_defaults(
92
  system_prompt=system_prompt,
93
  retriever=vector_index_retriever,
94
  service_context=self.service_context,
95
- node_postprocessors=node_postprocessors,
 
 
96
  )
97
 
98
  def chat(self, messages: list[ChatMessage]):
 
3
  from llama_index import ServiceContext, StorageContext, VectorStoreIndex
4
  from llama_index.chat_engine import ContextChatEngine
5
  from llama_index.chat_engine.types import BaseChatEngine
 
6
  from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
7
  from llama_index.llms import ChatMessage, MessageRole
8
 
 
9
  from app.components.embedding.component import EmbeddingComponent
10
  from app.components.llm.component import LLMComponent
11
  from app.components.node_store.component import NodeStoreComponent
 
77
  index=self.index
78
  )
79
 
 
 
 
 
 
 
 
 
 
80
  return ContextChatEngine.from_defaults(
81
  system_prompt=system_prompt,
82
  retriever=vector_index_retriever,
83
  service_context=self.service_context,
84
+ node_postprocessors=[
85
+ MetadataReplacementPostProcessor(target_metadata_key="window"),
86
+ ],
87
  )
88
 
89
  def chat(self, messages: list[ChatMessage]):
app/ui/ui.py CHANGED
@@ -1,14 +1,18 @@
1
  """This file should be imported only and only if you want to run the UI locally."""
 
2
  import itertools
3
  import logging
4
  from pathlib import Path
 
5
  from typing import Any
 
 
6
 
7
  import gradio as gr
8
  from fastapi import FastAPI
9
  from gradio.themes.utils.colors import slate
10
- from llama_index.llms import ChatMessage, MessageRole
11
 
 
12
  from app._config import settings
13
  from app.components.embedding.component import EmbeddingComponent
14
  from app.components.llm.component import LLMComponent
@@ -18,6 +22,7 @@ from app.enums import PROJECT_ROOT_PATH
18
  from app.server.chat.service import ChatService
19
  from app.server.ingest.service import IngestService
20
  from app.ui.schemas import Source
 
21
 
22
  logger = logging.getLogger(__name__)
23
 
@@ -28,6 +33,9 @@ UI_TAB_TITLE = "Agriculture Chatbot"
28
 
29
  SOURCES_SEPARATOR = "\n\n Sources: \n"
30
 
 
 
 
31
 
32
  class PrivateGptUi:
33
  def __init__(
@@ -40,11 +48,43 @@ class PrivateGptUi:
40
 
41
  # Cache the UI blocks
42
  self._ui_block = None
43
-
44
  # Initialize system prompt
45
  self._system_prompt = self._get_default_system_prompt()
46
 
47
- def _chat(self, message: str, history: list[list[str]], *_: Any) -> Any:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def build_history() -> list[ChatMessage]:
49
  history_messages: list[ChatMessage] = list(
50
  itertools.chain(
@@ -53,7 +93,11 @@ class PrivateGptUi:
53
  ChatMessage(content=interaction[0], role=MessageRole.USER),
54
  ChatMessage(
55
  # Remove from history content the Sources information
56
- content=interaction[1].split(SOURCES_SEPARATOR)[0],
 
 
 
 
57
  role=MessageRole.ASSISTANT,
58
  ),
59
  ]
@@ -142,6 +186,12 @@ class PrivateGptUi:
142
 
143
  with gr.Row(equal_height=False):
144
  with gr.Column(scale=3):
 
 
 
 
 
 
145
  upload_button = gr.components.UploadButton(
146
  "Upload File(s)",
147
  type="filepath",
@@ -172,7 +222,6 @@ class PrivateGptUi:
172
  interactive=True,
173
  render=False,
174
  )
175
-
176
  # On blur, set system prompt to use in queries
177
  system_prompt_input.blur(
178
  self._set_system_prompt,
@@ -192,7 +241,11 @@ class PrivateGptUi:
192
  AVATAR_BOT,
193
  ),
194
  ),
195
- additional_inputs=[upload_button, system_prompt_input],
 
 
 
 
196
  )
197
  return blocks
198
 
 
1
  """This file should be imported only and only if you want to run the UI locally."""
2
+
3
  import itertools
4
  import logging
5
  from pathlib import Path
6
+ import subprocess
7
  from typing import Any
8
+ import os
9
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
10
 
11
  import gradio as gr
12
  from fastapi import FastAPI
13
  from gradio.themes.utils.colors import slate
 
14
 
15
+ from llama_index.llms import MessageRole, ChatMessage
16
  from app._config import settings
17
  from app.components.embedding.component import EmbeddingComponent
18
  from app.components.llm.component import LLMComponent
 
22
  from app.server.chat.service import ChatService
23
  from app.server.ingest.service import IngestService
24
  from app.ui.schemas import Source
25
+ from app.paths import local_data_path
26
 
27
  logger = logging.getLogger(__name__)
28
 
 
33
 
34
  SOURCES_SEPARATOR = "\n\n Sources: \n"
35
 
36
+ model_name = "VietAI/envit5-translation"
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
39
 
40
  class PrivateGptUi:
41
  def __init__(
 
48
 
49
  # Cache the UI blocks
50
  self._ui_block = None
 
51
  # Initialize system prompt
52
  self._system_prompt = self._get_default_system_prompt()
53
 
54
+ def _chat(
55
+ self,
56
+ message: str,
57
+ history: list[list[str]],
58
+ upload_button: Any,
59
+ system_prompt_input: Any,
60
+ # show_image: bool,
61
+ ) -> Any:
62
+ # logger.info(f"Show image = {show_image}")
63
+ if "#ảnh" in message:
64
+ message = message.replace("#ảnh","")
65
+ vi_message = "vi: " + message
66
+ outputs = model.generate(tokenizer([vi_message], return_tensors="pt", padding=True).input_ids, max_length=512)
67
+ en_message = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].replace('en:','')
68
+ command = f"""
69
+ cd {local_data_path}
70
+ clip-retrieval filter --query "{en_message}" --output_folder "retrieved_folder" --indice_folder "index_folder" --num_results 1
71
+ """
72
+ logger.info(command)
73
+ subprocess.run(command, shell=True, check=True)
74
+
75
+ folder_path = f"{local_data_path}/retrieved_folder"
76
+ files = os.listdir(folder_path)
77
+ # sort images by most lately retrieved. Keep the old images to show them in chat history
78
+ files.sort(
79
+ key=lambda x: os.path.getctime(os.path.join(folder_path, x)),
80
+ reverse=True,
81
+ )
82
+ newest_image = files[0]
83
+ logger.info(f"Retrieve image {newest_image}")
84
+
85
+ return (os.path.relpath(f"{folder_path}/{newest_image}", PROJECT_ROOT_PATH),)
86
+
87
+
88
  def build_history() -> list[ChatMessage]:
89
  history_messages: list[ChatMessage] = list(
90
  itertools.chain(
 
93
  ChatMessage(content=interaction[0], role=MessageRole.USER),
94
  ChatMessage(
95
  # Remove from history content the Sources information
96
+ content=(
97
+ "[Image Output]"
98
+ if isinstance(interaction[1], tuple)
99
+ else (interaction[1]).split(SOURCES_SEPARATOR)[0]
100
+ ),
101
  role=MessageRole.ASSISTANT,
102
  ),
103
  ]
 
186
 
187
  with gr.Row(equal_height=False):
188
  with gr.Column(scale=3):
189
+ # image_checkbox = gr.Checkbox(
190
+ # label="Show Image",
191
+ # info="Do you want to output relevant image?",
192
+ # value=False,
193
+ # interactive=True,
194
+ # )
195
  upload_button = gr.components.UploadButton(
196
  "Upload File(s)",
197
  type="filepath",
 
222
  interactive=True,
223
  render=False,
224
  )
 
225
  # On blur, set system prompt to use in queries
226
  system_prompt_input.blur(
227
  self._set_system_prompt,
 
241
  AVATAR_BOT,
242
  ),
243
  ),
244
+ additional_inputs=[
245
+ upload_button,
246
+ system_prompt_input,
247
+ # image_checkbox,
248
+ ],
249
  )
250
  return blocks
251
 
imgs.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app._config import settings
2
+ import logging
3
+ import requests
4
+
5
+ logger = logging.getLogger(__name__)
6
+ match settings.IMG_DATASET:
7
+ case "growstuff":
8
+ global urls, keyword
9
+ urls = [
10
+ "https://www.growstuff.org/harvests.json",
11
+ "https://www.growstuff.org/crops.json",
12
+ "https://www.growstuff.org/seeds.json"
13
+ ]
14
+ keyword = "thumbnail_url"
15
+
16
+ thumbnail_urls = set()
17
+
18
+ for url in urls:
19
+ response = requests.get(url)
20
+ if response.status_code == 200:
21
+ data = response.json()['query']
22
+ thumbnail_urls.update(item.get(keyword) for item in data if item and item.get(keyword))
23
+ else:
24
+ logger.info(f"Failed to retrieve data from {url}.")
25
+ thumbnail_urls = list(thumbnail_urls)
26
+ with open(f'{settings.LOCAL_DATA_FOLDER}/myimglist.txt', 'w') as file:
27
+ for url in thumbnail_urls:
28
+ file.write(url + '\n')
29
+ logger.info(f"Retrieved {len(thumbnail_urls)} image urls and written to {settings.LOCAL_DATA_FOLDER}/myimglist.txt")
index.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/env bash
2
+
3
+ #pull images url from the web
4
+ python imgs.py
5
+ local_data_folder=$(python -c "from app._config import settings; print(settings.LOCAL_DATA_FOLDER)")
6
+
7
+ cd "$local_data_folder"
8
+ #remove if folders exists
9
+ rm -rf embeddings_folder
10
+ rm -rf image_folder
11
+ rm -rf index_folder
12
+ rm -rf retrieved_folder
13
+
14
+ echo "download image urls into image folder"
15
+ img2dataset --url_list=myimglist.txt --output_folder=image_folder --thread_count=64 --image_size=256
16
+
17
+ echo "create embedding folder"
18
+ #change --num_prepro_workers > 0 to enable multiprocessing
19
+ clip-retrieval inference --input_dataset image_folder --output_folder embeddings_folder --enable_text False --num_prepro_workers 0
20
+
21
+ echo "create indices from embedding folder"
22
+ clip-retrieval index --embeddings_folder embeddings_folder --index_folder index_folder
pyproject.toml CHANGED
@@ -4,7 +4,10 @@ version = "0.1.0"
4
  description = ""
5
  authors = ["PhucVu <[email protected]>"]
6
  readme = "README.md"
7
-
 
 
 
8
  [tool.poetry.dependencies]
9
  python = "^3.10"
10
  llama-index = "^0.9.22"
@@ -15,15 +18,14 @@ uvicorn = "^0.25.0"
15
  pydantic = "^2.5.3"
16
  gradio = "^4.12.0"
17
 
18
- # reranker
19
- torch = {version="^2.3.0", optional=true}
20
- sentence-transformers = {version="^2.7.0", optional=true}
21
-
22
  [tool.poetry.group.local]
23
  optional = true
24
  [tool.poetry.group.local.dependencies]
 
 
 
 
25
  transformers = "^4.36.2"
26
- torch = "^2.1.2"
27
  llama-cpp-python = "^0.2.29"
28
 
29
  [build-system]
 
4
  description = ""
5
  authors = ["PhucVu <[email protected]>"]
6
  readme = "README.md"
7
+ packages = [
8
+ { include = "app" },
9
+ { include = "app/**/*.py" },
10
+ ]
11
  [tool.poetry.dependencies]
12
  python = "^3.10"
13
  llama-index = "^0.9.22"
 
18
  pydantic = "^2.5.3"
19
  gradio = "^4.12.0"
20
 
 
 
 
 
21
  [tool.poetry.group.local]
22
  optional = true
23
  [tool.poetry.group.local.dependencies]
24
+ torch = "1.13.1"
25
+ clip-retrieval = "^2.44.0"
26
+ img2dataset = "^1.44.1"
27
+ ipython = "^8.20.0"
28
  transformers = "^4.36.2"
 
29
  llama-cpp-python = "^0.2.29"
30
 
31
  [build-system]