added support for HF TGI
Browse files- .env.example +1 -0
- .gitignore +2 -0
- app_modules/llm_loader.py +14 -0
- requirements.txt +2 -1
- tgi.sh +9 -0
.env.example
CHANGED
@@ -6,6 +6,7 @@ LLM_MODEL_TYPE=huggingface
|
|
6 |
# LLM_MODEL_TYPE=mosaicml
|
7 |
# LLM_MODEL_TYPE=stablelm
|
8 |
# LLM_MODEL_TYPE=openllm
|
|
|
9 |
|
10 |
OPENLLM_SERVER_URL=http://localhost:64300
|
11 |
|
|
|
6 |
# LLM_MODEL_TYPE=mosaicml
|
7 |
# LLM_MODEL_TYPE=stablelm
|
8 |
# LLM_MODEL_TYPE=openllm
|
9 |
+
# LLM_MODEL_TYPE=hftgi
|
10 |
|
11 |
OPENLLM_SERVER_URL=http://localhost:64300
|
12 |
|
.gitignore
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
pdfs
|
2 |
.vscode/
|
|
|
|
|
3 |
|
4 |
# Byte-compiled / optimized / DLL files
|
5 |
__pycache__/
|
|
|
1 |
pdfs
|
2 |
.vscode/
|
3 |
+
data/version.txt
|
4 |
+
data/models*
|
5 |
|
6 |
# Byte-compiled / optimized / DLL files
|
7 |
__pycache__/
|
app_modules/llm_loader.py
CHANGED
@@ -5,6 +5,7 @@ from queue import Queue
|
|
5 |
from typing import Any, Optional
|
6 |
|
7 |
import torch
|
|
|
8 |
from langchain.callbacks.base import BaseCallbackHandler
|
9 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
10 |
from langchain.chat_models import ChatOpenAI
|
@@ -188,6 +189,19 @@ class LLMLoader:
|
|
188 |
verbose=True,
|
189 |
use_mlock=True,
|
190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
elif self.llm_model_type.startswith("huggingface"):
|
192 |
MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
|
193 |
print(f" loading model: {MODEL_NAME_OR_PATH}")
|
|
|
5 |
from typing import Any, Optional
|
6 |
|
7 |
import torch
|
8 |
+
from langchain import HuggingFaceTextGenInference
|
9 |
from langchain.callbacks.base import BaseCallbackHandler
|
10 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
11 |
from langchain.chat_models import ChatOpenAI
|
|
|
189 |
verbose=True,
|
190 |
use_mlock=True,
|
191 |
)
|
192 |
+
elif self.llm_model_type == "hftgi":
|
193 |
+
HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
|
194 |
+
self.llm = HuggingFaceTextGenInference(
|
195 |
+
inference_server_url=HFTGI_SERVER_URL,
|
196 |
+
max_new_tokens=self.max_tokens_limit / 2,
|
197 |
+
top_k=10,
|
198 |
+
top_p=0.95,
|
199 |
+
typical_p=0.95,
|
200 |
+
temperature=0.01,
|
201 |
+
repetition_penalty=1.03,
|
202 |
+
callbacks=callbacks,
|
203 |
+
streaming=True,
|
204 |
+
)
|
205 |
elif self.llm_model_type.startswith("huggingface"):
|
206 |
MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
|
207 |
print(f" loading model: {MODEL_NAME_OR_PATH}")
|
requirements.txt
CHANGED
@@ -34,4 +34,5 @@ pypdf
|
|
34 |
python-telegram-bot
|
35 |
transformers_stream_generator
|
36 |
openllm
|
37 |
-
openllm[llama]
|
|
|
|
34 |
python-telegram-bot
|
35 |
transformers_stream_generator
|
36 |
openllm
|
37 |
+
openllm[llama]
|
38 |
+
text_generation
|
tgi.sh
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/sh
|
2 |
+
|
3 |
+
export HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
|
4 |
+
|
5 |
+
echo Running $HUGGINGFACE_MODEL_NAME_OR_PATH with TGI
|
6 |
+
|
7 |
+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
8 |
+
|
9 |
+
docker run -e HUGGING_FACE_HUB_TOKEN=$HUGGINGFACE_AUTH_TOKEN --shm-size 1g -p 8081:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.0 --model-id $HUGGINGFACE_MODEL_NAME_OR_PATH
|