inflaton commited on
Commit
6b398e8
·
1 Parent(s): 5cc0091

added support for HF TGI

Browse files
Files changed (5) hide show
  1. .env.example +1 -0
  2. .gitignore +2 -0
  3. app_modules/llm_loader.py +14 -0
  4. requirements.txt +2 -1
  5. tgi.sh +9 -0
.env.example CHANGED
@@ -6,6 +6,7 @@ LLM_MODEL_TYPE=huggingface
6
  # LLM_MODEL_TYPE=mosaicml
7
  # LLM_MODEL_TYPE=stablelm
8
  # LLM_MODEL_TYPE=openllm
 
9
 
10
  OPENLLM_SERVER_URL=http://localhost:64300
11
 
 
6
  # LLM_MODEL_TYPE=mosaicml
7
  # LLM_MODEL_TYPE=stablelm
8
  # LLM_MODEL_TYPE=openllm
9
+ # LLM_MODEL_TYPE=hftgi
10
 
11
  OPENLLM_SERVER_URL=http://localhost:64300
12
 
.gitignore CHANGED
@@ -1,5 +1,7 @@
1
  pdfs
2
  .vscode/
 
 
3
 
4
  # Byte-compiled / optimized / DLL files
5
  __pycache__/
 
1
  pdfs
2
  .vscode/
3
+ data/version.txt
4
+ data/models*
5
 
6
  # Byte-compiled / optimized / DLL files
7
  __pycache__/
app_modules/llm_loader.py CHANGED
@@ -5,6 +5,7 @@ from queue import Queue
5
  from typing import Any, Optional
6
 
7
  import torch
 
8
  from langchain.callbacks.base import BaseCallbackHandler
9
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
10
  from langchain.chat_models import ChatOpenAI
@@ -188,6 +189,19 @@ class LLMLoader:
188
  verbose=True,
189
  use_mlock=True,
190
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  elif self.llm_model_type.startswith("huggingface"):
192
  MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
193
  print(f" loading model: {MODEL_NAME_OR_PATH}")
 
5
  from typing import Any, Optional
6
 
7
  import torch
8
+ from langchain import HuggingFaceTextGenInference
9
  from langchain.callbacks.base import BaseCallbackHandler
10
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
11
  from langchain.chat_models import ChatOpenAI
 
189
  verbose=True,
190
  use_mlock=True,
191
  )
192
+ elif self.llm_model_type == "hftgi":
193
+ HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
194
+ self.llm = HuggingFaceTextGenInference(
195
+ inference_server_url=HFTGI_SERVER_URL,
196
+ max_new_tokens=self.max_tokens_limit / 2,
197
+ top_k=10,
198
+ top_p=0.95,
199
+ typical_p=0.95,
200
+ temperature=0.01,
201
+ repetition_penalty=1.03,
202
+ callbacks=callbacks,
203
+ streaming=True,
204
+ )
205
  elif self.llm_model_type.startswith("huggingface"):
206
  MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
207
  print(f" loading model: {MODEL_NAME_OR_PATH}")
requirements.txt CHANGED
@@ -34,4 +34,5 @@ pypdf
34
  python-telegram-bot
35
  transformers_stream_generator
36
  openllm
37
- openllm[llama]
 
 
34
  python-telegram-bot
35
  transformers_stream_generator
36
  openllm
37
+ openllm[llama]
38
+ text_generation
tgi.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ export HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
4
+
5
+ echo Running $HUGGINGFACE_MODEL_NAME_OR_PATH with TGI
6
+
7
+ volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
8
+
9
+ docker run -e HUGGING_FACE_HUB_TOKEN=$HUGGINGFACE_AUTH_TOKEN --shm-size 1g -p 8081:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.0 --model-id $HUGGINGFACE_MODEL_NAME_OR_PATH