File size: 7,167 Bytes
56bdf8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import streamlit as st
from llama_cpp import Llama
import warnings
warnings.filterwarnings(action='ignore')
import datetime
import random
import string
from time import sleep
import tiktoken
import os
from huggingface_hub import hf_hub_download
# for counting the tokens in the prompt and in the result
#context_count = len(encoding.encode(yourtext))
encoding = tiktoken.get_encoding("r50k_base")
nCTX = 2048
sTOPS = ['</s>']
modelname = "Lite-Mistral-150M-v2-Instruct"
# Set the webpage title
st.set_page_config(
page_title=f"Your LocalGPT β¨ with {modelname}",
page_icon="π",
layout="wide")
if "hf_model" not in st.session_state:
st.session_state.hf_model = "Lite-Mistral-150M-v2-Instruct"
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
if "repeat" not in st.session_state:
st.session_state.repeat = 1.35
if "temperature" not in st.session_state:
st.session_state.temperature = 0.1
if "maxlength" not in st.session_state:
st.session_state.maxlength = 500
if "speed" not in st.session_state:
st.session_state.speed = 0.0
if "modelfile" not in st.session_state:
modelfile = hf_hub_download(
repo_id=os.environ.get("REPO_ID", "OuteAI/Lite-Mistral-150M-v2-Instruct-GGUF"),
filename=os.environ.get("MODEL_FILE", "Lite-Mistral-150M-v2-Instruct-Q8_0.gguf"),
)
st.session_state.modelfile = modelfile
def writehistory(filename,text):
with open(filename, 'a', encoding='utf-8') as f:
f.write(text)
f.write('\n')
f.close()
def genRANstring(n):
"""
n = int number of char to randomize
"""
N = n
res = ''.join(random.choices(string.ascii_uppercase +
string.digits, k=N))
return res
@st.cache_resource
def create_chat():
# Set HF API token and HF repo
from llama_cpp import Llama
client = Llama(
model_path=st.session_state.modelfile,
#n_gpu_layers=0,
temperature=0.1,
top_p = 0.5,
n_ctx=nCTX,
max_tokens=600,
repeat_penalty=1.18,
stop=sTOPS,
verbose=False,
)
print('loading Lite-Mistral-150M-v2-Instruct with LlamaCPP...')
return client
# create THE SESSIoN STATES
if "logfilename" not in st.session_state:
## Logger file
logfile = f'{genRANstring(5)}_log.txt'
st.session_state.logfilename = logfile
#Write in the history the first 2 sessions
writehistory(st.session_state.logfilename,f'{str(datetime.datetime.now())}\n\nYour own LocalGPT with π {modelname}\n---\nπ§ π«‘: You are a helpful assistant.')
writehistory(st.session_state.logfilename,f'π: How may I help you today?')
#AVATARS
av_us = 'user.png' # './man.png' #"π¦" #A single emoji, e.g. "π§βπ»", "π€", "π¦". Shortcodes are not supported.
av_ass = 'assistant3002.png' #'./robot.png'
### START STREAMLIT UI
# Create a header element
mytitle = '# π³ OuteAI Local GPT'
st.markdown(mytitle, unsafe_allow_html=True)
st.markdown(f'> *π {modelname} with {nCTX} tokens Context window*')
st.markdown('---')
# CREATE THE SIDEBAR
with st.sidebar:
st.image('logo300.png', use_column_width=True)
st.session_state.temperature = st.slider('Temperature:', min_value=0.0, max_value=1.0, value=0.1, step=0.02)
st.session_state.maxlength = st.slider('Length reply:', min_value=150, max_value=2000,
value=500, step=50)
st.session_state.repeat = st.slider('Repeat Penalty:', min_value=0.0, max_value=2.0, value=1.35, step=0.01)
st.markdown(f"**Logfile**: {st.session_state.logfilename}")
statspeed = st.markdown(f'π« speed: {st.session_state.speed} t/s')
btnClear = st.button("Clear History",type="primary", use_container_width=True)
llm = create_chat()
# Display chat messages from history on app rerun
for message in st.session_state.messages:
if message["role"] == "user":
with st.chat_message(message["role"],avatar=av_us):
st.markdown(message["content"])
else:
with st.chat_message(message["role"],avatar=av_ass):
st.markdown(message["content"])
# Accept user input
if myprompt := st.chat_input("What is an AI model?"):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": myprompt})
# Display user message in chat message container
with st.chat_message("user", avatar=av_us):
st.markdown(myprompt)
usertext = f"user: {myprompt}"
writehistory(st.session_state.logfilename,usertext)
# Display assistant response in chat message container
with st.chat_message("assistant",avatar=av_ass):
message_placeholder = st.empty()
with st.spinner("Thinking..."):
start = datetime.datetime.now()
response = ''
conv_messages = []
conv_messages.append(st.session_state.messages[-1])
full_response = ""
for chunk in llm.create_chat_completion(
messages=conv_messages,
temperature=st.session_state.temperature,
repeat_penalty= st.session_state.repeat,
stop=sTOPS,
max_tokens=st.session_state.maxlength,
stream=True,):
try:
if chunk["choices"][0]["delta"]["content"]:
full_response += chunk["choices"][0]["delta"]["content"]
message_placeholder.markdown(full_response + "π³")
delta = datetime.datetime.now() -start
totalseconds = delta.total_seconds()
prompttokens = len(encoding.encode(myprompt))
assistanttokens = len(encoding.encode(full_response))
totaltokens = prompttokens + assistanttokens
st.session_state.speed = totaltokens/totalseconds
statspeed.markdown(f'π« speed: {st.session_state.speed:.2f} t/s')
except:
pass
delta = datetime.datetime.now() - start
totalseconds = delta.total_seconds()
prompttokens = len(encoding.encode(myprompt))
assistanttokens = len(encoding.encode(full_response))
totaltokens = prompttokens + assistanttokens
st.session_state.speed = totaltokens/totalseconds
statspeed.markdown(f'π« speed: {st.session_state.speed:.3f} t/s')
toregister = full_response + f"""
```
π§Ύ prompt tokens: {prompttokens}
π generated tokens: {assistanttokens}
β³ generation time: {delta}
π« speed: {st.session_state.speed:.2f} t/s
```"""
message_placeholder.markdown(toregister)
asstext = f"assistant: {toregister}"
writehistory(st.session_state.logfilename,asstext)
st.session_state.messages.append({"role": "assistant", "content": toregister}) |