Spaces:
Running
Running
# pip install transformers | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import streamlit as st | |
checkpoint = "HuggingFaceTB/SmolLM-135M-Instruct" | |
device = "cpu" # for GPU use "gpu" usage or "cpu" for CPU usage | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")` | |
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device) | |
st.title("Dexy Chat Assistant") | |
# Initialize session state for chat history | |
if 'messages' not in st.session_state: | |
st.session_state.messages = [] | |
# Text input for user | |
user_name = st.text_input("Your name please?: ", key="user_name") | |
user_input = st.text_input("Enter your message:", key="user_input") | |
if st.button("Send"): | |
if user_input: | |
# Add user message to history | |
st.session_state.messages.append({"role": "user", "content": user_input}) | |
# Process with model | |
input_text = tokenizer.apply_chat_template(st.session_state.messages, tokenize=False) | |
encoded = tokenizer(input_text, return_tensors="pt", padding=True) | |
inputs = encoded.input_ids.to(device) | |
attention_mask = encoded.attention_mask.to(device) | |
outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True) | |
response = tokenizer.decode(outputs[0]) | |
# Add assistant's response to history | |
st.session_state.messages.append({"role": "assistant", "content": response}) | |
# Display full chat history | |
for msg in st.session_state.messages: | |
if msg["role"] == "user": | |
st.write(f"{user_name}: {msg['content']}") | |
else: | |
# st.write(f"Dexy: {msg['content']}") | |
st.write(f"Dexy: {msg['content'].split('<|im_start|>assistant')[-1].split('<|im_end|>')[0]}") | |