Spaces:

Spestly
/

AtlasUI

Sleeping

App Files Files Community

Spestly commited on Jan 26

Commit

8653d1f

verified ·

1 Parent(s): 2b609ab

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -16

app.py CHANGED Viewed

@@ -6,9 +6,11 @@ from huggingface_hub import login
 import re
 import os
 HF_TOKEN = os.getenv("HF_TOKEN")
 login(token=HF_TOKEN)
 MODELS = {
     "athena-1": {
         "name": "🦁 Atlas-Flash",
@@ -20,9 +22,9 @@ MODELS = {
     },
 }
-USER_PFP = "user.png"
-AI_PFP = "ai_pfp.png"
 class AtlasInferenceApp:
     def __init__(self):
@@ -59,17 +61,17 @@ class AtlasInferenceApp:
             model_path = MODELS[model_key]["sizes"][model_size]
             tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
             model = AutoModelForCausalLM.from_pretrained(
                 model_path,
-                device_map="cpu",
-                torch_dtype=torch.float32,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             )
             st.session_state.current_model.update({
                 "tokenizer": tokenizer,
                 "model": model,
@@ -87,7 +89,7 @@ class AtlasInferenceApp:
             return "⚠️ Please select and load a model first"
         try:
             system_instruction = "You are Atlas, a helpful AI assistant trained to help the user. You are a Deepseek R1 fine-tune."
             prompt = f"{system_instruction}\n\n### Instruction:\n{message}\n\n### Response:"
@@ -99,8 +101,8 @@ class AtlasInferenceApp:
                 padding=True
             )
-            response_container = st.empty()
             full_response = ""
             with torch.no_grad():
                 for chunk in st.session_state.current_model["model"].generate(
@@ -113,13 +115,27 @@ class AtlasInferenceApp:
                     do_sample=True,
                     pad_token_id=st.session_state.current_model["tokenizer"].pad_token_id,
                     eos_token_id=st.session_state.current_model["tokenizer"].eos_token_id,
-                    streamer=None,  # Use a custom streamer for real-time updates
                 ):
-                    chunk_text = st.session_state.current_model["tokenizer"].decode(chunk, skip_special_tokens=True)
-                    full_response += chunk_text
-                    response_container.markdown(full_response)
-            return full_response.split("### Response:")[-1].strip()
         except Exception as e:
             return f"⚠️ Generation Error: {str(e)}"
         finally:
@@ -159,6 +175,7 @@ class AtlasInferenceApp:
         st.markdown("*⚠️ CAUTION: Atlas is an experimental model and this is just a preview. Responses may not be expected. Please double-check sensitive information!*")
         for message in st.session_state.chat_history:
             with st.chat_message(
                 message["role"],
@@ -166,6 +183,7 @@ class AtlasInferenceApp:
             ):
                 st.markdown(message["content"])
         if prompt := st.chat_input("Message Atlas..."):
             st.session_state.chat_history.append({"role": "user", "content": prompt})
             with st.chat_message("user", avatar=USER_PFP):

 import re
 import os
+# Load Hugging Face token
 HF_TOKEN = os.getenv("HF_TOKEN")
 login(token=HF_TOKEN)
+# Define models
 MODELS = {
     "athena-1": {
         "name": "🦁 Atlas-Flash",
     },
 }
+# Profile pictures
+USER_PFP = "user.png"  # Hugging Face user avatar
+AI_PFP = "ai_pfp.png"  # Replace with the path to your AI's image or a URL
 class AtlasInferenceApp:
     def __init__(self):
             model_path = MODELS[model_key]["sizes"][model_size]
+            # Load Qwen-compatible tokenizer and model
             tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
             model = AutoModelForCausalLM.from_pretrained(
                 model_path,
+                device_map="auto",  # Use GPU if available
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             )
+            # Update session state
             st.session_state.current_model.update({
                 "tokenizer": tokenizer,
                 "model": model,
             return "⚠️ Please select and load a model first"
         try:
+            # Add a system instruction to guide the model's behavior
             system_instruction = "You are Atlas, a helpful AI assistant trained to help the user. You are a Deepseek R1 fine-tune."
             prompt = f"{system_instruction}\n\n### Instruction:\n{message}\n\n### Response:"
                 padding=True
             )
+            # Generate response with streaming
+            response_container = st.empty()  # Placeholder for streaming text
             full_response = ""
             with torch.no_grad():
                 for chunk in st.session_state.current_model["model"].generate(
                     do_sample=True,
                     pad_token_id=st.session_state.current_model["tokenizer"].pad_token_id,
                     eos_token_id=st.session_state.current_model["tokenizer"].eos_token_id,
                 ):
+                    # Decode the chunk and update the response
+                    try:
+                        chunk_text = st.session_state.current_model["tokenizer"].decode(chunk, skip_special_tokens=True)
+                        # Remove the prompt from the response
+                        if prompt in chunk_text:
+                            chunk_text = chunk_text.replace(prompt, "").strip()
+                        full_response += chunk_text
+                        response_container.markdown(full_response)
+                    except Exception as decode_error:
+                        st.error(f"⚠️ Token Decoding Error: {str(decode_error)}")
+                        break
+                    # Stop if the response is too long or incomplete
+                    if len(full_response) >= max_tokens * 4:  # Approximate token-to-character ratio
+                        st.warning("⚠️ Response truncated due to length limit.")
+                        break
+            return full_response.strip()  # Return the cleaned response
         except Exception as e:
             return f"⚠️ Generation Error: {str(e)}"
         finally:
         st.markdown("*⚠️ CAUTION: Atlas is an experimental model and this is just a preview. Responses may not be expected. Please double-check sensitive information!*")
+        # Display chat history
         for message in st.session_state.chat_history:
             with st.chat_message(
                 message["role"],
             ):
                 st.markdown(message["content"])
+        # Input box for user messages
         if prompt := st.chat_input("Message Atlas..."):
             st.session_state.chat_history.append({"role": "user", "content": prompt})
             with st.chat_message("user", avatar=USER_PFP):