Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,11 +16,12 @@ from langchain_community.llms import OpenAI
|
|
16 |
from PIL import Image
|
17 |
from decord import VideoReader, cpu
|
18 |
import requests
|
|
|
|
|
19 |
|
20 |
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
21 |
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
22 |
|
23 |
-
# Load MiniCPM-V-2_6 with 4-bit quantization
|
24 |
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
|
25 |
device_map="auto", torch_dtype=torch.bfloat16)
|
26 |
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
|
@@ -28,9 +29,15 @@ tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_co
|
|
28 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
|
29 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Initialize voice-only mode
|
36 |
def play_voice_output(response):
|
@@ -177,7 +184,7 @@ def initialize_tools():
|
|
177 |
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
178 |
text_model.to(device='cuda', dtype=torch.bfloat16)
|
179 |
tts_model.to("cuda")
|
180 |
-
|
181 |
image_pipe.to("cuda")
|
182 |
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
183 |
if voice_only:
|
|
|
16 |
from PIL import Image
|
17 |
from decord import VideoReader, cpu
|
18 |
import requests
|
19 |
+
from huggingface_hub import hf_hub_download
|
20 |
+
from safetensors.torch import load_file
|
21 |
|
22 |
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
23 |
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
24 |
|
|
|
25 |
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
|
26 |
device_map="auto", torch_dtype=torch.bfloat16)
|
27 |
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
|
|
|
29 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
|
30 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
31 |
|
32 |
+
# Corrected image model and pipeline setup
|
33 |
+
base = "stabilityai/stable-diffusion-xl-base-1.0"
|
34 |
+
repo = "ByteDance/SDXL-Lightning"
|
35 |
+
ckpt = "sdxl_lightning_4step_unet.safetensors"
|
36 |
+
|
37 |
+
unet = UNet2DConditionModel.from_config(base, subfolder="unet").to("cuda", torch.float16)
|
38 |
+
unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda"))
|
39 |
+
image_pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant="fp16").to("cuda")
|
40 |
+
image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
|
41 |
|
42 |
# Initialize voice-only mode
|
43 |
def play_voice_output(response):
|
|
|
184 |
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
185 |
text_model.to(device='cuda', dtype=torch.bfloat16)
|
186 |
tts_model.to("cuda")
|
187 |
+
unet.to("cuda", torch.float16)
|
188 |
image_pipe.to("cuda")
|
189 |
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
190 |
if voice_only:
|