prithivMLmods commited on
Commit
0b1f5f8
Β·
verified Β·
1 Parent(s): 0c4954d

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -106
app.py DELETED
@@ -1,106 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import (
4
- AutoModelForCausalLM,
5
- AutoTokenizer,
6
- TextIteratorStreamer,
7
- )
8
- import os
9
- from threading import Thread
10
- import spaces
11
- import time
12
- import subprocess
13
-
14
- subprocess.run(
15
- "pip install flash-attn --no-build-isolation",
16
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
- shell=True,
18
- )
19
-
20
- token = os.environ["HF_TOKEN"]
21
-
22
-
23
- model = AutoModelForCausalLM.from_pretrained(
24
- "rdrede/mbn",
25
- token=token,
26
- trust_remote_code=True,
27
- torch_dtype=torch.bfloat16
28
- )
29
- tok = AutoTokenizer.from_pretrained("rdrede/mbn", token=token)
30
- terminators = [
31
- tok.eos_token_id,
32
- ]
33
-
34
- if torch.cuda.is_available():
35
- device = torch.device("cuda")
36
- print(f"Using GPU: {torch.cuda.get_device_name(device)}")
37
- else:
38
- device = torch.device("cpu")
39
- print("Using CPU")
40
-
41
- model = model.to(device)
42
- # Dispatch Errors
43
-
44
-
45
- @spaces.GPU(duration=60)
46
- def chat(message, history, temperature, do_sample, max_tokens):
47
- chat = []
48
- for item in history:
49
- chat.append({"role": "user", "content": item[0]})
50
- if item[1] is not None:
51
- chat.append({"role": "assistant", "content": item[1]})
52
- chat.append({"role": "user", "content": message})
53
- messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
54
- model_inputs = tok([messages], return_tensors="pt").to(device)
55
- streamer = TextIteratorStreamer(
56
- tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
57
- )
58
- generate_kwargs = dict(
59
- model_inputs,
60
- streamer=streamer,
61
- max_new_tokens=max_tokens,
62
- do_sample=True,
63
- temperature=temperature,
64
- eos_token_id=terminators,
65
- )
66
-
67
- if temperature == 0:
68
- generate_kwargs["do_sample"] = False
69
-
70
- t = Thread(target=model.generate, kwargs=generate_kwargs)
71
- t.start()
72
-
73
- partial_text = ""
74
- for new_text in streamer:
75
- partial_text += new_text
76
- yield partial_text
77
-
78
- yield partial_text
79
-
80
-
81
- demo = gr.ChatInterface(
82
- fn=chat,
83
- examples=[["Write me a poem about Machine Learning."]],
84
- # multimodal=False,
85
- additional_inputs_accordion=gr.Accordion(
86
- label="βš™οΈ Parameters", open=False, render=False
87
- ),
88
- additional_inputs=[
89
- gr.Slider(
90
- minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
91
- ),
92
- gr.Checkbox(label="Sampling", value=True),
93
- gr.Slider(
94
- minimum=128,
95
- maximum=4096,
96
- step=1,
97
- value=512,
98
- label="Max new tokens",
99
- render=False,
100
- ),
101
- ],
102
- stop_btn="Stop Generation",
103
- title="Chat With LLMs",
104
- description="Now Running [microsoft/phi-4](https://huggingface.co/microsoft/phi-4)",
105
- )
106
- demo.launch()