nikravan commited on
Commit
355b7d6
verified
1 Parent(s): e03b8e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -26
app.py CHANGED
@@ -1,35 +1,59 @@
 
 
 
 
 
1
  import torch
 
2
  import gradio as gr
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
4
- from threading import Thread
5
 
6
- # 鬲賳馗蹖賲丕鬲 賲丿賱
7
- MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
8
- CHAT_TEMPLATE = "Auto"
 
9
  MODEL_NAME = MODEL_ID.split("/")[-1]
10
  CONTEXT_LENGTH = 16000
11
 
 
 
 
 
12
 
13
- COLOR = "blue"
14
- EMOJI = "馃"
15
- DESCRIPTION = f"This is the {MODEL_NAME} model designed for testing thinking for general AI tasks."
16
-
17
-
18
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
- quantization_config = BitsAndBytesConfig(
20
- load_in_4bit=True,
21
- bnb_4bit_compute_dtype=torch.bfloat16
22
- )
23
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24
- model = AutoModelForCausalLM.from_pretrained(
25
- MODEL_ID,
26
- device_map="auto",
27
- quantization_config=quantization_config,
28
- ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
 
 
31
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
32
-
33
  if CHAT_TEMPLATE == "Auto":
34
  stop_tokens = [tokenizer.eos_token_id]
35
  instruction = system_prompt + "\n\n"
@@ -50,19 +74,19 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
50
  instruction += f' {message} [/INST]'
51
  else:
52
  raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
53
-
54
 
55
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
56
  enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
57
- input_ids, attention_mask = enc.input_ids.to(device), enc.attention_mask.to(device)
58
 
59
  if input_ids.shape[1] > CONTEXT_LENGTH:
60
  input_ids = input_ids[:, -CONTEXT_LENGTH:]
61
  attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
62
 
63
  generate_kwargs = dict(
64
- input_ids=input_ids,
65
- attention_mask=attention_mask,
66
  streamer=streamer,
67
  do_sample=True,
68
  temperature=temperature,
@@ -81,10 +105,28 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
81
  yield "".join(outputs)
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  gr.ChatInterface(
85
  predict,
86
  title=EMOJI + " " + MODEL_NAME,
87
  description=DESCRIPTION,
 
 
 
88
  additional_inputs_accordion=gr.Accordion(label="鈿欙笍 Parameters", open=False),
89
  additional_inputs=[
90
  gr.Textbox("You are a code assistant.", label="System prompt"),
@@ -95,4 +137,4 @@ gr.ChatInterface(
95
  gr.Slider(0, 1, 0.95, label="Top P sampling"),
96
  ],
97
  theme=gr.themes.Soft(primary_hue=COLOR),
98
- ).queue().launch()
 
1
+
2
+ import json
3
+ import subprocess
4
+ from threading import Thread
5
+
6
  import torch
7
+ import spaces
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 
10
 
11
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
+
13
+ MODEL_ID = "nikravan/Marco_o1_q4"
14
+ CHAT_TEMPLATE = "ChatML"
15
  MODEL_NAME = MODEL_ID.split("/")[-1]
16
  CONTEXT_LENGTH = 16000
17
 
18
+ # Estableciendo valores directamente para las variables
19
+ COLOR = "blue" # Color predeterminado de la interfaz
20
+ EMOJI = "馃" # Emoji predeterminado para el modelo
21
+ DESCRIPTION = f"This is the {MODEL_NAME} model designed for testing thinking for general AI tasks." # Descripci贸n predeterminada
22
 
23
+ latex_delimiters_set = [{
24
+ "left": "\\(",
25
+ "right": "\\)",
26
+ "display": False
27
+ }, {
28
+ "left": "\\begin{equation}",
29
+ "right": "\\end{equation}",
30
+ "display": True
31
+ }, {
32
+ "left": "\\begin{align}",
33
+ "right": "\\end{align}",
34
+ "display": True
35
+ }, {
36
+ "left": "\\begin{alignat}",
37
+ "right": "\\end{alignat}",
38
+ "display": True
39
+ }, {
40
+ "left": "\\begin{gather}",
41
+ "right": "\\end{gather}",
42
+ "display": True
43
+ }, {
44
+ "left": "\\begin{CD}",
45
+ "right": "\\end{CD}",
46
+ "display": True
47
+ }, {
48
+ "left": "\\[",
49
+ "right": "\\]",
50
+ "display": True
51
+ }]
52
 
53
 
54
+ @spaces.GPU()
55
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
56
+ # Format history with a given chat template
57
  if CHAT_TEMPLATE == "Auto":
58
  stop_tokens = [tokenizer.eos_token_id]
59
  instruction = system_prompt + "\n\n"
 
74
  instruction += f' {message} [/INST]'
75
  else:
76
  raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
77
+ print(instruction)
78
 
79
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
80
  enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
81
+ input_ids, attention_mask = enc.input_ids, enc.attention_mask
82
 
83
  if input_ids.shape[1] > CONTEXT_LENGTH:
84
  input_ids = input_ids[:, -CONTEXT_LENGTH:]
85
  attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
86
 
87
  generate_kwargs = dict(
88
+ input_ids=input_ids.to(device),
89
+ attention_mask=attention_mask.to(device),
90
  streamer=streamer,
91
  do_sample=True,
92
  temperature=temperature,
 
105
  yield "".join(outputs)
106
 
107
 
108
+ # Load model
109
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
110
+ quantization_config = BitsAndBytesConfig(
111
+ load_in_4bit=True,
112
+ bnb_4bit_compute_dtype=torch.bfloat16
113
+ )
114
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
115
+ model = AutoModelForCausalLM.from_pretrained(
116
+ MODEL_ID,
117
+ device_map="auto",
118
+ quantization_config=quantization_config,
119
+ attn_implementation="flash_attention_2",
120
+ )
121
+
122
+ # Create Gradio interface
123
  gr.ChatInterface(
124
  predict,
125
  title=EMOJI + " " + MODEL_NAME,
126
  description=DESCRIPTION,
127
+
128
+
129
+
130
  additional_inputs_accordion=gr.Accordion(label="鈿欙笍 Parameters", open=False),
131
  additional_inputs=[
132
  gr.Textbox("You are a code assistant.", label="System prompt"),
 
137
  gr.Slider(0, 1, 0.95, label="Top P sampling"),
138
  ],
139
  theme=gr.themes.Soft(primary_hue=COLOR),
140
+ ).queue().launch()