pngwn HF staff commited on
Commit
20f53f9
·
verified ·
1 Parent(s): e164420

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -33
app.py CHANGED
@@ -25,6 +25,7 @@ try:
25
  n_gpu_layers=-1, # change n_gpu_layers if you have more or less VRAM
26
  verbose=True
27
  )
 
28
  print(f"START: AFTER LLAMA-CPP SETUP -- {time.time() - start_load_time}s")
29
 
30
  except Exception as e:
@@ -38,43 +39,31 @@ def generate_text(
38
  max_tokens,
39
  temperature,
40
  top_p,
41
- ):
42
- temp = ""
43
- input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
44
- for interaction in history:
45
- input_prompt = (
46
- input_prompt
47
- + str(interaction[0])
48
- + " [/INST] "
49
- + str(interaction[1])
50
- + " </s><s> [INST] "
51
- )
52
 
53
- input_prompt = input_prompt + str(message) + " [/INST] "
 
 
 
 
54
 
55
- output = llm(
56
- input_prompt,
 
 
 
 
 
57
  temperature=temperature,
58
  top_p=top_p,
59
- top_k=40,
60
- repeat_penalty=1.1,
61
- max_tokens=max_tokens,
62
- stop=[
63
- "<|prompter|>",
64
- "<|endoftext|>",
65
- "<|endoftext|> \n",
66
- "ASSISTANT:",
67
- "USER:",
68
- "SYSTEM:",
69
- ],
70
- stream=True,
71
- )
72
- for out in output:
73
- stream = copy.deepcopy(out)
74
- temp += stream["choices"][0]["text"]
75
- yield temp
76
-
77
-
78
  demo = gr.ChatInterface(
79
  generate_text,
80
  title="llama-cpp-python on GPU",
 
25
  n_gpu_layers=-1, # change n_gpu_layers if you have more or less VRAM
26
  verbose=True
27
  )
28
+
29
  print(f"START: AFTER LLAMA-CPP SETUP -- {time.time() - start_load_time}s")
30
 
31
  except Exception as e:
 
39
  max_tokens,
40
  temperature,
41
  top_p,
42
+ ):
43
+ messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
44
 
45
+ for val in history:
46
+ if val[0]:
47
+ messages.append({"role": "user", "content": val[0]})
48
+ if val[1]:
49
+ messages.append({"role": "assistant", "content": val[1]})
50
 
51
+ messages.append({"role": "user", "content": message})
52
+
53
+ response = ""
54
+
55
+ for chunk in llm.create_chat_completion(
56
+ stream=True,
57
+ max_tokens=max_tokens,
58
  temperature=temperature,
59
  top_p=top_p,
60
+ messages=messages,
61
+ ):
62
+ part = chunk["choices"][0]["delta"].get("content", None)
63
+ if part:
64
+ response += part
65
+ yield response
66
+
 
 
 
 
 
 
 
 
 
 
 
 
67
  demo = gr.ChatInterface(
68
  generate_text,
69
  title="llama-cpp-python on GPU",