vonewman commited on
Commit
46a3c34
1 Parent(s): 9a156bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -54
app.py CHANGED
@@ -1,63 +1,44 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
4
- from threading import Thread
5
- import spaces
6
-
7
- finetuned_model = "CONCREE/adia-llm"
8
-
9
- # Charge le modele
10
- model = AutoModelForCausalLM.from_pretrained(
11
- finetuned_model,
12
- device_map="auto",
13
- trust_remote_code=True,
14
  )
15
 
16
- # Load tokenizer
17
- tokenizer = AutoTokenizer.from_pretrained(finetuned_model,
18
- trust_remote_code=True,
19
- padding=True,
20
- truncation=True)
21
-
22
- class StopOnTokens(StoppingCriteria):
23
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
24
- stop_ids = [29, 0]
25
- for stop_id in stop_ids:
26
- if input_ids[0][-1] == stop_id:
27
- return True
28
- return False
29
-
30
- @spaces.GPU
31
  def predict(message, history):
32
- history_transformer_format = history + [[message, ""]]
33
- stop = StopOnTokens()
34
-
35
- messages = "".join(["".join(["\n[INST]:"+item[0], "\n[/INST]:"+item[1]]) for item in history_transformer_format])
36
-
37
- model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
38
- streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
39
- generate_kwargs = dict(
40
- model_inputs,
41
- streamer=streamer,
42
- max_new_tokens=1024,
43
- num_beams=1,
44
- stopping_criteria=StoppingCriteriaList([stop])
 
 
 
 
 
 
45
  )
46
- t = Thread(target=model.generate, kwargs=generate_kwargs)
47
- t.start()
48
- partial_message = ""
49
- start_flag = True # Flag to ignore initial newline
50
 
51
- for new_token in streamer:
52
- if start_flag and new_token == '\n':
53
- continue
54
- start_flag = False
55
- partial_message += new_token
56
  yield partial_message
57
 
58
 
59
- demo = gr.ChatInterface(predict).launch()
60
-
61
-
62
- if __name__ == "__main__":
63
- demo.launch()
 
1
  import gradio as gr
2
+ from openai import OpenAI
3
+
4
+ BASE_URL = "https://kks679fhv1td67-8000.proxy.runpod.net/v1"
5
+ API_KEY="SOMEHOW"
6
+
7
+ # Create an OpenAI client to interact with the API server
8
+ client = OpenAI(
9
+ base_url=BASE_URL,
10
+ api_key=API_KEY
 
 
 
11
  )
12
 
13
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def predict(message, history):
15
+ # Convert chat history to OpenAI format
16
+ history_openai_format = [{
17
+ "role": "system",
18
+ "content": "Tu es un excellent assistant IA nommé Adia, développé par CONCREE pour accompagner les entrepreneurs Africains."
19
+ }]
20
+ for human, assistant in history:
21
+ history_openai_format.append({"role": "user", "content": human})
22
+ history_openai_format.append({
23
+ "role": "assistant",
24
+ "content": assistant
25
+ })
26
+ history_openai_format.append({"role": "user", "content": message})
27
+
28
+ # Create a chat completion request and send it to the API server
29
+ stream = client.chat.completions.create(
30
+ model="CONCREE/meta-adia-llm-instruct", # Model name to use
31
+ messages=history_openai_format, # Chat history
32
+ temperature=0.1, # Temperature for text generation
33
+ stream=True, # Stream response
34
  )
 
 
 
 
35
 
36
+ # Read and return generated text from response stream
37
+ partial_message = ""
38
+ for chunk in stream:
39
+ partial_message += (chunk.choices[0].delta.content or "")
 
40
  yield partial_message
41
 
42
 
43
+ # Create and launch a chat interface with Gradio
44
+ gr.ChatInterface(predict).queue().launch()