rphrp1985 commited on
Commit
84d8cd6
·
verified ·
1 Parent(s): 9434810

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -13
app.py CHANGED
@@ -17,18 +17,24 @@ print('token = ',token)
17
 
18
  from transformers import AutoModelForCausalLM, AutoTokenizer
19
 
20
- model_id = "mistralai/Mistral-7B-v0.3"
21
 
22
- model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
23
 
 
24
 
25
- tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
 
 
26
 
27
- model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
28
- # attn_implementation="flash_attention_2",
29
- # low_cpu_mem_usage=True,
30
- device_map="auto"
31
- )
 
 
 
32
 
33
 
34
 
@@ -41,6 +47,27 @@ def respond(
41
  temperature,
42
  top_p,
43
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
  messages = [
@@ -49,13 +76,13 @@ def respond(
49
  {"role": "user", "content": "Do you have mayonnaise recipes?"}
50
  ]
51
 
52
- inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
53
 
54
- outputs = model.generate(inputs, max_new_tokens=2000)
55
- gen_text=tokenizer.decode(outputs[0], skip_special_tokens=True)
56
 
57
- print(gen_text)
58
- yield gen_text
59
  # for val in history:
60
  # if val[0]:
61
  # messages.append({"role": "user", "content": val[0]})
 
17
 
18
  from transformers import AutoModelForCausalLM, AutoTokenizer
19
 
20
+ # model_id = "mistralai/Mistral-7B-v0.3"
21
 
22
+ # model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
23
 
24
+ from airllm import AirLLMLlama2
25
 
26
+ MAX_LENGTH = 128
27
+ # could use hugging face model repo id:
28
+ model = AirLLMLlama2("garage-bAInd/Platypus2-70B-instruct")
29
 
30
+
31
+ # tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
32
+
33
+ # model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
34
+ # # attn_implementation="flash_attention_2",
35
+ # # low_cpu_mem_usage=True,
36
+ # device_map="auto"
37
+ # )
38
 
39
 
40
 
 
47
  temperature,
48
  top_p,
49
  ):
50
+ input_text = [
51
+ 'What is the capital of United States?',
52
+ ]
53
+
54
+ input_tokens = model.tokenizer(input_text,
55
+ return_tensors="pt",
56
+ return_attention_mask=False,
57
+ truncation=True,
58
+ max_length=MAX_LENGTH,
59
+ padding=True)
60
+
61
+ generation_output = model.generate(
62
+ input_tokens['input_ids'].cuda(),
63
+ max_new_tokens=20,
64
+ use_cache=True,
65
+ return_dict_in_generate=True)
66
+
67
+ output = model.tokenizer.decode(generation_output.sequences[0])
68
+
69
+ print(output)
70
+ yield output
71
 
72
 
73
  messages = [
 
76
  {"role": "user", "content": "Do you have mayonnaise recipes?"}
77
  ]
78
 
79
+ # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
80
 
81
+ # outputs = model.generate(inputs, max_new_tokens=2000)
82
+ # gen_text=tokenizer.decode(outputs[0], skip_special_tokens=True)
83
 
84
+ # print(gen_text)
85
+ # yield gen_text
86
  # for val in history:
87
  # if val[0]:
88
  # messages.append({"role": "user", "content": val[0]})