This Pull Request upgrades to Llama 3.2 11B

#2
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. Example1.webp +0 -0
  3. Example2.png +3 -0
  4. README.md +2 -4
  5. app.py +89 -132
  6. requirements.txt +2 -8
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Example2.png filter=lfs diff=lfs merge=lfs -text
Example1.webp ADDED
Example2.png ADDED

Git LFS Details

  • SHA256: 7839e93dd753e5356176bf70d38c43bc56355099d8891ead7aaa342029369268
  • Pointer size: 132 Bytes
  • Size of remote file: 2.04 MB
README.md CHANGED
@@ -4,12 +4,10 @@ emoji: 🌘wπŸŒ–
4
  colorFrom: yellow
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.7.1
8
  app_file: app.py
9
  pinned: true
10
  short_description: A retrieval system with chatbot integration
11
- thumbnail: >-
12
- https://cdn-uploads.huggingface.co/production/uploads/6527e89a8808d80ccff88b7a/XVgtQiizeFHIUUj1huwdv.png
13
  ---
14
 
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: yellow
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.2.0
8
  app_file: app.py
9
  pinned: true
10
  short_description: A retrieval system with chatbot integration
 
 
11
  ---
12
 
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -1,140 +1,97 @@
1
- import gradio as gr
2
- from datasets import load_dataset
3
-
4
- import os
5
- import spaces
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
7
  import torch
8
  from threading import Thread
9
- from sentence_transformers import SentenceTransformer
10
- from datasets import load_dataset
11
  import time
12
-
13
- token = os.environ["HF_TOKEN"]
14
- ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
15
-
16
- dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
17
-
18
- data = dataset["train"]
19
- data = data.add_faiss_index("embeddings") # column name that has the embeddings of the dataset
20
-
21
-
22
- model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
23
-
24
- # use quantization to lower GPU usage
25
- bnb_config = BitsAndBytesConfig(
26
- load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
27
- )
28
-
29
- tokenizer = AutoTokenizer.from_pretrained(model_id,token=token)
30
- model = AutoModelForCausalLM.from_pretrained(
31
- model_id,
32
- torch_dtype=torch.bfloat16,
33
- device_map="auto",
34
- quantization_config=bnb_config,
35
- token=token
36
- )
37
- terminators = [
38
- tokenizer.eos_token_id,
39
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
40
- ]
41
-
42
- SYS_PROMPT = """You are an assistant for answering questions.
43
- You are given the extracted parts of a long document and a question. Provide a conversational answer.
44
- If you don't know the answer, just say "I do not know." Don't make up an answer."""
45
-
46
-
47
-
48
- def search(query: str, k: int = 3 ):
49
- """a function that embeds a new query and returns the most probable results"""
50
- embedded_query = ST.encode(query) # embed new query
51
- scores, retrieved_examples = data.get_nearest_examples( # retrieve results
52
- "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
53
- k=k # get only top k results
54
- )
55
- return scores, retrieved_examples
56
-
57
- def format_prompt(prompt,retrieved_documents,k):
58
- """using the retrieved documents we will prompt the model to generate our responses"""
59
- PROMPT = f"Question:{prompt}\nContext:"
60
- for idx in range(k) :
61
- PROMPT+= f"{retrieved_documents['text'][idx]}\n"
62
- return PROMPT
63
 
64
 
65
  @spaces.GPU
66
- def talk(prompt,history=[]):
67
- k = 1 # number of retrieved documents
68
- scores , retrieved_documents = search(prompt, k)
69
- formatted_prompt = format_prompt(prompt,retrieved_documents,k)
70
- formatted_prompt = formatted_prompt[:2000] # to avoid GPU OOM
71
- messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
72
- # tell the model to generate
73
- input_ids = tokenizer.apply_chat_template(
74
- messages,
75
- add_generation_prompt=True,
76
- return_tensors="pt"
77
- ).to(model.device)
78
- outputs = model.generate(
79
- input_ids,
80
- max_new_tokens=1024,
81
- eos_token_id=terminators,
82
- do_sample=True,
83
- temperature=0.6,
84
- top_p=0.9,
85
- )
86
- streamer = TextIteratorStreamer(
87
- tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
88
- )
89
- generate_kwargs = dict(
90
- input_ids= input_ids,
91
- streamer=streamer,
92
- max_new_tokens=1024,
93
- do_sample=True,
94
- top_p=0.95,
95
- temperature=0.75,
96
- eos_token_id=terminators,
97
- )
98
- t = Thread(target=model.generate, kwargs=generate_kwargs)
99
- t.start()
100
-
101
- outputs = []
102
- for text in streamer:
103
- outputs.append(text)
104
- yield "".join(outputs)
105
-
106
-
107
- TITLE = "# RAG"
108
-
109
- DESCRIPTION = """
110
- A rag pipeline with a chatbot feature
111
-
112
- Resources used to build this project :
113
-
114
- * embedding model : https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
115
- * dataset : https://huggingface.co/datasets/not-lain/wikipedia
116
- * faiss docs : https://huggingface.co/docs/datasets/v2.18.0/en/package_reference/main_classes#datasets.Dataset.add_faiss_index
117
- * chatbot : https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct
118
- * Full documentation : https://huggingface.co/blog/not-lain/rag-chatbot-using-llama3
119
- """
120
-
121
 
122
- demo = gr.ChatInterface(
123
- fn=talk,
124
- chatbot=gr.Chatbot(
125
- show_label=True,
126
- show_share_button=True,
127
- show_copy_button=True,
128
- layout="bubble",
129
- bubble_full_width=False,
130
- ),
131
- theme="Soft",
132
- type="tuples",
133
- examples=["what's anarchy ? "],
134
- title=TITLE,
135
- description=DESCRIPTION,
136
- autofocus=False,
137
- autoscroll = False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- )
140
- demo.launch(debug=True)
 
1
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
+ from PIL import Image
3
+ import requests
 
 
 
4
  import torch
5
  from threading import Thread
6
+ import gradio as gr
7
+ from gradio import FileData
8
  import time
9
+ import spaces
10
+ import re
11
+ ckpt = "Xkev/Llama-3.2V-11B-cot"
12
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt,
13
+ torch_dtype=torch.bfloat16).to("cuda")
14
+ processor = AutoProcessor.from_pretrained(ckpt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  @spaces.GPU
18
+ def bot_streaming(message, history, max_new_tokens=250):
19
+
20
+ txt = message["text"]
21
+ ext_buffer = f"{txt}"
22
+
23
+ messages= []
24
+ images = []
25
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ for i, msg in enumerate(history):
28
+ if isinstance(msg[0], tuple):
29
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
30
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
31
+ images.append(Image.open(msg[0][0]).convert("RGB"))
32
+ elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
33
+ # messages are already handled
34
+ pass
35
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
36
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
37
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
38
+
39
+ # add current message
40
+ if len(message["files"]) == 1:
41
+
42
+ if isinstance(message["files"][0], str): # examples
43
+ image = Image.open(message["files"][0]).convert("RGB")
44
+ else: # regular input
45
+ image = Image.open(message["files"][0]["path"]).convert("RGB")
46
+ images.append(image)
47
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
48
+ else:
49
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
50
+
51
+
52
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
53
+
54
+ if images == []:
55
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
56
+ else:
57
+ inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
58
+
59
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
60
+
61
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.6, top_p=0.9)
62
+ generated_text = ""
63
+
64
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
65
+ thread.start()
66
+ buffer = ""
67
+
68
+ for new_text in streamer:
69
+ buffer += new_text
70
+ generated_text_without_prompt = buffer
71
+ time.sleep(0.01)
72
+
73
+ buffer = re.sub(r"<(\w+)>", r"\<\1\>", buffer)
74
+ buffer = re.sub(r"</(\w+)>", r"\</\1\>", buffer)
75
+
76
+ yield buffer
77
+
78
+
79
+ demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA-CoT",
80
+ textbox=gr.MultimodalTextbox(),
81
+ additional_inputs = [gr.Slider(
82
+ minimum=512,
83
+ maximum=1024,
84
+ value=512,
85
+ step=1,
86
+ label="Maximum number of new tokens to generate",
87
+ )
88
+ ],
89
+ examples=[[{"text": "What is on the flower?", "files": ["./Example1.webp"]},512],
90
+ [{"text": "How to make this pastry?", "files": ["./Example2.png"]},512]],
91
+ cache_examples=False,
92
+ description="Upload an image, and start chatting about it. To learn more about LLaVA-CoT, visit [our GitHub page](https://github.com/PKU-YuanGroup/LLaVA-CoT).",
93
+ stop_btn="Stop Generation",
94
+ fill_height=True,
95
+ multimodal=True)
96
 
97
+ demo.launch(debug=True)
 
requirements.txt CHANGED
@@ -1,9 +1,3 @@
1
- numpy<2
2
- torch>=2
3
  spaces
4
- transformers
5
- sentence-transformers
6
- faiss-gpu
7
- datasets
8
- accelerate
9
- bitsandbytes
 
1
+ torch
 
2
  spaces
3
+ git+https://github.com/huggingface/transformers.git