File size: 6,162 Bytes
cdba444
 
 
 
 
 
 
 
 
 
abda236
0712d49
cdba444
 
 
abda236
0712d49
cdba444
 
 
 
 
 
 
 
 
 
 
 
 
 
91d6df8
cdba444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8f3522
 
 
af92ed6
 
7cdd3ad
b8f3522
 
cdba444
 
 
 
 
 
 
b8f3522
 
cdba444
 
 
 
 
 
 
91d6df8
cdba444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab61a2b
cdba444
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# import the required libraries
import gradio as gr
import json
from llmlingua import PromptCompressor
import tiktoken

# load the pre-trained models
compressors = {
    "xlm-roberta-large": PromptCompressor(
        model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
        use_llmlingua2=True,
        device_map="cpu"
    ),
    "mbert-base": PromptCompressor(
        model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
        use_llmlingua2=True,
        device_map="cpu"
    )
}
tokenizer = tiktoken.encoding_for_model("gpt-4")

with open('data/examples_MeetingBank.json', 'r') as f:
    examples = json.load(f) # list of examples, each example is a list of 3 group of values: idx (), original prompt (str), QA pairs (list of list of 2 strings)
original_prompt_list = [[s["original_prompt"]] for s in examples]
qa_list = [s["QA_pairs"] for s in examples]

def compress(original_prompt, compression_rate, base_model="xlm-roberta-large", force_tokens=['\n'], chunk_end_tokens=['.', '\n']):
    if '\\n' in force_tokens:
        idx = force_tokens.index('\\n')
        force_tokens[idx] = '\n'

    compressor = compressors.get(base_model, compressors["mbert-base"])
    results = compressor.compress_prompt_llmlingua2(
            original_prompt,
            rate=compression_rate,
            force_tokens=force_tokens,
            chunk_end_tokens=chunk_end_tokens,
            return_word_label=True,
            drop_consecutive=True
            )

    compressed_prompt = results["compressed_prompt"]
    n_word_compressed = len(tokenizer.encode(compressed_prompt))
    
    word_sep = "\t\t|\t\t"
    label_sep = " "
    lines = results["fn_labeled_original_prompt"].split(word_sep)
    preserved_tokens = []
    for line in lines:
        word, label = line.split(label_sep)
        preserved_tokens.append((word, '+') if label == '1' else (word, None))

    return compressed_prompt, preserved_tokens, n_word_compressed


title = "LLMLingua-2"

header = """# LLMLingua-2: Efficient and Faithful Task-Agnostic Prompt Compression via Data Distillation
            _Zhuoshi Pan, Qianhui Wu, Huiqiang Jiang, Menglin Xia, Xufang Luo, Jue Zhang, Qingwei Lin, Victor Ruehle, Yuqing Yang, Chin-Yew Lin, H. Vicky Zhao, Lili Qiu, Dongmei Zhang_<br/>
            [[project page]](https://llmlingua.com/llmlingua2.html) [[paper]](https://arxiv.org/abs/2403.12968) [[code]](https://github.com/microsoft/LLMLingua)<br/>
            <br/>
            💁‍♂️ This demo is deployed with HF "[CPU basic](https://huggingface.co./docs/hub/spaces-gpus)", the latency is expected to be longer.
        """

theme = "soft"
css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
            #anno-img .mask.active {opacity: 0.7}"""

original_prompt_text = """John: So, um, I've been thinking about the project, you know, and I believe we need to, uh, make some changes. I mean, we want the project to succeed, right? So, like, I think we should consider maybe revising the timeline.
Sarah: I totally agree, John. I mean, we have to be realistic, you know. The timeline is, like, too tight. You know what I mean? We should definitely extend it.
"""

with gr.Blocks(title=title, css=css) as app:
    gr.Markdown(header)
    with gr.Row():
        with gr.Column(scale=3):
            original_prompt = gr.Textbox(value=original_prompt_text, label="Original Prompt", lines=10, max_lines=10, interactive=True)
            compressed_prompt = gr.Textbox(value='', label="Compressed Prompt", lines=10, max_lines=10, interactive=False)
            
        with gr.Column(scale=1):
            base_model = gr.Radio(["mbert-base", "xlm-roberta-large"], label="Base Model", value="mbert-base", interactive=True)
            force_tokens = gr.Dropdown(['\\n', '.', '!', '?', ','],
                                       label="Tokens to Preserve",
                                       value=['\\n', '.', '!', '?', ','],
                                       multiselect=True,
                                       interactive=True)
            compression_rate = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Compression rate", info="after compr. / befor compr.", interactive=True)
            n_word_original = gr.Textbox(lines=1, label="Original (GPT-4 Tokens)", interactive=False, value=len(tokenizer.encode(original_prompt_text)))
            n_word_compressed = gr.Textbox(lines=1, label="Compressed (GPT-4 Tokens)", interactive=False)
    button = gr.Button("⚡Click to Compress")
    with gr.Accordion(label="Compression Details", open=False):
        diff_text = gr.HighlightedText(label="Diff", combine_adjacent=False, show_legend=True, color_map={"+": "green"})

    original_prompt.change(lambda x: len(tokenizer.encode(x)), inputs=[original_prompt], outputs=[n_word_original])
    original_prompt.change(lambda x: ("", "", []), inputs=[original_prompt], outputs=[compressed_prompt, n_word_compressed, diff_text])

    button.click(fn=compress,
                 inputs=[original_prompt, compression_rate, base_model, force_tokens],
                 outputs=[compressed_prompt, diff_text, n_word_compressed])
    
    qa_pairs = gr.DataFrame(label="GPT-4 generated QA pairs related to the original prompt:", headers=["Question", "Answer"], interactive=True,
                            value=[["Summarize the conversation.","John suggests making changes to the project, specifically revising the timeline to ensure its success. Sarah agrees with John, acknowledging that the current timeline is too tight and supports the idea of extending it."]])
    
    gr.Markdown("## Examples (click to select)")
    dataset = gr.Dataset(label="MeetingBank",
                         components=[gr.Textbox(visible=False, max_lines=3)],
                         samples=original_prompt_list,
                         type="index")

    dataset.select(fn=lambda idx: (examples[idx]["original_prompt"], examples[idx]["QA_pairs"]),
                   inputs=[dataset],
                   outputs=[original_prompt, qa_pairs])
        
app.queue(max_size=10, api_open=False).launch(show_api=False)