File size: 6,076 Bytes
10be4e3
 
 
08d9321
30e2235
 
10be4e3
 
08d9321
 
30e2235
 
 
 
 
 
10be4e3
 
 
 
 
 
 
 
 
1855b51
10be4e3
 
 
1a2fbfc
 
 
 
11be492
1a2fbfc
10be4e3
 
d6cb72b
30e2235
10be4e3
 
b6e279e
1a2fbfc
d6cb72b
10be4e3
d6cb72b
10be4e3
 
1a2fbfc
d6cb72b
10be4e3
e98dc14
10be4e3
b6e279e
527d813
f66d966
 
 
392cbe8
f66d966
 
10be4e3
 
 
f66d966
546fd6f
10be4e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6cb72b
 
10be4e3
 
d6cb72b
 
 
 
 
 
 
 
 
 
10be4e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from datasets import load_from_disk
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import JWhiteSpaceAnalyzer
from itertools import chain
from nltk.util import everygrams

searcher = LuceneSearcher("index")
searcher.set_analyzer(JWhiteSpaceAnalyzer())

def tokenize_word(word, min_len=2, max_len=4):
    return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]

def tokenize_sentence(sentence, min_len=2, max_len=4):
    return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))

ds = load_from_disk("data")
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS. 
RESULTS_PER_PAGE = 5 

TEXT_FIELD = "content"
METADATA_FIELD = "docid"

def result_html(result, meta):
    return (
    f"<div style=\"color:#2a5cb3;font-weight: 500\"><u>docid: {meta}</u></div><br>"
    f"<div><details><summary>{result[:250]}...</summary><p>{result[250:]}</p></details></div><br><hr><br>"
    )

def format_results(results, query):
    text_content = results[TEXT_FIELD]
    query_words = query.split()
    for word in query_words:
        text_content = [text.replace(word, f"<b style=\"color:#2a5cb3;font-weight: 700\">{word}</b>") for text in text_content]
    return "\n".join([result_html(result, meta) for result,meta in zip(text_content, results[METADATA_FIELD])])
    
def page_0(query):
    untokenized_query = query
    query = tokenize_sentence(query)
    hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
    ix = [int(hit.docid) for hit in hits]
    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True)
    results = format_results(results, untokenized_query)
    return results, [ix], gr.update(visible=True), untokenized_query

def page_i(i, ix, query):
    ix = ix[0]
    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True)
    results = format_results(results, query)
    return results, [ix], query
    
with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
    with gr.Row():
        gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
    with gr.Row():
        with gr.Column(scale=1):
            pass
        with gr.Column(scale=15):
            gr.Markdown("""<div style="text-align: justify"> This search tool was used to validate tokenization scheme for code retrieval for the BigCode project. We indexed the 🎅 <a href="https://huggingface.co./bigcode/santacoder">Santacoder</a> training dataset (Python, Java, and JavaScript) and use a (2,4)-gram tokenizer to build the index. This is the same tokenization scheme that ended up being used to power the ⭐ <a href="https://huggingface.co./spaces/bigcode/search">StarCoder search tool</a>.</div>""")
        with gr.Column(scale=1):
            pass        
    with gr.Row():
        with gr.Column(scale=1):
            result_list = gr.Dataframe(type="array", visible=False, col_count=1)      
        with gr.Column(scale=15):
            query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="Query")
        with gr.Column(scale=1):
            with gr.Row(scale=1):
                pass
            with gr.Row(scale=1):    
                submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False)
            with gr.Row(scale=1):
                pass
                
    with gr.Row():
        with gr.Column(scale=1):
            pass
        with gr.Column(scale=13):
            c = gr.HTML(label="Results")
            with gr.Row(visible=False) as pagination:
                # left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True)
                page_1 = gr.Button(value="1", elem_id="b").style(full_width=True)
                page_2 = gr.Button(value="2", elem_id="b").style(full_width=True)
                page_3 = gr.Button(value="3", elem_id="b").style(full_width=True)
                page_4 = gr.Button(value="4", elem_id="b").style(full_width=True)
                page_5 = gr.Button(value="5", elem_id="b").style(full_width=True)
                page_6 = gr.Button(value="6", elem_id="b").style(full_width=True)
                page_7 = gr.Button(value="7", elem_id="b").style(full_width=True)
                page_8 = gr.Button(value="8", elem_id="b").style(full_width=True)
                page_9 = gr.Button(value="9", elem_id="b").style(full_width=True)
                page_10 = gr.Button(value="10", elem_id="b").style(full_width=True)
                # right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True)
        with gr.Column(scale=1):
            pass
    query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination, query])
    submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination, query])
    with gr.Box(visible=False):
        nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)]
    page_1.click(fn=page_i, inputs=[nums[0], result_list, query], outputs=[c, result_list, query])
    page_2.click(fn=page_i, inputs=[nums[1], result_list, query], outputs=[c, result_list, query])
    page_3.click(fn=page_i, inputs=[nums[2], result_list, query], outputs=[c, result_list, query])
    page_4.click(fn=page_i, inputs=[nums[3], result_list, query], outputs=[c, result_list, query])
    page_5.click(fn=page_i, inputs=[nums[4], result_list, query], outputs=[c, result_list, query])
    page_6.click(fn=page_i, inputs=[nums[5], result_list, query], outputs=[c, result_list, query])
    page_7.click(fn=page_i, inputs=[nums[6], result_list, query], outputs=[c, result_list, query])
    page_8.click(fn=page_i, inputs=[nums[7], result_list, query], outputs=[c, result_list, query])
    page_9.click(fn=page_i, inputs=[nums[8], result_list, query], outputs=[c, result_list, query])
    page_10.click(fn=page_i, inputs=[nums[9], result_list, query], outputs=[c, result_list, query])
demo.launch(enable_queue=True, debug=True)