Spaces:
Running
Running
File size: 6,076 Bytes
10be4e3 08d9321 30e2235 10be4e3 08d9321 30e2235 10be4e3 1855b51 10be4e3 1a2fbfc 11be492 1a2fbfc 10be4e3 d6cb72b 30e2235 10be4e3 b6e279e 1a2fbfc d6cb72b 10be4e3 d6cb72b 10be4e3 1a2fbfc d6cb72b 10be4e3 e98dc14 10be4e3 b6e279e 527d813 f66d966 392cbe8 f66d966 10be4e3 f66d966 546fd6f 10be4e3 d6cb72b 10be4e3 d6cb72b 10be4e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
from datasets import load_from_disk
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import JWhiteSpaceAnalyzer
from itertools import chain
from nltk.util import everygrams
searcher = LuceneSearcher("index")
searcher.set_analyzer(JWhiteSpaceAnalyzer())
def tokenize_word(word, min_len=2, max_len=4):
return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]
def tokenize_sentence(sentence, min_len=2, max_len=4):
return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))
ds = load_from_disk("data")
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
RESULTS_PER_PAGE = 5
TEXT_FIELD = "content"
METADATA_FIELD = "docid"
def result_html(result, meta):
return (
f"<div style=\"color:#2a5cb3;font-weight: 500\"><u>docid: {meta}</u></div><br>"
f"<div><details><summary>{result[:250]}...</summary><p>{result[250:]}</p></details></div><br><hr><br>"
)
def format_results(results, query):
text_content = results[TEXT_FIELD]
query_words = query.split()
for word in query_words:
text_content = [text.replace(word, f"<b style=\"color:#2a5cb3;font-weight: 700\">{word}</b>") for text in text_content]
return "\n".join([result_html(result, meta) for result,meta in zip(text_content, results[METADATA_FIELD])])
def page_0(query):
untokenized_query = query
query = tokenize_sentence(query)
hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
ix = [int(hit.docid) for hit in hits]
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True)
results = format_results(results, untokenized_query)
return results, [ix], gr.update(visible=True), untokenized_query
def page_i(i, ix, query):
ix = ix[0]
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True)
results = format_results(results, query)
return results, [ix], query
with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
with gr.Row():
gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
with gr.Row():
with gr.Column(scale=1):
pass
with gr.Column(scale=15):
gr.Markdown("""<div style="text-align: justify"> This search tool was used to validate tokenization scheme for code retrieval for the BigCode project. We indexed the 🎅 <a href="https://huggingface.co./bigcode/santacoder">Santacoder</a> training dataset (Python, Java, and JavaScript) and use a (2,4)-gram tokenizer to build the index. This is the same tokenization scheme that ended up being used to power the ⭐ <a href="https://huggingface.co./spaces/bigcode/search">StarCoder search tool</a>.</div>""")
with gr.Column(scale=1):
pass
with gr.Row():
with gr.Column(scale=1):
result_list = gr.Dataframe(type="array", visible=False, col_count=1)
with gr.Column(scale=15):
query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="Query")
with gr.Column(scale=1):
with gr.Row(scale=1):
pass
with gr.Row(scale=1):
submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False)
with gr.Row(scale=1):
pass
with gr.Row():
with gr.Column(scale=1):
pass
with gr.Column(scale=13):
c = gr.HTML(label="Results")
with gr.Row(visible=False) as pagination:
# left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True)
page_1 = gr.Button(value="1", elem_id="b").style(full_width=True)
page_2 = gr.Button(value="2", elem_id="b").style(full_width=True)
page_3 = gr.Button(value="3", elem_id="b").style(full_width=True)
page_4 = gr.Button(value="4", elem_id="b").style(full_width=True)
page_5 = gr.Button(value="5", elem_id="b").style(full_width=True)
page_6 = gr.Button(value="6", elem_id="b").style(full_width=True)
page_7 = gr.Button(value="7", elem_id="b").style(full_width=True)
page_8 = gr.Button(value="8", elem_id="b").style(full_width=True)
page_9 = gr.Button(value="9", elem_id="b").style(full_width=True)
page_10 = gr.Button(value="10", elem_id="b").style(full_width=True)
# right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True)
with gr.Column(scale=1):
pass
query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination, query])
submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination, query])
with gr.Box(visible=False):
nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)]
page_1.click(fn=page_i, inputs=[nums[0], result_list, query], outputs=[c, result_list, query])
page_2.click(fn=page_i, inputs=[nums[1], result_list, query], outputs=[c, result_list, query])
page_3.click(fn=page_i, inputs=[nums[2], result_list, query], outputs=[c, result_list, query])
page_4.click(fn=page_i, inputs=[nums[3], result_list, query], outputs=[c, result_list, query])
page_5.click(fn=page_i, inputs=[nums[4], result_list, query], outputs=[c, result_list, query])
page_6.click(fn=page_i, inputs=[nums[5], result_list, query], outputs=[c, result_list, query])
page_7.click(fn=page_i, inputs=[nums[6], result_list, query], outputs=[c, result_list, query])
page_8.click(fn=page_i, inputs=[nums[7], result_list, query], outputs=[c, result_list, query])
page_9.click(fn=page_i, inputs=[nums[8], result_list, query], outputs=[c, result_list, query])
page_10.click(fn=page_i, inputs=[nums[9], result_list, query], outputs=[c, result_list, query])
demo.launch(enable_queue=True, debug=True) |