search_paper / app.py
jialicheng's picture
Update app.py
ea776c6 verified
import gradio as gr
from huggingface_hub import InferenceClient
import re
import datetime
from urllib import request
from lxml import etree
url_prefix_mapping = {
'acl': 'https://aclanthology.org',
'emnlp': 'https://aclanthology.org',
'naacl': 'https://aclanthology.org',
'tacl': 'https://aclanthology.org',
'nips': 'https://papers.nips.cc',
'icml': 'https://papers.nips.cc',
'iclr': 'https://iclr.cc',
}
mlr_mapping = {
('icml', 2020): 'v119',
('icml', 2021): 'v139',
('icml', 2022): 'v162',
('icml', 2023): 'v202',
('icml', 2024): 'v139',
}
def get_paper_home(venue, year):
if venue in ['acl', 'emnlp', 'naacl']:
return f'https://aclanthology.org/events/{venue}-{year}'
elif venue == 'nips':
return f'https://papers.{venue}.cc/paper_files/paper/{year}'
elif venue == 'icml':
return f'https://proceedings.mlr.press/{mlr_mapping[(venue, year)]}'
elif venue == 'iclr':
return f'https://iclr.cc/Downloads/{year}'
def check_keywords(ele, keywords):
s = ''.join(ele.itertext()).lower()
url = ele.get('href')
for i in keywords:
match = re.search(i, s)
if match:
return True
return False
def check_keywords_icml(ele, keywords):
s = ''.join(ele.find('.//p[@class="title"]').itertext()).lower()
url = ele.get('href')
for i in keywords:
match = re.search(i, s)
if match:
return True
return False
def search(keywords, venues, min_year, max_year):
keywords = [keyword.strip() for keyword in keywords.split(",")]
year_range = list(range(min_year, max_year))
search_venues = []
if "NeurIPS/ICLR/ICML" in venues:
search_venues.extend(['nips', 'iclr', 'icml'])
if "*ACL" in venues:
search_venues.extend(['acl', 'emnlp', 'naacl', 'tacl'])
if "CVPR/ECCV/ICCV" in venues:
search_venues.extend(['nips', 'iclr', 'icml'])
results = []
for venue in search_venues:
for year in year_range:
print(venue, year)
paper_home = get_paper_home(venue, year)
url_prefix = url_prefix_mapping[venue]
if venue == 'icml':
url_prefix = paper_home
try:
response = request.urlopen(paper_home)
except:
continue
html = response.read().decode()
tree = etree.fromstring(html, etree.HTMLParser())
if 'acl' in venue:
paper_tag_on_html = ".//a[@class='align-middle']"
elif venue == 'iclr':
paper_tag_on_html = ".//a[@class='Poster']"
elif venue == 'nips':
paper_tag_on_html = ".//a[@title='paper title']"
elif venue == 'icml':
paper_tag_on_html = ".//div[@class='paper']"
elements = tree.findall(paper_tag_on_html)
for element in elements:
if venue == 'icml':
if check_keywords_icml(element, keywords):
paper_url = element.find('.//p[@class="links"]').find('a').get('href')
results.append([paper_url])
else:
if check_keywords(element, keywords):
paper_url = url_prefix + element.get('href')
results.append([paper_url])
print(len(elements))
print()
return results
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
current_year = datetime.datetime.now().year
# demo = gr.Interface(
# search,
# inputs=[
# gr.Textbox(lines=2, placeholder="Keywords of the paper title. Supports ReGex."),
# gr.CheckboxGroup(["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"], label="Choose Venues to Search", value=["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"]),
# gr.Slider(minimum=2020, maximum=current_year, value=[2020, current_year], label="Year Range", step=1)
# ],
# outputs=gr.DataFrame(headers=["Paper Link", "Title", "Authors"])
# )
def test_search(keywords, venues, min_year, max_year):
return [["https://example.com"], ["https://anotherexample.com"]]
with gr.Blocks() as demo:
with gr.Row(): # Organize inputs and outputs in a row (side by side)
with gr.Column(scale=1): # Input section (narrower)
# Textbox for keywords
textbox = gr.Textbox(
label="Enter comma-separated keywords",
placeholder="Enter keywords, separated by commas...",
lines=2
)
# Vertical checkbox group for actions
checkbox = gr.CheckboxGroup(
["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"],
label="Choose Venues to Search",
value=["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"],
type="value"
)
# Year range slider
min_year_slider = gr.Slider(minimum=2015, maximum=current_year, value=2020, label="Select Min Year", step=1)
max_year_slider = gr.Slider(minimum=2015, maximum=current_year, value=current_year, label="Select Max Year", step=1)
submit_button = gr.Button("Search")
with gr.Column(scale=3): # Output section (wider)
# Output table
output_table = gr.DataFrame(
headers=["Paper Link",],# "Title", "Authors"
label="Results"
)
# Link the input components to the output function
submit_button.click(
search,
inputs=[textbox, checkbox, min_year_slider, max_year_slider],
outputs=output_table
)
if __name__ == "__main__":
demo.launch()