Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub import InferenceClient | |
from urllib import request | |
from lxml import etree | |
url_prefix_mapping = { | |
'acl': 'https://aclanthology.org', | |
'emnlp': 'https://aclanthology.org', | |
'naacl': 'https://aclanthology.org', | |
'tacl': 'https://aclanthology.org', | |
'nips': 'https://papers.nips.cc', | |
'icml': 'https://papers.nips.cc', | |
'iclr': 'https://iclr.cc', | |
} | |
mlr_mapping = { | |
('icml', 2020): 'v119', | |
('icml', 2021): 'v139', | |
('icml', 2022): 'v162', | |
('icml', 2023): 'v202', | |
('icml', 2024): 'v139', | |
} | |
def get_paper_home(venue, year): | |
if venue in ['acl', 'emnlp', 'naacl']: | |
return f'https://aclanthology.org/events/{venue}-{year}' | |
elif venue == 'nips': | |
return f'https://papers.{venue}.cc/paper_files/paper/{year}' | |
elif venue == 'icml': | |
return f'https://proceedings.mlr.press/{mlr_mapping[(venue, year)]}' | |
elif venue == 'iclr': | |
return f'https://iclr.cc/Downloads/{year}' | |
def check_key_words(ele): | |
s = ''.join(ele.itertext()).lower() | |
url = ele.get('href') | |
for i in keywords: | |
match = re.search(i, s) | |
if match: | |
return True | |
return False | |
def check_key_words_icml(ele): | |
s = ''.join(ele.find('.//p[@class="title"]').itertext()).lower() | |
url = ele.get('href') | |
for i in keywords: | |
match = re.search(i, s) | |
if match: | |
return True | |
return False | |
def search(keywords, venues): | |
search_venues = [] | |
if "NeurIPS/ICLR/ICML" in venues: | |
search_venues.extend(['nips', 'iclr', 'icml']) | |
if "*ACL" in venues: | |
search_venues.extend(['acl', 'emnlp', 'naacl', 'tacl']) | |
if "CVPR/ECCV/ICCV" in venues: | |
search_venues.extend(['nips', 'iclr', 'icml']) | |
results = [] | |
for venue in search_venues: | |
if 'acl' in venue: | |
paper_tag_on_html = ".//a[@class='align-middle']" | |
elif venue == 'iclr': | |
paper_tag_on_html = ".//a[@class='Poster']" | |
elif venue == 'nips': | |
paper_tag_on_html = ".//a[@title='paper title']" | |
elif venue == 'icml': | |
paper_tag_on_html = ".//div[@class='paper']" | |
for year in years: | |
print(venue, year) | |
paper_home = get_paper_home(venue, year) | |
url_prefix = url_prefix_mapping[venue] | |
if venue == 'icml': | |
url_prefix = paper_home | |
try: | |
response = request.urlopen(paper_home) | |
except: | |
continue | |
html = response.read().decode() | |
tree = etree.fromstring(html, etree.HTMLParser()) | |
elements = tree.findall(paper_tag_on_html) | |
if venue == 'icml': | |
elements = [i for i in elements if check_key_words_icml(i)] | |
urls = [i.find('.//p[@class="links"]').find('a').get('href') for i in elements] | |
results.extend(urls) | |
else: | |
elements = [i for i in elements if check_key_words(i)] | |
urls = [url_prefix + i.find('.//p[@class="links"]').get('href') for i in elements] | |
results.extend(urls) | |
print(len(elements)) | |
print() | |
return results | |
""" | |
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
""" | |
demo = gr.Interface( | |
search, | |
inputs=[ | |
gr.Textbox(lines=2, placeholder="Keywords of the paper title. Supports ReGex."), | |
gr.CheckboxGroup(["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"], label="Choose Venues to Search", value=["NeurIPS/ICLR/ICML", "*ACL", "CVPR/ECCV/ICCV"]) | |
], | |
outputs=gr.DataFrame(headers=["Paper Link", ])#"Title", "Authors" | |
) | |
if __name__ == "__main__": | |
demo.launch() |