Pranjal2041 commited on
Commit
d4218cc
·
1 Parent(s): 2034b44

Fix Issues

Browse files
Files changed (2) hide show
  1. app.py +10 -4
  2. fetch_prod.py +10 -0
app.py CHANGED
@@ -63,8 +63,12 @@ def classify(text, is_unseen):
63
  def scrape_click(url):
64
  out = scraper.get_product(url)
65
  if isinstance(out, str):
66
- print('Error Occured', out)
67
- return
 
 
 
 
68
 
69
  text = out['description']
70
  if text not in cache:
@@ -94,7 +98,7 @@ with gr.Blocks(css="#warning {height: 100%}") as demo:
94
  description = "<p style='font-size: 14px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://github.com/Pranjal2041' style='text-decoration:none' target='_blank'>Pranjal Aggarwal, </a> <a href='' style='text-decoration:none' target='_blank'>Ameet Deshpande, </a> <a href='' style='text-decoration:none' target='_blank'>Karthik Narasimhan </a> </p>" \
95
  + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://sites.google.com/view/semsup-xc/home' target='_blank'>Project Page</a> | <a href='https://arxiv.org/abs/' target='_blank'>Arxiv</a> | <a href='https://github.com/princeton-nlp/SemSup-XC' target='_blank'>Github Repo</a></p>" \
96
  + "<p style='text-align: center; margin: 5px; font-size: 14px; font-weight: w300;'> \
97
- Extreme classification (XC) considers the scenario of predicting over a very large number of classes (thousands to millions), with real-world applications including serving search engine results, e-commerce product tagging, and news article classification. The zero-shot version of this task involves the addition of new categories at test time, requiring models to generalize to novel classes without additional training data (e.g. one may add a new class “fidget spinner” for e-commerce product tagging). In this paper, we develop SEMSUP-XC, a model that achieves state-of-the-art zero-shot (ZS) and few-shot (FS) performance on three extreme classification benchmarks spanning the domains of law, e-commerce, and Wikipedia. SEMSUP-XC builds upon the recently proposed framework of semantic supervision that uses semantic label descriptions to represent and generalize to classes (e.g., “fidget spinner” described as “A popular spinning toy intended as a stress reliever”). Specifically, we use a combination of contrastive learning, a hybrid lexico-semantic similarity module and automated description collection to train SEMSUP-XC efficiently over extremely large class spaces. SEMSUP-XC significantly outperforms baselines and state-of-the-art models on all three datasets, by up to 6-10 precision@1 points on zero-shot classification and >10 precision points on few-shot classification, with similar gains for recall@10 (3 for zero-shot and 2 for few-shot). Our ablation studies show the relative importance of various components and conclude the combined importance of the proposed architecture and automatically scraped descriptions with improvements up to 33 precision@1 points. Furthermore, qualitative analyses demonstrate SEMSUP-XC's better understanding of label space than other state-of-the-art models. \
98
  </p>" \
99
  # gr.HTML(description)
100
  gr.Markdown(description)
@@ -117,7 +121,8 @@ with gr.Blocks(css="#warning {height: 100%}") as demo:
117
  <br>
118
  <br>
119
  Our model was trained on over 1 million product descriptions from Amazon on 6500 different categories.
120
- SemSup-XC can generalize to unseen labels.
 
121
  You can also fetch product descriptions by simply entering the product link, and classify categories on both seen and unseen labels.
122
  """
123
  )
@@ -182,6 +187,7 @@ with gr.Blocks(css="#warning {height: 100%}") as demo:
182
 
183
  # classify_btn.click(lambda value, is_unseen: gr.update(value = classify(value, is_unseen == 'Unseen Labels'), visible = True), inputs = [text_box, radio_btn], outputs=classified_labels_text)
184
  classify_btn.click(lambda value, is_unseen: gr.update(value = format_labels_html(classify(value, is_unseen == 'Unseen Labels'), desc_is_visible = descriptions_visible), visible = True), inputs = [text_box, radio_btn], outputs=label_html)
 
185
 
186
  random_example_btn.click(lambda value: gr.update(value = get_random_example()), inputs= random_example_btn, outputs=text_box)
187
  random_example_btn.click(lambda value: (gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)), inputs = random_example_btn, outputs=[label_html, gold_labels, toggle_descriptions])
 
63
  def scrape_click(url):
64
  out = scraper.get_product(url)
65
  if isinstance(out, str):
66
+ if out == 'Invalid URL':
67
+ gr.Error("Please enter a valid Amazon URL")
68
+ else:
69
+ gr.Error("Error Occured. Check the URL or try again later.")
70
+ print('Error Occured', out)
71
+ return
72
 
73
  text = out['description']
74
  if text not in cache:
 
98
  description = "<p style='font-size: 14px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://github.com/Pranjal2041' style='text-decoration:none' target='_blank'>Pranjal Aggarwal, </a> <a href='' style='text-decoration:none' target='_blank'>Ameet Deshpande, </a> <a href='' style='text-decoration:none' target='_blank'>Karthik Narasimhan </a> </p>" \
99
  + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://sites.google.com/view/semsup-xc/home' target='_blank'>Project Page</a> | <a href='https://arxiv.org/abs/' target='_blank'>Arxiv</a> | <a href='https://github.com/princeton-nlp/SemSup-XC' target='_blank'>Github Repo</a></p>" \
100
  + "<p style='text-align: center; margin: 5px; font-size: 14px; font-weight: w300;'> \
101
+ Extreme classification (XC) considers the scenario of predicting over a very large number of classes (thousands to millions), with real-world applications including serving search engine results, e-commerce product tagging, and news article classification. A real-life requirement in this domain is to predict from labels unseen during training(Zero-Shot), however there have been very little success in this domain. To this end, we propose SemSup-XC, a model that achieves state-of-the-art zero-shot (ZS) and few-shot (FS) performance on three extreme classification benchmarks spanning various domains. Instead of treating labels as class ids, our model learns from diverse descriptions of them, thereby attaining a more better understanding of the label space, evident from qualitative and quantitative results. \
102
  </p>" \
103
  # gr.HTML(description)
104
  gr.Markdown(description)
 
121
  <br>
122
  <br>
123
  Our model was trained on over 1 million product descriptions from Amazon on 6500 different categories.
124
+ SemSup-XC can generalize to both seen and unseen labels.
125
+ You can either use already available examples or enter your own text to classify.
126
  You can also fetch product descriptions by simply entering the product link, and classify categories on both seen and unseen labels.
127
  """
128
  )
 
187
 
188
  # classify_btn.click(lambda value, is_unseen: gr.update(value = classify(value, is_unseen == 'Unseen Labels'), visible = True), inputs = [text_box, radio_btn], outputs=classified_labels_text)
189
  classify_btn.click(lambda value, is_unseen: gr.update(value = format_labels_html(classify(value, is_unseen == 'Unseen Labels'), desc_is_visible = descriptions_visible), visible = True), inputs = [text_box, radio_btn], outputs=label_html)
190
+ classify_btn.click(lambda x: gr.update(visible=True), inputs = classify_btn, outputs = label_html)
191
 
192
  random_example_btn.click(lambda value: gr.update(value = get_random_example()), inputs= random_example_btn, outputs=text_box)
193
  random_example_btn.click(lambda value: (gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)), inputs = random_example_btn, outputs=[label_html, gold_labels, toggle_descriptions])
fetch_prod.py CHANGED
@@ -10,7 +10,17 @@ class Scraper:
10
  def __init__(self):
11
  ...
12
 
 
 
 
 
 
 
 
13
  def get_product(self, url : str) -> Dict:
 
 
 
14
  webpage = requests.get(url, headers=FakeHttpHeader().as_header_dict())
15
  f = open('webpage_out.html','w')
16
  f.write(webpage.content.decode())
 
10
  def __init__(self):
11
  ...
12
 
13
+ def sanity_url(self, url : str) -> bool:
14
+ if url.find('amazon')==-1:
15
+ return False
16
+ if url.find('product')==-1:
17
+ return False
18
+ return True
19
+
20
  def get_product(self, url : str) -> Dict:
21
+ if not self.sanity_url(url):
22
+ return 'Invalid URL'
23
+
24
  webpage = requests.get(url, headers=FakeHttpHeader().as_header_dict())
25
  f = open('webpage_out.html','w')
26
  f.write(webpage.content.decode())