Spaces:

dejanseo
/

DEJAN-Taxonomy-Classifier

Running

App Files Files Community

DEJAN-Taxonomy-Classifier / app.py

dejanseo

Update app.py

a975033 verified 2 months ago

raw

history blame

3.75 kB

	import os
	from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
	import streamlit as st
	import torch
	import torch.nn.functional as F
	import matplotlib.pyplot as plt

	# Load Hugging Face token from environment
	HF_TOKEN = os.getenv("HF_TOKEN")
	model_path = "dejanseo/DEJAN-Taxonomy-Classifier"

	# Load the model and tokenizer using the token
	tokenizer = DebertaV2Tokenizer.from_pretrained(model_path, use_auth_token=HF_TOKEN)
	model = DebertaV2ForSequenceClassification.from_pretrained(model_path, use_auth_token=HF_TOKEN)

	# LABEL_MAPPING (from model index to numeric ID) and corresponding category names
	LABEL_MAPPING = {
	1: 0, 8: 1, 111: 2, 141: 3, 166: 4, 222: 5, 412: 6, 436: 7,
	469: 8, 536: 9, 537: 10, 632: 11, 772: 12, 783: 13, 888: 14,
	922: 15, 988: 16, 1239: 17, 2092: 18, 5181: 19, 5605: 20
	}

	CATEGORY_NAMES = {
	1: "Animals & Pet Supplies",
	8: "Arts & Entertainment",
	111: "Apparel & Accessories",
	141: "Baby & Toddler",
	166: "Books & Magazines",
	222: "Business & Industrial",
	412: "Cameras & Optics",
	436: "Cars & Vehicles",
	469: "Computers & Electronics",
	536: "Food & Beverages",
	537: "Furniture",
	632: "Hardware",
	772: "Health & Beauty",
	783: "Home & Garden",
	888: "Luggage & Bags",
	922: "Media",
	988: "Sporting Goods",
	1239: "Software",
	2092: "Sports & Outdoors",
	5181: "Toys & Games",
	5605: "Travel & Tourism"
	}

	# Reverse mapping for model output index to text label
	INDEX_TO_CATEGORY = {v: f"[{k}] {CATEGORY_NAMES[k]}" for k, v in LABEL_MAPPING.items()}

	# Set Streamlit app title
	st.title("Google Taxonomy Classifier by DEJAN")
	st.write("Enter text in the input box, and the model will classify it into one of the 21 top level categories. This demo showcases early model capability while the full 5000+ label model is undergoing extensive training.")

	# Input text box
	input_text = st.text_area("Enter text for classification:")

	# Inference function
	def classify_text(text):
	if not text.strip():
	return None
	# Tokenize and encode input text
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
	# Get model predictions
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	# Convert logits to probabilities using softmax
	probabilities = F.softmax(logits, dim=-1).squeeze().tolist()
	return probabilities

	# Display results when text is entered
	if st.button("Classify"):
	if input_text.strip():
	st.write("Processing...")
	# Classify the input text
	probabilities = classify_text(input_text)
	if probabilities:
	# Map probabilities to categories
	mapped_probs = {INDEX_TO_CATEGORY[idx]: prob for idx, prob in enumerate(probabilities)}
	# Sort categories by probability in descending order
	sorted_categories = sorted(mapped_probs.items(), key=lambda x: x[1], reverse=True)
	categories = [item[0] for item in sorted_categories]
	values = [item[1] for item in sorted_categories]

	# Create horizontal bar chart
	fig, ax = plt.subplots(figsize=(10, 6))
	ax.barh(categories, values)
	ax.set_xlabel("Probability")
	ax.set_ylabel("Category")
	ax.set_title("Classification Probabilities")
	ax.invert_yaxis() # Ensure highest probability is at the top
	ax.set_xlim(0, 1) # Set the x-axis range to 0-1 for probabilities
	st.pyplot(fig)
	else:
	st.error("Could not classify the text. Please try again.")
	else:
	st.warning("Please enter some text for classification.")