Spaces:
Sleeping
Sleeping
import json as js | |
import os | |
import re | |
from typing import List | |
import fasttext | |
import gradio as gr | |
import joblib | |
import omikuji | |
from huggingface_hub import snapshot_download | |
from install_packages import download_model | |
download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin') | |
# Download the model files from Hugging Face | |
for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy', | |
'kapllan/omikuji-bonsai-parliament-it-spacy']: | |
if not os.path.exists(repo_id): | |
os.makedirs(repo_id) | |
model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id) | |
lang_model = fasttext.load_model('lid.176.bin') | |
with open('./id2label.json', 'r') as f: | |
id2label = js.load(f) | |
def map_language(language: str) -> str: | |
language_mapping = {'de': 'German', | |
'it': 'Italian', | |
'fr': 'French'} | |
if language in language_mapping.keys(): | |
return language_mapping[language] | |
else: | |
return language | |
def find_model(language: str): | |
vectorizer, model = None, None | |
if language in ['de', 'fr', 'it']: | |
path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer' | |
path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model' | |
vectorizer = joblib.load(path_to_vectorizer) | |
model = omikuji.Model.load(path_to_model) | |
return vectorizer, model | |
def predict_lang(text: str) -> str: | |
text = re.sub(r'\n', '', text) # Remove linebreaks because fasttext cannot process that otherwise | |
predictions = lang_model.predict(text, k=1) # returns top 2 matching languages | |
language = predictions[0][0] # returns top 2 matching languages | |
language = re.sub(r'__label__', '', language) # returns top 2 matching languages | |
return language | |
def predict_topic(text: str) -> [List[str], str]: | |
results = [] | |
language = predict_lang(text) | |
vectorizer, model = find_model(language) | |
language = map_language(language) | |
if vectorizer is not None: | |
texts = [text] | |
vector = vectorizer.transform(texts) | |
for row in vector: | |
if row.nnz == 0: # All zero vector, empty result | |
continue | |
feature_values = [(col, row[0, col]) for col in row.nonzero()[1]] | |
for subj_id, score in model.predict(feature_values, top_k=1000): | |
results.append((id2label[str(subj_id)], score)) | |
return results, language | |
def topic_modeling(text: str, threshold: float) -> [List[str], str]: | |
# Prepare labels and scores for the plot | |
sorted_topics, language = predict_topic(text) | |
if len(sorted_topics) > 0 and language in ['German', 'French', 'Italian']: | |
sorted_topics = [t for t in sorted_topics if t[1] >= threshold] | |
else: | |
sorted_topics = [] | |
return sorted_topics, language | |
with gr.Blocks() as iface: | |
gr.Markdown("# Topic Modeling") | |
gr.Markdown("Enter a document and get each topic along with its score.") | |
with gr.Row(): | |
with gr.Column(): | |
input_text = gr.Textbox(lines=10, placeholder="Enter a document") | |
submit_button = gr.Button("Submit") | |
threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Score Threshold", value=0.0) | |
language_text = gr.Textbox(lines=1, placeholder="Detected language will be shown here...", | |
interactive=False, label="Detected Language") | |
with gr.Column(): | |
output_data = gr.Dataframe(headers=["Label", "Score"]) | |
submit_button.click(topic_modeling, inputs=[input_text, threshold_slider], outputs=[output_data, language_text]) | |
# Launch the app | |
iface.launch(share=True) | |