Spaces:

nktssk
/

itis

Running

File size: 5,668 Bytes

import os
import re
import json
import torch
import requests
import unicodedata
import soundfile as sf
import pymorphy2

import gradio as gr
import wikipediaapi
from PIL import Image
from transformers import pipeline, CLIPProcessor, CLIPModel

morph = pymorphy2.MorphAnalyzer()

def load_attractions_json(url):
    r = requests.get(url)
    r.raise_for_status()
    return json.loads(r.text)

url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
landmark_titles = load_attractions_json(url)

def clean_text(text):
    text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    def rm_diacritics(c):
        return '' if unicodedata.category(c) == 'Mn' else c
    text = unicodedata.normalize('NFD', text)
    text = ''.join(rm_diacritics(c) for c in text)
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    return text.strip()

# Упрощенное определение падежа по предлогу
def get_case_for_preposition(prep):
    d = {
        'в': 'loc', 'на': 'loc', 'о': 'loc', 'об': 'loc', 'обо': 'loc',
        'к': 'dat',
        'с': 'ins', 'со': 'ins', 'над': 'ins', 'под': 'ins',
        'из': 'gen', 'от': 'gen', 'у': 'gen', 'до': 'gen', 'для': 'gen'
    }
    return d.get(prep.lower(), 'nom')

def replace_numbers_with_text_in_context(text):
    tokens = text.split()
    result = []
    for i, token in enumerate(tokens):
        if re.match(r'^\d+(\.\d+)?$', token):
            cse = 'nom'
            if i > 0:
                cse = get_case_for_preposition(tokens[i - 1])
            # Сначала переводим число в текст (nominative)
            from num2words import num2words
            number_as_words = num2words(float(token) if '.' in token else int(token), lang='ru')
            number_as_words = number_as_words.replace('-', ' ')
            subtokens = number_as_words.split()
            inflected_subtokens = []
            for st in subtokens:
                p = morph.parse(st)
                if p:
                    best = p[0]
                    if cse in best.tag.cases:
                        form = best.inflect({cse})
                        inflected_subtokens.append(form.word if form else st)
                    else:
                        inflected_subtokens.append(st)
                else:
                    inflected_subtokens.append(st)
            result.append(' '.join(inflected_subtokens))
        else:
            result.append(token)
    return ' '.join(result)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
wiki = wikipediaapi.Wikipedia("Nikita", "en")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

text_inputs = clip_processor(text=landmark_titles, images=None, return_tensors="pt", padding=True)
with torch.no_grad():
    text_embeds = clip_model.get_text_features(**text_inputs)
    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

language = 'ru'
model_id = 'v3_1_ru'
sample_rate = 48000
speaker = 'eugene'
silero_model, _ = torch.hub.load(
    repo_or_dir='snakers4/silero-models',
    model='silero_tts',
    language=language,
    speaker=model_id
)

def text_to_speech(text, out_path="speech.wav"):
    text = replace_numbers_with_text_in_context(text)
    audio = silero_model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
    sf.write(out_path, audio, sample_rate)
    return out_path

def fetch_wikipedia_summary(landmark):
    page = wiki.page(landmark)
    return clean_text(page.summary) if page.exists() else "Found error!"

def recognize_landmark_clip(image):
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    img_in = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        img_embed = clip_model.get_image_features(**img_in)
        img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
    sim = (img_embed @ text_embeds.T).squeeze(0)
    best_idx = sim.argmax().item()
    return landmark_titles[best_idx], sim[best_idx].item()

def process_landmark(landmark):
    txt = fetch_wikipedia_summary(landmark)
    if txt == "Found error!":
        return None
    sm = summarizer(txt, min_length=20, max_length=210)[0]["summary_text"]
    tr = translator(sm, max_length=1000)[0]["translation_text"]
    return text_to_speech(tr)

def process_image_clip(image):
    recognized, score = recognize_landmark_clip(image)
    return process_landmark(recognized)

def process_text_clip(landmark):
    return process_landmark(landmark)

with gr.Blocks() as demo:
    gr.Markdown("## Помощь туристу")
    with gr.Tabs():
        with gr.Tab("CLIP + Sum + Translate + T2S"):
            with gr.Row():
                image_input = gr.Image(label="Загрузите фото", type="pil")
                text_input = gr.Textbox(label="Или введите название")
            audio_output = gr.Audio(label="Результат")
            with gr.Row():
                btn_img = gr.Button("Распознать и перевести")
                btn_txt = gr.Button("Поиск по названию")
            btn_img.click(fn=process_image_clip, inputs=image_input, outputs=audio_output)
            btn_txt.click(fn=process_text_clip, inputs=text_input, outputs=audio_output)

demo.launch(debug=True)