import gradio as gr from transformers import AutoTokenizer, AutoModel import torch import nltk from nltk import pos_tag from nltk.tokenize import word_tokenize from nltk.chunk import RegexpParser import requests nltk.download('averaged_perceptron_tagger') nltk.download('punkt') # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') def filter_pos(text): words = word_tokenize(text) tagged_words = pos_tag(words) return ' '.join([word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R')]) def get_embedding(text): filtered_text = filter_pos(text) tokens = tokenizer(filtered_text, return_tensors='pt', padding=True, truncation=True, max_length=128) with torch.no_grad(): output = model(**tokens) return output.last_hidden_state.mean(1) def calculate_similarity(text1, text2): embed1 = get_embedding(text1) embed2 = get_embedding(text2) cos = torch.nn.CosineSimilarity(dim=1) similarity = cos(embed1, embed2) return f"{similarity.item():.2%} Similarity" def report_issue(text1, text2, similarity): url = 'https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/formResponse' data = { 'entry.1041881480': text1, 'entry.1520964719': text2, 'entry.2094809206': similarity } response = requests.post(url, data=data) if response.status_code == 200: return "Report sent successfully!" else: return "Failed to send report." def extract_chunks(text): # Define grammar for chunking grammar = r""" NP: {