Spaces:
Build error
Build error
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline | |
from sentence_transformers import SentenceTransformer, util | |
import random | |
import re | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
import warnings | |
from transformers import logging | |
import os | |
import tensorflow as tf | |
import requests | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
warnings.filterwarnings("ignore", category=UserWarning) | |
warnings.filterwarnings("ignore") | |
logging.set_verbosity_error() | |
tf.get_logger().setLevel('ERROR') | |
nltk.download('punkt') | |
GROQ_API_KEY="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" | |
class TextEnhancer: | |
def __init__(self): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(self.device) | |
self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5") | |
self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device) | |
print("paraphraser loaded") | |
self.grammar_pipeline = pipeline( | |
"text2text-generation", | |
model="Grammarly/coedit-large", | |
device=0 if self.device == "cuda" else -1 | |
) | |
print("grammar check loaded") | |
self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device) | |
print("sementics model loaded") | |
def _evaluate_with_groq(self, passage=""): | |
if not passage: | |
raise ValueError("Input passage cannot be empty.") | |
# Groq API setup | |
headers = { | |
"Authorization": f"Bearer {GROQ_API_KEY}", # Replace GROQ_API_KEY with your actual API key. | |
"Content-Type": "application/json" | |
} | |
payload = { | |
"model": "llama3-70b-8192", | |
"messages": [ | |
{ | |
"role": "system", | |
"content": "Paraphrase this sentence to better suit it as an introductory sentence to a student's Statement of purpose. Ensure that the vocabulary and grammar is upto par. ONLY return the raw paraphrased sentence and nothing else.IF IT IS a empty string, return empty string " | |
}, | |
{ | |
"role": "user", | |
"content": f"Here is the passage: {passage}" | |
} | |
], | |
"temperature": 1.0, | |
"max_tokens": 8192 | |
} | |
# Sending request to Groq API | |
print("Sending request to Groq API...") | |
response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers) | |
print("Response received.") | |
# Handling the response | |
if response.status_code == 200: | |
data = response.json() | |
try: | |
segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
print("sentence paraphrase processed successfully.") | |
print(segmented_text) | |
return segmented_text | |
except (IndexError, KeyError) as e: | |
raise ValueError(f"Unexpected response structure from Groq API. Error: {str(e)}") | |
else: | |
raise ValueError(f"Groq API error: {response.status_code}, {response.text}") | |
def _correct_formatting(self, sentence): | |
cleaned_sentence = re.sub(r'([.,!?])\1+', r'\1', sentence) | |
cleaned_sentence = cleaned_sentence.strip() | |
return cleaned_sentence | |
def enhance_text(self, text, min_similarity=0.8, max_variations=3): | |
sent=0 | |
enhanced_sentences = [] | |
sentences = sent_tokenize(text) | |
total_words = sum(len(sentence.split()) for sentence in sentences) | |
print(f"generated: {total_words}") | |
for sentence in sentences: | |
if not sentence.strip(): | |
continue | |
sent+=1 | |
inputs = self.paraphrase_tokenizer( | |
f"paraphrase: {sentence}", | |
return_tensors="pt", | |
padding=True, | |
max_length=150, | |
truncation=True | |
).to(self.device) | |
outputs = self.paraphrase_model.generate( | |
**inputs, | |
max_length=len(sentence.split()) + 20, | |
num_return_sequences=max_variations, | |
num_beams=max_variations, | |
temperature=0.7 | |
) | |
paraphrases = [ | |
self.paraphrase_tokenizer.decode(output, skip_special_tokens=True) | |
for output in outputs | |
] | |
sentence_embedding = self.similarity_model.encode(sentence) | |
paraphrase_embeddings = self.similarity_model.encode(paraphrases) | |
similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings) | |
valid_paraphrases = [ | |
para for para, sim in zip(paraphrases, similarities[0]) | |
if sim >= min_similarity | |
] | |
if sent in {1, len(sentences)} and valid_paraphrases: | |
gemini_feedback = self._evaluate_with_groq(valid_paraphrases[0]) | |
if gemini_feedback.strip(): | |
valid_paraphrases[0] = gemini_feedback.strip() | |
if valid_paraphrases: | |
corrected = self.grammar_pipeline( | |
valid_paraphrases[0], | |
max_length=150, | |
num_return_sequences=1 | |
)[0]["generated_text"] | |
corrected = self._humanize_text(corrected) | |
corrected=self._correct_formatting(corrected) | |
enhanced_sentences.append(corrected) | |
else: | |
sentence=self._correct_formatting(sentence) | |
enhanced_sentences.append(sentence) | |
enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "." | |
return enhanced_text | |
def _humanize_text(self, text): | |
contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"} | |
words = text.split() | |
text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words]) | |
if random.random() > 0.7: | |
text = text.replace(" and ", ", and ") | |
# Minor variations in sentence structure | |
if random.random() > 0.5: | |
text = text.replace(" is ", " happens to be ") | |
return text | |
def create_interface(): | |
enhancer = TextEnhancer() | |
def process_text(text, similarity_threshold=0.75): | |
try: | |
enhanced = enhancer.enhance_text( | |
text, | |
min_similarity=similarity_threshold / 100, | |
max_variations=10 | |
) | |
print("grammar enhanced") | |
return enhanced | |
except Exception as e: | |
return f"Error: {str(e)}" | |
interface = gr.Blocks() | |
with interface: | |
with gr.Row(elem_id="header", variant="panel"): | |
gr.HTML(""" | |
<div style="display: flex; align-items: center; justify-content: center; gap: 10px; margin-bottom: 20px;"> | |
<img src="https://raw.githubusercontent.com/juicjaane/blueai/main/logo_2.jpg" style="width: 50px; height: 50px;"> | |
<h1 style="color: gold; font-size: 2em; margin: 0;">Konect U</h1> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### Your SoP") | |
input_text = gr.Textbox(label="Input", placeholder="Enter SoP to Paraphrase...", lines=10) | |
submit_button = gr.Button("Paraphrase") | |
with gr.Column(scale=1): | |
gr.Markdown("### Paraphrased SoP") | |
enhanced_text = gr.Textbox(label="SoP", lines=10) | |
submit_button.click(process_text, inputs=[input_text], outputs=enhanced_text) | |
return interface | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch(share=True) | |