from nltk import sent_tokenize import openai import re import nltk nltk.download('punkt') class SynonymEditor: def __init__(self, api_key, model_engine, max_tokens, temperature, language): openai.api_key = api_key self.model_engine = model_engine self.max_tokens = max_tokens self.temperature = temperature self.language = language # Play with the prompts here and change the return index to change and see the effect of the prompt on the output quality # Note that the longer the prompt, higher the token used and hence the billing def _get_prompt(self, sentence, few_shots): if (few_shots): if (self.language == "de"): prompt = 'Modernisiere den Text. Behalte die Inhalte von Klammern "" bei.\n' + \ few_shots + "\nEingang:" + sentence + " Ausgang:" else: prompt = "Replace exactly one word with a synonym while preserving the overall sentence structure and meaning.\n" + \ few_shots + "\nInput:" + sentence + " Output:" elif "__QUOTE__" in sentence: if (self.language == "de"): prompt = 'Modernisiere den Text. Behalte die Inhalte von Klammern "" bei.\n'+sentence+'\n' else: prompt = "Replace exactly one word with a synonym while preserving __QUOTE__ in the following sentence:\n"+sentence+"\n" else: if (self.language == "de"): prompt = 'Modernisiere den Text. Behalte die Inhalte von Klammern "" bei.\n'+sentence+'\n' else: prompt = "Replace exactly one word with a synonym in the following sentence:\n"+sentence+"\n" return prompt # Call the OpenAI API here def __call_ai(self, sentence, few_shots): prompt = self._get_prompt(sentence, few_shots) print(prompt) response = openai.Completion.create( model=self.model_engine, prompt=prompt, temperature=self.temperature, max_tokens=self.max_tokens, top_p=1, frequency_penalty=0, presence_penalty=0 ) return self._post_process_sentence(response.choices[0].text.strip()) # Split the paragraph to preserve quotation marks def _split_into_sentences(self, text, few_shots): if (few_shots == False): text = text.replace('"', '__QUOTE__') text = re.sub(r'\s+', ' ', text) text = text.strip() sentences = sent_tokenize(text) return sentences def _post_process_sentence(self, text): print(text) print("==============") return text.replace('__QUOTE__', '"') # Preprocess the text, perform edit task and join back to get the original format def _edit_text(self, text, few_shots=False): edited_text = "" paragraphs = text.split("\n\n") edited_paragraphs = [] for paragraph in paragraphs: sentences = self._split_into_sentences(paragraph, few_shots) edited_sentences = [] for sentence in sentences: new_sentence = self.__call_ai(sentence, few_shots) edited_sentences.append(new_sentence) # join edited sentences to form an edited paragraph edited_paragraph = ' '.join(edited_sentences) edited_paragraphs.append(edited_paragraph) # join edited paragraphs to form edited text edited_text = '\n\n'.join(edited_paragraphs) return edited_text # File Read Write operation def edit_file(self, input_file, output_file): print("Opening File") with open(input_file, "r", encoding="utf8", errors="ignore") as f: text = f.read() print("Editing") edited_text = self._edit_text(text) print("Finishing up") with open(output_file, "w") as f: f.write(edited_text) print("Done!")