import nltk import spacy class TextPreprocessor: """ Class that Preprocesses text for the pipeline Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings. """ def __init__(self): nltk.download('punkt') spacy.cli.download("en_core_web_sm") self.nlp = spacy.load("en_core_web_sm") self.nlp.add_pipe( "fastcoref", config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'} ) def coref(self, text = None): ''' Does Coreference Resolution Parameters: text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations. Returns: Coreference Resolved paragraph ''' if not text: text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.' doc = self.nlp( text, component_cfg={"fastcoref": {'resolve_text': True}} ) # Check doc._.coref_clusters for cluster info return doc._.resolved_text def neg_prompt(self,string : str): """ Splits Text Into Postive an Negative Prompt. """ positive = " " negative = " " words = nltk.word_tokenize(string) for i, word in enumerate(words[:-1]): if words[i+1].lower() not in ["n't", 'not']: positive += " " + word else: for wor in words[i+2:]: negative += " " + wor return {'pos':positive, 'neg': negative} if(words!=[]): positive+=words[-1] return {'pos':positive, 'neg': negative} def __call__(self, text): coref_text = self.coref(text) sentences = nltk.sent_tokenize(coref_text) processed_sentences = [] for sentence in sentences: processed_sentences.append(self.neg_prompt(sentence)) return processed_sentences, sentences