Videobook_v1.5 / Videobook /TextPreprocessor.py
Warlord-K's picture
Trying to get spacy to work
e3a5b4c
raw
history blame
2.06 kB
import nltk
import spacy
class TextPreprocessor:
"""
Class that Preprocesses text for the pipeline
Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings.
"""
def __init__(self):
nltk.download('punkt')
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")
self.nlp.add_pipe(
"fastcoref",
config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
)
def coref(self, text = None):
'''
Does Coreference Resolution
Parameters:
text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.
Returns:
Coreference Resolved paragraph
'''
if not text:
text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
doc = self.nlp(
text,
component_cfg={"fastcoref": {'resolve_text': True}}
)
# Check doc._.coref_clusters for cluster info
return doc._.resolved_text
def neg_prompt(self,string : str):
"""
Splits Text Into Postive an Negative Prompt.
"""
positive = " "
negative = " "
words = nltk.word_tokenize(string)
for i, word in enumerate(words[:-1]):
if words[i+1].lower() not in ["n't", 'not']:
positive += " " + word
else:
for wor in words[i+2:]:
negative += " " + wor
return {'pos':positive, 'neg': negative}
if(words!=[]):
positive+=words[-1]
return {'pos':positive, 'neg': negative}
def __call__(self, text):
coref_text = self.coref(text)
sentences = nltk.sent_tokenize(coref_text)
processed_sentences = []
for sentence in sentences:
processed_sentences.append(self.neg_prompt(sentence))
return processed_sentences, sentences