ToS-Summarization / keyphrase_extraction.py
EE21's picture
Update keyphrase_extraction.py
210facd
raw
history blame
1.13 kB
from rake_nltk import Rake
import re
# Define a list of obligation words
obligation_words = ["must", "will", "use", "may", "provides", 'is obliged to',
'has to', 'needs to', 'is required to',
"shall", "should", "ought to", "required", "obligated", "duty"]
def extract_sentences_with_obligations(text):
# Initialize Rake with stopwords set to None (to keep all words)
rake = Rake()
# Split the text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
# Initialize a list to store sentences with obligation words
obligation_sentences = []
# Iterate through the sentences
for sentence in sentences:
# Extract keyphrases from the sentence
rake.extract_keywords_from_text(sentence)
# Get the ranked keyphrases
ranked_keyphrases = rake.get_ranked_phrases()
# Check if any of the ranked keyphrases contain obligation words
if any(any(word in kp.lower() for word in obligation_words) for kp in ranked_keyphrases):
obligation_sentences.append(sentence)
return obligation_sentences