# extractive.py import nltk from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize import networkx as nx import numpy as np import torch nltk.download('stopwords') nltk.download('punkt') def preprocess_text(text): sentences = sent_tokenize(text) return sentences def get_sentence_embeddings(sentences, model, tokenizer): embeddings = [] with torch.no_grad(): for sentence in sentences: inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512) outputs = model(**inputs) sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1) embeddings.append(sentence_embedding.squeeze().numpy()) return np.array(embeddings) def build_semantic_graph(embeddings, similarity_threshold=0.75): graph = nx.Graph() for i, emb1 in enumerate(embeddings): for j, emb2 in enumerate(embeddings): if i != j: similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) if similarity >= similarity_threshold: graph.add_edge(i, j, weight=similarity) return graph def apply_textrank(graph, sentences, damping_factor=0.85, max_iter=100): num_nodes = len(sentences) personalization = {i: 1 / num_nodes for i in range(num_nodes)} scores = nx.pagerank(graph, personalization=personalization, max_iter=max_iter) ranked_sentences = sorted(((score, idx) for idx, score in scores.items()), reverse=True) return ranked_sentences def generate_summary(ranked_sentences, sentences, max_length_ratio=0.5): stop_words = set(stopwords.words('english')) summary = [] current_length = 0 total_length = sum(len(sentence.split()) for sentence in sentences) max_length = int(total_length * max_length_ratio) for score, idx in ranked_sentences: sentence = sentences[idx] sentence_length = len(sentence.split()) sentence_words = [word for word in sentence.split() if word.lower() not in stop_words] if current_length + sentence_length <= max_length: summary.append(" ".join(sentence_words)) current_length += sentence_length else: break return " ".join(summary)