arousrihab commited on
Commit
22a5607
1 Parent(s): 2828552

Upload 3 files

Browse files
Files changed (3) hide show
  1. extractive.py +61 -0
  2. requirements.txt +6 -0
  3. utils.py +9 -0
extractive.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # extractive.py
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import sent_tokenize
5
+ import networkx as nx
6
+ import numpy as np
7
+ import torch
8
+
9
+ nltk.download('stopwords')
10
+ nltk.download('punkt')
11
+
12
+ def preprocess_text(text):
13
+ sentences = sent_tokenize(text)
14
+ return sentences
15
+
16
+ def get_sentence_embeddings(sentences, model, tokenizer):
17
+ embeddings = []
18
+ with torch.no_grad():
19
+ for sentence in sentences:
20
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
21
+ outputs = model(**inputs)
22
+ sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
23
+ embeddings.append(sentence_embedding.squeeze().numpy())
24
+ return np.array(embeddings)
25
+
26
+ def build_semantic_graph(embeddings, similarity_threshold=0.75):
27
+ graph = nx.Graph()
28
+ for i, emb1 in enumerate(embeddings):
29
+ for j, emb2 in enumerate(embeddings):
30
+ if i != j:
31
+ similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
32
+ if similarity >= similarity_threshold:
33
+ graph.add_edge(i, j, weight=similarity)
34
+ return graph
35
+
36
+ def apply_textrank(graph, sentences, damping_factor=0.85, max_iter=100):
37
+ num_nodes = len(sentences)
38
+ personalization = {i: 1 / num_nodes for i in range(num_nodes)}
39
+ scores = nx.pagerank(graph, personalization=personalization, max_iter=max_iter)
40
+ ranked_sentences = sorted(((score, idx) for idx, score in scores.items()), reverse=True)
41
+ return ranked_sentences
42
+
43
+ def generate_summary(ranked_sentences, sentences, max_length_ratio=0.5):
44
+ stop_words = set(stopwords.words('english'))
45
+ summary = []
46
+ current_length = 0
47
+ total_length = sum(len(sentence.split()) for sentence in sentences)
48
+ max_length = int(total_length * max_length_ratio)
49
+
50
+ for score, idx in ranked_sentences:
51
+ sentence = sentences[idx]
52
+ sentence_length = len(sentence.split())
53
+ sentence_words = [word for word in sentence.split() if word.lower() not in stop_words]
54
+
55
+ if current_length + sentence_length <= max_length:
56
+ summary.append(" ".join(sentence_words))
57
+ current_length += sentence_length
58
+ else:
59
+ break
60
+
61
+ return " ".join(summary)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ spacy
3
+ torch
4
+ transformers
5
+ nltk
6
+ networkx
utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+ import spacy
3
+
4
+ nlp = spacy.load("en-core-sci-lg")
5
+
6
+ def extract_named_entities(text):
7
+ doc = nlp(text)
8
+ entities = [(ent.text) for ent in doc.ents]
9
+ return entities