Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -27,7 +27,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
27 |
return cosine_similarity([embed1], [embed2])[0][0]
|
28 |
|
29 |
def create_lda_model(texts, stopwords):
|
30 |
-
vectorizer = CountVectorizer(
|
31 |
doc_term_matrix = vectorizer.fit_transform(texts)
|
32 |
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
|
33 |
lda.fit(doc_term_matrix)
|
@@ -49,12 +49,17 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
49 |
unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
|
50 |
lexical_diversity = len(unique_words) / len(words) if words else 0
|
51 |
|
52 |
-
# Combine factors
|
53 |
-
importance = (0.
|
54 |
return importance
|
55 |
|
56 |
# Split the text into sentences
|
57 |
sentences = sent_tokenize(full_text)
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
text_lang = detect_language(full_text)
|
60 |
|
@@ -91,7 +96,6 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
91 |
|
92 |
return ' '.join(compressed_text)
|
93 |
|
94 |
-
|
95 |
async def predict(text, word_reduction_factor):
|
96 |
if len(text.split()) > 5000:
|
97 |
return "Text is too long for this demo. Please provide a text with less than 5000 words."
|
|
|
27 |
return cosine_similarity([embed1], [embed2])[0][0]
|
28 |
|
29 |
def create_lda_model(texts, stopwords):
|
30 |
+
vectorizer = CountVectorizer(stop_words=stopwords)
|
31 |
doc_term_matrix = vectorizer.fit_transform(texts)
|
32 |
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
|
33 |
lda.fit(doc_term_matrix)
|
|
|
49 |
unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
|
50 |
lexical_diversity = len(unique_words) / len(words) if words else 0
|
51 |
|
52 |
+
# Combine factors
|
53 |
+
importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
|
54 |
return importance
|
55 |
|
56 |
# Split the text into sentences
|
57 |
sentences = sent_tokenize(full_text)
|
58 |
+
final_sentences = []
|
59 |
+
for s in sentences:
|
60 |
+
broken_sentences = s.split('\n')
|
61 |
+
final_sentences.extend(broken_sentences)
|
62 |
+
sentences = final_sentences
|
63 |
|
64 |
text_lang = detect_language(full_text)
|
65 |
|
|
|
96 |
|
97 |
return ' '.join(compressed_text)
|
98 |
|
|
|
99 |
async def predict(text, word_reduction_factor):
|
100 |
if len(text.split()) > 5000:
|
101 |
return "Text is too long for this demo. Please provide a text with less than 5000 words."
|