Spaces:
Sleeping
Sleeping
File size: 4,725 Bytes
be66611 a0ba20b be66611 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
from flask import Flask, request, render_template, jsonify
import pandas as pd
import requests
import os
import re
import networkx as nx
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import nltk
# Inisialisasi NLTK
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')
# Inisialisasi Flask
app = Flask(__name__)
# Fungsi untuk scraping berita
def scrape_news(url):
isi = []
judul = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
article_full = BeautifulSoup(response.content, "html.parser")
judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold")
if judul_artikel:
judul_artikel = judul_artikel.text.strip()
else:
judul_artikel = "Judul tidak ditemukan"
artikel_element = article_full.find("div", class_="detail-text")
if artikel_element:
artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")]
artikel_content = "\n".join(artikel_teks)
else:
artikel_content = "Konten artikel tidak ditemukan"
isi.append(artikel_content)
judul.append(judul_artikel)
except requests.exceptions.RequestException as e:
judul.append("Error")
isi.append(f"Gagal mengambil data: {e}")
return pd.DataFrame({"judul": judul, "isi": isi})
# Fungsi preprocessing
def preprocess_text(content):
content = content.lower()
content = re.sub(r"[0-9]|[/(){}\[\]\|@,;_]|[^a-z .]+", " ", content)
content = re.sub(r"\s+", " ", content).strip()
tokens = word_tokenize(content)
stopword = set(stopwords.words("indonesian"))
tokens = [word for word in tokens if word not in stopword]
return " ".join(tokens)
# Fungsi untuk membuat ringkasan dan visualisasi graf
def summarize_and_visualize(content):
kalimat = sent_tokenize(content)
preprocessed_text = preprocess_text(content)
kalimat_preprocessing = sent_tokenize(preprocessed_text)
# TF-IDF dan cosine similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(kalimat_preprocessing)
cossim_prep = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Analisis jaringan dengan NetworkX
G = nx.DiGraph()
for i in range(len(cossim_prep)):
G.add_node(i)
for j in range(len(cossim_prep)):
if cossim_prep[i][j] > 0.1 and i != j:
G.add_edge(i, j)
# Hitung closeness centrality dan buat ringkasan
closeness_scores = nx.closeness_centrality(G)
sorted_closeness = sorted(closeness_scores.items(), key=lambda x: x[1], reverse=True)
ringkasan = " ".join(kalimat[node] for node, _ in sorted_closeness[:3])
# Visualisasi graf
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=2)
nx.draw_networkx_nodes(G, pos, node_size=500, node_color="b")
nx.draw_networkx_edges(G, pos, edge_color="red", arrows=True)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title("Graph Representation of Sentence Similarity")
# Periksa apakah file graph.png sudah ada
graph_path = "static/graph.png"
if os.path.exists(graph_path):
os.remove(graph_path) # Hapus file jika sudah ada
# Simpan graf sebagai file baru
plt.savefig(graph_path)
plt.close()
return ringkasan
# Route utama untuk scraping dan analisis
@app.route("/", methods=["GET", "POST"])
def index():
if request.method == "POST":
url = request.form.get("url")
if url:
# Scraping berita
df = scrape_news(url)
if not df.empty:
content = df["isi"].iloc[0]
title = df["judul"].iloc[0]
# Preprocessing, summarizing, and visualizing
ringkasan = summarize_and_visualize(content)
return render_template("result.html", title=title, content=content, summary=ringkasan, graph_url="static/graph.png")
else:
return render_template("summary.html", error="Gagal mengambil data dari URL.")
else:
return render_template("summary.html", error="URL tidak boleh kosong.")
return render_template("summary.html")
# Menjalankan aplikasi Flask
if __name__ == "__main__":
app.run(debug=True, port=5002)
|