File size: 4,725 Bytes
be66611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0ba20b
be66611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from flask import Flask, request, render_template, jsonify
import pandas as pd
import requests
import os
import re
import networkx as nx
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import nltk

# Inisialisasi NLTK
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')

# Inisialisasi Flask
app = Flask(__name__)

# Fungsi untuk scraping berita
def scrape_news(url):
    isi = []
    judul = []

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        article_full = BeautifulSoup(response.content, "html.parser")
        judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold")
        if judul_artikel:
            judul_artikel = judul_artikel.text.strip()
        else:
            judul_artikel = "Judul tidak ditemukan"
        artikel_element = article_full.find("div", class_="detail-text")
        if artikel_element:
            artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")]
            artikel_content = "\n".join(artikel_teks)
        else:
            artikel_content = "Konten artikel tidak ditemukan"
        isi.append(artikel_content)
        judul.append(judul_artikel)
    except requests.exceptions.RequestException as e:
        judul.append("Error")
        isi.append(f"Gagal mengambil data: {e}")

    return pd.DataFrame({"judul": judul, "isi": isi})

# Fungsi preprocessing
def preprocess_text(content):
    content = content.lower()
    content = re.sub(r"[0-9]|[/(){}\[\]\|@,;_]|[^a-z .]+", " ", content)
    content = re.sub(r"\s+", " ", content).strip()
    tokens = word_tokenize(content)
    stopword = set(stopwords.words("indonesian"))
    tokens = [word for word in tokens if word not in stopword]
    return " ".join(tokens)

# Fungsi untuk membuat ringkasan dan visualisasi graf
def summarize_and_visualize(content):
    kalimat = sent_tokenize(content)
    preprocessed_text = preprocess_text(content)
    kalimat_preprocessing = sent_tokenize(preprocessed_text)

    # TF-IDF dan cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(kalimat_preprocessing)
    cossim_prep = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Analisis jaringan dengan NetworkX
    G = nx.DiGraph()
    for i in range(len(cossim_prep)):
        G.add_node(i)
        for j in range(len(cossim_prep)):
            if cossim_prep[i][j] > 0.1 and i != j:
                G.add_edge(i, j)

    # Hitung closeness centrality dan buat ringkasan
    closeness_scores = nx.closeness_centrality(G)
    sorted_closeness = sorted(closeness_scores.items(), key=lambda x: x[1], reverse=True)
    ringkasan = " ".join(kalimat[node] for node, _ in sorted_closeness[:3])

    # Visualisasi graf
    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G, k=2)
    nx.draw_networkx_nodes(G, pos, node_size=500, node_color="b")
    nx.draw_networkx_edges(G, pos, edge_color="red", arrows=True)
    nx.draw_networkx_labels(G, pos, font_size=10)
    plt.title("Graph Representation of Sentence Similarity")
    # Periksa apakah file graph.png sudah ada
    graph_path = "static/graph.png"
    if os.path.exists(graph_path):
        os.remove(graph_path)  # Hapus file jika sudah ada

    # Simpan graf sebagai file baru
    plt.savefig(graph_path)
    plt.close()

    return ringkasan

# Route utama untuk scraping dan analisis
@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == "POST":
        url = request.form.get("url")
        if url:
            # Scraping berita
            df = scrape_news(url)
            if not df.empty:
                content = df["isi"].iloc[0]
                title = df["judul"].iloc[0]

                # Preprocessing, summarizing, and visualizing
                ringkasan = summarize_and_visualize(content)
                return render_template("result.html", title=title, content=content, summary=ringkasan, graph_url="static/graph.png")
            else:
                return render_template("summary.html", error="Gagal mengambil data dari URL.")
        else:
            return render_template("summary.html", error="URL tidak boleh kosong.")
    return render_template("summary.html")

# Menjalankan aplikasi Flask
if __name__ == "__main__":
    app.run(debug=True, port=5002)