repository_reader / output /scan_result_20241030_210745.txt
DeL-TaiseiOzaki
first commit
227e75d
#ファイルパス
Get_URL_list/get_url_list.py
------------
import json
import requests
from bs4 import BeautifulSoup
# Load URLs from JSON file
with open('ideabte_scraping/Get_URL_list/URL_json_output/debate_urls.json', 'r') as f:
json_urls = json.load(f)
# Function to get sub-page URLs from a main theme URL
def get_debate_topic_urls(main_url):
response = requests.get(main_url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract all links from the main URL page
links = soup.find_all('a', href=True)
# Filter for links that are debate topics
topic_urls = [link['href'] for link in links if link['href'].startswith('/')]
# Make URLs absolute
full_urls = [f"https://idebate.net{url}" for url in topic_urls if "~b" in url]
return full_urls
# Dictionary to store all debate topic URLs for each main theme URL
all_debate_topic_urls = {}
for theme_url in json_urls:
theme_name = theme_url.split("/")[-2].replace("~", "_")
all_debate_topic_urls[theme_name] = get_debate_topic_urls(theme_url)
# Output the results
with open('ideabte_scraping/Get_URL_list/output/debate_topic_urls.json', 'w') as f:
json.dump(all_debate_topic_urls, f, indent=4)
print("Debate topic URLs have been saved to debate_topic_urls.json")
#ファイルパス
scraping_idebate/run_main.sh
------------
#!/bin/bash
# Set default paths
JSON_FILE="ideabte_scraping/Get_URL_list/output/debate_topic_urls.json"
OUTPUT_DIR="ideabte_scraping/scraping_idebate/output"
# Check if the JSON file exists
if [ ! -f "$JSON_FILE" ]; then
echo "Error: JSON file '$JSON_FILE' does not exist."
exit 1
fi
# Create the output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Run the Python script
python3 ideabte_scraping/scraping_idebate/src/scraping.py "$JSON_FILE" "$OUTPUT_DIR"
echo "Scraping completed. Output files are stored in $OUTPUT_DIR"
#ファイルパス
scraping_idebate/src/scraping.py
------------
import requests
from bs4 import BeautifulSoup
import json
import os
import sys
from urllib.parse import urlparse
def scrape_url(url, output_dir):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
topic = soup.find("h1", class_="blog-post__title").get_text(strip=True)
points_list = []
def extract_points(section, section_name):
accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
for item in accordion_items:
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
points_list.append({
"topic": topic,
"section": section_name,
"context": f"**{point_subtitle}**\n{point_body}"
})
points_for_section = soup.find('div', class_='points-vote points-vote--for')
if points_for_section:
extract_points(points_for_section, "Points For")
points_against_section = soup.find('div', class_='points-vote points-vote--against')
if points_against_section:
extract_points(points_against_section, "Points Against")
# Generate a unique filename based on the URL
parsed_url = urlparse(url)
filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json"
output_path = os.path.join(output_dir, filename)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(points_list, f, ensure_ascii=False, indent=4)
print(f"Data saved to {output_path}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <json_file> <output_dir>")
sys.exit(1)
json_file = sys.argv[1]
output_dir = sys.argv[2]
os.makedirs(output_dir, exist_ok=True)
with open(json_file, 'r') as f:
url_data = json.load(f)
for category, urls in url_data.items():
for url in urls:
try:
scrape_url(url, output_dir)
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
#ファイルパス
scraping_idebate/src/scraping_test.py
------------
import requests
from bs4 import BeautifulSoup
url = "https://idebate.net/this-house-would-make-all-museums-free-of-charge~b641/"
# ウェブページを取得
response = requests.get(url)
response.raise_for_status() # エラーチェック
# HTMLを解析
soup = BeautifulSoup(response.content, 'html.parser')
# Points Forのdiv要素を取得
points_for_section = soup.find('div', class_='points-vote points-vote--for')
# ポイントを含むアコーディオン要素を取得
accordion_items = points_for_section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
# 各ポイントのテキストを抽出
points = []
for item in accordion_items:
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
points.append(f"**{point_subtitle}**\n{point_body}")
# 抽出したポイントを出力
for point in points:
print(point)
print("-" * 20) # 区切り線
#ファイルパス
scraping_idebate/src/scraping_tqdm.py
------------
import requests
from bs4 import BeautifulSoup
import json
import os
import sys
from urllib.parse import urlparse
from tqdm import tqdm
def scrape_url(url, output_dir):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
topic = soup.find("h1", class_="blog-post__title").get_text(strip=True)
points_list = []
def extract_points(section, section_name):
accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item')
for item in accordion_items:
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip()
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip()
points_list.append({
"topic": topic,
"section": section_name,
"context": f"**{point_subtitle}**\n{point_body}"
})
points_for_section = soup.find('div', class_='points-vote points-vote--for')
if points_for_section:
extract_points(points_for_section, "Points For")
points_against_section = soup.find('div', class_='points-vote points-vote--against')
if points_against_section:
extract_points(points_against_section, "Points Against")
# Generate a unique filename based on the URL
parsed_url = urlparse(url)
filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json"
output_path = os.path.join(output_dir, filename)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(points_list, f, ensure_ascii=False, indent=4)
return output_path
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <json_file> <output_dir>")
sys.exit(1)
json_file = sys.argv[1]
output_dir = sys.argv[2]
os.makedirs(output_dir, exist_ok=True)
with open(json_file, 'r') as f:
url_data = json.load(f)
total_urls = sum(len(urls) for urls in url_data.values())
with tqdm(total=total_urls, desc="Scraping Progress") as pbar:
for category, urls in url_data.items():
for url in urls:
try:
output_path = scrape_url(url, output_dir)
pbar.set_postfix_str(f"Saved: {output_path}")
pbar.update(1)
except Exception as e:
pbar.set_postfix_str(f"Error: {url}")
print(f"\nError scraping {url}: {str(e)}")
pbar.update(1)
print("\nScraping completed. All data saved to the output directory.")