#ファイルパス Get_URL_list/get_url_list.py ------------ import json import requests from bs4 import BeautifulSoup # Load URLs from JSON file with open('ideabte_scraping/Get_URL_list/URL_json_output/debate_urls.json', 'r') as f: json_urls = json.load(f) # Function to get sub-page URLs from a main theme URL def get_debate_topic_urls(main_url): response = requests.get(main_url) soup = BeautifulSoup(response.text, 'html.parser') # Extract all links from the main URL page links = soup.find_all('a', href=True) # Filter for links that are debate topics topic_urls = [link['href'] for link in links if link['href'].startswith('/')] # Make URLs absolute full_urls = [f"https://idebate.net{url}" for url in topic_urls if "~b" in url] return full_urls # Dictionary to store all debate topic URLs for each main theme URL all_debate_topic_urls = {} for theme_url in json_urls: theme_name = theme_url.split("/")[-2].replace("~", "_") all_debate_topic_urls[theme_name] = get_debate_topic_urls(theme_url) # Output the results with open('ideabte_scraping/Get_URL_list/output/debate_topic_urls.json', 'w') as f: json.dump(all_debate_topic_urls, f, indent=4) print("Debate topic URLs have been saved to debate_topic_urls.json") #ファイルパス scraping_idebate/run_main.sh ------------ #!/bin/bash # Set default paths JSON_FILE="ideabte_scraping/Get_URL_list/output/debate_topic_urls.json" OUTPUT_DIR="ideabte_scraping/scraping_idebate/output" # Check if the JSON file exists if [ ! -f "$JSON_FILE" ]; then echo "Error: JSON file '$JSON_FILE' does not exist." exit 1 fi # Create the output directory if it doesn't exist mkdir -p "$OUTPUT_DIR" # Run the Python script python3 ideabte_scraping/scraping_idebate/src/scraping.py "$JSON_FILE" "$OUTPUT_DIR" echo "Scraping completed. Output files are stored in $OUTPUT_DIR" #ファイルパス scraping_idebate/src/scraping.py ------------ import requests from bs4 import BeautifulSoup import json import os import sys from urllib.parse import urlparse def scrape_url(url, output_dir): response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') topic = soup.find("h1", class_="blog-post__title").get_text(strip=True) points_list = [] def extract_points(section, section_name): accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item') for item in accordion_items: point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip() point_body = item.find('div', class_='accordion__body').find('p').get_text().strip() points_list.append({ "topic": topic, "section": section_name, "context": f"**{point_subtitle}**\n{point_body}" }) points_for_section = soup.find('div', class_='points-vote points-vote--for') if points_for_section: extract_points(points_for_section, "Points For") points_against_section = soup.find('div', class_='points-vote points-vote--against') if points_against_section: extract_points(points_against_section, "Points Against") # Generate a unique filename based on the URL parsed_url = urlparse(url) filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json" output_path = os.path.join(output_dir, filename) with open(output_path, "w", encoding="utf-8") as f: json.dump(points_list, f, ensure_ascii=False, indent=4) print(f"Data saved to {output_path}") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python script.py ") sys.exit(1) json_file = sys.argv[1] output_dir = sys.argv[2] os.makedirs(output_dir, exist_ok=True) with open(json_file, 'r') as f: url_data = json.load(f) for category, urls in url_data.items(): for url in urls: try: scrape_url(url, output_dir) except Exception as e: print(f"Error scraping {url}: {str(e)}") #ファイルパス scraping_idebate/src/scraping_test.py ------------ import requests from bs4 import BeautifulSoup url = "https://idebate.net/this-house-would-make-all-museums-free-of-charge~b641/" # ウェブページを取得 response = requests.get(url) response.raise_for_status() # エラーチェック # HTMLを解析 soup = BeautifulSoup(response.content, 'html.parser') # Points Forのdiv要素を取得 points_for_section = soup.find('div', class_='points-vote points-vote--for') # ポイントを含むアコーディオン要素を取得 accordion_items = points_for_section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item') # 各ポイントのテキストを抽出 points = [] for item in accordion_items: point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip() point_body = item.find('div', class_='accordion__body').find('p').get_text().strip() points.append(f"**{point_subtitle}**\n{point_body}") # 抽出したポイントを出力 for point in points: print(point) print("-" * 20) # 区切り線 #ファイルパス scraping_idebate/src/scraping_tqdm.py ------------ import requests from bs4 import BeautifulSoup import json import os import sys from urllib.parse import urlparse from tqdm import tqdm def scrape_url(url, output_dir): response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') topic = soup.find("h1", class_="blog-post__title").get_text(strip=True) points_list = [] def extract_points(section, section_name): accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item') for item in accordion_items: point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip() point_body = item.find('div', class_='accordion__body').find('p').get_text().strip() points_list.append({ "topic": topic, "section": section_name, "context": f"**{point_subtitle}**\n{point_body}" }) points_for_section = soup.find('div', class_='points-vote points-vote--for') if points_for_section: extract_points(points_for_section, "Points For") points_against_section = soup.find('div', class_='points-vote points-vote--against') if points_against_section: extract_points(points_against_section, "Points Against") # Generate a unique filename based on the URL parsed_url = urlparse(url) filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json" output_path = os.path.join(output_dir, filename) with open(output_path, "w", encoding="utf-8") as f: json.dump(points_list, f, ensure_ascii=False, indent=4) return output_path if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python script.py ") sys.exit(1) json_file = sys.argv[1] output_dir = sys.argv[2] os.makedirs(output_dir, exist_ok=True) with open(json_file, 'r') as f: url_data = json.load(f) total_urls = sum(len(urls) for urls in url_data.values()) with tqdm(total=total_urls, desc="Scraping Progress") as pbar: for category, urls in url_data.items(): for url in urls: try: output_path = scrape_url(url, output_dir) pbar.set_postfix_str(f"Saved: {output_path}") pbar.update(1) except Exception as e: pbar.set_postfix_str(f"Error: {url}") print(f"\nError scraping {url}: {str(e)}") pbar.update(1) print("\nScraping completed. All data saved to the output directory.")