Spaces:
Running
Running
#ファイルパス | |
Get_URL_list/get_url_list.py | |
------------ | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
# Load URLs from JSON file | |
with open('ideabte_scraping/Get_URL_list/URL_json_output/debate_urls.json', 'r') as f: | |
json_urls = json.load(f) | |
# Function to get sub-page URLs from a main theme URL | |
def get_debate_topic_urls(main_url): | |
response = requests.get(main_url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract all links from the main URL page | |
links = soup.find_all('a', href=True) | |
# Filter for links that are debate topics | |
topic_urls = [link['href'] for link in links if link['href'].startswith('/')] | |
# Make URLs absolute | |
full_urls = [f"https://idebate.net{url}" for url in topic_urls if "~b" in url] | |
return full_urls | |
# Dictionary to store all debate topic URLs for each main theme URL | |
all_debate_topic_urls = {} | |
for theme_url in json_urls: | |
theme_name = theme_url.split("/")[-2].replace("~", "_") | |
all_debate_topic_urls[theme_name] = get_debate_topic_urls(theme_url) | |
# Output the results | |
with open('ideabte_scraping/Get_URL_list/output/debate_topic_urls.json', 'w') as f: | |
json.dump(all_debate_topic_urls, f, indent=4) | |
print("Debate topic URLs have been saved to debate_topic_urls.json") | |
#ファイルパス | |
scraping_idebate/run_main.sh | |
------------ | |
#!/bin/bash | |
# Set default paths | |
JSON_FILE="ideabte_scraping/Get_URL_list/output/debate_topic_urls.json" | |
OUTPUT_DIR="ideabte_scraping/scraping_idebate/output" | |
# Check if the JSON file exists | |
if [ ! -f "$JSON_FILE" ]; then | |
echo "Error: JSON file '$JSON_FILE' does not exist." | |
exit 1 | |
fi | |
# Create the output directory if it doesn't exist | |
mkdir -p "$OUTPUT_DIR" | |
# Run the Python script | |
python3 ideabte_scraping/scraping_idebate/src/scraping.py "$JSON_FILE" "$OUTPUT_DIR" | |
echo "Scraping completed. Output files are stored in $OUTPUT_DIR" | |
#ファイルパス | |
scraping_idebate/src/scraping.py | |
------------ | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import os | |
import sys | |
from urllib.parse import urlparse | |
def scrape_url(url, output_dir): | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
topic = soup.find("h1", class_="blog-post__title").get_text(strip=True) | |
points_list = [] | |
def extract_points(section, section_name): | |
accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item') | |
for item in accordion_items: | |
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip() | |
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip() | |
points_list.append({ | |
"topic": topic, | |
"section": section_name, | |
"context": f"**{point_subtitle}**\n{point_body}" | |
}) | |
points_for_section = soup.find('div', class_='points-vote points-vote--for') | |
if points_for_section: | |
extract_points(points_for_section, "Points For") | |
points_against_section = soup.find('div', class_='points-vote points-vote--against') | |
if points_against_section: | |
extract_points(points_against_section, "Points Against") | |
# Generate a unique filename based on the URL | |
parsed_url = urlparse(url) | |
filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json" | |
output_path = os.path.join(output_dir, filename) | |
with open(output_path, "w", encoding="utf-8") as f: | |
json.dump(points_list, f, ensure_ascii=False, indent=4) | |
print(f"Data saved to {output_path}") | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python script.py <json_file> <output_dir>") | |
sys.exit(1) | |
json_file = sys.argv[1] | |
output_dir = sys.argv[2] | |
os.makedirs(output_dir, exist_ok=True) | |
with open(json_file, 'r') as f: | |
url_data = json.load(f) | |
for category, urls in url_data.items(): | |
for url in urls: | |
try: | |
scrape_url(url, output_dir) | |
except Exception as e: | |
print(f"Error scraping {url}: {str(e)}") | |
#ファイルパス | |
scraping_idebate/src/scraping_test.py | |
------------ | |
import requests | |
from bs4 import BeautifulSoup | |
url = "https://idebate.net/this-house-would-make-all-museums-free-of-charge~b641/" | |
# ウェブページを取得 | |
response = requests.get(url) | |
response.raise_for_status() # エラーチェック | |
# HTMLを解析 | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Points Forのdiv要素を取得 | |
points_for_section = soup.find('div', class_='points-vote points-vote--for') | |
# ポイントを含むアコーディオン要素を取得 | |
accordion_items = points_for_section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item') | |
# 各ポイントのテキストを抽出 | |
points = [] | |
for item in accordion_items: | |
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip() | |
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip() | |
points.append(f"**{point_subtitle}**\n{point_body}") | |
# 抽出したポイントを出力 | |
for point in points: | |
print(point) | |
print("-" * 20) # 区切り線 | |
#ファイルパス | |
scraping_idebate/src/scraping_tqdm.py | |
------------ | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import os | |
import sys | |
from urllib.parse import urlparse | |
from tqdm import tqdm | |
def scrape_url(url, output_dir): | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
topic = soup.find("h1", class_="blog-post__title").get_text(strip=True) | |
points_list = [] | |
def extract_points(section, section_name): | |
accordion_items = section.find_next_sibling('div', class_='accordion').find_all('div', class_='accordion__item') | |
for item in accordion_items: | |
point_subtitle = item.find('h4', class_='accordion__subtitle').get_text().strip() | |
point_body = item.find('div', class_='accordion__body').find('p').get_text().strip() | |
points_list.append({ | |
"topic": topic, | |
"section": section_name, | |
"context": f"**{point_subtitle}**\n{point_body}" | |
}) | |
points_for_section = soup.find('div', class_='points-vote points-vote--for') | |
if points_for_section: | |
extract_points(points_for_section, "Points For") | |
points_against_section = soup.find('div', class_='points-vote points-vote--against') | |
if points_against_section: | |
extract_points(points_against_section, "Points Against") | |
# Generate a unique filename based on the URL | |
parsed_url = urlparse(url) | |
filename = f"{parsed_url.path.strip('/').replace('/', '_')}.json" | |
output_path = os.path.join(output_dir, filename) | |
with open(output_path, "w", encoding="utf-8") as f: | |
json.dump(points_list, f, ensure_ascii=False, indent=4) | |
return output_path | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python script.py <json_file> <output_dir>") | |
sys.exit(1) | |
json_file = sys.argv[1] | |
output_dir = sys.argv[2] | |
os.makedirs(output_dir, exist_ok=True) | |
with open(json_file, 'r') as f: | |
url_data = json.load(f) | |
total_urls = sum(len(urls) for urls in url_data.values()) | |
with tqdm(total=total_urls, desc="Scraping Progress") as pbar: | |
for category, urls in url_data.items(): | |
for url in urls: | |
try: | |
output_path = scrape_url(url, output_dir) | |
pbar.set_postfix_str(f"Saved: {output_path}") | |
pbar.update(1) | |
except Exception as e: | |
pbar.set_postfix_str(f"Error: {url}") | |
print(f"\nError scraping {url}: {str(e)}") | |
pbar.update(1) | |
print("\nScraping completed. All data saved to the output directory.") | |