markqiu's picture
Upload folder using huggingface_hub
cd36062
import concurrent.futures
import requests
import re
from bs4 import BeautifulSoup
import extensions.superboogav2.parameters as parameters
from .data_processor import process_and_add_to_collector
from .utils import create_metadata_source
def _download_single(url):
response = requests.get(url, timeout=5)
if response.status_code == 200:
return response.content
else:
raise Exception("Failed to download URL")
def _download_urls(urls, threads=1):
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for url in urls:
future = executor.submit(_download_single, url)
futures.append(future)
results = []
i = 0
for future in concurrent.futures.as_completed(futures):
try:
result = future.result()
results.append(result)
i += 1
yield f"{i}/{len(urls)}", results
except Exception:
pass
yield "Done", results
def feed_url_into_collector(urls, collector):
all_text = ''
cumulative = ''
urls = urls.strip().split('\n')
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
yield cumulative
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
yield cumulative + update
cumulative += 'Processing the HTML sources...'
yield cumulative
for content in contents:
soup = BeautifulSoup(content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
strings = soup.stripped_strings
if parameters.get_is_strong_cleanup():
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
text = '\n'.join([s.strip() for s in strings])
all_text += text
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))