Spaces:

Nymbo
/

Webscout

Build error

File size: 7,332 Bytes

9e7090f


from pydantic import BaseModel, Field
from typing import Union

from DeepWEBS.utilsdw.logger import logger
from DeepWEBS.networks.google_searcher import GoogleSearcher
from DeepWEBS.networks.webpage_fetcher import BatchWebpageFetcher
from DeepWEBS.documents.query_results_extractor import QueryResultsExtractor
from DeepWEBS.documents.webpage_content_extractor import BatchWebpageContentExtractor
from DeepWEBS.utilsdw.logger import logger
import argparse

class DeepWEBS:
    def __init__(self):
        pass

    class DeepSearch(BaseModel):
        queries: list = Field(
            default=[""],
            description="(list[str]) Queries to search",
        )
        result_num: int = Field(
            default=10,
            description="(int) Number of search results",
        )
        safe: bool = Field(
            default=False,
            description="(bool) Enable SafeSearch",
        )
        types: list = Field(
            default=["web"],
            description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
        )
        extract_webpage: bool = Field(
            default=False,
            description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
        )
        overwrite_query_html: bool = Field(
            default=False,
            description="(bool) Overwrite HTML file of query results",
        )
        overwrite_webpage_html: bool = Field(
            default=False,
            description="(bool) Overwrite HTML files of webpages from query results",
        )

    def queries_to_search_results(self, item: DeepSearch):
        google_searcher = GoogleSearcher()
        queries_search_results = []
        for query in item.queries:
            query_results_extractor = QueryResultsExtractor()
            if not query.strip():
                continue
            try:
                query_html_path = google_searcher.search(
                    query=query,
                    result_num=item.result_num,
                    safe=item.safe,
                    overwrite=item.overwrite_query_html,
                )
            except Exception as e:
                logger.error(f"Failed to search for query '{query}': {e}")
                continue

            try:
                query_search_results = query_results_extractor.extract(query_html_path)
            except Exception as e:
                logger.error(f"Failed to extract search results for query '{query}': {e}")
                continue

            queries_search_results.append(query_search_results)
        logger.note(queries_search_results)

        if item.extract_webpage:
            queries_search_results = self.extract_webpages(
                queries_search_results,
                overwrite_webpage_html=item.overwrite_webpage_html,
            )
        return queries_search_results

    def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
        for query_idx, query_search_results in enumerate(queries_search_results):
            try:
                # Fetch webpages with urls
                batch_webpage_fetcher = BatchWebpageFetcher()
                urls = [
                    query_result["url"]
                    for query_result in query_search_results["query_results"]
                ]
                url_and_html_path_list = batch_webpage_fetcher.fetch(
                    urls,
                    overwrite=overwrite_webpage_html,
                    output_parent=query_search_results["query"],
                )
            except Exception as e:
                logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
                continue

            # Extract webpage contents from htmls
            html_paths = [
                str(url_and_html_path["html_path"])
                for url_and_html_path in url_and_html_path_list
            ]
            batch_webpage_content_extractor = BatchWebpageContentExtractor()
            try:
                html_path_and_extracted_content_list = (
                    batch_webpage_content_extractor.extract(html_paths)
                )
            except Exception as e:
                logger.error(f"Failed to extract webpage contents for query '{query_search_results['query']}': {e}")
                continue

            # Build the map of url to extracted_content
            html_path_to_url_dict = {
                str(url_and_html_path["html_path"]): url_and_html_path["url"]
                for url_and_html_path in url_and_html_path_list
            }
            url_to_extracted_content_dict = {
                html_path_to_url_dict[
                    html_path_and_extracted_content["html_path"]
                ]: html_path_and_extracted_content["extracted_content"]
                for html_path_and_extracted_content in html_path_and_extracted_content_list
            }

            # Write extracted contents (as 'text' field) to query_search_results
            for query_result_idx, query_result in enumerate(
                query_search_results["query_results"]
            ):
                url = query_result["url"]
                extracted_content = url_to_extracted_content_dict.get(url, "")
                queries_search_results[query_idx]["query_results"][query_result_idx][
                    "text"
                ] = extracted_content

        return queries_search_results


class ArgParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        super(ArgParser, self).__init__(*args, **kwargs)

        self.add_argument(
            "-q",
            "--queries",
            type=str,
            nargs="+",
            required=True,
            help="Queries to search",
        )
        self.add_argument(
            "-n",
            "--result_num",
            type=int,
            default=10,
            help="Number of search results",
        )
        self.add_argument(
            "-s",
            "--safe",
            default=False,
            action="store_true",
            help="Enable SafeSearch",
        )
        self.add_argument(
            "-t",
            "--types",
            type=str,
            nargs="+",
            default=["web"],
            choices=["web", "image", "videos", "news"],
            help="Types of search results",
        )
        self.add_argument(
            "-e",
            "--extract_webpage",
            default=False,
            action="store_true",
            help="Enable extracting main text contents from webpage",
        )
        self.add_argument(
            "-o",
            "--overwrite_query_html",
            default=False,
            action="store_true",
            help="Overwrite HTML file of query results",
        )
        self.add_argument(
            "-w",
            "--overwrite_webpage_html",
            default=False,
            action="store_true",
            help="Overwrite HTML files of webpages from query results",
        )

        self.args = self.parse_args()