Spaces:

alamin655
/

spacex

Runtime error

App Files Files Community

Zsombor Gegesy commited on Sep 24, 2023

Commit

57c73d3

•

1 Parent(s): 75a77d2

Refactor the search result parsing

Browse files

Files changed (4) hide show

src/engines/duckduckgo.rs +10 -30
src/engines/search_result_parser.rs +41 -8
src/engines/searx.rs +13 -29
src/models/engine_models.rs +1 -1

src/engines/duckduckgo.rs CHANGED Viewed

@@ -18,10 +18,12 @@ use super::search_result_parser::SearchResultParser;
 /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
 /// reduce code duplication as well as allows to create vector of different search engines easily.
 pub struct DuckDuckGo {
     parser: SearchResultParser,
 }
 impl DuckDuckGo {
     pub fn new() -> Result<Self, EngineError> {
         Ok(Self {
             parser: SearchResultParser::new(
@@ -77,41 +79,19 @@ impl SearchEngine for DuckDuckGo {
             &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
         );
-        if document.select(&self.parser.no_result).next().is_some() {
             return Err(Report::new(EngineError::EmptyResultSet));
         }
         // scrape all the results from the html
-        Ok(document
-            .select(&self.parser.results)
-            .map(|result| {
-                SearchResult::new(
-                    result
-                        .select(&self.parser.result_title)
-                        .next()
-                        .unwrap()
-                        .inner_html()
-                        .trim(),
-                    format!(
-                        "https://{}",
-                        result
-                            .select(&self.parser.result_url)
-                            .next()
-                            .unwrap()
-                            .inner_html()
-                            .trim()
-                    )
-                    .as_str(),
-                    result
-                        .select(&self.parser.result_desc)
-                        .next()
-                        .unwrap()
-                        .inner_html()
-                        .trim(),
                     &["duckduckgo"],
-                )
             })
-            .map(|search_result| (search_result.url.clone(), search_result))
-            .collect())
     }
 }

 /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
 /// reduce code duplication as well as allows to create vector of different search engines easily.
 pub struct DuckDuckGo {
+    // The parser, used to interpret the search result.
     parser: SearchResultParser,
 }
 impl DuckDuckGo {
+    /// Creates the DuckDuckGo parser.
     pub fn new() -> Result<Self, EngineError> {
         Ok(Self {
             parser: SearchResultParser::new(
             &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
         );
+        if self.parser.parse_for_no_results(&document).next().is_some() {
             return Err(Report::new(EngineError::EmptyResultSet));
         }
         // scrape all the results from the html
+        self.parser
+            .parse_for_results(&document, |title, url, desc| {
+                Some(SearchResult::new(
+                    title.inner_html().trim(),
+                    &format!("https://{}", url.inner_html().trim()),
+                    desc.inner_html().trim(),
                     &["duckduckgo"],
+                ))
             })
     }
 }

src/engines/search_result_parser.rs CHANGED Viewed

@@ -1,16 +1,21 @@
-use crate::models::engine_models::EngineError;
-use error_stack::{Report, Result, ResultExt};
-use scraper::{Html, Selector};
 pub struct SearchResultParser {
-    pub no_result: Selector,
-    pub results: Selector,
-    pub result_title: Selector,
-    pub result_url: Selector,
-    pub result_desc: Selector,
 }
 impl SearchResultParser {
     pub fn new(
         no_result_selector: &str,
         results_selector: &str,
@@ -26,8 +31,36 @@ impl SearchResultParser {
             result_desc: new_selector(result_desc_selector)?,
         })
     }
 }
 fn new_selector(selector: &str) -> Result<Selector, EngineError> {
     Selector::parse(selector).map_err(|err| {
         Report::new(EngineError::UnexpectedError).attach_printable(format!(

+//! This modules provides helper functionalities for parsing a html document into internal SearchResult.
+use std::collections::HashMap;
+use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
+use error_stack::{Report, Result};
+use scraper::{html::Select, ElementRef, Html, Selector};
+/// A html search result parser, based on a predefined CSS selectors.
 pub struct SearchResultParser {
+    no_result: Selector,
+    results: Selector,
+    result_title: Selector,
+    result_url: Selector,
+    result_desc: Selector,
 }
 impl SearchResultParser {
+    /// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
     pub fn new(
         no_result_selector: &str,
         results_selector: &str,
             result_desc: new_selector(result_desc_selector)?,
         })
     }
+    /// Parse the html and returns element representing the 'no result found' response.
+    pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
+        document.select(&self.no_result)
+    }
+    /// Parse the html, and convert the results to SearchResult with the help of the builder function
+    pub fn parse_for_results(
+        &self,
+        document: &Html,
+        builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
+        let res = document
+            .select(&self.results)
+            .filter_map(|result| {
+                let title = result.select(&self.result_title).next();
+                let url = result.select(&self.result_url).next();
+                let desc = result.select(&self.result_desc).next();
+                match (title, url, desc) {
+                    (Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
+                    _ => None,
+                }
+            })
+            .map(|search_result| (search_result.url.clone(), search_result))
+            .collect();
+        Ok(res)
+    }
 }
+/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
 fn new_selector(selector: &str) -> Result<Selector, EngineError> {
     Selector::parse(selector).map_err(|err| {
         Report::new(EngineError::UnexpectedError).attach_printable(format!(

src/engines/searx.rs CHANGED Viewed

@@ -14,11 +14,12 @@ use error_stack::{Report, Result, ResultExt};
 /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
 /// reduce code duplication as well as allows to create vector of different search engines easily.
 pub struct Searx {
     parser: SearchResultParser,
 }
 impl Searx {
-    // new Searchx engine
     pub fn new() -> Result<Searx, EngineError> {
         Ok(Self {
             parser: SearchResultParser::new(
@@ -70,7 +71,7 @@ impl SearchEngine for Searx {
             &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
         );
-        if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
             if no_result_msg.inner_html()
             == "we didn't find any results. Please use another query or search in more categories"
         {
@@ -79,33 +80,16 @@ impl SearchEngine for Searx {
         }
         // scrape all the results from the html
-        Ok(document
-            .select(&self.parser.results)
-            .map(|result| {
-                SearchResult::new(
-                    result
-                        .select(&self.parser.result_title)
-                        .next()
-                        .unwrap()
-                        .inner_html()
-                        .trim(),
-                    result
-                        .select(&self.parser.result_url)
-                        .next()
-                        .unwrap()
-                        .value()
-                        .attr("href")
-                        .unwrap(),
-                    result
-                        .select(&self.parser.result_desc)
-                        .next()
-                        .unwrap()
-                        .inner_html()
-                        .trim(),
-                    &["searx"],
-                )
             })
-            .map(|search_result| (search_result.url.clone(), search_result))
-            .collect())
     }
 }

 /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
 /// reduce code duplication as well as allows to create vector of different search engines easily.
 pub struct Searx {
+    // The parser, used to interpret the search result.
     parser: SearchResultParser,
 }
 impl Searx {
+    /// creates a Searx parser
     pub fn new() -> Result<Searx, EngineError> {
         Ok(Self {
             parser: SearchResultParser::new(
             &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
         );
+        if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
             if no_result_msg.inner_html()
             == "we didn't find any results. Please use another query or search in more categories"
         {
         }
         // scrape all the results from the html
+        self.parser
+            .parse_for_results(&document, |title, url, desc| {
+                url.value().attr("href").map(|url| {
+                    SearchResult::new(
+                        title.inner_html().trim(),
+                        url,
+                        desc.inner_html().trim(),
+                        &["searx"],
+                    )
+                })
             })
     }
 }

src/models/engine_models.rs CHANGED Viewed

@@ -8,7 +8,7 @@ use std::{collections::HashMap, fmt, time::Duration};
 /// A custom error type used for handle engine associated errors.
 #[derive(Debug)]
 pub enum EngineError {
-    // No matching engine found
     EngineNotFound,
     /// This variant handles all request related errors like forbidden, not found,
     /// etc.

 /// A custom error type used for handle engine associated errors.
 #[derive(Debug)]
 pub enum EngineError {
+    /// No matching engine found
     EngineNotFound,
     /// This variant handles all request related errors like forbidden, not found,
     /// etc.