Zsombor Gegesy commited on
Commit
57c73d3
1 Parent(s): 75a77d2

Refactor the search result parsing

Browse files
src/engines/duckduckgo.rs CHANGED
@@ -18,10 +18,12 @@ use super::search_result_parser::SearchResultParser;
18
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
19
  /// reduce code duplication as well as allows to create vector of different search engines easily.
20
  pub struct DuckDuckGo {
 
21
  parser: SearchResultParser,
22
  }
23
 
24
  impl DuckDuckGo {
 
25
  pub fn new() -> Result<Self, EngineError> {
26
  Ok(Self {
27
  parser: SearchResultParser::new(
@@ -77,41 +79,19 @@ impl SearchEngine for DuckDuckGo {
77
  &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
78
  );
79
 
80
- if document.select(&self.parser.no_result).next().is_some() {
81
  return Err(Report::new(EngineError::EmptyResultSet));
82
  }
83
 
84
  // scrape all the results from the html
85
- Ok(document
86
- .select(&self.parser.results)
87
- .map(|result| {
88
- SearchResult::new(
89
- result
90
- .select(&self.parser.result_title)
91
- .next()
92
- .unwrap()
93
- .inner_html()
94
- .trim(),
95
- format!(
96
- "https://{}",
97
- result
98
- .select(&self.parser.result_url)
99
- .next()
100
- .unwrap()
101
- .inner_html()
102
- .trim()
103
- )
104
- .as_str(),
105
- result
106
- .select(&self.parser.result_desc)
107
- .next()
108
- .unwrap()
109
- .inner_html()
110
- .trim(),
111
  &["duckduckgo"],
112
- )
113
  })
114
- .map(|search_result| (search_result.url.clone(), search_result))
115
- .collect())
116
  }
117
  }
 
18
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
19
  /// reduce code duplication as well as allows to create vector of different search engines easily.
20
  pub struct DuckDuckGo {
21
+ // The parser, used to interpret the search result.
22
  parser: SearchResultParser,
23
  }
24
 
25
  impl DuckDuckGo {
26
+ /// Creates the DuckDuckGo parser.
27
  pub fn new() -> Result<Self, EngineError> {
28
  Ok(Self {
29
  parser: SearchResultParser::new(
 
79
  &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
80
  );
81
 
82
+ if self.parser.parse_for_no_results(&document).next().is_some() {
83
  return Err(Report::new(EngineError::EmptyResultSet));
84
  }
85
 
86
  // scrape all the results from the html
87
+ self.parser
88
+ .parse_for_results(&document, |title, url, desc| {
89
+ Some(SearchResult::new(
90
+ title.inner_html().trim(),
91
+ &format!("https://{}", url.inner_html().trim()),
92
+ desc.inner_html().trim(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  &["duckduckgo"],
94
+ ))
95
  })
 
 
96
  }
97
  }
src/engines/search_result_parser.rs CHANGED
@@ -1,16 +1,21 @@
1
- use crate::models::engine_models::EngineError;
2
- use error_stack::{Report, Result, ResultExt};
3
- use scraper::{Html, Selector};
4
 
 
 
 
 
 
5
  pub struct SearchResultParser {
6
- pub no_result: Selector,
7
- pub results: Selector,
8
- pub result_title: Selector,
9
- pub result_url: Selector,
10
- pub result_desc: Selector,
11
  }
12
 
13
  impl SearchResultParser {
 
14
  pub fn new(
15
  no_result_selector: &str,
16
  results_selector: &str,
@@ -26,8 +31,36 @@ impl SearchResultParser {
26
  result_desc: new_selector(result_desc_selector)?,
27
  })
28
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
 
 
31
  fn new_selector(selector: &str) -> Result<Selector, EngineError> {
32
  Selector::parse(selector).map_err(|err| {
33
  Report::new(EngineError::UnexpectedError).attach_printable(format!(
 
1
+ //! This modules provides helper functionalities for parsing a html document into internal SearchResult.
2
+ use std::collections::HashMap;
 
3
 
4
+ use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
5
+ use error_stack::{Report, Result};
6
+ use scraper::{html::Select, ElementRef, Html, Selector};
7
+
8
+ /// A html search result parser, based on a predefined CSS selectors.
9
  pub struct SearchResultParser {
10
+ no_result: Selector,
11
+ results: Selector,
12
+ result_title: Selector,
13
+ result_url: Selector,
14
+ result_desc: Selector,
15
  }
16
 
17
  impl SearchResultParser {
18
+ /// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
19
  pub fn new(
20
  no_result_selector: &str,
21
  results_selector: &str,
 
31
  result_desc: new_selector(result_desc_selector)?,
32
  })
33
  }
34
+
35
+ /// Parse the html and returns element representing the 'no result found' response.
36
+ pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
37
+ document.select(&self.no_result)
38
+ }
39
+
40
+ /// Parse the html, and convert the results to SearchResult with the help of the builder function
41
+ pub fn parse_for_results(
42
+ &self,
43
+ document: &Html,
44
+ builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
45
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
46
+ let res = document
47
+ .select(&self.results)
48
+ .filter_map(|result| {
49
+ let title = result.select(&self.result_title).next();
50
+ let url = result.select(&self.result_url).next();
51
+ let desc = result.select(&self.result_desc).next();
52
+ match (title, url, desc) {
53
+ (Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
54
+ _ => None,
55
+ }
56
+ })
57
+ .map(|search_result| (search_result.url.clone(), search_result))
58
+ .collect();
59
+ Ok(res)
60
+ }
61
  }
62
 
63
+ /// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
64
  fn new_selector(selector: &str) -> Result<Selector, EngineError> {
65
  Selector::parse(selector).map_err(|err| {
66
  Report::new(EngineError::UnexpectedError).attach_printable(format!(
src/engines/searx.rs CHANGED
@@ -14,11 +14,12 @@ use error_stack::{Report, Result, ResultExt};
14
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
  /// reduce code duplication as well as allows to create vector of different search engines easily.
16
  pub struct Searx {
 
17
  parser: SearchResultParser,
18
  }
19
 
20
  impl Searx {
21
- // new Searchx engine
22
  pub fn new() -> Result<Searx, EngineError> {
23
  Ok(Self {
24
  parser: SearchResultParser::new(
@@ -70,7 +71,7 @@ impl SearchEngine for Searx {
70
  &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
71
  );
72
 
73
- if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
74
  if no_result_msg.inner_html()
75
  == "we didn't find any results. Please use another query or search in more categories"
76
  {
@@ -79,33 +80,16 @@ impl SearchEngine for Searx {
79
  }
80
 
81
  // scrape all the results from the html
82
- Ok(document
83
- .select(&self.parser.results)
84
- .map(|result| {
85
- SearchResult::new(
86
- result
87
- .select(&self.parser.result_title)
88
- .next()
89
- .unwrap()
90
- .inner_html()
91
- .trim(),
92
- result
93
- .select(&self.parser.result_url)
94
- .next()
95
- .unwrap()
96
- .value()
97
- .attr("href")
98
- .unwrap(),
99
- result
100
- .select(&self.parser.result_desc)
101
- .next()
102
- .unwrap()
103
- .inner_html()
104
- .trim(),
105
- &["searx"],
106
- )
107
  })
108
- .map(|search_result| (search_result.url.clone(), search_result))
109
- .collect())
110
  }
111
  }
 
14
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
  /// reduce code duplication as well as allows to create vector of different search engines easily.
16
  pub struct Searx {
17
+ // The parser, used to interpret the search result.
18
  parser: SearchResultParser,
19
  }
20
 
21
  impl Searx {
22
+ /// creates a Searx parser
23
  pub fn new() -> Result<Searx, EngineError> {
24
  Ok(Self {
25
  parser: SearchResultParser::new(
 
71
  &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
72
  );
73
 
74
+ if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
75
  if no_result_msg.inner_html()
76
  == "we didn't find any results. Please use another query or search in more categories"
77
  {
 
80
  }
81
 
82
  // scrape all the results from the html
83
+ self.parser
84
+ .parse_for_results(&document, |title, url, desc| {
85
+ url.value().attr("href").map(|url| {
86
+ SearchResult::new(
87
+ title.inner_html().trim(),
88
+ url,
89
+ desc.inner_html().trim(),
90
+ &["searx"],
91
+ )
92
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  })
 
 
94
  }
95
  }
src/models/engine_models.rs CHANGED
@@ -8,7 +8,7 @@ use std::{collections::HashMap, fmt, time::Duration};
8
  /// A custom error type used for handle engine associated errors.
9
  #[derive(Debug)]
10
  pub enum EngineError {
11
- // No matching engine found
12
  EngineNotFound,
13
  /// This variant handles all request related errors like forbidden, not found,
14
  /// etc.
 
8
  /// A custom error type used for handle engine associated errors.
9
  #[derive(Debug)]
10
  pub enum EngineError {
11
+ /// No matching engine found
12
  EngineNotFound,
13
  /// This variant handles all request related errors like forbidden, not found,
14
  /// etc.