alamin655 commited on
Commit
40b7e6d
2 Parent(s): 8156f7e dac2c0c

Merge pull request #273 from gzsombor/search-result-parse-refactor

Browse files

⚙️ Rewrite the search result parser code to make the engine code more concise

src/config/parser.rs CHANGED
@@ -3,6 +3,7 @@
3
 
4
  use crate::handler::paths::{file_path, FileType};
5
 
 
6
  use crate::models::parser_models::{AggregatorConfig, RateLimiter, Style};
7
  use log::LevelFilter;
8
  use mlua::Lua;
@@ -28,7 +29,7 @@ pub struct Config {
28
  /// It stores the option to whether enable or disable debug mode.
29
  pub debug: bool,
30
  /// It stores all the engine names that were enabled by the user.
31
- pub upstream_search_engines: Vec<crate::models::engine_models::EngineHandler>,
32
  /// It stores the time (secs) which controls the server request timeout.
33
  pub request_timeout: u8,
34
  /// It stores the number of threads which controls the app will use to run.
@@ -111,8 +112,8 @@ impl Config {
111
  .get::<_, HashMap<String, bool>>("upstream_search_engines")?
112
  .into_iter()
113
  .filter_map(|(key, value)| value.then_some(key))
114
- .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine))
115
- .collect(),
116
  request_timeout: globals.get::<_, u8>("request_timeout")?,
117
  threads,
118
  rate_limiter: RateLimiter {
 
3
 
4
  use crate::handler::paths::{file_path, FileType};
5
 
6
+ use crate::models::engine_models::{EngineError, EngineHandler};
7
  use crate::models::parser_models::{AggregatorConfig, RateLimiter, Style};
8
  use log::LevelFilter;
9
  use mlua::Lua;
 
29
  /// It stores the option to whether enable or disable debug mode.
30
  pub debug: bool,
31
  /// It stores all the engine names that were enabled by the user.
32
+ pub upstream_search_engines: Vec<EngineHandler>,
33
  /// It stores the time (secs) which controls the server request timeout.
34
  pub request_timeout: u8,
35
  /// It stores the number of threads which controls the app will use to run.
 
112
  .get::<_, HashMap<String, bool>>("upstream_search_engines")?
113
  .into_iter()
114
  .filter_map(|(key, value)| value.then_some(key))
115
+ .map(|engine| EngineHandler::new(&engine))
116
+ .collect::<Result<Vec<EngineHandler>, error_stack::Report<EngineError>>>()?,
117
  request_timeout: globals.get::<_, u8>("request_timeout")?,
118
  threads,
119
  rate_limiter: RateLimiter {
src/engines/duckduckgo.rs CHANGED
@@ -5,7 +5,7 @@
5
  use std::collections::HashMap;
6
 
7
  use reqwest::header::HeaderMap;
8
- use scraper::{Html, Selector};
9
 
10
  use crate::models::aggregation_models::SearchResult;
11
 
@@ -13,9 +13,29 @@ use crate::models::engine_models::{EngineError, SearchEngine};
13
 
14
  use error_stack::{Report, Result, ResultExt};
15
 
 
 
16
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
17
  /// reduce code duplication as well as allows to create vector of different search engines easily.
18
- pub struct DuckDuckGo;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  #[async_trait::async_trait]
21
  impl SearchEngine for DuckDuckGo {
@@ -59,58 +79,19 @@ impl SearchEngine for DuckDuckGo {
59
  &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
60
  );
61
 
62
- let no_result: Selector = Selector::parse(".no-results")
63
- .map_err(|_| Report::new(EngineError::UnexpectedError))
64
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
65
-
66
- if document.select(&no_result).next().is_some() {
67
  return Err(Report::new(EngineError::EmptyResultSet));
68
  }
69
 
70
- let results: Selector = Selector::parse(".result")
71
- .map_err(|_| Report::new(EngineError::UnexpectedError))
72
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
73
- let result_title: Selector = Selector::parse(".result__a")
74
- .map_err(|_| Report::new(EngineError::UnexpectedError))
75
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
76
- let result_url: Selector = Selector::parse(".result__url")
77
- .map_err(|_| Report::new(EngineError::UnexpectedError))
78
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
79
- let result_desc: Selector = Selector::parse(".result__snippet")
80
- .map_err(|_| Report::new(EngineError::UnexpectedError))
81
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
82
-
83
  // scrape all the results from the html
84
- Ok(document
85
- .select(&results)
86
- .map(|result| {
87
- SearchResult::new(
88
- result
89
- .select(&result_title)
90
- .next()
91
- .unwrap()
92
- .inner_html()
93
- .trim(),
94
- format!(
95
- "https://{}",
96
- result
97
- .select(&result_url)
98
- .next()
99
- .unwrap()
100
- .inner_html()
101
- .trim()
102
- )
103
- .as_str(),
104
- result
105
- .select(&result_desc)
106
- .next()
107
- .unwrap()
108
- .inner_html()
109
- .trim(),
110
  &["duckduckgo"],
111
- )
112
  })
113
- .map(|search_result| (search_result.url.clone(), search_result))
114
- .collect())
115
  }
116
  }
 
5
  use std::collections::HashMap;
6
 
7
  use reqwest::header::HeaderMap;
8
+ use scraper::Html;
9
 
10
  use crate::models::aggregation_models::SearchResult;
11
 
 
13
 
14
  use error_stack::{Report, Result, ResultExt};
15
 
16
+ use super::search_result_parser::SearchResultParser;
17
+
18
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
19
  /// reduce code duplication as well as allows to create vector of different search engines easily.
20
+ pub struct DuckDuckGo {
21
+ /// The parser, used to interpret the search result.
22
+ parser: SearchResultParser,
23
+ }
24
+
25
+ impl DuckDuckGo {
26
+ /// Creates the DuckDuckGo parser.
27
+ pub fn new() -> Result<Self, EngineError> {
28
+ Ok(Self {
29
+ parser: SearchResultParser::new(
30
+ ".no-results",
31
+ ".result",
32
+ ".result__a",
33
+ ".result__url",
34
+ ".result__snippet",
35
+ )?,
36
+ })
37
+ }
38
+ }
39
 
40
  #[async_trait::async_trait]
41
  impl SearchEngine for DuckDuckGo {
 
79
  &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
80
  );
81
 
82
+ if self.parser.parse_for_no_results(&document).next().is_some() {
 
 
 
 
83
  return Err(Report::new(EngineError::EmptyResultSet));
84
  }
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  // scrape all the results from the html
87
+ self.parser
88
+ .parse_for_results(&document, |title, url, desc| {
89
+ Some(SearchResult::new(
90
+ title.inner_html().trim(),
91
+ &format!("https://{}", url.inner_html().trim()),
92
+ desc.inner_html().trim(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  &["duckduckgo"],
94
+ ))
95
  })
 
 
96
  }
97
  }
src/engines/mod.rs CHANGED
@@ -4,4 +4,5 @@
4
  //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
5
 
6
  pub mod duckduckgo;
 
7
  pub mod searx;
 
4
  //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
5
 
6
  pub mod duckduckgo;
7
+ pub mod search_result_parser;
8
  pub mod searx;
src/engines/search_result_parser.rs ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //! This modules provides helper functionalities for parsing a html document into internal SearchResult.
2
+ use std::collections::HashMap;
3
+
4
+ use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
5
+ use error_stack::{Report, Result};
6
+ use scraper::{html::Select, ElementRef, Html, Selector};
7
+
8
+ /// A html search result parser, based on a predefined CSS selectors.
9
+ pub struct SearchResultParser {
10
+ /// selector to locate the element which is displayed, if there were nothing found.
11
+ no_result: Selector,
12
+ /// selector to locate the element which contains one item from the search result.
13
+ results: Selector,
14
+ /// selector to locate the title relative to the search result item.
15
+ result_title: Selector,
16
+ /// selector to locate the url relative to the search result item.
17
+ result_url: Selector,
18
+ /// selector to locate the description relative to the search result item.
19
+ result_desc: Selector,
20
+ }
21
+
22
+ impl SearchResultParser {
23
+ /// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
24
+ pub fn new(
25
+ no_result_selector: &str,
26
+ results_selector: &str,
27
+ result_title_selector: &str,
28
+ result_url_selector: &str,
29
+ result_desc_selector: &str,
30
+ ) -> Result<SearchResultParser, EngineError> {
31
+ Ok(SearchResultParser {
32
+ no_result: new_selector(no_result_selector)?,
33
+ results: new_selector(results_selector)?,
34
+ result_title: new_selector(result_title_selector)?,
35
+ result_url: new_selector(result_url_selector)?,
36
+ result_desc: new_selector(result_desc_selector)?,
37
+ })
38
+ }
39
+
40
+ /// Parse the html and returns element representing the 'no result found' response.
41
+ pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
42
+ document.select(&self.no_result)
43
+ }
44
+
45
+ /// Parse the html, and convert the results to SearchResult with the help of the builder function
46
+ pub fn parse_for_results(
47
+ &self,
48
+ document: &Html,
49
+ builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
50
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
51
+ let res = document
52
+ .select(&self.results)
53
+ .filter_map(|result| {
54
+ let title = result.select(&self.result_title).next();
55
+ let url = result.select(&self.result_url).next();
56
+ let desc = result.select(&self.result_desc).next();
57
+ match (title, url, desc) {
58
+ (Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
59
+ _ => None,
60
+ }
61
+ })
62
+ .map(|search_result| (search_result.url.clone(), search_result))
63
+ .collect();
64
+ Ok(res)
65
+ }
66
+ }
67
+
68
+ /// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
69
+ fn new_selector(selector: &str) -> Result<Selector, EngineError> {
70
+ Selector::parse(selector).map_err(|err| {
71
+ Report::new(EngineError::UnexpectedError).attach_printable(format!(
72
+ "invalid CSS selector: {}, err: {:?}",
73
+ selector, err
74
+ ))
75
+ })
76
+ }
src/engines/searx.rs CHANGED
@@ -3,16 +3,35 @@
3
  //! number if provided.
4
 
5
  use reqwest::header::HeaderMap;
6
- use scraper::{Html, Selector};
7
  use std::collections::HashMap;
8
 
 
9
  use crate::models::aggregation_models::SearchResult;
10
  use crate::models::engine_models::{EngineError, SearchEngine};
11
  use error_stack::{Report, Result, ResultExt};
12
 
13
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
14
  /// reduce code duplication as well as allows to create vector of different search engines easily.
15
- pub struct Searx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  #[async_trait::async_trait]
18
  impl SearchEngine for Searx {
@@ -52,13 +71,7 @@ impl SearchEngine for Searx {
52
  &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
53
  );
54
 
55
- let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
56
- .map_err(|_| Report::new(EngineError::UnexpectedError))
57
- .attach_printable_lazy(|| {
58
- format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
59
- })?;
60
-
61
- if let Some(no_result_msg) = document.select(&no_result).nth(1) {
62
  if no_result_msg.inner_html()
63
  == "we didn't find any results. Please use another query or search in more categories"
64
  {
@@ -66,48 +79,17 @@ impl SearchEngine for Searx {
66
  }
67
  }
68
 
69
- let results: Selector = Selector::parse(".result")
70
- .map_err(|_| Report::new(EngineError::UnexpectedError))
71
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
72
- let result_title: Selector = Selector::parse("h3>a")
73
- .map_err(|_| Report::new(EngineError::UnexpectedError))
74
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
75
- let result_url: Selector = Selector::parse("h3>a")
76
- .map_err(|_| Report::new(EngineError::UnexpectedError))
77
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
78
-
79
- let result_desc: Selector = Selector::parse(".content")
80
- .map_err(|_| Report::new(EngineError::UnexpectedError))
81
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
82
-
83
  // scrape all the results from the html
84
- Ok(document
85
- .select(&results)
86
- .map(|result| {
87
- SearchResult::new(
88
- result
89
- .select(&result_title)
90
- .next()
91
- .unwrap()
92
- .inner_html()
93
- .trim(),
94
- result
95
- .select(&result_url)
96
- .next()
97
- .unwrap()
98
- .value()
99
- .attr("href")
100
- .unwrap(),
101
- result
102
- .select(&result_desc)
103
- .next()
104
- .unwrap()
105
- .inner_html()
106
- .trim(),
107
- &["searx"],
108
- )
109
  })
110
- .map(|search_result| (search_result.url.clone(), search_result))
111
- .collect())
112
  }
113
  }
 
3
  //! number if provided.
4
 
5
  use reqwest::header::HeaderMap;
6
+ use scraper::Html;
7
  use std::collections::HashMap;
8
 
9
+ use super::search_result_parser::SearchResultParser;
10
  use crate::models::aggregation_models::SearchResult;
11
  use crate::models::engine_models::{EngineError, SearchEngine};
12
  use error_stack::{Report, Result, ResultExt};
13
 
14
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
  /// reduce code duplication as well as allows to create vector of different search engines easily.
16
+ pub struct Searx {
17
+ /// The parser, used to interpret the search result.
18
+ parser: SearchResultParser,
19
+ }
20
+
21
+ impl Searx {
22
+ /// creates a Searx parser
23
+ pub fn new() -> Result<Searx, EngineError> {
24
+ Ok(Self {
25
+ parser: SearchResultParser::new(
26
+ "#urls>.dialog-error>p",
27
+ ".result",
28
+ "h3>a",
29
+ "h3>a",
30
+ ".content",
31
+ )?,
32
+ })
33
+ }
34
+ }
35
 
36
  #[async_trait::async_trait]
37
  impl SearchEngine for Searx {
 
71
  &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
72
  );
73
 
74
+ if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
 
 
 
 
 
 
75
  if no_result_msg.inner_html()
76
  == "we didn't find any results. Please use another query or search in more categories"
77
  {
 
79
  }
80
  }
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  // scrape all the results from the html
83
+ self.parser
84
+ .parse_for_results(&document, |title, url, desc| {
85
+ url.value().attr("href").map(|url| {
86
+ SearchResult::new(
87
+ title.inner_html().trim(),
88
+ url,
89
+ desc.inner_html().trim(),
90
+ &["searx"],
91
+ )
92
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  })
 
 
94
  }
95
  }
src/models/aggregation_models.rs CHANGED
@@ -85,12 +85,14 @@ impl EngineErrorInfo {
85
  pub fn new(error: &EngineError, engine: &str) -> Self {
86
  Self {
87
  error: match error {
 
88
  EngineError::RequestError => "RequestError".to_owned(),
89
  EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
90
  EngineError::UnexpectedError => "UnexpectedError".to_owned(),
91
  },
92
  engine: engine.to_owned(),
93
  severity_color: match error {
 
94
  EngineError::RequestError => "green".to_owned(),
95
  EngineError::EmptyResultSet => "blue".to_owned(),
96
  EngineError::UnexpectedError => "red".to_owned(),
 
85
  pub fn new(error: &EngineError, engine: &str) -> Self {
86
  Self {
87
  error: match error {
88
+ EngineError::NoSuchEngineFound(_) => "EngineNotFound".to_owned(),
89
  EngineError::RequestError => "RequestError".to_owned(),
90
  EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
91
  EngineError::UnexpectedError => "UnexpectedError".to_owned(),
92
  },
93
  engine: engine.to_owned(),
94
  severity_color: match error {
95
+ EngineError::NoSuchEngineFound(_) => "red".to_owned(),
96
  EngineError::RequestError => "green".to_owned(),
97
  EngineError::EmptyResultSet => "blue".to_owned(),
98
  EngineError::UnexpectedError => "red".to_owned(),
src/models/engine_models.rs CHANGED
@@ -2,12 +2,14 @@
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
  use super::aggregation_models::SearchResult;
5
- use error_stack::{Result, ResultExt};
6
  use std::{collections::HashMap, fmt, time::Duration};
7
 
8
  /// A custom error type used for handle engine associated errors.
9
  #[derive(Debug)]
10
  pub enum EngineError {
 
 
11
  /// This variant handles all request related errors like forbidden, not found,
12
  /// etc.
13
  EmptyResultSet,
@@ -24,6 +26,9 @@ pub enum EngineError {
24
  impl fmt::Display for EngineError {
25
  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26
  match self {
 
 
 
27
  EngineError::EmptyResultSet => {
28
  write!(f, "The upstream search engine returned an empty result set")
29
  }
@@ -134,18 +139,25 @@ impl EngineHandler {
134
  /// # Returns
135
  ///
136
  /// It returns an option either containing the value or a none if the engine is unknown
137
- pub fn new(engine_name: &str) -> Option<Self> {
138
  let engine: (&'static str, Box<dyn SearchEngine>) =
139
  match engine_name.to_lowercase().as_str() {
140
- "duckduckgo" => (
141
- "duckduckgo",
142
- Box::new(crate::engines::duckduckgo::DuckDuckGo),
143
- ),
144
- "searx" => ("searx", Box::new(crate::engines::searx::Searx)),
145
- _ => return None,
 
 
 
 
 
 
 
146
  };
147
 
148
- Some(Self {
149
  engine: engine.1,
150
  name: engine.0,
151
  })
 
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
  use super::aggregation_models::SearchResult;
5
+ use error_stack::{Report, Result, ResultExt};
6
  use std::{collections::HashMap, fmt, time::Duration};
7
 
8
  /// A custom error type used for handle engine associated errors.
9
  #[derive(Debug)]
10
  pub enum EngineError {
11
+ /// No matching engine found
12
+ NoSuchEngineFound(String),
13
  /// This variant handles all request related errors like forbidden, not found,
14
  /// etc.
15
  EmptyResultSet,
 
26
  impl fmt::Display for EngineError {
27
  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28
  match self {
29
+ EngineError::NoSuchEngineFound(engine) => {
30
+ write!(f, "No such engine with the name '{engine}' found")
31
+ }
32
  EngineError::EmptyResultSet => {
33
  write!(f, "The upstream search engine returned an empty result set")
34
  }
 
139
  /// # Returns
140
  ///
141
  /// It returns an option either containing the value or a none if the engine is unknown
142
+ pub fn new(engine_name: &str) -> Result<Self, EngineError> {
143
  let engine: (&'static str, Box<dyn SearchEngine>) =
144
  match engine_name.to_lowercase().as_str() {
145
+ "duckduckgo" => {
146
+ let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
147
+ ("duckduckgo", Box::new(engine))
148
+ }
149
+ "searx" => {
150
+ let engine = crate::engines::searx::Searx::new()?;
151
+ ("searx", Box::new(engine))
152
+ }
153
+ _ => {
154
+ return Err(Report::from(EngineError::NoSuchEngineFound(
155
+ engine_name.to_string(),
156
+ )))
157
+ }
158
  };
159
 
160
+ Ok(Self {
161
  engine: engine.1,
162
  name: engine.0,
163
  })
src/server/routes/search.rs CHANGED
@@ -191,7 +191,7 @@ async fn results(
191
  let engines: Vec<EngineHandler> = cookie_value
192
  .engines
193
  .iter()
194
- .filter_map(|name| EngineHandler::new(name))
195
  .collect();
196
 
197
  safe_search_level = match config.safe_search {
 
191
  let engines: Vec<EngineHandler> = cookie_value
192
  .engines
193
  .iter()
194
+ .filter_map(|name| EngineHandler::new(name).ok())
195
  .collect();
196
 
197
  safe_search_level = match config.safe_search {