Zsombor Gegesy commited on
Commit
75a77d2
1 Parent(s): 769d870

Create separate search_result_parser

Browse files
src/config/parser.rs CHANGED
@@ -111,7 +111,7 @@ impl Config {
111
  .get::<_, HashMap<String, bool>>("upstream_search_engines")?
112
  .into_iter()
113
  .filter_map(|(key, value)| value.then_some(key))
114
- .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine))
115
  .collect(),
116
  request_timeout: globals.get::<_, u8>("request_timeout")?,
117
  threads,
 
111
  .get::<_, HashMap<String, bool>>("upstream_search_engines")?
112
  .into_iter()
113
  .filter_map(|(key, value)| value.then_some(key))
114
+ .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine).ok())
115
  .collect(),
116
  request_timeout: globals.get::<_, u8>("request_timeout")?,
117
  threads,
src/engines/duckduckgo.rs CHANGED
@@ -5,7 +5,7 @@
5
  use std::collections::HashMap;
6
 
7
  use reqwest::header::HeaderMap;
8
- use scraper::{Html, Selector};
9
 
10
  use crate::models::aggregation_models::SearchResult;
11
 
@@ -13,9 +13,27 @@ use crate::models::engine_models::{EngineError, SearchEngine};
13
 
14
  use error_stack::{Report, Result, ResultExt};
15
 
 
 
16
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
17
  /// reduce code duplication as well as allows to create vector of different search engines easily.
18
- pub struct DuckDuckGo;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  #[async_trait::async_trait]
21
  impl SearchEngine for DuckDuckGo {
@@ -59,34 +77,17 @@ impl SearchEngine for DuckDuckGo {
59
  &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
60
  );
61
 
62
- let no_result: Selector = Selector::parse(".no-results")
63
- .map_err(|_| Report::new(EngineError::UnexpectedError))
64
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
65
-
66
- if document.select(&no_result).next().is_some() {
67
  return Err(Report::new(EngineError::EmptyResultSet));
68
  }
69
 
70
- let results: Selector = Selector::parse(".result")
71
- .map_err(|_| Report::new(EngineError::UnexpectedError))
72
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
73
- let result_title: Selector = Selector::parse(".result__a")
74
- .map_err(|_| Report::new(EngineError::UnexpectedError))
75
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
76
- let result_url: Selector = Selector::parse(".result__url")
77
- .map_err(|_| Report::new(EngineError::UnexpectedError))
78
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
79
- let result_desc: Selector = Selector::parse(".result__snippet")
80
- .map_err(|_| Report::new(EngineError::UnexpectedError))
81
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
82
-
83
  // scrape all the results from the html
84
  Ok(document
85
- .select(&results)
86
  .map(|result| {
87
  SearchResult::new(
88
  result
89
- .select(&result_title)
90
  .next()
91
  .unwrap()
92
  .inner_html()
@@ -94,7 +95,7 @@ impl SearchEngine for DuckDuckGo {
94
  format!(
95
  "https://{}",
96
  result
97
- .select(&result_url)
98
  .next()
99
  .unwrap()
100
  .inner_html()
@@ -102,7 +103,7 @@ impl SearchEngine for DuckDuckGo {
102
  )
103
  .as_str(),
104
  result
105
- .select(&result_desc)
106
  .next()
107
  .unwrap()
108
  .inner_html()
 
5
  use std::collections::HashMap;
6
 
7
  use reqwest::header::HeaderMap;
8
+ use scraper::Html;
9
 
10
  use crate::models::aggregation_models::SearchResult;
11
 
 
13
 
14
  use error_stack::{Report, Result, ResultExt};
15
 
16
+ use super::search_result_parser::SearchResultParser;
17
+
18
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
19
  /// reduce code duplication as well as allows to create vector of different search engines easily.
20
+ pub struct DuckDuckGo {
21
+ parser: SearchResultParser,
22
+ }
23
+
24
+ impl DuckDuckGo {
25
+ pub fn new() -> Result<Self, EngineError> {
26
+ Ok(Self {
27
+ parser: SearchResultParser::new(
28
+ ".no-results",
29
+ ".result",
30
+ ".result__a",
31
+ ".result__url",
32
+ ".result__snippet",
33
+ )?,
34
+ })
35
+ }
36
+ }
37
 
38
  #[async_trait::async_trait]
39
  impl SearchEngine for DuckDuckGo {
 
77
  &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
78
  );
79
 
80
+ if document.select(&self.parser.no_result).next().is_some() {
 
 
 
 
81
  return Err(Report::new(EngineError::EmptyResultSet));
82
  }
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  // scrape all the results from the html
85
  Ok(document
86
+ .select(&self.parser.results)
87
  .map(|result| {
88
  SearchResult::new(
89
  result
90
+ .select(&self.parser.result_title)
91
  .next()
92
  .unwrap()
93
  .inner_html()
 
95
  format!(
96
  "https://{}",
97
  result
98
+ .select(&self.parser.result_url)
99
  .next()
100
  .unwrap()
101
  .inner_html()
 
103
  )
104
  .as_str(),
105
  result
106
+ .select(&self.parser.result_desc)
107
  .next()
108
  .unwrap()
109
  .inner_html()
src/engines/mod.rs CHANGED
@@ -4,4 +4,5 @@
4
  //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
5
 
6
  pub mod duckduckgo;
 
7
  pub mod searx;
 
4
  //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
5
 
6
  pub mod duckduckgo;
7
+ pub mod search_result_parser;
8
  pub mod searx;
src/engines/search_result_parser.rs ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use crate::models::engine_models::EngineError;
2
+ use error_stack::{Report, Result, ResultExt};
3
+ use scraper::{Html, Selector};
4
+
5
+ pub struct SearchResultParser {
6
+ pub no_result: Selector,
7
+ pub results: Selector,
8
+ pub result_title: Selector,
9
+ pub result_url: Selector,
10
+ pub result_desc: Selector,
11
+ }
12
+
13
+ impl SearchResultParser {
14
+ pub fn new(
15
+ no_result_selector: &str,
16
+ results_selector: &str,
17
+ result_title_selector: &str,
18
+ result_url_selector: &str,
19
+ result_desc_selector: &str,
20
+ ) -> Result<SearchResultParser, EngineError> {
21
+ Ok(SearchResultParser {
22
+ no_result: new_selector(no_result_selector)?,
23
+ results: new_selector(results_selector)?,
24
+ result_title: new_selector(result_title_selector)?,
25
+ result_url: new_selector(result_url_selector)?,
26
+ result_desc: new_selector(result_desc_selector)?,
27
+ })
28
+ }
29
+ }
30
+
31
+ fn new_selector(selector: &str) -> Result<Selector, EngineError> {
32
+ Selector::parse(selector).map_err(|err| {
33
+ Report::new(EngineError::UnexpectedError).attach_printable(format!(
34
+ "invalid CSS selector: {}, err: {:?}",
35
+ selector, err
36
+ ))
37
+ })
38
+ }
src/engines/searx.rs CHANGED
@@ -3,16 +3,34 @@
3
  //! number if provided.
4
 
5
  use reqwest::header::HeaderMap;
6
- use scraper::{Html, Selector};
7
  use std::collections::HashMap;
8
 
 
9
  use crate::models::aggregation_models::SearchResult;
10
  use crate::models::engine_models::{EngineError, SearchEngine};
11
  use error_stack::{Report, Result, ResultExt};
12
 
13
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
14
  /// reduce code duplication as well as allows to create vector of different search engines easily.
15
- pub struct Searx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  #[async_trait::async_trait]
18
  impl SearchEngine for Searx {
@@ -52,13 +70,7 @@ impl SearchEngine for Searx {
52
  &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
53
  );
54
 
55
- let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
56
- .map_err(|_| Report::new(EngineError::UnexpectedError))
57
- .attach_printable_lazy(|| {
58
- format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
59
- })?;
60
-
61
- if let Some(no_result_msg) = document.select(&no_result).nth(1) {
62
  if no_result_msg.inner_html()
63
  == "we didn't find any results. Please use another query or search in more categories"
64
  {
@@ -66,40 +78,26 @@ impl SearchEngine for Searx {
66
  }
67
  }
68
 
69
- let results: Selector = Selector::parse(".result")
70
- .map_err(|_| Report::new(EngineError::UnexpectedError))
71
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
72
- let result_title: Selector = Selector::parse("h3>a")
73
- .map_err(|_| Report::new(EngineError::UnexpectedError))
74
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
75
- let result_url: Selector = Selector::parse("h3>a")
76
- .map_err(|_| Report::new(EngineError::UnexpectedError))
77
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
78
-
79
- let result_desc: Selector = Selector::parse(".content")
80
- .map_err(|_| Report::new(EngineError::UnexpectedError))
81
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
82
-
83
  // scrape all the results from the html
84
  Ok(document
85
- .select(&results)
86
  .map(|result| {
87
  SearchResult::new(
88
  result
89
- .select(&result_title)
90
  .next()
91
  .unwrap()
92
  .inner_html()
93
  .trim(),
94
  result
95
- .select(&result_url)
96
  .next()
97
  .unwrap()
98
  .value()
99
  .attr("href")
100
  .unwrap(),
101
  result
102
- .select(&result_desc)
103
  .next()
104
  .unwrap()
105
  .inner_html()
 
3
  //! number if provided.
4
 
5
  use reqwest::header::HeaderMap;
6
+ use scraper::Html;
7
  use std::collections::HashMap;
8
 
9
+ use super::search_result_parser::SearchResultParser;
10
  use crate::models::aggregation_models::SearchResult;
11
  use crate::models::engine_models::{EngineError, SearchEngine};
12
  use error_stack::{Report, Result, ResultExt};
13
 
14
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
  /// reduce code duplication as well as allows to create vector of different search engines easily.
16
+ pub struct Searx {
17
+ parser: SearchResultParser,
18
+ }
19
+
20
+ impl Searx {
21
+ // new Searchx engine
22
+ pub fn new() -> Result<Searx, EngineError> {
23
+ Ok(Self {
24
+ parser: SearchResultParser::new(
25
+ "#urls>.dialog-error>p",
26
+ ".result",
27
+ "h3>a",
28
+ "h3>a",
29
+ ".content",
30
+ )?,
31
+ })
32
+ }
33
+ }
34
 
35
  #[async_trait::async_trait]
36
  impl SearchEngine for Searx {
 
70
  &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
71
  );
72
 
73
+ if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
 
 
 
 
 
 
74
  if no_result_msg.inner_html()
75
  == "we didn't find any results. Please use another query or search in more categories"
76
  {
 
78
  }
79
  }
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  // scrape all the results from the html
82
  Ok(document
83
+ .select(&self.parser.results)
84
  .map(|result| {
85
  SearchResult::new(
86
  result
87
+ .select(&self.parser.result_title)
88
  .next()
89
  .unwrap()
90
  .inner_html()
91
  .trim(),
92
  result
93
+ .select(&self.parser.result_url)
94
  .next()
95
  .unwrap()
96
  .value()
97
  .attr("href")
98
  .unwrap(),
99
  result
100
+ .select(&self.parser.result_desc)
101
  .next()
102
  .unwrap()
103
  .inner_html()
src/models/aggregation_models.rs CHANGED
@@ -85,12 +85,14 @@ impl EngineErrorInfo {
85
  pub fn new(error: &EngineError, engine: &str) -> Self {
86
  Self {
87
  error: match error {
 
88
  EngineError::RequestError => "RequestError".to_owned(),
89
  EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
90
  EngineError::UnexpectedError => "UnexpectedError".to_owned(),
91
  },
92
  engine: engine.to_owned(),
93
  severity_color: match error {
 
94
  EngineError::RequestError => "green".to_owned(),
95
  EngineError::EmptyResultSet => "blue".to_owned(),
96
  EngineError::UnexpectedError => "red".to_owned(),
 
85
  pub fn new(error: &EngineError, engine: &str) -> Self {
86
  Self {
87
  error: match error {
88
+ EngineError::EngineNotFound => "EngineNotFound".to_owned(),
89
  EngineError::RequestError => "RequestError".to_owned(),
90
  EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
91
  EngineError::UnexpectedError => "UnexpectedError".to_owned(),
92
  },
93
  engine: engine.to_owned(),
94
  severity_color: match error {
95
+ EngineError::EngineNotFound => "red".to_owned(),
96
  EngineError::RequestError => "green".to_owned(),
97
  EngineError::EmptyResultSet => "blue".to_owned(),
98
  EngineError::UnexpectedError => "red".to_owned(),
src/models/engine_models.rs CHANGED
@@ -2,12 +2,14 @@
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
  use super::aggregation_models::SearchResult;
5
- use error_stack::{Result, ResultExt};
6
  use std::{collections::HashMap, fmt, time::Duration};
7
 
8
  /// A custom error type used for handle engine associated errors.
9
  #[derive(Debug)]
10
  pub enum EngineError {
 
 
11
  /// This variant handles all request related errors like forbidden, not found,
12
  /// etc.
13
  EmptyResultSet,
@@ -24,6 +26,9 @@ pub enum EngineError {
24
  impl fmt::Display for EngineError {
25
  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26
  match self {
 
 
 
27
  EngineError::EmptyResultSet => {
28
  write!(f, "The upstream search engine returned an empty result set")
29
  }
@@ -134,18 +139,21 @@ impl EngineHandler {
134
  /// # Returns
135
  ///
136
  /// It returns an option either containing the value or a none if the engine is unknown
137
- pub fn new(engine_name: &str) -> Option<Self> {
138
  let engine: (&'static str, Box<dyn SearchEngine>) =
139
  match engine_name.to_lowercase().as_str() {
140
- "duckduckgo" => (
141
- "duckduckgo",
142
- Box::new(crate::engines::duckduckgo::DuckDuckGo),
143
- ),
144
- "searx" => ("searx", Box::new(crate::engines::searx::Searx)),
145
- _ => return None,
 
 
 
146
  };
147
 
148
- Some(Self {
149
  engine: engine.1,
150
  name: engine.0,
151
  })
 
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
  use super::aggregation_models::SearchResult;
5
+ use error_stack::{Report, Result, ResultExt};
6
  use std::{collections::HashMap, fmt, time::Duration};
7
 
8
  /// A custom error type used for handle engine associated errors.
9
  #[derive(Debug)]
10
  pub enum EngineError {
11
+ // No matching engine found
12
+ EngineNotFound,
13
  /// This variant handles all request related errors like forbidden, not found,
14
  /// etc.
15
  EmptyResultSet,
 
26
  impl fmt::Display for EngineError {
27
  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28
  match self {
29
+ EngineError::EngineNotFound => {
30
+ write!(f, "Search engine not found")
31
+ }
32
  EngineError::EmptyResultSet => {
33
  write!(f, "The upstream search engine returned an empty result set")
34
  }
 
139
  /// # Returns
140
  ///
141
  /// It returns an option either containing the value or a none if the engine is unknown
142
+ pub fn new(engine_name: &str) -> Result<Self, EngineError> {
143
  let engine: (&'static str, Box<dyn SearchEngine>) =
144
  match engine_name.to_lowercase().as_str() {
145
+ "duckduckgo" => {
146
+ let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
147
+ ("duckduckgo", Box::new(engine))
148
+ }
149
+ "searx" => {
150
+ let engine = crate::engines::searx::Searx::new()?;
151
+ ("searx", Box::new(engine))
152
+ }
153
+ _ => return Err(Report::from(EngineError::EngineNotFound)),
154
  };
155
 
156
+ Ok(Self {
157
  engine: engine.1,
158
  name: engine.0,
159
  })
src/server/routes/search.rs CHANGED
@@ -191,7 +191,7 @@ async fn results(
191
  let engines: Vec<EngineHandler> = cookie_value
192
  .engines
193
  .iter()
194
- .filter_map(|name| EngineHandler::new(name))
195
  .collect();
196
 
197
  safe_search_level = match config.safe_search {
 
191
  let engines: Vec<EngineHandler> = cookie_value
192
  .engines
193
  .iter()
194
+ .filter_map(|name| EngineHandler::new(name).ok())
195
  .collect();
196
 
197
  safe_search_level = match config.safe_search {