Spaces:
Runtime error
Runtime error
Zsombor Gegesy
commited on
Commit
•
75a77d2
1
Parent(s):
769d870
Create separate search_result_parser
Browse files- src/config/parser.rs +1 -1
- src/engines/duckduckgo.rs +25 -24
- src/engines/mod.rs +1 -0
- src/engines/search_result_parser.rs +38 -0
- src/engines/searx.rs +25 -27
- src/models/aggregation_models.rs +2 -0
- src/models/engine_models.rs +17 -9
- src/server/routes/search.rs +1 -1
src/config/parser.rs
CHANGED
@@ -111,7 +111,7 @@ impl Config {
|
|
111 |
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
112 |
.into_iter()
|
113 |
.filter_map(|(key, value)| value.then_some(key))
|
114 |
-
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine))
|
115 |
.collect(),
|
116 |
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
117 |
threads,
|
|
|
111 |
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
112 |
.into_iter()
|
113 |
.filter_map(|(key, value)| value.then_some(key))
|
114 |
+
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine).ok())
|
115 |
.collect(),
|
116 |
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
117 |
threads,
|
src/engines/duckduckgo.rs
CHANGED
@@ -5,7 +5,7 @@
|
|
5 |
use std::collections::HashMap;
|
6 |
|
7 |
use reqwest::header::HeaderMap;
|
8 |
-
use scraper::
|
9 |
|
10 |
use crate::models::aggregation_models::SearchResult;
|
11 |
|
@@ -13,9 +13,27 @@ use crate::models::engine_models::{EngineError, SearchEngine};
|
|
13 |
|
14 |
use error_stack::{Report, Result, ResultExt};
|
15 |
|
|
|
|
|
16 |
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
17 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
18 |
-
pub struct DuckDuckGo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
#[async_trait::async_trait]
|
21 |
impl SearchEngine for DuckDuckGo {
|
@@ -59,34 +77,17 @@ impl SearchEngine for DuckDuckGo {
|
|
59 |
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
60 |
);
|
61 |
|
62 |
-
|
63 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
64 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
|
65 |
-
|
66 |
-
if document.select(&no_result).next().is_some() {
|
67 |
return Err(Report::new(EngineError::EmptyResultSet));
|
68 |
}
|
69 |
|
70 |
-
let results: Selector = Selector::parse(".result")
|
71 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
72 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
73 |
-
let result_title: Selector = Selector::parse(".result__a")
|
74 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
75 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
|
76 |
-
let result_url: Selector = Selector::parse(".result__url")
|
77 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
78 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
|
79 |
-
let result_desc: Selector = Selector::parse(".result__snippet")
|
80 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
81 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
|
82 |
-
|
83 |
// scrape all the results from the html
|
84 |
Ok(document
|
85 |
-
.select(&results)
|
86 |
.map(|result| {
|
87 |
SearchResult::new(
|
88 |
result
|
89 |
-
.select(&result_title)
|
90 |
.next()
|
91 |
.unwrap()
|
92 |
.inner_html()
|
@@ -94,7 +95,7 @@ impl SearchEngine for DuckDuckGo {
|
|
94 |
format!(
|
95 |
"https://{}",
|
96 |
result
|
97 |
-
.select(&result_url)
|
98 |
.next()
|
99 |
.unwrap()
|
100 |
.inner_html()
|
@@ -102,7 +103,7 @@ impl SearchEngine for DuckDuckGo {
|
|
102 |
)
|
103 |
.as_str(),
|
104 |
result
|
105 |
-
.select(&result_desc)
|
106 |
.next()
|
107 |
.unwrap()
|
108 |
.inner_html()
|
|
|
5 |
use std::collections::HashMap;
|
6 |
|
7 |
use reqwest::header::HeaderMap;
|
8 |
+
use scraper::Html;
|
9 |
|
10 |
use crate::models::aggregation_models::SearchResult;
|
11 |
|
|
|
13 |
|
14 |
use error_stack::{Report, Result, ResultExt};
|
15 |
|
16 |
+
use super::search_result_parser::SearchResultParser;
|
17 |
+
|
18 |
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
19 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
20 |
+
pub struct DuckDuckGo {
|
21 |
+
parser: SearchResultParser,
|
22 |
+
}
|
23 |
+
|
24 |
+
impl DuckDuckGo {
|
25 |
+
pub fn new() -> Result<Self, EngineError> {
|
26 |
+
Ok(Self {
|
27 |
+
parser: SearchResultParser::new(
|
28 |
+
".no-results",
|
29 |
+
".result",
|
30 |
+
".result__a",
|
31 |
+
".result__url",
|
32 |
+
".result__snippet",
|
33 |
+
)?,
|
34 |
+
})
|
35 |
+
}
|
36 |
+
}
|
37 |
|
38 |
#[async_trait::async_trait]
|
39 |
impl SearchEngine for DuckDuckGo {
|
|
|
77 |
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
78 |
);
|
79 |
|
80 |
+
if document.select(&self.parser.no_result).next().is_some() {
|
|
|
|
|
|
|
|
|
81 |
return Err(Report::new(EngineError::EmptyResultSet));
|
82 |
}
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
// scrape all the results from the html
|
85 |
Ok(document
|
86 |
+
.select(&self.parser.results)
|
87 |
.map(|result| {
|
88 |
SearchResult::new(
|
89 |
result
|
90 |
+
.select(&self.parser.result_title)
|
91 |
.next()
|
92 |
.unwrap()
|
93 |
.inner_html()
|
|
|
95 |
format!(
|
96 |
"https://{}",
|
97 |
result
|
98 |
+
.select(&self.parser.result_url)
|
99 |
.next()
|
100 |
.unwrap()
|
101 |
.inner_html()
|
|
|
103 |
)
|
104 |
.as_str(),
|
105 |
result
|
106 |
+
.select(&self.parser.result_desc)
|
107 |
.next()
|
108 |
.unwrap()
|
109 |
.inner_html()
|
src/engines/mod.rs
CHANGED
@@ -4,4 +4,5 @@
|
|
4 |
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.
|
5 |
|
6 |
pub mod duckduckgo;
|
|
|
7 |
pub mod searx;
|
|
|
4 |
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.
|
5 |
|
6 |
pub mod duckduckgo;
|
7 |
+
pub mod search_result_parser;
|
8 |
pub mod searx;
|
src/engines/search_result_parser.rs
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
use crate::models::engine_models::EngineError;
|
2 |
+
use error_stack::{Report, Result, ResultExt};
|
3 |
+
use scraper::{Html, Selector};
|
4 |
+
|
5 |
+
pub struct SearchResultParser {
|
6 |
+
pub no_result: Selector,
|
7 |
+
pub results: Selector,
|
8 |
+
pub result_title: Selector,
|
9 |
+
pub result_url: Selector,
|
10 |
+
pub result_desc: Selector,
|
11 |
+
}
|
12 |
+
|
13 |
+
impl SearchResultParser {
|
14 |
+
pub fn new(
|
15 |
+
no_result_selector: &str,
|
16 |
+
results_selector: &str,
|
17 |
+
result_title_selector: &str,
|
18 |
+
result_url_selector: &str,
|
19 |
+
result_desc_selector: &str,
|
20 |
+
) -> Result<SearchResultParser, EngineError> {
|
21 |
+
Ok(SearchResultParser {
|
22 |
+
no_result: new_selector(no_result_selector)?,
|
23 |
+
results: new_selector(results_selector)?,
|
24 |
+
result_title: new_selector(result_title_selector)?,
|
25 |
+
result_url: new_selector(result_url_selector)?,
|
26 |
+
result_desc: new_selector(result_desc_selector)?,
|
27 |
+
})
|
28 |
+
}
|
29 |
+
}
|
30 |
+
|
31 |
+
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
32 |
+
Selector::parse(selector).map_err(|err| {
|
33 |
+
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
34 |
+
"invalid CSS selector: {}, err: {:?}",
|
35 |
+
selector, err
|
36 |
+
))
|
37 |
+
})
|
38 |
+
}
|
src/engines/searx.rs
CHANGED
@@ -3,16 +3,34 @@
|
|
3 |
//! number if provided.
|
4 |
|
5 |
use reqwest::header::HeaderMap;
|
6 |
-
use scraper::
|
7 |
use std::collections::HashMap;
|
8 |
|
|
|
9 |
use crate::models::aggregation_models::SearchResult;
|
10 |
use crate::models::engine_models::{EngineError, SearchEngine};
|
11 |
use error_stack::{Report, Result, ResultExt};
|
12 |
|
13 |
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
14 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
15 |
-
pub struct Searx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
#[async_trait::async_trait]
|
18 |
impl SearchEngine for Searx {
|
@@ -52,13 +70,7 @@ impl SearchEngine for Searx {
|
|
52 |
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
53 |
);
|
54 |
|
55 |
-
let
|
56 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
57 |
-
.attach_printable_lazy(|| {
|
58 |
-
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
|
59 |
-
})?;
|
60 |
-
|
61 |
-
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
62 |
if no_result_msg.inner_html()
|
63 |
== "we didn't find any results. Please use another query or search in more categories"
|
64 |
{
|
@@ -66,40 +78,26 @@ impl SearchEngine for Searx {
|
|
66 |
}
|
67 |
}
|
68 |
|
69 |
-
let results: Selector = Selector::parse(".result")
|
70 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
71 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
72 |
-
let result_title: Selector = Selector::parse("h3>a")
|
73 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
74 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
75 |
-
let result_url: Selector = Selector::parse("h3>a")
|
76 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
77 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
78 |
-
|
79 |
-
let result_desc: Selector = Selector::parse(".content")
|
80 |
-
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
81 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
|
82 |
-
|
83 |
// scrape all the results from the html
|
84 |
Ok(document
|
85 |
-
.select(&results)
|
86 |
.map(|result| {
|
87 |
SearchResult::new(
|
88 |
result
|
89 |
-
.select(&result_title)
|
90 |
.next()
|
91 |
.unwrap()
|
92 |
.inner_html()
|
93 |
.trim(),
|
94 |
result
|
95 |
-
.select(&result_url)
|
96 |
.next()
|
97 |
.unwrap()
|
98 |
.value()
|
99 |
.attr("href")
|
100 |
.unwrap(),
|
101 |
result
|
102 |
-
.select(&result_desc)
|
103 |
.next()
|
104 |
.unwrap()
|
105 |
.inner_html()
|
|
|
3 |
//! number if provided.
|
4 |
|
5 |
use reqwest::header::HeaderMap;
|
6 |
+
use scraper::Html;
|
7 |
use std::collections::HashMap;
|
8 |
|
9 |
+
use super::search_result_parser::SearchResultParser;
|
10 |
use crate::models::aggregation_models::SearchResult;
|
11 |
use crate::models::engine_models::{EngineError, SearchEngine};
|
12 |
use error_stack::{Report, Result, ResultExt};
|
13 |
|
14 |
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
15 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
16 |
+
pub struct Searx {
|
17 |
+
parser: SearchResultParser,
|
18 |
+
}
|
19 |
+
|
20 |
+
impl Searx {
|
21 |
+
// new Searchx engine
|
22 |
+
pub fn new() -> Result<Searx, EngineError> {
|
23 |
+
Ok(Self {
|
24 |
+
parser: SearchResultParser::new(
|
25 |
+
"#urls>.dialog-error>p",
|
26 |
+
".result",
|
27 |
+
"h3>a",
|
28 |
+
"h3>a",
|
29 |
+
".content",
|
30 |
+
)?,
|
31 |
+
})
|
32 |
+
}
|
33 |
+
}
|
34 |
|
35 |
#[async_trait::async_trait]
|
36 |
impl SearchEngine for Searx {
|
|
|
70 |
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
71 |
);
|
72 |
|
73 |
+
if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
if no_result_msg.inner_html()
|
75 |
== "we didn't find any results. Please use another query or search in more categories"
|
76 |
{
|
|
|
78 |
}
|
79 |
}
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
// scrape all the results from the html
|
82 |
Ok(document
|
83 |
+
.select(&self.parser.results)
|
84 |
.map(|result| {
|
85 |
SearchResult::new(
|
86 |
result
|
87 |
+
.select(&self.parser.result_title)
|
88 |
.next()
|
89 |
.unwrap()
|
90 |
.inner_html()
|
91 |
.trim(),
|
92 |
result
|
93 |
+
.select(&self.parser.result_url)
|
94 |
.next()
|
95 |
.unwrap()
|
96 |
.value()
|
97 |
.attr("href")
|
98 |
.unwrap(),
|
99 |
result
|
100 |
+
.select(&self.parser.result_desc)
|
101 |
.next()
|
102 |
.unwrap()
|
103 |
.inner_html()
|
src/models/aggregation_models.rs
CHANGED
@@ -85,12 +85,14 @@ impl EngineErrorInfo {
|
|
85 |
pub fn new(error: &EngineError, engine: &str) -> Self {
|
86 |
Self {
|
87 |
error: match error {
|
|
|
88 |
EngineError::RequestError => "RequestError".to_owned(),
|
89 |
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
|
90 |
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
|
91 |
},
|
92 |
engine: engine.to_owned(),
|
93 |
severity_color: match error {
|
|
|
94 |
EngineError::RequestError => "green".to_owned(),
|
95 |
EngineError::EmptyResultSet => "blue".to_owned(),
|
96 |
EngineError::UnexpectedError => "red".to_owned(),
|
|
|
85 |
pub fn new(error: &EngineError, engine: &str) -> Self {
|
86 |
Self {
|
87 |
error: match error {
|
88 |
+
EngineError::EngineNotFound => "EngineNotFound".to_owned(),
|
89 |
EngineError::RequestError => "RequestError".to_owned(),
|
90 |
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
|
91 |
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
|
92 |
},
|
93 |
engine: engine.to_owned(),
|
94 |
severity_color: match error {
|
95 |
+
EngineError::EngineNotFound => "red".to_owned(),
|
96 |
EngineError::RequestError => "green".to_owned(),
|
97 |
EngineError::EmptyResultSet => "blue".to_owned(),
|
98 |
EngineError::UnexpectedError => "red".to_owned(),
|
src/models/engine_models.rs
CHANGED
@@ -2,12 +2,14 @@
|
|
2 |
//! the upstream search engines with the search query provided by the user.
|
3 |
|
4 |
use super::aggregation_models::SearchResult;
|
5 |
-
use error_stack::{Result, ResultExt};
|
6 |
use std::{collections::HashMap, fmt, time::Duration};
|
7 |
|
8 |
/// A custom error type used for handle engine associated errors.
|
9 |
#[derive(Debug)]
|
10 |
pub enum EngineError {
|
|
|
|
|
11 |
/// This variant handles all request related errors like forbidden, not found,
|
12 |
/// etc.
|
13 |
EmptyResultSet,
|
@@ -24,6 +26,9 @@ pub enum EngineError {
|
|
24 |
impl fmt::Display for EngineError {
|
25 |
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
26 |
match self {
|
|
|
|
|
|
|
27 |
EngineError::EmptyResultSet => {
|
28 |
write!(f, "The upstream search engine returned an empty result set")
|
29 |
}
|
@@ -134,18 +139,21 @@ impl EngineHandler {
|
|
134 |
/// # Returns
|
135 |
///
|
136 |
/// It returns an option either containing the value or a none if the engine is unknown
|
137 |
-
pub fn new(engine_name: &str) ->
|
138 |
let engine: (&'static str, Box<dyn SearchEngine>) =
|
139 |
match engine_name.to_lowercase().as_str() {
|
140 |
-
"duckduckgo" =>
|
141 |
-
|
142 |
-
Box::new(
|
143 |
-
|
144 |
-
"searx" =>
|
145 |
-
|
|
|
|
|
|
|
146 |
};
|
147 |
|
148 |
-
|
149 |
engine: engine.1,
|
150 |
name: engine.0,
|
151 |
})
|
|
|
2 |
//! the upstream search engines with the search query provided by the user.
|
3 |
|
4 |
use super::aggregation_models::SearchResult;
|
5 |
+
use error_stack::{Report, Result, ResultExt};
|
6 |
use std::{collections::HashMap, fmt, time::Duration};
|
7 |
|
8 |
/// A custom error type used for handle engine associated errors.
|
9 |
#[derive(Debug)]
|
10 |
pub enum EngineError {
|
11 |
+
// No matching engine found
|
12 |
+
EngineNotFound,
|
13 |
/// This variant handles all request related errors like forbidden, not found,
|
14 |
/// etc.
|
15 |
EmptyResultSet,
|
|
|
26 |
impl fmt::Display for EngineError {
|
27 |
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
28 |
match self {
|
29 |
+
EngineError::EngineNotFound => {
|
30 |
+
write!(f, "Search engine not found")
|
31 |
+
}
|
32 |
EngineError::EmptyResultSet => {
|
33 |
write!(f, "The upstream search engine returned an empty result set")
|
34 |
}
|
|
|
139 |
/// # Returns
|
140 |
///
|
141 |
/// It returns an option either containing the value or a none if the engine is unknown
|
142 |
+
pub fn new(engine_name: &str) -> Result<Self, EngineError> {
|
143 |
let engine: (&'static str, Box<dyn SearchEngine>) =
|
144 |
match engine_name.to_lowercase().as_str() {
|
145 |
+
"duckduckgo" => {
|
146 |
+
let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
|
147 |
+
("duckduckgo", Box::new(engine))
|
148 |
+
}
|
149 |
+
"searx" => {
|
150 |
+
let engine = crate::engines::searx::Searx::new()?;
|
151 |
+
("searx", Box::new(engine))
|
152 |
+
}
|
153 |
+
_ => return Err(Report::from(EngineError::EngineNotFound)),
|
154 |
};
|
155 |
|
156 |
+
Ok(Self {
|
157 |
engine: engine.1,
|
158 |
name: engine.0,
|
159 |
})
|
src/server/routes/search.rs
CHANGED
@@ -191,7 +191,7 @@ async fn results(
|
|
191 |
let engines: Vec<EngineHandler> = cookie_value
|
192 |
.engines
|
193 |
.iter()
|
194 |
-
.filter_map(|name| EngineHandler::new(name))
|
195 |
.collect();
|
196 |
|
197 |
safe_search_level = match config.safe_search {
|
|
|
191 |
let engines: Vec<EngineHandler> = cookie_value
|
192 |
.engines
|
193 |
.iter()
|
194 |
+
.filter_map(|name| EngineHandler::new(name).ok())
|
195 |
.collect();
|
196 |
|
197 |
safe_search_level = match config.safe_search {
|