Spaces:
Runtime error
Runtime error
Zsombor Gegesy
commited on
Commit
•
57c73d3
1
Parent(s):
75a77d2
Refactor the search result parsing
Browse files- src/engines/duckduckgo.rs +10 -30
- src/engines/search_result_parser.rs +41 -8
- src/engines/searx.rs +13 -29
- src/models/engine_models.rs +1 -1
src/engines/duckduckgo.rs
CHANGED
@@ -18,10 +18,12 @@ use super::search_result_parser::SearchResultParser;
|
|
18 |
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
19 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
20 |
pub struct DuckDuckGo {
|
|
|
21 |
parser: SearchResultParser,
|
22 |
}
|
23 |
|
24 |
impl DuckDuckGo {
|
|
|
25 |
pub fn new() -> Result<Self, EngineError> {
|
26 |
Ok(Self {
|
27 |
parser: SearchResultParser::new(
|
@@ -77,41 +79,19 @@ impl SearchEngine for DuckDuckGo {
|
|
77 |
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
78 |
);
|
79 |
|
80 |
-
if
|
81 |
return Err(Report::new(EngineError::EmptyResultSet));
|
82 |
}
|
83 |
|
84 |
// scrape all the results from the html
|
85 |
-
|
86 |
-
.
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
.next()
|
92 |
-
.unwrap()
|
93 |
-
.inner_html()
|
94 |
-
.trim(),
|
95 |
-
format!(
|
96 |
-
"https://{}",
|
97 |
-
result
|
98 |
-
.select(&self.parser.result_url)
|
99 |
-
.next()
|
100 |
-
.unwrap()
|
101 |
-
.inner_html()
|
102 |
-
.trim()
|
103 |
-
)
|
104 |
-
.as_str(),
|
105 |
-
result
|
106 |
-
.select(&self.parser.result_desc)
|
107 |
-
.next()
|
108 |
-
.unwrap()
|
109 |
-
.inner_html()
|
110 |
-
.trim(),
|
111 |
&["duckduckgo"],
|
112 |
-
)
|
113 |
})
|
114 |
-
.map(|search_result| (search_result.url.clone(), search_result))
|
115 |
-
.collect())
|
116 |
}
|
117 |
}
|
|
|
18 |
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
19 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
20 |
pub struct DuckDuckGo {
|
21 |
+
// The parser, used to interpret the search result.
|
22 |
parser: SearchResultParser,
|
23 |
}
|
24 |
|
25 |
impl DuckDuckGo {
|
26 |
+
/// Creates the DuckDuckGo parser.
|
27 |
pub fn new() -> Result<Self, EngineError> {
|
28 |
Ok(Self {
|
29 |
parser: SearchResultParser::new(
|
|
|
79 |
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
80 |
);
|
81 |
|
82 |
+
if self.parser.parse_for_no_results(&document).next().is_some() {
|
83 |
return Err(Report::new(EngineError::EmptyResultSet));
|
84 |
}
|
85 |
|
86 |
// scrape all the results from the html
|
87 |
+
self.parser
|
88 |
+
.parse_for_results(&document, |title, url, desc| {
|
89 |
+
Some(SearchResult::new(
|
90 |
+
title.inner_html().trim(),
|
91 |
+
&format!("https://{}", url.inner_html().trim()),
|
92 |
+
desc.inner_html().trim(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
&["duckduckgo"],
|
94 |
+
))
|
95 |
})
|
|
|
|
|
96 |
}
|
97 |
}
|
src/engines/search_result_parser.rs
CHANGED
@@ -1,16 +1,21 @@
|
|
1 |
-
|
2 |
-
use
|
3 |
-
use scraper::{Html, Selector};
|
4 |
|
|
|
|
|
|
|
|
|
|
|
5 |
pub struct SearchResultParser {
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
}
|
12 |
|
13 |
impl SearchResultParser {
|
|
|
14 |
pub fn new(
|
15 |
no_result_selector: &str,
|
16 |
results_selector: &str,
|
@@ -26,8 +31,36 @@ impl SearchResultParser {
|
|
26 |
result_desc: new_selector(result_desc_selector)?,
|
27 |
})
|
28 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
}
|
30 |
|
|
|
31 |
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
32 |
Selector::parse(selector).map_err(|err| {
|
33 |
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
|
|
1 |
+
//! This modules provides helper functionalities for parsing a html document into internal SearchResult.
|
2 |
+
use std::collections::HashMap;
|
|
|
3 |
|
4 |
+
use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
|
5 |
+
use error_stack::{Report, Result};
|
6 |
+
use scraper::{html::Select, ElementRef, Html, Selector};
|
7 |
+
|
8 |
+
/// A html search result parser, based on a predefined CSS selectors.
|
9 |
pub struct SearchResultParser {
|
10 |
+
no_result: Selector,
|
11 |
+
results: Selector,
|
12 |
+
result_title: Selector,
|
13 |
+
result_url: Selector,
|
14 |
+
result_desc: Selector,
|
15 |
}
|
16 |
|
17 |
impl SearchResultParser {
|
18 |
+
/// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
|
19 |
pub fn new(
|
20 |
no_result_selector: &str,
|
21 |
results_selector: &str,
|
|
|
31 |
result_desc: new_selector(result_desc_selector)?,
|
32 |
})
|
33 |
}
|
34 |
+
|
35 |
+
/// Parse the html and returns element representing the 'no result found' response.
|
36 |
+
pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
|
37 |
+
document.select(&self.no_result)
|
38 |
+
}
|
39 |
+
|
40 |
+
/// Parse the html, and convert the results to SearchResult with the help of the builder function
|
41 |
+
pub fn parse_for_results(
|
42 |
+
&self,
|
43 |
+
document: &Html,
|
44 |
+
builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
|
45 |
+
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
46 |
+
let res = document
|
47 |
+
.select(&self.results)
|
48 |
+
.filter_map(|result| {
|
49 |
+
let title = result.select(&self.result_title).next();
|
50 |
+
let url = result.select(&self.result_url).next();
|
51 |
+
let desc = result.select(&self.result_desc).next();
|
52 |
+
match (title, url, desc) {
|
53 |
+
(Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
|
54 |
+
_ => None,
|
55 |
+
}
|
56 |
+
})
|
57 |
+
.map(|search_result| (search_result.url.clone(), search_result))
|
58 |
+
.collect();
|
59 |
+
Ok(res)
|
60 |
+
}
|
61 |
}
|
62 |
|
63 |
+
/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
|
64 |
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
65 |
Selector::parse(selector).map_err(|err| {
|
66 |
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
src/engines/searx.rs
CHANGED
@@ -14,11 +14,12 @@ use error_stack::{Report, Result, ResultExt};
|
|
14 |
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
15 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
16 |
pub struct Searx {
|
|
|
17 |
parser: SearchResultParser,
|
18 |
}
|
19 |
|
20 |
impl Searx {
|
21 |
-
|
22 |
pub fn new() -> Result<Searx, EngineError> {
|
23 |
Ok(Self {
|
24 |
parser: SearchResultParser::new(
|
@@ -70,7 +71,7 @@ impl SearchEngine for Searx {
|
|
70 |
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
71 |
);
|
72 |
|
73 |
-
if let Some(no_result_msg) =
|
74 |
if no_result_msg.inner_html()
|
75 |
== "we didn't find any results. Please use another query or search in more categories"
|
76 |
{
|
@@ -79,33 +80,16 @@ impl SearchEngine for Searx {
|
|
79 |
}
|
80 |
|
81 |
// scrape all the results from the html
|
82 |
-
|
83 |
-
.
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
.
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
result
|
93 |
-
.select(&self.parser.result_url)
|
94 |
-
.next()
|
95 |
-
.unwrap()
|
96 |
-
.value()
|
97 |
-
.attr("href")
|
98 |
-
.unwrap(),
|
99 |
-
result
|
100 |
-
.select(&self.parser.result_desc)
|
101 |
-
.next()
|
102 |
-
.unwrap()
|
103 |
-
.inner_html()
|
104 |
-
.trim(),
|
105 |
-
&["searx"],
|
106 |
-
)
|
107 |
})
|
108 |
-
.map(|search_result| (search_result.url.clone(), search_result))
|
109 |
-
.collect())
|
110 |
}
|
111 |
}
|
|
|
14 |
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
15 |
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
16 |
pub struct Searx {
|
17 |
+
// The parser, used to interpret the search result.
|
18 |
parser: SearchResultParser,
|
19 |
}
|
20 |
|
21 |
impl Searx {
|
22 |
+
/// creates a Searx parser
|
23 |
pub fn new() -> Result<Searx, EngineError> {
|
24 |
Ok(Self {
|
25 |
parser: SearchResultParser::new(
|
|
|
71 |
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
72 |
);
|
73 |
|
74 |
+
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
75 |
if no_result_msg.inner_html()
|
76 |
== "we didn't find any results. Please use another query or search in more categories"
|
77 |
{
|
|
|
80 |
}
|
81 |
|
82 |
// scrape all the results from the html
|
83 |
+
self.parser
|
84 |
+
.parse_for_results(&document, |title, url, desc| {
|
85 |
+
url.value().attr("href").map(|url| {
|
86 |
+
SearchResult::new(
|
87 |
+
title.inner_html().trim(),
|
88 |
+
url,
|
89 |
+
desc.inner_html().trim(),
|
90 |
+
&["searx"],
|
91 |
+
)
|
92 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
})
|
|
|
|
|
94 |
}
|
95 |
}
|
src/models/engine_models.rs
CHANGED
@@ -8,7 +8,7 @@ use std::{collections::HashMap, fmt, time::Duration};
|
|
8 |
/// A custom error type used for handle engine associated errors.
|
9 |
#[derive(Debug)]
|
10 |
pub enum EngineError {
|
11 |
-
|
12 |
EngineNotFound,
|
13 |
/// This variant handles all request related errors like forbidden, not found,
|
14 |
/// etc.
|
|
|
8 |
/// A custom error type used for handle engine associated errors.
|
9 |
#[derive(Debug)]
|
10 |
pub enum EngineError {
|
11 |
+
/// No matching engine found
|
12 |
EngineNotFound,
|
13 |
/// This variant handles all request related errors like forbidden, not found,
|
14 |
/// etc.
|