Spaces:
Runtime error
Runtime error
neon_arch
commited on
Commit
•
94ef62e
1
Parent(s):
8133de1
✨ feat: add documentation to code
Browse files- src/config_parser/parser.rs +9 -1
- src/engines/duckduckgo.rs +19 -18
- src/engines/engine_models.rs +2 -1
- src/engines/searx.rs +20 -18
- src/search_results_handler/aggregation_models.rs +10 -0
- src/search_results_handler/aggregator.rs +16 -6
- src/server/routes.rs +15 -1
src/config_parser/parser.rs
CHANGED
@@ -18,6 +18,10 @@ static CONFIG_FILE_NAME: &str = "config.lua";
|
|
18 |
/// * `style` - It stores the theming options for the website.
|
19 |
/// * `redis_connection_url` - It stores the redis connection url address on which the redis
|
20 |
/// client should connect.
|
|
|
|
|
|
|
|
|
21 |
#[derive(Clone)]
|
22 |
pub struct Config {
|
23 |
pub port: u16,
|
@@ -31,9 +35,13 @@ pub struct Config {
|
|
31 |
}
|
32 |
|
33 |
/// Configuration options for the aggregator.
|
|
|
|
|
|
|
|
|
|
|
34 |
#[derive(Clone)]
|
35 |
pub struct AggreatorConfig {
|
36 |
-
/// Whether to introduce a random delay before sending the request to the search engine.
|
37 |
pub random_delay: bool,
|
38 |
}
|
39 |
|
|
|
18 |
/// * `style` - It stores the theming options for the website.
|
19 |
/// * `redis_connection_url` - It stores the redis connection url address on which the redis
|
20 |
/// client should connect.
|
21 |
+
/// * `aggregator` - It stores the option to whether enable or disable production use.
|
22 |
+
/// * `logging` - It stores the option to whether enable or disable logs.
|
23 |
+
/// * `debug` - It stores the option to whether enable or disable debug mode.
|
24 |
+
/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user.
|
25 |
#[derive(Clone)]
|
26 |
pub struct Config {
|
27 |
pub port: u16,
|
|
|
35 |
}
|
36 |
|
37 |
/// Configuration options for the aggregator.
|
38 |
+
///
|
39 |
+
/// # Fields
|
40 |
+
///
|
41 |
+
/// * `random_delay` - It stores the option to whether enable or disable random delays between
|
42 |
+
/// requests.
|
43 |
#[derive(Clone)]
|
44 |
pub struct AggreatorConfig {
|
|
|
45 |
pub random_delay: bool,
|
46 |
}
|
47 |
|
src/engines/duckduckgo.rs
CHANGED
@@ -13,28 +13,29 @@ use super::engine_models::{EngineError, SearchEngine};
|
|
13 |
|
14 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
15 |
|
16 |
-
///
|
17 |
-
///
|
18 |
-
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
19 |
-
/// values are RawSearchResult struct and then returns it within a Result enum.
|
20 |
-
///
|
21 |
-
/// # Arguments
|
22 |
-
///
|
23 |
-
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
24 |
-
/// * `page` - Takes an u32 as an argument.
|
25 |
-
/// * `user_agent` - Takes a random user agent string as an argument.
|
26 |
-
///
|
27 |
-
/// # Errors
|
28 |
-
///
|
29 |
-
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
30 |
-
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
31 |
-
/// provide results for the requested search query and also returns error if the scraping selector
|
32 |
-
/// or HeaderMap fails to initialize.
|
33 |
-
|
34 |
pub struct DuckDuckGo;
|
35 |
|
36 |
#[async_trait::async_trait]
|
37 |
impl SearchEngine for DuckDuckGo {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
async fn results(
|
39 |
&self,
|
40 |
query: String,
|
|
|
13 |
|
14 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
15 |
|
16 |
+
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
17 |
+
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
pub struct DuckDuckGo;
|
19 |
|
20 |
#[async_trait::async_trait]
|
21 |
impl SearchEngine for DuckDuckGo {
|
22 |
+
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
23 |
+
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
24 |
+
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
25 |
+
/// values are RawSearchResult struct and then returns it within a Result enum.
|
26 |
+
///
|
27 |
+
/// # Arguments
|
28 |
+
///
|
29 |
+
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
30 |
+
/// * `page` - Takes an u32 as an argument.
|
31 |
+
/// * `user_agent` - Takes a random user agent string as an argument.
|
32 |
+
///
|
33 |
+
/// # Errors
|
34 |
+
///
|
35 |
+
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
36 |
+
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
37 |
+
/// provide results for the requested search query and also returns error if the scraping selector
|
38 |
+
/// or HeaderMap fails to initialize.
|
39 |
async fn results(
|
40 |
&self,
|
41 |
query: String,
|
src/engines/engine_models.rs
CHANGED
@@ -43,6 +43,7 @@ impl fmt::Display for EngineError {
|
|
43 |
|
44 |
impl error_stack::Context for EngineError {}
|
45 |
|
|
|
46 |
#[async_trait::async_trait]
|
47 |
pub trait SearchEngine {
|
48 |
async fn fetch_html_from_upstream(
|
@@ -53,7 +54,7 @@ pub trait SearchEngine {
|
|
53 |
// fetch the html from upstream search engine
|
54 |
Ok(reqwest::Client::new()
|
55 |
.get(url)
|
56 |
-
.timeout(Duration::from_secs(30))
|
57 |
.headers(header_map) // add spoofed headers to emulate human behaviour
|
58 |
.send()
|
59 |
.await
|
|
|
43 |
|
44 |
impl error_stack::Context for EngineError {}
|
45 |
|
46 |
+
/// A trait to define common behaviour for all search engines.
|
47 |
#[async_trait::async_trait]
|
48 |
pub trait SearchEngine {
|
49 |
async fn fetch_html_from_upstream(
|
|
|
54 |
// fetch the html from upstream search engine
|
55 |
Ok(reqwest::Client::new()
|
56 |
.get(url)
|
57 |
+
.timeout(Duration::from_secs(30)) // Add timeout to request to avoid DDOSing the server
|
58 |
.headers(header_map) // add spoofed headers to emulate human behaviour
|
59 |
.send()
|
60 |
.await
|
src/engines/searx.rs
CHANGED
@@ -11,28 +11,30 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
11 |
use super::engine_models::{EngineError, SearchEngine};
|
12 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
13 |
|
14 |
-
///
|
15 |
-
///
|
16 |
-
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
17 |
-
/// values are RawSearchResult struct and then returns it within a Result enum.
|
18 |
-
///
|
19 |
-
/// # Arguments
|
20 |
-
///
|
21 |
-
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
22 |
-
/// * `page` - Takes an u32 as an argument.
|
23 |
-
/// * `user_agent` - Takes a random user agent string as an argument.
|
24 |
-
///
|
25 |
-
/// # Errors
|
26 |
-
///
|
27 |
-
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
28 |
-
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
29 |
-
/// provide results for the requested search query and also returns error if the scraping selector
|
30 |
-
/// or HeaderMap fails to initialize.
|
31 |
-
|
32 |
pub struct Searx;
|
33 |
|
34 |
#[async_trait::async_trait]
|
35 |
impl SearchEngine for Searx {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
async fn results(
|
37 |
&self,
|
38 |
query: String,
|
|
|
11 |
use super::engine_models::{EngineError, SearchEngine};
|
12 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
13 |
|
14 |
+
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
15 |
+
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
pub struct Searx;
|
17 |
|
18 |
#[async_trait::async_trait]
|
19 |
impl SearchEngine for Searx {
|
20 |
+
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
21 |
+
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
22 |
+
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
23 |
+
/// values are RawSearchResult struct and then returns it within a Result enum.
|
24 |
+
///
|
25 |
+
/// # Arguments
|
26 |
+
///
|
27 |
+
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
28 |
+
/// * `page` - Takes an u32 as an argument.
|
29 |
+
/// * `user_agent` - Takes a random user agent string as an argument.
|
30 |
+
///
|
31 |
+
/// # Errors
|
32 |
+
///
|
33 |
+
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
34 |
+
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
35 |
+
/// provide results for the requested search query and also returns error if the scraping selector
|
36 |
+
/// or HeaderMap fails to initialize.
|
37 |
+
|
38 |
async fn results(
|
39 |
&self,
|
40 |
query: String,
|
src/search_results_handler/aggregation_models.rs
CHANGED
@@ -143,6 +143,11 @@ impl EngineErrorInfo {
|
|
143 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
144 |
/// `SearchResult` structs.
|
145 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
|
|
|
|
|
|
|
|
|
|
146 |
#[derive(Serialize, Deserialize)]
|
147 |
#[serde(rename_all = "camelCase")]
|
148 |
pub struct SearchResults {
|
@@ -162,6 +167,8 @@ impl SearchResults {
|
|
162 |
/// and stores it into a vector of `SearchResult` structs.
|
163 |
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
|
164 |
/// the search url.
|
|
|
|
|
165 |
pub fn new(
|
166 |
results: Vec<SearchResult>,
|
167 |
page_query: String,
|
@@ -176,14 +183,17 @@ impl SearchResults {
|
|
176 |
}
|
177 |
}
|
178 |
|
|
|
179 |
pub fn add_style(&mut self, style: Style) {
|
180 |
self.style = style;
|
181 |
}
|
182 |
|
|
|
183 |
pub fn is_empty_result_set(&self) -> bool {
|
184 |
self.results.is_empty()
|
185 |
}
|
186 |
|
|
|
187 |
pub fn set_empty_result_set(&mut self) {
|
188 |
self.empty_result_set = true;
|
189 |
}
|
|
|
143 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
144 |
/// `SearchResult` structs.
|
145 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
146 |
+
/// * `style` - Stores the theming options for the website.
|
147 |
+
/// * `engine_errors_info` - Stores the information on which engines failed with their engine name
|
148 |
+
/// and the type of error that caused it.
|
149 |
+
/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
|
150 |
+
/// given search query.
|
151 |
#[derive(Serialize, Deserialize)]
|
152 |
#[serde(rename_all = "camelCase")]
|
153 |
pub struct SearchResults {
|
|
|
167 |
/// and stores it into a vector of `SearchResult` structs.
|
168 |
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
|
169 |
/// the search url.
|
170 |
+
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
|
171 |
+
/// given search query.
|
172 |
pub fn new(
|
173 |
results: Vec<SearchResult>,
|
174 |
page_query: String,
|
|
|
183 |
}
|
184 |
}
|
185 |
|
186 |
+
/// A setter function to add website style to the return search results.
|
187 |
pub fn add_style(&mut self, style: Style) {
|
188 |
self.style = style;
|
189 |
}
|
190 |
|
191 |
+
/// A function which checks whether the results stored are empty or not.
|
192 |
pub fn is_empty_result_set(&self) -> bool {
|
193 |
self.results.is_empty()
|
194 |
}
|
195 |
|
196 |
+
/// A setter function which sets the empty_result_set to true.
|
197 |
pub fn set_empty_result_set(&mut self) {
|
198 |
self.empty_result_set = true;
|
199 |
}
|
src/search_results_handler/aggregator.rs
CHANGED
@@ -18,14 +18,21 @@ use crate::engines::{
|
|
18 |
searx,
|
19 |
};
|
20 |
|
|
|
21 |
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
|
22 |
|
23 |
-
/// A function that aggregates all the scraped results from the above
|
24 |
-
///
|
25 |
-
///
|
26 |
-
///
|
27 |
-
///
|
28 |
-
///
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
///
|
30 |
/// # Example:
|
31 |
///
|
@@ -37,6 +44,9 @@ type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<
|
|
37 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
38 |
/// * `page` - Accepts an u32 page number.
|
39 |
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
|
|
|
|
|
|
|
40 |
///
|
41 |
/// # Error
|
42 |
///
|
|
|
18 |
searx,
|
19 |
};
|
20 |
|
21 |
+
/// Aliases for long type annotations
|
22 |
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
|
23 |
|
24 |
+
/// A function that aggregates all the scraped results from the above user selected upstream
|
25 |
+
/// search engines either selected from the UI or from the config file which is handled by the code
|
26 |
+
/// by matching over the selected search engines and adding the selected ones to the vector which
|
27 |
+
/// is then used to create an async task vector with `tokio::spawn` which returns a future which
|
28 |
+
/// is then awaited on in another loop and then all the collected results is filtered for errors
|
29 |
+
/// and proper results and if an error is found is then sent to the UI with the engine name and the
|
30 |
+
/// error type that caused it by putting them finallt in the returned `SearchResults` struct. Also
|
31 |
+
/// the same process also removes duplicate results and if two results are found to be from two or
|
32 |
+
/// more engines then puts their names together to show the results are fetched from these upstream
|
33 |
+
/// engines and then removes all data from the HashMap and puts into a struct of all results aggregated
|
34 |
+
/// into a vector and also adds the query used into the struct this is neccessory because otherwise the
|
35 |
+
/// search bar in search remains empty if searched from the query url.
|
36 |
///
|
37 |
/// # Example:
|
38 |
///
|
|
|
44 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
45 |
/// * `page` - Accepts an u32 page number.
|
46 |
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
|
47 |
+
/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
|
48 |
+
/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
|
49 |
+
/// user through the UI or the config file.
|
50 |
///
|
51 |
/// # Error
|
52 |
///
|
src/server/routes.rs
CHANGED
@@ -51,6 +51,13 @@ pub async fn not_found(
|
|
51 |
.body(page_content))
|
52 |
}
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
#[allow(dead_code)]
|
55 |
#[derive(Deserialize)]
|
56 |
struct Cookie {
|
@@ -126,7 +133,7 @@ pub async fn search(
|
|
126 |
|
127 |
// fetch the cached results json.
|
128 |
let cached_results_json = redis_cache.cached_results_json(&page_url);
|
129 |
-
// check if fetched results was indeed fetched or it was an error and if so
|
130 |
// handle the data accordingly.
|
131 |
match cached_results_json {
|
132 |
Ok(results_json) => {
|
@@ -135,6 +142,10 @@ pub async fn search(
|
|
135 |
Ok(HttpResponse::Ok().body(page_content))
|
136 |
}
|
137 |
Err(_) => {
|
|
|
|
|
|
|
|
|
138 |
let mut results_json: crate::search_results_handler::aggregation_models::SearchResults = match req.cookie("appCookie") {
|
139 |
Some(cookie_value) => {
|
140 |
let cookie_value:Cookie = serde_json::from_str(cookie_value.name_value().1)?;
|
@@ -143,6 +154,9 @@ pub async fn search(
|
|
143 |
None => aggregate(query.clone(), page, config.aggregator.random_delay, config.debug, config.upstream_search_engines.clone()).await?,
|
144 |
};
|
145 |
results_json.add_style(config.style.clone());
|
|
|
|
|
|
|
146 |
if results_json.is_empty_result_set() {
|
147 |
results_json.set_empty_result_set();
|
148 |
}
|
|
|
51 |
.body(page_content))
|
52 |
}
|
53 |
|
54 |
+
/// A named struct which is used to deserialize the cookies fetched from the client side.
|
55 |
+
///
|
56 |
+
/// # Fields
|
57 |
+
///
|
58 |
+
/// * `theme` - It stores the theme name used in the website.
|
59 |
+
/// * `colorscheme` - It stores the colorscheme name used for the website theme.
|
60 |
+
/// * `engines` - It stores the user selected upstream search engines selected from the UI.
|
61 |
#[allow(dead_code)]
|
62 |
#[derive(Deserialize)]
|
63 |
struct Cookie {
|
|
|
133 |
|
134 |
// fetch the cached results json.
|
135 |
let cached_results_json = redis_cache.cached_results_json(&page_url);
|
136 |
+
// check if fetched catch results was indeed fetched or it was an error and if so
|
137 |
// handle the data accordingly.
|
138 |
match cached_results_json {
|
139 |
Ok(results_json) => {
|
|
|
142 |
Ok(HttpResponse::Ok().body(page_content))
|
143 |
}
|
144 |
Err(_) => {
|
145 |
+
// check if the cookie value is empty or not if it is empty then use the
|
146 |
+
// default selected upstream search engines from the config file otherwise
|
147 |
+
// parse the non-empty cookie and grab the user selected engines from the
|
148 |
+
// UI and use that.
|
149 |
let mut results_json: crate::search_results_handler::aggregation_models::SearchResults = match req.cookie("appCookie") {
|
150 |
Some(cookie_value) => {
|
151 |
let cookie_value:Cookie = serde_json::from_str(cookie_value.name_value().1)?;
|
|
|
154 |
None => aggregate(query.clone(), page, config.aggregator.random_delay, config.debug, config.upstream_search_engines.clone()).await?,
|
155 |
};
|
156 |
results_json.add_style(config.style.clone());
|
157 |
+
// check whether the results grabbed from the upstream engines are empty or
|
158 |
+
// not if they are empty then set the empty_result_set option to true in
|
159 |
+
// the result json.
|
160 |
if results_json.is_empty_result_set() {
|
161 |
results_json.set_empty_result_set();
|
162 |
}
|