Spaces:
Runtime error
Runtime error
File size: 6,362 Bytes
5962cca 493c56b 2a68081 b72af01 5962cca 019b332 9a4cf94 049b1c1 5962cca 049b1c1 9a4cf94 049b1c1 9a4cf94 5962cca 9a4cf94 5962cca 9a4cf94 5962cca 9a4cf94 ebb9e9e c2280b7 5962cca b72af01 15dfda6 b72af01 5aca5c0 049b1c1 b72af01 2a68081 b72af01 2d47e8d b72af01 2d47e8d 15dfda6 b72af01 049b1c1 b72af01 2a68081 b72af01 2a68081 2d47e8d cbad560 5aca5c0 049b1c1 5aca5c0 049b1c1 5aca5c0 049b1c1 5aca5c0 049b1c1 5aca5c0 493c56b 5aca5c0 049b1c1 5aca5c0 b72af01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
//! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user.
use super::aggregation_models::SearchResult;
use error_stack::{Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors.
#[derive(Debug)]
pub enum EngineError {
/// This variant handles all request related errors like forbidden, not found,
/// etc.
EmptyResultSet,
/// This variant handles the not results found error provide by the upstream
/// search engines.
RequestError,
/// This variant handles all the errors which are unexpected or occur rarely
/// and are errors mostly related to failure in initialization of HeaderMap,
/// Selector errors and all other errors occurring within the code handling
/// the `upstream search engines`.
UnexpectedError,
}
impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
EngineError::EmptyResultSet => {
write!(f, "The upstream search engine returned an empty result set")
}
EngineError::RequestError => {
write!(
f,
"Error occurred while requesting data from upstream search engine"
)
}
EngineError::UnexpectedError => {
write!(f, "An unexpected error occurred while processing the data")
}
}
}
}
impl error_stack::Context for EngineError {}
/// A trait to define common behavior for all search engines.
#[async_trait::async_trait]
pub trait SearchEngine: Sync + Send {
/// This helper function fetches/requests the search results from the upstream search engine in
/// an html form.
///
/// # Arguments
///
/// * `url` - It takes the url of the upstream search engine with the user requested search
/// query appended in the search parameters.
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
/// the amount of time for each request to remain connected when until the results can be provided
/// by the upstream engine.
///
/// # Error
///
/// It returns the html data as a string if the upstream engine provides the data as expected
/// otherwise it returns a custom `EngineError`.
async fn fetch_html_from_upstream(
&self,
url: &str,
header_map: reqwest::header::HeaderMap,
request_timeout: u8,
) -> Result<String, EngineError> {
// fetch the html from upstream search engine
Ok(reqwest::Client::new()
.get(url)
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.change_context(EngineError::RequestError)?
.text()
.await
.change_context(EngineError::RequestError)?)
}
/// This function scrapes results from the upstream engine and puts all the scraped results like
/// title, visiting_url (href in html),engine (from which engine it was fetched from) and description
/// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult
/// struct and then returns it within a Result enum.
///
/// # Arguments
///
/// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument.
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
///
/// # Errors
///
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
/// provide results for the requested search query and also returns error if the scraping selector
/// or HeaderMap fails to initialize.
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
request_timeout: u8,
safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>;
}
/// A named struct which stores the engine struct with the name of the associated engine.
pub struct EngineHandler {
/// It stores the engine struct wrapped in a box smart pointer as the engine struct implements
/// the `SearchEngine` trait.
engine: Box<dyn SearchEngine>,
/// It stores the name of the engine to which the struct is associated to.
name: &'static str,
}
impl Clone for EngineHandler {
fn clone(&self) -> Self {
Self::new(self.name).unwrap()
}
}
impl EngineHandler {
/// Parses an engine name into an engine handler.
///
/// # Arguments
///
/// * `engine_name` - It takes the name of the engine to which the struct was associated to.
///
/// # Returns
///
/// It returns an option either containing the value or a none if the engine is unknown
pub fn new(engine_name: &str) -> Option<Self> {
let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() {
"duckduckgo" => (
"duckduckgo",
Box::new(crate::engines::duckduckgo::DuckDuckGo),
),
"searx" => ("searx", Box::new(crate::engines::searx::Searx)),
_ => return None,
};
Some(Self {
engine: engine.1,
name: engine.0,
})
}
/// This function converts the EngineHandler type into a tuple containing the engine name and
/// the associated engine struct.
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
(self.name, self.engine)
}
}
|