Spaces:
Runtime error
Runtime error
File size: 9,068 Bytes
5962cca 493c56b 75a77d2 b42adaa 991f3f5 5962cca 019b332 9a4cf94 57c73d3 f56002d 049b1c1 5962cca 049b1c1 9a4cf94 049b1c1 9a4cf94 5962cca 9a4cf94 5962cca f56002d 75a77d2 9a4cf94 5962cca 9a4cf94 ebb9e9e c2280b7 5962cca b72af01 15dfda6 b72af01 5aca5c0 049b1c1 b72af01 2a68081 b72af01 b42adaa b72af01 b42adaa b72af01 15dfda6 b72af01 669e365 049b1c1 b72af01 2a68081 b72af01 2a68081 b42adaa cbad560 991f3f5 5aca5c0 049b1c1 5aca5c0 049b1c1 5aca5c0 049b1c1 5aca5c0 049b1c1 75a77d2 5aca5c0 75a77d2 27bc52c e1e426c c02006c 9f23a1c 50aa52c f56002d 5aca5c0 75a77d2 5aca5c0 049b1c1 5aca5c0 b72af01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
//! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user.
use super::aggregation_models::SearchResult;
use error_stack::{Report, Result, ResultExt};
use reqwest::Client;
use std::fmt;
/// A custom error type used for handle engine associated errors.
#[derive(Debug)]
pub enum EngineError {
/// No matching engine found
NoSuchEngineFound(String),
/// This variant handles all request related errors like forbidden, not found,
/// etc.
EmptyResultSet,
/// This variant handles the not results found error provide by the upstream
/// search engines.
RequestError,
/// This variant handles all the errors which are unexpected or occur rarely
/// and are errors mostly related to failure in initialization of HeaderMap,
/// Selector errors and all other errors occurring within the code handling
/// the `upstream search engines`.
UnexpectedError,
}
impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
EngineError::NoSuchEngineFound(engine) => {
write!(f, "No such engine with the name '{engine}' found")
}
EngineError::EmptyResultSet => {
write!(f, "The upstream search engine returned an empty result set")
}
EngineError::RequestError => {
write!(
f,
"Error occurred while requesting data from upstream search engine"
)
}
EngineError::UnexpectedError => {
write!(f, "An unexpected error occurred while processing the data")
}
}
}
}
impl error_stack::Context for EngineError {}
/// A trait to define common behavior for all search engines.
#[async_trait::async_trait]
pub trait SearchEngine: Sync + Send {
/// This helper function fetches/requests the search results from the upstream search engine in
/// an html form.
///
/// # Arguments
///
/// * `url` - It takes the url of the upstream search engine with the user requested search
/// query appended in the search parameters.
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
/// the amount of time for each request to remain connected when until the results can be provided
/// by the upstream engine.
///
/// # Error
///
/// It returns the html data as a string if the upstream engine provides the data as expected
/// otherwise it returns a custom `EngineError`.
async fn fetch_html_from_upstream(
&self,
url: &str,
header_map: reqwest::header::HeaderMap,
client: &Client,
) -> Result<String, EngineError> {
// fetch the html from upstream search engine
Ok(client
.get(url)
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.change_context(EngineError::RequestError)?
.text()
.await
.change_context(EngineError::RequestError)?)
}
/// This helper function fetches/requests the json search results from the upstream search engine as a vector of bytes.
///
/// # Arguments
///
/// * `url` - It takes the url of the upstream search engine with the user requested search
/// query appended in the search parameters.
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
/// the amount of time for each request to remain connected when until the results can be provided
/// by the upstream engine.
///
/// # Error
///
/// It returns the html data as a vector of bytes if the upstream engine provides the data as expected
/// otherwise it returns a custom `EngineError`.
async fn fetch_json_as_bytes_from_upstream(
&self,
url: &str,
header_map: reqwest::header::HeaderMap,
client: &Client,
) -> Result<Vec<u8>, EngineError> {
// fetch the json response from upstream search engine
Ok(client
.get(url)
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.change_context(EngineError::RequestError)?
.bytes()
.await
.change_context(EngineError::RequestError)?
.to_vec())
}
/// This function scrapes results from the upstream engine and puts all the scraped results like
/// title, visiting_url (href in html),engine (from which engine it was fetched from) and description
/// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult
/// struct and then returns it within a Result enum.
///
/// # Arguments
///
/// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument.
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
///
/// # Errors
///
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
/// provide results for the requested search query and also returns error if the scraping selector
/// or HeaderMap fails to initialize.
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
safe_search: u8,
) -> Result<Vec<(String, SearchResult)>, EngineError>;
}
/// A named struct which stores the engine struct with the name of the associated engine.
pub struct EngineHandler {
/// It stores the engine struct wrapped in a box smart pointer as the engine struct implements
/// the `SearchEngine` trait.
engine: Box<dyn SearchEngine>,
/// It stores the name of the engine to which the struct is associated to.
name: &'static str,
}
impl Clone for EngineHandler {
fn clone(&self) -> Self {
Self::new(self.name).unwrap()
}
}
impl EngineHandler {
/// Parses an engine name into an engine handler.
///
/// # Arguments
///
/// * `engine_name` - It takes the name of the engine to which the struct was associated to.
///
/// # Returns
///
/// It returns an option either containing the value or a none if the engine is unknown
pub fn new(engine_name: &str) -> Result<Self, EngineError> {
let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() {
"duckduckgo" => {
let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
("duckduckgo", Box::new(engine))
}
"searx" => {
let engine = crate::engines::searx::Searx::new()?;
("searx", Box::new(engine))
}
"brave" => {
let engine = crate::engines::brave::Brave::new()?;
("brave", Box::new(engine))
}
"startpage" => {
let engine = crate::engines::startpage::Startpage::new()?;
("startpage", Box::new(engine))
}
"librex" => {
let engine = crate::engines::librex::LibreX::new()?;
("librex", Box::new(engine))
}
"mojeek" => {
let engine = crate::engines::mojeek::Mojeek::new()?;
("mojeek", Box::new(engine))
}
"bing" => {
let engine = crate::engines::bing::Bing::new()?;
("bing", Box::new(engine))
}
_ => {
return Err(Report::from(EngineError::NoSuchEngineFound(
engine_name.to_string(),
)))
}
};
Ok(Self {
engine: engine.1,
name: engine.0,
})
}
/// This function converts the EngineHandler type into a tuple containing the engine name and
/// the associated engine struct.
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
(self.name, self.engine)
}
}
|