File size: 9,068 Bytes
5962cca
 
 
493c56b
75a77d2
b42adaa
991f3f5
5962cca
 
019b332
9a4cf94
57c73d3
f56002d
049b1c1
 
5962cca
049b1c1
 
9a4cf94
049b1c1
 
 
 
9a4cf94
5962cca
 
9a4cf94
 
5962cca
f56002d
 
75a77d2
9a4cf94
5962cca
 
9a4cf94
ebb9e9e
 
 
 
 
 
 
c2280b7
5962cca
 
 
 
b72af01
 
15dfda6
b72af01
5aca5c0
049b1c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b72af01
 
2a68081
b72af01
b42adaa
b72af01
 
b42adaa
b72af01
15dfda6
b72af01
 
 
 
 
 
 
 
669e365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
049b1c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b72af01
 
2a68081
b72af01
2a68081
b42adaa
cbad560
991f3f5
5aca5c0
 
049b1c1
5aca5c0
049b1c1
 
5aca5c0
049b1c1
5aca5c0
 
 
 
 
 
 
 
 
 
049b1c1
 
 
 
 
 
 
 
 
75a77d2
5aca5c0
 
75a77d2
 
 
 
 
 
 
 
27bc52c
 
 
 
e1e426c
 
 
 
c02006c
 
 
 
9f23a1c
 
 
 
50aa52c
 
 
 
f56002d
 
 
 
 
5aca5c0
 
75a77d2
5aca5c0
 
 
 
 
049b1c1
 
5aca5c0
 
 
b72af01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
//! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user.

use super::aggregation_models::SearchResult;
use error_stack::{Report, Result, ResultExt};
use reqwest::Client;
use std::fmt;

/// A custom error type used for handle engine associated errors.
#[derive(Debug)]
pub enum EngineError {
    /// No matching engine found
    NoSuchEngineFound(String),
    /// This variant handles all request related errors like forbidden, not found,
    /// etc.
    EmptyResultSet,
    /// This variant handles the not results found error provide by the upstream
    /// search engines.
    RequestError,
    ///  This variant handles all the errors which are unexpected or occur rarely
    /// and are errors mostly related to failure in initialization of HeaderMap,
    /// Selector errors and all other errors occurring within the code handling
    /// the `upstream search engines`.
    UnexpectedError,
}

impl fmt::Display for EngineError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            EngineError::NoSuchEngineFound(engine) => {
                write!(f, "No such engine with the name '{engine}' found")
            }
            EngineError::EmptyResultSet => {
                write!(f, "The upstream search engine returned an empty result set")
            }
            EngineError::RequestError => {
                write!(
                    f,
                    "Error occurred while requesting data from upstream search engine"
                )
            }
            EngineError::UnexpectedError => {
                write!(f, "An unexpected error occurred while processing the data")
            }
        }
    }
}

impl error_stack::Context for EngineError {}

/// A trait to define common behavior for all search engines.
#[async_trait::async_trait]
pub trait SearchEngine: Sync + Send {
    /// This helper function fetches/requests the search results from the upstream search engine in
    /// an html form.
    ///
    /// # Arguments
    ///
    /// * `url` - It takes the url of the upstream search engine with the user requested search
    /// query appended in the search parameters.
    /// * `header_map` - It takes the http request headers to be sent to the upstream engine in
    /// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
    /// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
    /// the amount of time for each request to remain connected when until the results can be provided
    /// by the upstream engine.
    ///
    /// # Error
    ///
    /// It returns the html data as a string if the upstream engine provides the data as expected
    /// otherwise it returns a custom `EngineError`.
    async fn fetch_html_from_upstream(
        &self,
        url: &str,
        header_map: reqwest::header::HeaderMap,
        client: &Client,
    ) -> Result<String, EngineError> {
        // fetch the html from upstream search engine
        Ok(client
            .get(url)
            .headers(header_map) // add spoofed headers to emulate human behavior
            .send()
            .await
            .change_context(EngineError::RequestError)?
            .text()
            .await
            .change_context(EngineError::RequestError)?)
    }

    /// This helper function fetches/requests the json search results from the upstream search engine as a vector of bytes.
    ///
    /// # Arguments
    ///
    /// * `url` - It takes the url of the upstream search engine with the user requested search
    /// query appended in the search parameters.
    /// * `header_map` - It takes the http request headers to be sent to the upstream engine in
    /// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
    /// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
    /// the amount of time for each request to remain connected when until the results can be provided
    /// by the upstream engine.
    ///
    /// # Error
    ///
    /// It returns the html data as a vector of bytes if the upstream engine provides the data as expected
    /// otherwise it returns a custom `EngineError`.
    async fn fetch_json_as_bytes_from_upstream(
        &self,
        url: &str,
        header_map: reqwest::header::HeaderMap,
        client: &Client,
    ) -> Result<Vec<u8>, EngineError> {
        // fetch the json response from upstream search engine

        Ok(client
            .get(url)
            .headers(header_map) // add spoofed headers to emulate human behavior
            .send()
            .await
            .change_context(EngineError::RequestError)?
            .bytes()
            .await
            .change_context(EngineError::RequestError)?
            .to_vec())
    }

    /// This function scrapes results from the upstream engine and puts all the scraped results like
    /// title, visiting_url (href in html),engine (from which engine it was fetched from) and description
    /// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult
    /// struct and then returns it within a Result enum.
    ///
    /// # Arguments
    ///
    /// * `query` - Takes the user provided query to query to the upstream search engine with.
    /// * `page` - Takes an u32 as an argument.
    /// * `user_agent` - Takes a random user agent string as an argument.
    /// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
    ///
    /// # Errors
    ///
    /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
    /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
    /// provide results for the requested search query and also returns error if the scraping selector
    /// or HeaderMap fails to initialize.
    async fn results(
        &self,
        query: &str,
        page: u32,
        user_agent: &str,
        client: &Client,
        safe_search: u8,
    ) -> Result<Vec<(String, SearchResult)>, EngineError>;
}

/// A named struct which stores the engine struct with the name of the associated engine.
pub struct EngineHandler {
    /// It stores the engine struct wrapped in a box smart pointer as the engine struct implements
    /// the `SearchEngine` trait.
    engine: Box<dyn SearchEngine>,
    /// It stores the name of the engine to which the struct is associated to.
    name: &'static str,
}

impl Clone for EngineHandler {
    fn clone(&self) -> Self {
        Self::new(self.name).unwrap()
    }
}

impl EngineHandler {
    /// Parses an engine name into an engine handler.
    ///
    /// # Arguments
    ///
    /// * `engine_name` - It takes the name of the engine to which the struct was associated to.
    ///
    /// # Returns
    ///
    /// It returns an option either containing the value or a none if the engine is unknown
    pub fn new(engine_name: &str) -> Result<Self, EngineError> {
        let engine: (&'static str, Box<dyn SearchEngine>) =
            match engine_name.to_lowercase().as_str() {
                "duckduckgo" => {
                    let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
                    ("duckduckgo", Box::new(engine))
                }
                "searx" => {
                    let engine = crate::engines::searx::Searx::new()?;
                    ("searx", Box::new(engine))
                }
                "brave" => {
                    let engine = crate::engines::brave::Brave::new()?;
                    ("brave", Box::new(engine))
                }
                "startpage" => {
                    let engine = crate::engines::startpage::Startpage::new()?;
                    ("startpage", Box::new(engine))
                }
                "librex" => {
                    let engine = crate::engines::librex::LibreX::new()?;
                    ("librex", Box::new(engine))
                }
                "mojeek" => {
                    let engine = crate::engines::mojeek::Mojeek::new()?;
                    ("mojeek", Box::new(engine))
                }
                "bing" => {
                    let engine = crate::engines::bing::Bing::new()?;
                    ("bing", Box::new(engine))
                }
                _ => {
                    return Err(Report::from(EngineError::NoSuchEngineFound(
                        engine_name.to_string(),
                    )))
                }
            };

        Ok(Self {
            engine: engine.1,
            name: engine.0,
        })
    }

    /// This function converts the EngineHandler type into a tuple containing the engine name and
    /// the associated engine struct.
    pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
        (self.name, self.engine)
    }
}