File size: 9,336 Bytes
fc69ace
 
 
c5c1684
 
0781385
c5c1684
0781385
15fc415
f94ac50
a3edf70
f94ac50
 
15fc415
0781385
 
 
 
 
15fc415
94ef62e
a3edf70
 
cff7de9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc69ace
 
 
 
 
c5c1684
fc69ace
 
 
c170de8
0527288
94ef62e
 
9cb582a
94ef62e
fc69ace
 
 
c5c1684
fc69ace
 
15fc415
0781385
c170de8
0527288
13632f1
0781385
2d47e8d
15fc415
f94ac50
15fc415
 
c5c1684
13632f1
0527288
 
 
 
c5c1684
 
2f01651
0781385
 
2f01651
 
 
0781385
 
c5c1684
2f01651
 
a3edf70
 
 
 
 
2d47e8d
 
 
 
 
a3edf70
 
1ebf888
2f01651
15fc415
0781385
2f01651
a3edf70
2f01651
0781385
15fc415
a3edf70
 
a28d559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0781385
 
 
 
 
a3edf70
2f01651
0781385
 
 
a3edf70
653d08c
a3edf70
 
 
 
0781385
 
 
 
 
a3edf70
2f01651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0781385
 
a3edf70
653d08c
a3edf70
 
 
 
0781385
 
 
 
f94ac50
15fc415
f94ac50
 
 
 
 
 
 
 
 
 
 
 
 
 
a3edf70
f94ac50
15fc415
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
//! This module provides the functionality to scrape and gathers all the results from the upstream
//! search engines and then removes duplicate results.

use std::{collections::HashMap, time::Duration};

use error_stack::Report;
use rand::Rng;
use tokio::task::JoinHandle;

use super::{
    aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
    user_agent::random_user_agent,
};

use crate::engines::{
    duckduckgo,
    engine_models::{EngineError, SearchEngine},
    searx,
};

/// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;

/// The function aggregates the scraped results from the user-selected upstream search engines.
/// These engines can be chosen either from the user interface (UI) or from the configuration file.
/// The code handles this process by matching the selected search engines and adding them to a vector.
/// This vector is then used to create an asynchronous task vector using `tokio::spawn`, which returns
/// a future. This future is awaited in another loop. Once the results are collected, they are filtered
/// to remove any errors and ensure only proper results are included. If an error is encountered, it is
/// sent to the UI along with the name of the engine and the type of error. This information is finally
/// placed in the returned `SearchResults` struct.
///
/// Additionally, the function eliminates duplicate results. If two results are identified as coming from
/// multiple engines, their names are combined to indicate that the results were fetched from these upstream
/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all
/// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
/// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
///
/// Overall, this function serves to aggregate scraped results from user-selected search engines, handling errors,
/// removing duplicates, and organizing the data for display in the UI.
///
/// # Example:
///
/// If you search from the url like `https://127.0.0.1/search?q=huston` then the search bar should
/// contain the word huston and not remain empty.
///
/// # Arguments
///
/// * `query` - Accepts a string to query with the above upstream search engines.
/// * `page` - Accepts an u32 page number.
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
/// * `request_timeout` - Accepts a time (secs) as a value which controls the server request timeout.
/// user through the UI or the config file.
///
/// # Error
///
/// Returns an error a reqwest and scraping selector errors if any error occurs in the results
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
/// containing appropriate values.
pub async fn aggregate(
    query: String,
    page: u32,
    random_delay: bool,
    debug: bool,
    upstream_search_engines: Vec<String>,
    request_timeout: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
    let user_agent: String = random_user_agent();
    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();

    // Add a random delay before making the request.
    if random_delay || !debug {
        let mut rng = rand::thread_rng();
        let delay_secs = rng.gen_range(1..10);
        std::thread::sleep(Duration::from_secs(delay_secs));
    }

    // fetch results from upstream search engines simultaneously/concurrently.
    let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
        .iter()
        .map(|engine| match engine.to_lowercase().as_str() {
            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
            "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
            &_ => panic!("Config Error: Incorrect config file option provided"),
        })
        .collect();

    let task_capacity: usize = search_engines.len();

    let tasks: FutureVec = search_engines
        .into_iter()
        .map(|search_engine| {
            let query: String = query.clone();
            let user_agent: String = user_agent.clone();
            tokio::spawn(async move {
                search_engine
                    .results(query, page, user_agent.clone(), request_timeout)
                    .await
            })
        })
        .collect();

    let mut outputs = Vec::with_capacity(task_capacity);

    for task in tasks {
        if let Ok(result) = task.await {
            outputs.push(result)
        }
    }

    let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();

    // The code block `outputs.iter()` determines whether it is the first time the code is being run.
    // It does this by checking the initial flag. If it is the first time, the code selects the first
    // engine from which results are fetched and adds or extends them into the `result_map`. If the
    // initially selected engine fails, the code automatically selects another engine to map or extend
    // into the `result_map`. On the other hand, if an engine selected for the first time successfully
    // fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
    // the code iterates through the remaining engines one by one. It compares the fetched results from each
    // engine with the results already present in the `result_map` to identify any duplicates. If duplicate
    // results are found, the code groups them together with the name of the engine from which they were
    // fetched, and automatically removes the duplicate results from the newly fetched data.
    //
    // Additionally, the code handles errors returned by the engines. It keeps track of which engines
    // encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
    // Each structure in this vector contains the name of the engine and the type of error it returned.
    // These structures will later be added to the final `SearchResults` structure. The `SearchResults`
    // structure is used to display an error box in the UI containing the relevant information from
    // the `EngineErrorInfo` structure.
    //
    // In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
    // of errors in order to populate the `result_map` and provide informative feedback to the user through the
    // `SearchResults` structure.
    let mut initial: bool = true;
    let mut counter: usize = 0;
    outputs.iter().for_each(|results| {
        if initial {
            match results {
                Ok(result) => {
                    result_map.extend(result.clone());
                    counter += 1;
                    initial = false
                }
                Err(error_type) => {
                    log::error!("Engine Error: {:?}", error_type);
                    engine_errors_info.push(EngineErrorInfo::new(
                        error_type.downcast_ref::<EngineError>().unwrap(),
                        upstream_search_engines[counter].clone(),
                    ));
                    counter += 1
                }
            }
        } else {
            match results {
                Ok(result) => {
                    result.clone().into_iter().for_each(|(key, value)| {
                        result_map
                            .entry(key)
                            .and_modify(|result| {
                                result.add_engines(value.clone().engine());
                            })
                            .or_insert_with(|| -> RawSearchResult {
                                RawSearchResult::new(
                                    value.title.clone(),
                                    value.visiting_url.clone(),
                                    value.description.clone(),
                                    value.engine.clone(),
                                )
                            });
                    });
                    counter += 1
                }
                Err(error_type) => {
                    log::error!("Engine Error: {:?}", error_type);
                    engine_errors_info.push(EngineErrorInfo::new(
                        error_type.downcast_ref::<EngineError>().unwrap(),
                        upstream_search_engines[counter].clone(),
                    ));
                    counter += 1
                }
            }
        }
    });

    Ok(SearchResults::new(
        result_map
            .into_iter()
            .map(|(key, value)| {
                SearchResult::new(
                    value.title,
                    value.visiting_url,
                    key,
                    value.description,
                    value.engine,
                )
            })
            .collect(),
        query.to_string(),
        engine_errors_info,
    ))
}