neon_arch commited on
Commit
4f28711
1 Parent(s): 9d2fb6c

✨ feat: add code to filter aggregated search results using lists (#163)

Browse files
Files changed (1) hide show
  1. src/results/aggregator.rs +47 -8
src/results/aggregator.rs CHANGED
@@ -1,18 +1,22 @@
1
  //! This module provides the functionality to scrape and gathers all the results from the upstream
2
  //! search engines and then removes duplicate results.
3
 
4
- use std::{collections::HashMap, time::Duration};
5
-
6
- use error_stack::Report;
7
- use rand::Rng;
8
- use tokio::task::JoinHandle;
9
 
10
  use super::{
11
  aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
12
  user_agent::random_user_agent,
13
  };
 
 
 
 
 
14
 
15
- use crate::engines::engine_models::{EngineError, EngineHandler};
 
 
 
16
 
17
  /// Aliases for long type annotations
18
  type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
@@ -106,7 +110,7 @@ pub async fn aggregate(
106
  log::error!("Engine Error: {:?}", error);
107
  engine_errors_info.push(EngineErrorInfo::new(
108
  error.downcast_ref::<EngineError>().unwrap(),
109
- engine_name.to_string(),
110
  ));
111
  };
112
 
@@ -143,7 +147,22 @@ pub async fn aggregate(
143
  }
144
  }
145
 
146
- let results = result_map.into_values().collect();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  Ok(SearchResults::new(
149
  results,
@@ -151,3 +170,23 @@ pub async fn aggregate(
151
  engine_errors_info,
152
  ))
153
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  //! This module provides the functionality to scrape and gathers all the results from the upstream
2
  //! search engines and then removes duplicate results.
3
 
4
+ use std::{collections::HashMap, io::BufReader, time::Duration};
 
 
 
 
5
 
6
  use super::{
7
  aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
8
  user_agent::random_user_agent,
9
  };
10
+ use error_stack::Report;
11
+ use rand::Rng;
12
+ use regex::Regex;
13
+ use std::{fs::File, io::BufRead};
14
+ use tokio::task::JoinHandle;
15
 
16
+ use crate::{
17
+ engines::engine_models::{EngineError, EngineHandler},
18
+ handler::paths::{file_path, FileType},
19
+ };
20
 
21
  /// Aliases for long type annotations
22
  type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
 
110
  log::error!("Engine Error: {:?}", error);
111
  engine_errors_info.push(EngineErrorInfo::new(
112
  error.downcast_ref::<EngineError>().unwrap(),
113
+ engine_name,
114
  ));
115
  };
116
 
 
147
  }
148
  }
149
 
150
+ let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
151
+ filter_with_lists(
152
+ &mut result_map,
153
+ &mut blacklist_map,
154
+ &file_path(FileType::BlockList)?,
155
+ )?;
156
+
157
+ filter_with_lists(
158
+ &mut blacklist_map,
159
+ &mut result_map,
160
+ &file_path(FileType::AllowList)?,
161
+ )?;
162
+
163
+ drop(blacklist_map);
164
+
165
+ let results: Vec<SearchResult> = result_map.into_values().collect();
166
 
167
  Ok(SearchResults::new(
168
  results,
 
170
  engine_errors_info,
171
  ))
172
  }
173
+
174
+ fn filter_with_lists(
175
+ map_to_be_filtered: &mut HashMap<String, SearchResult>,
176
+ resultant_map: &mut HashMap<String, SearchResult>,
177
+ file_path: &str,
178
+ ) -> Result<(), Box<dyn std::error::Error>> {
179
+ for (url, search_result) in map_to_be_filtered.clone().into_iter() {
180
+ let reader = BufReader::new(File::open(file_path)?);
181
+ for line in reader.lines() {
182
+ let re = Regex::new(&line?)?;
183
+ if re.is_match(&url.to_lowercase())
184
+ || re.is_match(&search_result.title.to_lowercase())
185
+ || re.is_match(&search_result.description.to_lowercase())
186
+ {
187
+ resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
188
+ }
189
+ }
190
+ }
191
+ Ok(())
192
+ }