Spaces:
Runtime error
Runtime error
neon_arch
commited on
Commit
•
4f28711
1
Parent(s):
9d2fb6c
✨ feat: add code to filter aggregated search results using lists (#163)
Browse files- src/results/aggregator.rs +47 -8
src/results/aggregator.rs
CHANGED
@@ -1,18 +1,22 @@
|
|
1 |
//! This module provides the functionality to scrape and gathers all the results from the upstream
|
2 |
//! search engines and then removes duplicate results.
|
3 |
|
4 |
-
use std::{collections::HashMap, time::Duration};
|
5 |
-
|
6 |
-
use error_stack::Report;
|
7 |
-
use rand::Rng;
|
8 |
-
use tokio::task::JoinHandle;
|
9 |
|
10 |
use super::{
|
11 |
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
|
12 |
user_agent::random_user_agent,
|
13 |
};
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
use crate::
|
|
|
|
|
|
|
16 |
|
17 |
/// Aliases for long type annotations
|
18 |
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
@@ -106,7 +110,7 @@ pub async fn aggregate(
|
|
106 |
log::error!("Engine Error: {:?}", error);
|
107 |
engine_errors_info.push(EngineErrorInfo::new(
|
108 |
error.downcast_ref::<EngineError>().unwrap(),
|
109 |
-
engine_name
|
110 |
));
|
111 |
};
|
112 |
|
@@ -143,7 +147,22 @@ pub async fn aggregate(
|
|
143 |
}
|
144 |
}
|
145 |
|
146 |
-
let
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
Ok(SearchResults::new(
|
149 |
results,
|
@@ -151,3 +170,23 @@ pub async fn aggregate(
|
|
151 |
engine_errors_info,
|
152 |
))
|
153 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
//! This module provides the functionality to scrape and gathers all the results from the upstream
|
2 |
//! search engines and then removes duplicate results.
|
3 |
|
4 |
+
use std::{collections::HashMap, io::BufReader, time::Duration};
|
|
|
|
|
|
|
|
|
5 |
|
6 |
use super::{
|
7 |
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
|
8 |
user_agent::random_user_agent,
|
9 |
};
|
10 |
+
use error_stack::Report;
|
11 |
+
use rand::Rng;
|
12 |
+
use regex::Regex;
|
13 |
+
use std::{fs::File, io::BufRead};
|
14 |
+
use tokio::task::JoinHandle;
|
15 |
|
16 |
+
use crate::{
|
17 |
+
engines::engine_models::{EngineError, EngineHandler},
|
18 |
+
handler::paths::{file_path, FileType},
|
19 |
+
};
|
20 |
|
21 |
/// Aliases for long type annotations
|
22 |
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
|
|
110 |
log::error!("Engine Error: {:?}", error);
|
111 |
engine_errors_info.push(EngineErrorInfo::new(
|
112 |
error.downcast_ref::<EngineError>().unwrap(),
|
113 |
+
engine_name,
|
114 |
));
|
115 |
};
|
116 |
|
|
|
147 |
}
|
148 |
}
|
149 |
|
150 |
+
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
|
151 |
+
filter_with_lists(
|
152 |
+
&mut result_map,
|
153 |
+
&mut blacklist_map,
|
154 |
+
&file_path(FileType::BlockList)?,
|
155 |
+
)?;
|
156 |
+
|
157 |
+
filter_with_lists(
|
158 |
+
&mut blacklist_map,
|
159 |
+
&mut result_map,
|
160 |
+
&file_path(FileType::AllowList)?,
|
161 |
+
)?;
|
162 |
+
|
163 |
+
drop(blacklist_map);
|
164 |
+
|
165 |
+
let results: Vec<SearchResult> = result_map.into_values().collect();
|
166 |
|
167 |
Ok(SearchResults::new(
|
168 |
results,
|
|
|
170 |
engine_errors_info,
|
171 |
))
|
172 |
}
|
173 |
+
|
174 |
+
fn filter_with_lists(
|
175 |
+
map_to_be_filtered: &mut HashMap<String, SearchResult>,
|
176 |
+
resultant_map: &mut HashMap<String, SearchResult>,
|
177 |
+
file_path: &str,
|
178 |
+
) -> Result<(), Box<dyn std::error::Error>> {
|
179 |
+
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
|
180 |
+
let reader = BufReader::new(File::open(file_path)?);
|
181 |
+
for line in reader.lines() {
|
182 |
+
let re = Regex::new(&line?)?;
|
183 |
+
if re.is_match(&url.to_lowercase())
|
184 |
+
|| re.is_match(&search_result.title.to_lowercase())
|
185 |
+
|| re.is_match(&search_result.description.to_lowercase())
|
186 |
+
{
|
187 |
+
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
|
188 |
+
}
|
189 |
+
}
|
190 |
+
}
|
191 |
+
Ok(())
|
192 |
+
}
|