alamin655 commited on
Commit
4cd1810
2 Parent(s): fb2b660 b68e06c

Merge pull request #419 from neon-mmd/FEAT/314_startpage-for-the-search-engine

Browse files
Cargo.lock CHANGED
@@ -4066,7 +4066,7 @@ checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
4066
 
4067
  [[package]]
4068
  name = "websurfx"
4069
- version = "1.3.6"
4070
  dependencies = [
4071
  "actix-cors",
4072
  "actix-files",
 
4066
 
4067
  [[package]]
4068
  name = "websurfx"
4069
+ version = "1.4.0"
4070
  dependencies = [
4071
  "actix-cors",
4072
  "actix-files",
Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
  [package]
2
  name = "websurfx"
3
- version = "1.3.6"
4
  edition = "2021"
5
  description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
6
  repository = "https://github.com/neon-mmd/websurfx"
 
1
  [package]
2
  name = "websurfx"
3
+ version = "1.4.0"
4
  edition = "2021"
5
  description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
6
  repository = "https://github.com/neon-mmd/websurfx"
src/engines/mod.rs CHANGED
@@ -7,3 +7,4 @@ pub mod brave;
7
  pub mod duckduckgo;
8
  pub mod search_result_parser;
9
  pub mod searx;
 
 
7
  pub mod duckduckgo;
8
  pub mod search_result_parser;
9
  pub mod searx;
10
+ pub mod startpage;
src/engines/startpage.rs ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine
2
+ //! by querying the upstream duckduckgo search engine with user provided query and with a page
3
+ //! number if provided.
4
+
5
+ use std::collections::HashMap;
6
+
7
+ use reqwest::header::HeaderMap;
8
+ use reqwest::Client;
9
+ use scraper::Html;
10
+
11
+ use crate::models::aggregation_models::SearchResult;
12
+
13
+ use crate::models::engine_models::{EngineError, SearchEngine};
14
+
15
+ use error_stack::{Report, Result, ResultExt};
16
+
17
+ use super::search_result_parser::SearchResultParser;
18
+
19
+ /// A new Startpage engine type defined in-order to implement the `SearchEngine` trait which allows to
20
+ /// reduce code duplication as well as allows to create vector of different search engines easily.
21
+ pub struct Startpage {
22
+ /// The parser, used to interpret the search result.
23
+ parser: SearchResultParser,
24
+ }
25
+
26
+ impl Startpage {
27
+ /// Creates the Startpage parser.
28
+ pub fn new() -> Result<Self, EngineError> {
29
+ Ok(Self {
30
+ parser: SearchResultParser::new(
31
+ ".no-results",
32
+ ".w-gl__result__main",
33
+ ".w-gl__result-second-line-container>.w-gl__result-title>h3",
34
+ ".w-gl__result-url",
35
+ ".w-gl__description",
36
+ )?,
37
+ })
38
+ }
39
+ }
40
+
41
+ #[async_trait::async_trait]
42
+ impl SearchEngine for Startpage {
43
+ async fn results(
44
+ &self,
45
+ query: &str,
46
+ page: u32,
47
+ user_agent: &str,
48
+ client: &Client,
49
+ _safe_search: u8,
50
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
51
+ // Page number can be missing or empty string and so appropriate handling is required
52
+ // so that upstream server recieves valid page number.
53
+ let url: String = match page {
54
+ 1 | 0 => {
55
+ format!("https://startpage.com/do/dsearch?q={query}&num=10&start=0")
56
+ }
57
+ _ => {
58
+ format!(
59
+ "https://startpage.com/do/dsearch?q={query}&num=10&start={}",
60
+ page * 10,
61
+ )
62
+ }
63
+ };
64
+
65
+ // initializing HeaderMap and adding appropriate headers.
66
+ let header_map = HeaderMap::try_from(&HashMap::from([
67
+ ("USER_AGENT".to_string(), user_agent.to_string()),
68
+ ("REFERER".to_string(), "https://google.com/".to_string()),
69
+ (
70
+ "CONTENT_TYPE".to_string(),
71
+ "application/x-www-form-urlencoded".to_string(),
72
+ ),
73
+ ("COOKIE".to_string(), "preferences=connect_to_serverEEE0N1Ndate_timeEEEworldN1Ndisable_family_filterEEE0N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE1N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fnight%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE10N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius".to_string()),
74
+ ]))
75
+ .change_context(EngineError::UnexpectedError)?;
76
+
77
+ let document: Html = Html::parse_document(
78
+ &Startpage::fetch_html_from_upstream(self, &url, header_map, client).await?,
79
+ );
80
+
81
+ if self.parser.parse_for_no_results(&document).next().is_some() {
82
+ return Err(Report::new(EngineError::EmptyResultSet));
83
+ }
84
+
85
+ // scrape all the results from the html
86
+ self.parser
87
+ .parse_for_results(&document, |title, url, desc| {
88
+ Some(SearchResult::new(
89
+ title.inner_html().trim(),
90
+ &format!("{}", url.inner_html().trim()),
91
+ desc.inner_html().trim(),
92
+ &["startpage"],
93
+ ))
94
+ })
95
+ }
96
+ }
src/models/engine_models.rs CHANGED
@@ -154,6 +154,10 @@ impl EngineHandler {
154
  let engine = crate::engines::brave::Brave::new()?;
155
  ("brave", Box::new(engine))
156
  }
 
 
 
 
157
  _ => {
158
  return Err(Report::from(EngineError::NoSuchEngineFound(
159
  engine_name.to_string(),
 
154
  let engine = crate::engines::brave::Brave::new()?;
155
  ("brave", Box::new(engine))
156
  }
157
+ "startpage" => {
158
+ let engine = crate::engines::startpage::Startpage::new()?;
159
+ ("startpage", Box::new(engine))
160
+ }
161
  _ => {
162
  return Err(Report::from(EngineError::NoSuchEngineFound(
163
  engine_name.to_string(),
websurfx/config.lua CHANGED
@@ -1,18 +1,18 @@
1
  -- ### General ###
2
  logging = true -- an option to enable or disable logs.
3
- debug = false -- an option to enable or disable debug mode.
4
- threads = 10 -- the amount of threads that the app will use to run (the value should be greater than 0).
5
 
6
  -- ### Server ###
7
- port = "8080" -- port on which server should be launched
8
  binding_ip = "127.0.0.1" --ip address on the which server should be launched.
9
- production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one))
10
  -- if production_use is set to true
11
  -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
12
- request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds).
13
  rate_limiter = {
14
- number_of_requests = 20, -- The number of request that are allowed within a provided time limit.
15
- time_limit = 3, -- The time limit in which the quantity of requests that should be accepted.
16
  }
17
 
18
  -- ### Search ###
@@ -43,14 +43,15 @@ safe_search = 2
43
  -- tomorrow-night
44
  -- }}
45
  colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
46
- theme = "simple" -- the theme name which should be used for the website
47
 
48
  -- ### Caching ###
49
  redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
50
- cache_expiry_time = 600 -- This option takes the expiry time of the search results (value in seconds and the value should be greater than or equal to 60 seconds).
51
  -- ### Search Engines ###
52
  upstream_search_engines = {
53
- DuckDuckGo = true,
54
- Searx = false,
55
- Brave = false,
 
56
  } -- select the upstream search engines from which the results should be fetched.
 
1
  -- ### General ###
2
  logging = true -- an option to enable or disable logs.
3
+ debug = false -- an option to enable or disable debug mode.
4
+ threads = 10 -- the amount of threads that the app will use to run (the value should be greater than 0).
5
 
6
  -- ### Server ###
7
+ port = "8080" -- port on which server should be launched
8
  binding_ip = "127.0.0.1" --ip address on the which server should be launched.
9
+ production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one))
10
  -- if production_use is set to true
11
  -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
12
+ request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds).
13
  rate_limiter = {
14
+ number_of_requests = 20, -- The number of request that are allowed within a provided time limit.
15
+ time_limit = 3, -- The time limit in which the quantity of requests that should be accepted.
16
  }
17
 
18
  -- ### Search ###
 
43
  -- tomorrow-night
44
  -- }}
45
  colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
46
+ theme = "simple" -- the theme name which should be used for the website
47
 
48
  -- ### Caching ###
49
  redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
50
+ cache_expiry_time = 600 -- This option takes the expiry time of the search results (value in seconds and the value should be greater than or equal to 60 seconds).
51
  -- ### Search Engines ###
52
  upstream_search_engines = {
53
+ DuckDuckGo = true,
54
+ Searx = false,
55
+ Brave = false,
56
+ Startpage = false,
57
  } -- select the upstream search engines from which the results should be fetched.