Spaces:
Runtime error
Runtime error
File size: 5,421 Bytes
9f23a1c 991f3f5 9f23a1c 991f3f5 9f23a1c 991f3f5 9f23a1c 991f3f5 9f23a1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
//! The `mojeek` module handles the scraping of results from the mojeek search engine
//! by querying the upstream mojeek search engine with user provided query and with a page
//! number if provided.
use std::collections::HashMap;
use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
use super::search_result_parser::SearchResultParser;
/// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Mojeek {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
}
impl Mojeek {
/// Creates the Mojeek parser.
pub fn new() -> Result<Self, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
".result-col",
".results-standard li",
"a span.url",
"h2 a.title",
"p.s",
)?,
})
}
}
#[async_trait::async_trait]
impl SearchEngine for Mojeek {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
safe_search: u8,
) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Mojeek uses `start results from this number` convention
// So, for 10 results per page, page 0 starts at 1, page 1
// starts at 11, and so on.
let results_per_page = 10;
let start_result = results_per_page * page + 1;
let results_per_page = results_per_page.to_string();
let start_result = start_result.to_string();
let search_engines = vec![
"Bing",
"Brave",
"DuckDuckGo",
"Ecosia",
"Google",
"Lilo",
"Metager",
"Qwant",
"Startpage",
"Swisscows",
"Yandex",
"Yep",
"You",
];
let qss = search_engines.join("%2C");
// A branchless condition to check whether the `safe_search` parameter has the
// value 0 or not. If it is zero then it sets the value 0 otherwise it sets
// the value to 1 for all other values of `safe_search`
//
// Moreover, the below branchless code is equivalent to the following code below:
//
// ```rust
// let safe = if safe_search == 0 { 0 } else { 1 }.to_string();
// ```
//
// For more information on branchless programming. See:
//
// * https://piped.video/watch?v=bVJ-mWWL7cE
let safe = u8::from(safe_search != 0).to_string();
// Mojeek detects automated requests, these are preferences that are
// able to circumvent the countermeasure. Some of these are
// not documented in their Search API
let query_params: Vec<(&str, &str)> = vec![
("t", results_per_page.as_str()),
("theme", "dark"),
("arc", "none"),
("date", "1"),
("cdate", "1"),
("tlen", "100"),
("ref", "1"),
("hp", "minimal"),
("lb", "en"),
("qss", &qss),
("safe", &safe),
];
let mut query_params_string = String::new();
for (k, v) in &query_params {
query_params_string.push_str(&format!("&{k}={v}"));
}
let url: String = match page {
0 => {
format!("https://www.mojeek.com/search?q={query}{query_params_string}")
}
_ => {
format!(
"https://www.mojeek.com/search?q={query}&s={start_result}{query_params_string}"
)
}
};
let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
("REFERER".to_string(), "https://google.com/".to_string()),
(
"CONTENT_TYPE".to_string(),
"application/x-www-form-urlencoded".to_string(),
),
("COOKIE".to_string(), cookie_string),
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&Mojeek::fetch_html_from_upstream(self, &url, header_map, client).await?,
);
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
if no_result_msg
.inner_html()
.contains("No pages found matching:")
{
return Err(Report::new(EngineError::EmptyResultSet));
}
}
// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
Some(SearchResult::new(
title.inner_html().trim(),
url.inner_html().trim(),
desc.inner_html().trim(),
&["mojeek"],
))
})
}
}
|