Spaces:
Runtime error
Runtime error
File size: 4,830 Bytes
9f23a1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
//! The `mojeek` module handles the scraping of results from the mojeek search engine
//! by querying the upstream mojeek search engine with user provided query and with a page
//! number if provided.
use std::collections::HashMap;
use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
use super::search_result_parser::SearchResultParser;
/// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Mojeek {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
}
impl Mojeek {
/// Creates the Mojeek parser.
pub fn new() -> Result<Self, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
".result-col",
".results-standard li",
"a span.url",
"h2 a.title",
"p.s",
)?,
})
}
}
#[async_trait::async_trait]
impl SearchEngine for Mojeek {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Mojeek uses `start results from this number` convention
// So, for 10 results per page, page 0 starts at 1, page 1
// starts at 11, and so on.
let results_per_page = 10;
let start_result = results_per_page * page + 1;
let results_per_page = results_per_page.to_string();
let start_result = start_result.to_string();
let search_engines = vec![
"Bing",
"Brave",
"DuckDuckGo",
"Ecosia",
"Google",
"Lilo",
"Metager",
"Qwant",
"Startpage",
"Swisscows",
"Yandex",
"Yep",
"You",
];
let qss = search_engines.join("%2C");
let safe = if safe_search == 0 { "0" } else { "1" };
// Mojeek detects automated requests, these are preferences that are
// able to circumvent the countermeasure. Some of these are
// not documented in their Search API
let query_params: Vec<(&str, &str)> = vec![
("t", results_per_page.as_str()),
("theme", "dark"),
("arc", "none"),
("date", "1"),
("cdate", "1"),
("tlen", "100"),
("ref", "1"),
("hp", "minimal"),
("lb", "en"),
("qss", &qss),
("safe", safe),
];
let mut query_params_string = String::new();
for (k, v) in &query_params {
query_params_string.push_str(&format!("&{k}={v}"));
}
let url: String = match page {
0 => {
format!("https://www.mojeek.com/search?q={query}{query_params_string}")
}
_ => {
format!(
"https://www.mojeek.com/search?q={query}&s={start_result}{query_params_string}"
)
}
};
let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
("REFERER".to_string(), "https://google.com/".to_string()),
(
"CONTENT_TYPE".to_string(),
"application/x-www-form-urlencoded".to_string(),
),
("COOKIE".to_string(), cookie_string),
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&Mojeek::fetch_html_from_upstream(self, &url, header_map, client).await?,
);
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
if no_result_msg
.inner_html()
.contains("No pages found matching:")
{
return Err(Report::new(EngineError::EmptyResultSet));
}
}
// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
Some(SearchResult::new(
title.inner_html().trim(),
url.inner_html().trim(),
desc.inner_html().trim(),
&["mojeek"],
))
})
}
}
|