Spaces:
Runtime error
Runtime error
File size: 6,017 Bytes
ed13a16 0502a8f 15fc415 c5c1684 15fc415 493c56b 15fc415 493c56b 9a4cf94 5962cca 94ef62e f9b9e87 15fc415 f9b9e87 2d47e8d 5aca5c0 f9b9e87 a8791de 15fc415 f9b9e87 5962cca 2d47e8d 5962cca f9b9e87 5962cca 9a4cf94 5962cca f9b9e87 5962cca f9b9e87 9a4cf94 f9b9e87 15fc415 f9b9e87 5aca5c0 f9b9e87 15dfda6 f9b9e87 15fc415 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
//! The `searx` module handles the scraping of results from the searx search engine instance
//! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided.
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector};
use std::collections::HashMap;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{IntoReport, Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Searx;
#[async_trait::async_trait]
impl SearchEngine for Searx {
async fn results(
&self,
query: String,
page: u32,
user_agent: String,
request_timeout: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
let url: String = match page {
0 | 1 => format!("https://searx.work/search?q={query}&pageno=1"),
_ => format!("https://searx.work/search?q={query}&pageno={page}"),
};
// initializing headers and adding appropriate headers.
let mut header_map = HeaderMap::new();
header_map.insert(
USER_AGENT,
user_agent
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
REFERER,
"https://google.com/"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
CONTENT_TYPE,
"application/x-www-form-urlencoded"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
let document: Html = Html::parse_document(
&Searx::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
);
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| {
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
})?;
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
if no_result_msg.inner_html()
== "we didn't find any results. Please use another query or search in more categories"
{
return Err(Report::new(EngineError::EmptyResultSet));
}
}
let results: Selector = Selector::parse(".result")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
let result_title: Selector = Selector::parse("h3>a")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
let result_url: Selector = Selector::parse("h3>a")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
let result_desc: Selector = Selector::parse(".content")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
// scrape all the results from the html
Ok(document
.select(&results)
.map(|result| {
SearchResult::new(
result
.select(&result_title)
.next()
.unwrap()
.inner_html()
.trim()
.to_string(),
result
.select(&result_url)
.next()
.unwrap()
.value()
.attr("href")
.unwrap()
.to_string(),
result
.select(&result_desc)
.next()
.unwrap()
.inner_html()
.trim()
.to_string(),
vec!["searx".to_string()],
)
})
.map(|search_result| (search_result.url.clone(), search_result))
.collect())
}
}
|