Spaces:
Runtime error
Runtime error
File size: 3,382 Bytes
ed13a16 c223fed 15fc415 f5f0488 fe74f2e 75a77d2 15fc415 493c56b 15fc415 493c56b 9a4cf94 f5f0488 5962cca 75a77d2 94ef62e 75a77d2 32abacb 75a77d2 57c73d3 75a77d2 c170de8 f9b9e87 453dbdc f9b9e87 f5f0488 f9b9e87 f5f0488 fe74f2e 6fa45ec 5aca5c0 f9b9e87 a8791de f9b9e87 15fc415 f9b9e87 f5f0488 5962cca f9b9e87 fe74f2e f9b9e87 5962cca 57c73d3 f9b9e87 15fc415 f9b9e87 57c73d3 f5f0488 57c73d3 f9b9e87 15fc415 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
//! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine
//! by querying the upstream duckduckgo search engine with user provided query and with a page
//! number if provided.
use std::collections::HashMap;
use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
use super::search_result_parser::SearchResultParser;
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct DuckDuckGo {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
}
impl DuckDuckGo {
/// Creates the DuckDuckGo parser.
pub fn new() -> Result<Self, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
".no-results",
".result",
".result__a",
".result__url",
".result__snippet",
)?,
})
}
}
#[async_trait::async_trait]
impl SearchEngine for DuckDuckGo {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
_safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
let url: String = match page {
1 | 0 => {
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
}
_ => {
format!(
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
query,
(page / 2 + (page % 2)) * 30,
(page / 2 + (page % 2)) * 30 + 1
)
}
};
// initializing HeaderMap and adding appropriate headers.
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
("REFERER".to_string(), "https://google.com/".to_string()),
(
"CONTENT_TYPE".to_string(),
"application/x-www-form-urlencoded".to_string(),
),
("COOKIE".to_string(), "kl=wt-wt".to_string()),
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, client).await?,
);
if self.parser.parse_for_no_results(&document).next().is_some() {
return Err(Report::new(EngineError::EmptyResultSet));
}
// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
Some(SearchResult::new(
title.inner_html().trim(),
&format!("https://{}", url.inner_html().trim()),
desc.inner_html().trim(),
&["duckduckgo"],
))
})
}
}
|