File size: 5,225 Bytes
ed13a16
 
 
 
c223fed
15fc415
0502a8f
15fc415
 
493c56b
15fc415
493c56b
9a4cf94
 
5962cca
94ef62e
 
f9b9e87
c170de8
f9b9e87
 
453dbdc
f9b9e87
 
 
 
2d47e8d
5aca5c0
f9b9e87
 
 
a8791de
f9b9e87
 
 
 
 
 
 
 
 
 
 
15fc415
f9b9e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5962cca
f9b9e87
2d47e8d
f9b9e87
5962cca
f9b9e87
 
 
5962cca
f9b9e87
 
 
15fc415
f9b9e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5aca5c0
f94ac50
f9b9e87
f94ac50
 
 
 
f9b9e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15dfda6
f9b9e87
 
15fc415
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
//! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine
//! by querying the upstream duckduckgo search engine with user provided query and with a page
//! number if provided.

use std::collections::HashMap;

use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector};

use crate::models::aggregation_models::SearchResult;

use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{IntoReport, Report, Result, ResultExt};

/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct DuckDuckGo;

#[async_trait::async_trait]
impl SearchEngine for DuckDuckGo {
    async fn results(
        &self,
        query: String,
        page: u32,
        user_agent: String,
        request_timeout: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        let url: String = match page {
            1 | 0 => {
                format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
            }
            _ => {
                format!(
                    "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
                    query,
                    (page / 2 + (page % 2)) * 30,
                    (page / 2 + (page % 2)) * 30 + 1
                )
            }
        };

        // initializing HeaderMap and adding appropriate headers.
        let mut header_map = HeaderMap::new();
        header_map.insert(
            USER_AGENT,
            user_agent
                .parse()
                .into_report()
                .change_context(EngineError::UnexpectedError)?,
        );
        header_map.insert(
            REFERER,
            "https://google.com/"
                .parse()
                .into_report()
                .change_context(EngineError::UnexpectedError)?,
        );
        header_map.insert(
            CONTENT_TYPE,
            "application/x-www-form-urlencoded"
                .parse()
                .into_report()
                .change_context(EngineError::UnexpectedError)?,
        );
        header_map.insert(
            COOKIE,
            "kl=wt-wt"
                .parse()
                .into_report()
                .change_context(EngineError::UnexpectedError)?,
        );

        let document: Html = Html::parse_document(
            &DuckDuckGo::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
        );

        let no_result: Selector = Selector::parse(".no-results")
            .map_err(|_| Report::new(EngineError::UnexpectedError))
            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;

        if document.select(&no_result).next().is_some() {
            return Err(Report::new(EngineError::EmptyResultSet));
        }

        let results: Selector = Selector::parse(".result")
            .map_err(|_| Report::new(EngineError::UnexpectedError))
            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
        let result_title: Selector = Selector::parse(".result__a")
            .map_err(|_| Report::new(EngineError::UnexpectedError))
            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
        let result_url: Selector = Selector::parse(".result__url")
            .map_err(|_| Report::new(EngineError::UnexpectedError))
            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
        let result_desc: Selector = Selector::parse(".result__snippet")
            .map_err(|_| Report::new(EngineError::UnexpectedError))
            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;

        // scrape all the results from the html
        Ok(document
            .select(&results)
            .map(|result| {
                SearchResult::new(
                    result
                        .select(&result_title)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim()
                        .to_string(),
                    format!(
                        "https://{}",
                        result
                            .select(&result_url)
                            .next()
                            .unwrap()
                            .inner_html()
                            .trim()
                    ),
                    result
                        .select(&result_desc)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim()
                        .to_string(),
                    vec!["duckduckgo".to_string()],
                )
            })
            .map(|search_result| (search_result.url.clone(), search_result))
            .collect())
    }
}