File size: 4,830 Bytes
9f23a1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
//! The `mojeek` module handles the scraping of results from the mojeek search engine
//! by querying the upstream mojeek search engine with user provided query and with a page
//! number if provided.

use std::collections::HashMap;

use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;

use crate::models::aggregation_models::SearchResult;

use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::search_result_parser::SearchResultParser;

/// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Mojeek {
    /// The parser, used to interpret the search result.
    parser: SearchResultParser,
}

impl Mojeek {
    /// Creates the Mojeek parser.
    pub fn new() -> Result<Self, EngineError> {
        Ok(Self {
            parser: SearchResultParser::new(
                ".result-col",
                ".results-standard li",
                "a span.url",
                "h2 a.title",
                "p.s",
            )?,
        })
    }
}

#[async_trait::async_trait]
impl SearchEngine for Mojeek {
    async fn results(
        &self,
        query: &str,
        page: u32,
        user_agent: &str,
        client: &Client,
        safe_search: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Mojeek uses `start results from this number` convention
        // So, for 10 results per page, page 0 starts at 1, page 1
        // starts at 11, and so on.
        let results_per_page = 10;
        let start_result = results_per_page * page + 1;

        let results_per_page = results_per_page.to_string();
        let start_result = start_result.to_string();

        let search_engines = vec![
            "Bing",
            "Brave",
            "DuckDuckGo",
            "Ecosia",
            "Google",
            "Lilo",
            "Metager",
            "Qwant",
            "Startpage",
            "Swisscows",
            "Yandex",
            "Yep",
            "You",
        ];
        let qss = search_engines.join("%2C");
        let safe = if safe_search == 0 { "0" } else { "1" };

        // Mojeek detects automated requests, these are preferences that are
        // able to circumvent the countermeasure. Some of these are
        // not documented in their Search API
        let query_params: Vec<(&str, &str)> = vec![
            ("t", results_per_page.as_str()),
            ("theme", "dark"),
            ("arc", "none"),
            ("date", "1"),
            ("cdate", "1"),
            ("tlen", "100"),
            ("ref", "1"),
            ("hp", "minimal"),
            ("lb", "en"),
            ("qss", &qss),
            ("safe", safe),
        ];

        let mut query_params_string = String::new();
        for (k, v) in &query_params {
            query_params_string.push_str(&format!("&{k}={v}"));
        }

        let url: String = match page {
            0 => {
                format!("https://www.mojeek.com/search?q={query}{query_params_string}")
            }
            _ => {
                format!(
                    "https://www.mojeek.com/search?q={query}&s={start_result}{query_params_string}"
                )
            }
        };

        let mut cookie_string = String::new();
        for (k, v) in &query_params {
            cookie_string.push_str(&format!("{k}={v}; "));
        }

        let header_map = HeaderMap::try_from(&HashMap::from([
            ("USER_AGENT".to_string(), user_agent.to_string()),
            ("REFERER".to_string(), "https://google.com/".to_string()),
            (
                "CONTENT_TYPE".to_string(),
                "application/x-www-form-urlencoded".to_string(),
            ),
            ("COOKIE".to_string(), cookie_string),
        ]))
        .change_context(EngineError::UnexpectedError)?;

        let document: Html = Html::parse_document(
            &Mojeek::fetch_html_from_upstream(self, &url, header_map, client).await?,
        );

        if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
            if no_result_msg
                .inner_html()
                .contains("No pages found matching:")
            {
                return Err(Report::new(EngineError::EmptyResultSet));
            }
        }

        // scrape all the results from the html
        self.parser
            .parse_for_results(&document, |title, url, desc| {
                Some(SearchResult::new(
                    title.inner_html().trim(),
                    url.inner_html().trim(),
                    desc.inner_html().trim(),
                    &["mojeek"],
                ))
            })
    }
}