Abhaykoul commited on
Commit
e218904
1 Parent(s): 797f4fc

Update webscout/webscout_search.py

Browse files
Files changed (1) hide show
  1. webscout/webscout_search.py +1055 -1055
webscout/webscout_search.py CHANGED
@@ -1,1056 +1,1056 @@
1
- import logging
2
- import warnings
3
- from concurrent.futures import ThreadPoolExecutor
4
- from datetime import datetime, timezone
5
- from decimal import Decimal
6
- from functools import cached_property
7
- from itertools import cycle, islice
8
- from threading import Event
9
- from types import TracebackType
10
- from typing import Dict, List, Optional, Tuple, Type, Union, cast
11
-
12
- import pyreqwest_impersonate as pri # type: ignore
13
-
14
- try:
15
- from lxml.etree import _Element
16
- from lxml.html import HTMLParser as LHTMLParser
17
- from lxml.html import document_fromstring
18
-
19
- LXML_AVAILABLE = True
20
- except ImportError:
21
- LXML_AVAILABLE = False
22
-
23
- from .exceptions import WebscoutE, RatelimitE, TimeoutE
24
- from .utils import (
25
- _calculate_distance,
26
- _extract_vqd,
27
- _normalize,
28
- _normalize_url,
29
- _text_extract_json,
30
- json_loads,
31
- )
32
-
33
- logger = logging.getLogger("webcout_search.WEBS")
34
-
35
-
36
- class WEBS:
37
- """webcout_search class to get search results from duckduckgo.com."""
38
-
39
- _executor: ThreadPoolExecutor = ThreadPoolExecutor()
40
-
41
- def __init__(
42
- self,
43
- headers: Optional[Dict[str, str]] = None,
44
- proxy: Optional[str] = None,
45
- proxies: Union[Dict[str, str], str, None] = None, # deprecated
46
- timeout: Optional[int] = 10,
47
- ) -> None:
48
- """Initialize the WEBS object.
49
-
50
- Args:
51
- headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None.
52
- proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
53
- example: "http://user:[email protected]:3128". Defaults to None.
54
- timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
55
- """
56
- self.proxy: Optional[str] = proxy
57
- assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str"
58
- if not proxy and proxies:
59
- warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
60
- self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
61
- self.headers = headers if headers else {}
62
- self.headers["Referer"] = "https://duckduckgo.com/"
63
- self.client = pri.Client(
64
- headers=self.headers,
65
- proxy=self.proxy,
66
- timeout=timeout,
67
- cookie_store=True,
68
- referer=True,
69
- impersonate="chrome_124",
70
- follow_redirects=False,
71
- verify=False,
72
- )
73
- self._exception_event = Event()
74
- self._chat_messages: List[Dict[str, str]] = []
75
- self._chat_vqd: str = ""
76
-
77
- def __enter__(self) -> "WEBS":
78
- return self
79
-
80
- def __exit__(
81
- self,
82
- exc_type: Optional[Type[BaseException]] = None,
83
- exc_val: Optional[BaseException] = None,
84
- exc_tb: Optional[TracebackType] = None,
85
- ) -> None:
86
- pass
87
-
88
- @cached_property
89
- def parser(self) -> "LHTMLParser":
90
- """Get HTML parser."""
91
- return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
92
-
93
- def _get_url(
94
- self,
95
- method: str,
96
- url: str,
97
- params: Optional[Dict[str, str]] = None,
98
- content: Optional[bytes] = None,
99
- data: Optional[Union[Dict[str, str], bytes]] = None,
100
- ) -> bytes:
101
- if self._exception_event.is_set():
102
- raise WebscoutE("Exception occurred in previous call.")
103
- try:
104
- resp = self.client.request(method, url, params=params, content=content, data=data)
105
- except Exception as ex:
106
- self._exception_event.set()
107
- if "time" in str(ex).lower():
108
- raise TimeoutE(f"{url} {type(ex).__name__}: {ex}") from ex
109
- raise WebscoutE(f"{url} {type(ex).__name__}: {ex}") from ex
110
- logger.debug(f"_get_url() {resp.url} {resp.status_code} {len(resp.content)}")
111
- if resp.status_code == 200:
112
- return cast(bytes, resp.content)
113
- self._exception_event.set()
114
- if resp.status_code in (202, 301, 403):
115
- raise RatelimitE(f"{resp.url} {resp.status_code} Ratelimit")
116
- raise WebscoutE(f"{resp.url} return None. {params=} {content=} {data=}")
117
-
118
- def _get_vqd(self, keywords: str) -> str:
119
- """Get vqd value for a search query."""
120
- resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
121
- return _extract_vqd(resp_content, keywords)
122
-
123
- def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
124
- """Initiates a chat session with Webscout AI.
125
-
126
- Args:
127
- keywords (str): The initial message or question to send to the AI.
128
- model (str): The model to use: "gpt-3.5", "claude-3-haiku". Defaults to "gpt-3.5".
129
-
130
- Returns:
131
- str: The response from the AI.
132
- """
133
- models = {"claude-3-haiku": "claude-3-haiku-20240307", "gpt-3.5": "gpt-3.5-turbo-0125"}
134
- # vqd
135
- if not self._chat_vqd:
136
- resp = self.client.get("https://duckduckgo.com/duckchat/v1/status", headers={"x-vqd-accept": "1"})
137
- self._chat_vqd = resp.headers.get("x-vqd-4", "")
138
-
139
- self._chat_messages.append({"role": "user", "content": keywords})
140
-
141
- json_data = {
142
- "model": models[model],
143
- "messages": self._chat_messages,
144
- }
145
- resp = self.client.post(
146
- "https://duckduckgo.com/duckchat/v1/chat", headers={"x-vqd-4": self._chat_vqd}, json=json_data
147
- )
148
- self._chat_vqd = resp.headers.get("x-vqd-4", "")
149
-
150
- messages = []
151
- for line in resp.text.replace("data: ", "").replace("[DONE]", "").split("\n\n"):
152
- x = line.strip()
153
- if x:
154
- j = json_loads(x)
155
- message = j.get("message", "")
156
- messages.append(message)
157
- result = "".join(messages)
158
- self._chat_messages.append({"role": "assistant", "content": result})
159
- return result
160
-
161
- def text(
162
- self,
163
- keywords: str,
164
- region: str = "wt-wt",
165
- safesearch: str = "moderate",
166
- timelimit: Optional[str] = None,
167
- backend: str = "api",
168
- max_results: Optional[int] = None,
169
- ) -> List[Dict[str, str]]:
170
- """Webscout text search. Query params: https://duckduckgo.com/params.
171
-
172
- Args:
173
- keywords: keywords for query.
174
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
175
- safesearch: on, moderate, off. Defaults to "moderate".
176
- timelimit: d, w, m, y. Defaults to None.
177
- backend: api, html, lite. Defaults to api.
178
- api - collect data from https://duckduckgo.com,
179
- html - collect data from https://html.duckduckgo.com,
180
- lite - collect data from https://lite.duckduckgo.com.
181
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
182
-
183
- Returns:
184
- List of dictionaries with search results, or None if there was an error.
185
-
186
- Raises:
187
- WebscoutE: Base exception for webcout_search errors.
188
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
189
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
190
- """
191
- if LXML_AVAILABLE is False and backend != "api":
192
- backend = "api"
193
- warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
194
-
195
- if backend == "api":
196
- results = self._text_api(keywords, region, safesearch, timelimit, max_results)
197
- elif backend == "html":
198
- results = self._text_html(keywords, region, safesearch, timelimit, max_results)
199
- elif backend == "lite":
200
- results = self._text_lite(keywords, region, timelimit, max_results)
201
- return results
202
-
203
- def _text_api(
204
- self,
205
- keywords: str,
206
- region: str = "wt-wt",
207
- safesearch: str = "moderate",
208
- timelimit: Optional[str] = None,
209
- max_results: Optional[int] = None,
210
- ) -> List[Dict[str, str]]:
211
- """Webscout text search. Query params: https://duckduckgo.com/params.
212
-
213
- Args:
214
- keywords: keywords for query.
215
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
216
- safesearch: on, moderate, off. Defaults to "moderate".
217
- timelimit: d, w, m, y. Defaults to None.
218
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
219
-
220
- Returns:
221
- List of dictionaries with search results.
222
-
223
- Raises:
224
- WebscoutE: Base exception for webcout_search errors.
225
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
226
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
227
- """
228
- assert keywords, "keywords is mandatory"
229
-
230
- vqd = self._get_vqd(keywords)
231
-
232
- payload = {
233
- "q": keywords,
234
- "kl": region,
235
- "l": region,
236
- "p": "",
237
- "s": "0",
238
- "df": "",
239
- "vqd": vqd,
240
- "ex": "",
241
- }
242
- safesearch = safesearch.lower()
243
- if safesearch == "moderate":
244
- payload["ex"] = "-1"
245
- elif safesearch == "off":
246
- payload["ex"] = "-2"
247
- elif safesearch == "on": # strict
248
- payload["p"] = "1"
249
- if timelimit:
250
- payload["df"] = timelimit
251
-
252
- cache = set()
253
- results: List[Dict[str, str]] = []
254
-
255
- def _text_api_page(s: int) -> List[Dict[str, str]]:
256
- payload["s"] = f"{s}"
257
- resp_content = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
258
- page_data = _text_extract_json(resp_content, keywords)
259
- page_results = []
260
- for row in page_data:
261
- href = row.get("u", None)
262
- if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
263
- cache.add(href)
264
- body = _normalize(row["a"])
265
- if body:
266
- result = {
267
- "title": _normalize(row["t"]),
268
- "href": _normalize_url(href),
269
- "body": body,
270
- }
271
- page_results.append(result)
272
- return page_results
273
-
274
- slist = [0]
275
- if max_results:
276
- max_results = min(max_results, 500)
277
- slist.extend(range(23, max_results, 50))
278
- try:
279
- for r in self._executor.map(_text_api_page, slist):
280
- results.extend(r)
281
- except Exception as e:
282
- raise e
283
-
284
- return list(islice(results, max_results))
285
-
286
- def _text_html(
287
- self,
288
- keywords: str,
289
- region: str = "wt-wt",
290
- safesearch: str = "moderate",
291
- timelimit: Optional[str] = None,
292
- max_results: Optional[int] = None,
293
- ) -> List[Dict[str, str]]:
294
- """Webscout text search. Query params: https://duckduckgo.com/params.
295
-
296
- Args:
297
- keywords: keywords for query.
298
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
299
- safesearch: on, moderate, off. Defaults to "moderate".
300
- timelimit: d, w, m, y. Defaults to None.
301
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
302
-
303
- Returns:
304
- List of dictionaries with search results.
305
-
306
- Raises:
307
- WebscoutE: Base exception for webcout_search errors.
308
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
309
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
310
- """
311
- assert keywords, "keywords is mandatory"
312
-
313
- safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
314
- payload = {
315
- "q": keywords,
316
- "kl": region,
317
- "p": safesearch_base[safesearch.lower()],
318
- "o": "json",
319
- "api": "d.js",
320
- }
321
- if timelimit:
322
- payload["df"] = timelimit
323
- if max_results and max_results > 20:
324
- vqd = self._get_vqd(keywords)
325
- payload["vqd"] = vqd
326
-
327
- cache = set()
328
- results: List[Dict[str, str]] = []
329
-
330
- def _text_html_page(s: int) -> List[Dict[str, str]]:
331
- payload["s"] = f"{s}"
332
- resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
333
- if b"No results." in resp_content:
334
- return []
335
-
336
- page_results = []
337
- tree = document_fromstring(resp_content, self.parser)
338
- elements = tree.xpath("//div[h2]")
339
- if not isinstance(elements, List):
340
- return []
341
- for e in elements:
342
- if isinstance(e, _Element):
343
- hrefxpath = e.xpath("./a/@href")
344
- href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
345
- if (
346
- href
347
- and href not in cache
348
- and not href.startswith(
349
- ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
350
- )
351
- ):
352
- cache.add(href)
353
- titlexpath = e.xpath("./h2/a/text()")
354
- title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
355
- bodyxpath = e.xpath("./a//text()")
356
- body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
357
- result = {
358
- "title": _normalize(title),
359
- "href": _normalize_url(href),
360
- "body": _normalize(body),
361
- }
362
- page_results.append(result)
363
- return page_results
364
-
365
- slist = [0]
366
- if max_results:
367
- max_results = min(max_results, 500)
368
- slist.extend(range(23, max_results, 50))
369
- try:
370
- for r in self._executor.map(_text_html_page, slist):
371
- results.extend(r)
372
- except Exception as e:
373
- raise e
374
-
375
- return list(islice(results, max_results))
376
-
377
- def _text_lite(
378
- self,
379
- keywords: str,
380
- region: str = "wt-wt",
381
- timelimit: Optional[str] = None,
382
- max_results: Optional[int] = None,
383
- ) -> List[Dict[str, str]]:
384
- """Webscout text search. Query params: https://duckduckgo.com/params.
385
-
386
- Args:
387
- keywords: keywords for query.
388
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
389
- timelimit: d, w, m, y. Defaults to None.
390
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
391
-
392
- Returns:
393
- List of dictionaries with search results.
394
-
395
- Raises:
396
- WebscoutE: Base exception for webcout_search errors.
397
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
398
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
399
- """
400
- assert keywords, "keywords is mandatory"
401
-
402
- payload = {
403
- "q": keywords,
404
- "o": "json",
405
- "api": "d.js",
406
- "kl": region,
407
- }
408
- if timelimit:
409
- payload["df"] = timelimit
410
-
411
- cache = set()
412
- results: List[Dict[str, str]] = []
413
-
414
- def _text_lite_page(s: int) -> List[Dict[str, str]]:
415
- payload["s"] = f"{s}"
416
- resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
417
- if b"No more results." in resp_content:
418
- return []
419
-
420
- page_results = []
421
- tree = document_fromstring(resp_content, self.parser)
422
- elements = tree.xpath("//table[last()]//tr")
423
- if not isinstance(elements, List):
424
- return []
425
-
426
- data = zip(cycle(range(1, 5)), elements)
427
- for i, e in data:
428
- if isinstance(e, _Element):
429
- if i == 1:
430
- hrefxpath = e.xpath(".//a//@href")
431
- href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
432
- if (
433
- href is None
434
- or href in cache
435
- or href.startswith(
436
- ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
437
- )
438
- ):
439
- [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
440
- else:
441
- cache.add(href)
442
- titlexpath = e.xpath(".//a//text()")
443
- title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
444
- elif i == 2:
445
- bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
446
- body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
447
- if href:
448
- result = {
449
- "title": _normalize(title),
450
- "href": _normalize_url(href),
451
- "body": _normalize(body),
452
- }
453
- page_results.append(result)
454
- return page_results
455
-
456
- slist = [0]
457
- if max_results:
458
- max_results = min(max_results, 500)
459
- slist.extend(range(23, max_results, 50))
460
- try:
461
- for r in self._executor.map(_text_lite_page, slist):
462
- results.extend(r)
463
- except Exception as e:
464
- raise e
465
-
466
- return list(islice(results, max_results))
467
-
468
- def images(
469
- self,
470
- keywords: str,
471
- region: str = "wt-wt",
472
- safesearch: str = "moderate",
473
- timelimit: Optional[str] = None,
474
- size: Optional[str] = None,
475
- color: Optional[str] = None,
476
- type_image: Optional[str] = None,
477
- layout: Optional[str] = None,
478
- license_image: Optional[str] = None,
479
- max_results: Optional[int] = None,
480
- ) -> List[Dict[str, str]]:
481
- """Webscout images search. Query params: https://duckduckgo.com/params.
482
-
483
- Args:
484
- keywords: keywords for query.
485
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
486
- safesearch: on, moderate, off. Defaults to "moderate".
487
- timelimit: Day, Week, Month, Year. Defaults to None.
488
- size: Small, Medium, Large, Wallpaper. Defaults to None.
489
- color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
490
- Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
491
- type_image: photo, clipart, gif, transparent, line.
492
- Defaults to None.
493
- layout: Square, Tall, Wide. Defaults to None.
494
- license_image: any (All Creative Commons), Public (PublicDomain),
495
- Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
496
- Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
497
- Use Commercially). Defaults to None.
498
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
499
-
500
- Returns:
501
- List of dictionaries with images search results.
502
-
503
- Raises:
504
- WebscoutE: Base exception for webcout_search errors.
505
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
506
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
507
- """
508
- assert keywords, "keywords is mandatory"
509
-
510
- vqd = self._get_vqd(keywords)
511
-
512
- safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
513
- timelimit = f"time:{timelimit}" if timelimit else ""
514
- size = f"size:{size}" if size else ""
515
- color = f"color:{color}" if color else ""
516
- type_image = f"type:{type_image}" if type_image else ""
517
- layout = f"layout:{layout}" if layout else ""
518
- license_image = f"license:{license_image}" if license_image else ""
519
- payload = {
520
- "l": region,
521
- "o": "json",
522
- "q": keywords,
523
- "vqd": vqd,
524
- "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
525
- "p": safesearch_base[safesearch.lower()],
526
- }
527
-
528
- cache = set()
529
- results: List[Dict[str, str]] = []
530
-
531
- def _images_page(s: int) -> List[Dict[str, str]]:
532
- payload["s"] = f"{s}"
533
- resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
534
- resp_json = json_loads(resp_content)
535
-
536
- page_data = resp_json.get("results", [])
537
- page_results = []
538
- for row in page_data:
539
- image_url = row.get("image")
540
- if image_url and image_url not in cache:
541
- cache.add(image_url)
542
- result = {
543
- "title": row["title"],
544
- "image": _normalize_url(image_url),
545
- "thumbnail": _normalize_url(row["thumbnail"]),
546
- "url": _normalize_url(row["url"]),
547
- "height": row["height"],
548
- "width": row["width"],
549
- "source": row["source"],
550
- }
551
- page_results.append(result)
552
- return page_results
553
-
554
- slist = [0]
555
- if max_results:
556
- max_results = min(max_results, 500)
557
- slist.extend(range(100, max_results, 100))
558
- try:
559
- for r in self._executor.map(_images_page, slist):
560
- results.extend(r)
561
- except Exception as e:
562
- raise e
563
-
564
- return list(islice(results, max_results))
565
-
566
- def videos(
567
- self,
568
- keywords: str,
569
- region: str = "wt-wt",
570
- safesearch: str = "moderate",
571
- timelimit: Optional[str] = None,
572
- resolution: Optional[str] = None,
573
- duration: Optional[str] = None,
574
- license_videos: Optional[str] = None,
575
- max_results: Optional[int] = None,
576
- ) -> List[Dict[str, str]]:
577
- """Webscout videos search. Query params: https://duckduckgo.com/params.
578
-
579
- Args:
580
- keywords: keywords for query.
581
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
582
- safesearch: on, moderate, off. Defaults to "moderate".
583
- timelimit: d, w, m. Defaults to None.
584
- resolution: high, standart. Defaults to None.
585
- duration: short, medium, long. Defaults to None.
586
- license_videos: creativeCommon, youtube. Defaults to None.
587
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
588
-
589
- Returns:
590
- List of dictionaries with videos search results.
591
-
592
- Raises:
593
- WebscoutE: Base exception for webcout_search errors.
594
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
595
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
596
- """
597
- assert keywords, "keywords is mandatory"
598
-
599
- vqd = self._get_vqd(keywords)
600
-
601
- safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
602
- timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
603
- resolution = f"videoDefinition:{resolution}" if resolution else ""
604
- duration = f"videoDuration:{duration}" if duration else ""
605
- license_videos = f"videoLicense:{license_videos}" if license_videos else ""
606
- payload = {
607
- "l": region,
608
- "o": "json",
609
- "q": keywords,
610
- "vqd": vqd,
611
- "f": f"{timelimit},{resolution},{duration},{license_videos}",
612
- "p": safesearch_base[safesearch.lower()],
613
- }
614
-
615
- cache = set()
616
- results: List[Dict[str, str]] = []
617
-
618
- def _videos_page(s: int) -> List[Dict[str, str]]:
619
- payload["s"] = f"{s}"
620
- resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
621
- resp_json = json_loads(resp_content)
622
-
623
- page_data = resp_json.get("results", [])
624
- page_results = []
625
- for row in page_data:
626
- if row["content"] not in cache:
627
- cache.add(row["content"])
628
- page_results.append(row)
629
- return page_results
630
-
631
- slist = [0]
632
- if max_results:
633
- max_results = min(max_results, 400)
634
- slist.extend(range(59, max_results, 59))
635
- try:
636
- for r in self._executor.map(_videos_page, slist):
637
- results.extend(r)
638
- except Exception as e:
639
- raise e
640
-
641
- return list(islice(results, max_results))
642
-
643
- def news(
644
- self,
645
- keywords: str,
646
- region: str = "wt-wt",
647
- safesearch: str = "moderate",
648
- timelimit: Optional[str] = None,
649
- max_results: Optional[int] = None,
650
- ) -> List[Dict[str, str]]:
651
- """Webscout news search. Query params: https://duckduckgo.com/params.
652
-
653
- Args:
654
- keywords: keywords for query.
655
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
656
- safesearch: on, moderate, off. Defaults to "moderate".
657
- timelimit: d, w, m. Defaults to None.
658
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
659
-
660
- Returns:
661
- List of dictionaries with news search results.
662
-
663
- Raises:
664
- WebscoutE: Base exception for webcout_search errors.
665
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
666
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
667
- """
668
- assert keywords, "keywords is mandatory"
669
-
670
- vqd = self._get_vqd(keywords)
671
-
672
- safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
673
- payload = {
674
- "l": region,
675
- "o": "json",
676
- "noamp": "1",
677
- "q": keywords,
678
- "vqd": vqd,
679
- "p": safesearch_base[safesearch.lower()],
680
- }
681
- if timelimit:
682
- payload["df"] = timelimit
683
-
684
- cache = set()
685
- results: List[Dict[str, str]] = []
686
-
687
- def _news_page(s: int) -> List[Dict[str, str]]:
688
- payload["s"] = f"{s}"
689
- resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
690
- resp_json = json_loads(resp_content)
691
- page_data = resp_json.get("results", [])
692
- page_results = []
693
- for row in page_data:
694
- if row["url"] not in cache:
695
- cache.add(row["url"])
696
- image_url = row.get("image", None)
697
- result = {
698
- "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
699
- "title": row["title"],
700
- "body": _normalize(row["excerpt"]),
701
- "url": _normalize_url(row["url"]),
702
- "image": _normalize_url(image_url),
703
- "source": row["source"],
704
- }
705
- page_results.append(result)
706
- return page_results
707
-
708
- slist = [0]
709
- if max_results:
710
- max_results = min(max_results, 200)
711
- slist.extend(range(29, max_results, 29))
712
- try:
713
- for r in self._executor.map(_news_page, slist):
714
- results.extend(r)
715
- except Exception as e:
716
- raise e
717
-
718
- return list(islice(results, max_results))
719
-
720
- def answers(self, keywords: str) -> List[Dict[str, str]]:
721
- """Webscout instant answers. Query params: https://duckduckgo.com/params.
722
-
723
- Args:
724
- keywords: keywords for query,
725
-
726
- Returns:
727
- List of dictionaries with instant answers results.
728
-
729
- Raises:
730
- WebscoutE: Base exception for webcout_search errors.
731
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
732
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
733
- """
734
- assert keywords, "keywords is mandatory"
735
-
736
- payload = {
737
- "q": f"what is {keywords}",
738
- "format": "json",
739
- }
740
- resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
741
- page_data = json_loads(resp_content)
742
-
743
- results = []
744
- answer = page_data.get("AbstractText")
745
- url = page_data.get("AbstractURL")
746
- if answer:
747
- results.append(
748
- {
749
- "icon": None,
750
- "text": answer,
751
- "topic": None,
752
- "url": url,
753
- }
754
- )
755
-
756
- # related
757
- payload = {
758
- "q": f"{keywords}",
759
- "format": "json",
760
- }
761
- resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
762
- resp_json = json_loads(resp_content)
763
- page_data = resp_json.get("RelatedTopics", [])
764
-
765
- for row in page_data:
766
- topic = row.get("Name")
767
- if not topic:
768
- icon = row["Icon"].get("URL")
769
- results.append(
770
- {
771
- "icon": f"https://duckduckgo.com{icon}" if icon else "",
772
- "text": row["Text"],
773
- "topic": None,
774
- "url": row["FirstURL"],
775
- }
776
- )
777
- else:
778
- for subrow in row["Topics"]:
779
- icon = subrow["Icon"].get("URL")
780
- results.append(
781
- {
782
- "icon": f"https://duckduckgo.com{icon}" if icon else "",
783
- "text": subrow["Text"],
784
- "topic": topic,
785
- "url": subrow["FirstURL"],
786
- }
787
- )
788
-
789
- return results
790
-
791
- def suggestions(self, keywords: str, region: str = "wt-wt") -> List[Dict[str, str]]:
792
- """Webscout suggestions. Query params: https://duckduckgo.com/params.
793
-
794
- Args:
795
- keywords: keywords for query.
796
- region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
797
-
798
- Returns:
799
- List of dictionaries with suggestions results.
800
-
801
- Raises:
802
- WebscoutE: Base exception for webcout_search errors.
803
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
804
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
805
- """
806
- assert keywords, "keywords is mandatory"
807
-
808
- payload = {
809
- "q": keywords,
810
- "kl": region,
811
- }
812
- resp_content = self._get_url("GET", "https://duckduckgo.com/ac/", params=payload)
813
- page_data = json_loads(resp_content)
814
- return [r for r in page_data]
815
-
816
- def maps(
817
- self,
818
- keywords: str,
819
- place: Optional[str] = None,
820
- street: Optional[str] = None,
821
- city: Optional[str] = None,
822
- county: Optional[str] = None,
823
- state: Optional[str] = None,
824
- country: Optional[str] = None,
825
- postalcode: Optional[str] = None,
826
- latitude: Optional[str] = None,
827
- longitude: Optional[str] = None,
828
- radius: int = 0,
829
- max_results: Optional[int] = None,
830
- ) -> List[Dict[str, str]]:
831
- """Webscout maps search. Query params: https://duckduckgo.com/params.
832
-
833
- Args:
834
- keywords: keywords for query
835
- place: if set, the other parameters are not used. Defaults to None.
836
- street: house number/street. Defaults to None.
837
- city: city of search. Defaults to None.
838
- county: county of search. Defaults to None.
839
- state: state of search. Defaults to None.
840
- country: country of search. Defaults to None.
841
- postalcode: postalcode of search. Defaults to None.
842
- latitude: geographic coordinate (north-south position). Defaults to None.
843
- longitude: geographic coordinate (east-west position); if latitude and
844
- longitude are set, the other parameters are not used. Defaults to None.
845
- radius: expand the search square by the distance in kilometers. Defaults to 0.
846
- max_results: max number of results. If None, returns results only from the first response. Defaults to None.
847
-
848
- Returns:
849
- List of dictionaries with maps search results, or None if there was an error.
850
-
851
- Raises:
852
- WebscoutE: Base exception for webcout_search errors.
853
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
854
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
855
- """
856
- assert keywords, "keywords is mandatory"
857
-
858
- vqd = self._get_vqd(keywords)
859
-
860
- # if longitude and latitude are specified, skip the request about bbox to the nominatim api
861
- if latitude and longitude:
862
- lat_t = Decimal(latitude.replace(",", "."))
863
- lat_b = Decimal(latitude.replace(",", "."))
864
- lon_l = Decimal(longitude.replace(",", "."))
865
- lon_r = Decimal(longitude.replace(",", "."))
866
- if radius == 0:
867
- radius = 1
868
- # otherwise request about bbox to nominatim api
869
- else:
870
- if place:
871
- params = {
872
- "q": place,
873
- "polygon_geojson": "0",
874
- "format": "jsonv2",
875
- }
876
- else:
877
- params = {
878
- "polygon_geojson": "0",
879
- "format": "jsonv2",
880
- }
881
- if street:
882
- params["street"] = street
883
- if city:
884
- params["city"] = city
885
- if county:
886
- params["county"] = county
887
- if state:
888
- params["state"] = state
889
- if country:
890
- params["country"] = country
891
- if postalcode:
892
- params["postalcode"] = postalcode
893
- # request nominatim api to get coordinates box
894
- resp_content = self._get_url(
895
- "GET",
896
- "https://nominatim.openstreetmap.org/search.php",
897
- params=params,
898
- )
899
- if resp_content == b"[]":
900
- raise WebscoutE("maps() Coordinates are not found, check function parameters.")
901
- resp_json = json_loads(resp_content)
902
- coordinates = resp_json[0]["boundingbox"]
903
- lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
904
- lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
905
-
906
- # if a radius is specified, expand the search square
907
- lat_t += Decimal(radius) * Decimal(0.008983)
908
- lat_b -= Decimal(radius) * Decimal(0.008983)
909
- lon_l -= Decimal(radius) * Decimal(0.008983)
910
- lon_r += Decimal(radius) * Decimal(0.008983)
911
- logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
912
-
913
- cache = set()
914
- results: List[Dict[str, str]] = []
915
-
916
- def _maps_page(
917
- bbox: Tuple[Decimal, Decimal, Decimal, Decimal],
918
- ) -> Optional[List[Dict[str, str]]]:
919
- if max_results and len(results) >= max_results:
920
- return None
921
- lat_t, lon_l, lat_b, lon_r = bbox
922
- params = {
923
- "q": keywords,
924
- "vqd": vqd,
925
- "tg": "maps_places",
926
- "rt": "D",
927
- "mkexp": "b",
928
- "wiki_info": "1",
929
- "is_requery": "1",
930
- "bbox_tl": f"{lat_t},{lon_l}",
931
- "bbox_br": f"{lat_b},{lon_r}",
932
- "strict_bbox": "1",
933
- }
934
- resp_content = self._get_url("GET", "https://duckduckgo.com/local.js", params=params)
935
- resp_json = json_loads(resp_content)
936
- page_data = resp_json.get("results", [])
937
-
938
- page_results = []
939
- for res in page_data:
940
- r_name = f'{res["name"]} {res["address"]}'
941
- if r_name in cache:
942
- continue
943
- else:
944
- cache.add(r_name)
945
- result = {
946
- "title": res["name"],
947
- "address": res["address"],
948
- "country_code": res["country_code"],
949
- "url": _normalize_url(res["website"]),
950
- "phone": res["phone"] or "",
951
- "latitude": res["coordinates"]["latitude"],
952
- "longitude": res["coordinates"]["longitude"],
953
- "source": _normalize_url(res["url"]),
954
- "image": x.get("image", "") if (x := res["embed"]) else "",
955
- "desc": x.get("description", "") if (x := res["embed"]) else "",
956
- "hours": res["hours"] or "",
957
- "category": res["ddg_category"] or "",
958
- "facebook": f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else "",
959
- "instagram": f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else "",
960
- "twitter": f"https://twitter.com/{x}" if (x := res["twitter_id"]) else "",
961
- }
962
- page_results.append(result)
963
- return page_results
964
-
965
- # search squares (bboxes)
966
- start_bbox = (lat_t, lon_l, lat_b, lon_r)
967
- work_bboxes = [start_bbox]
968
- while work_bboxes:
969
- queue_bboxes = [] # for next iteration, at the end of the iteration work_bboxes = queue_bboxes
970
- tasks = []
971
- for bbox in work_bboxes:
972
- tasks.append(bbox)
973
- # if distance between coordinates > 1, divide the square into 4 parts and save them in queue_bboxes
974
- if _calculate_distance(lat_t, lon_l, lat_b, lon_r) > 1:
975
- lat_t, lon_l, lat_b, lon_r = bbox
976
- lat_middle = (lat_t + lat_b) / 2
977
- lon_middle = (lon_l + lon_r) / 2
978
- bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
979
- bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
980
- bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
981
- bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
982
- queue_bboxes.extend([bbox1, bbox2, bbox3, bbox4])
983
-
984
- # gather tasks using asyncio.wait_for and timeout
985
- work_bboxes_results = []
986
- try:
987
- for r in self._executor.map(_maps_page, tasks):
988
- if r:
989
- work_bboxes_results.extend(r)
990
- except Exception as e:
991
- raise e
992
-
993
- for x in work_bboxes_results:
994
- if isinstance(x, list):
995
- results.extend(x)
996
- elif isinstance(x, dict):
997
- results.append(x)
998
-
999
- work_bboxes = queue_bboxes
1000
- if not max_results or len(results) >= max_results or len(work_bboxes_results) == 0:
1001
- break
1002
-
1003
- return list(islice(results, max_results))
1004
-
1005
- def translate(
1006
- self, keywords: Union[List[str], str], from_: Optional[str] = None, to: str = "en"
1007
- ) -> List[Dict[str, str]]:
1008
- """Webscout translate.
1009
-
1010
- Args:
1011
- keywords: string or list of strings to translate.
1012
- from_: translate from (defaults automatically). Defaults to None.
1013
- to: what language to translate. Defaults to "en".
1014
-
1015
- Returns:
1016
- List od dictionaries with translated keywords.
1017
-
1018
- Raises:
1019
- WebscoutE: Base exception for webcout_search errors.
1020
- RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
1021
- TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
1022
- """
1023
- assert keywords, "keywords is mandatory"
1024
-
1025
- vqd = self._get_vqd("translate")
1026
-
1027
- payload = {
1028
- "vqd": vqd,
1029
- "query": "translate",
1030
- "to": to,
1031
- }
1032
- if from_:
1033
- payload["from"] = from_
1034
-
1035
- def _translate_keyword(keyword: str) -> Dict[str, str]:
1036
- resp_content = self._get_url(
1037
- "POST",
1038
- "https://duckduckgo.com/translation.js",
1039
- params=payload,
1040
- content=keyword.encode(),
1041
- )
1042
- page_data: Dict[str, str] = json_loads(resp_content)
1043
- page_data["original"] = keyword
1044
- return page_data
1045
-
1046
- if isinstance(keywords, str):
1047
- keywords = [keywords]
1048
-
1049
- results = []
1050
- try:
1051
- for r in self._executor.map(_translate_keyword, keywords):
1052
- results.append(r)
1053
- except Exception as e:
1054
- raise e
1055
-
1056
  return results
 
1
+ import logging
2
+ import warnings
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from datetime import datetime, timezone
5
+ from decimal import Decimal
6
+ from functools import cached_property
7
+ from itertools import cycle, islice
8
+ from threading import Event
9
+ from types import TracebackType
10
+ from typing import Dict, List, Optional, Tuple, Type, Union, cast
11
+
12
+ import pyreqwest_impersonate as pri # type: ignore
13
+
14
+ try:
15
+ from lxml.etree import _Element
16
+ from lxml.html import HTMLParser as LHTMLParser
17
+ from lxml.html import document_fromstring
18
+
19
+ LXML_AVAILABLE = True
20
+ except ImportError:
21
+ LXML_AVAILABLE = False
22
+
23
+ from webscout.utils.exceptions import WebscoutE, RatelimitE, TimeoutE
24
+ from webscout.utils.utils import (
25
+ _calculate_distance,
26
+ _extract_vqd,
27
+ _normalize,
28
+ _normalize_url,
29
+ _text_extract_json,
30
+ json_loads,
31
+ )
32
+
33
+ logger = logging.getLogger("webcout_search.WEBS")
34
+
35
+
36
+ class WEBS:
37
+ """webcout_search class to get search results from duckduckgo.com."""
38
+
39
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor()
40
+
41
+ def __init__(
42
+ self,
43
+ headers: Optional[Dict[str, str]] = None,
44
+ proxy: Optional[str] = None,
45
+ proxies: Union[Dict[str, str], str, None] = None, # deprecated
46
+ timeout: Optional[int] = 10,
47
+ ) -> None:
48
+ """Initialize the WEBS object.
49
+
50
+ Args:
51
+ headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None.
52
+ proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
53
+ example: "http://user:[email protected]:3128". Defaults to None.
54
+ timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
55
+ """
56
+ self.proxy: Optional[str] = proxy
57
+ assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str"
58
+ if not proxy and proxies:
59
+ warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
60
+ self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
61
+ self.headers = headers if headers else {}
62
+ self.headers["Referer"] = "https://duckduckgo.com/"
63
+ self.client = pri.Client(
64
+ headers=self.headers,
65
+ proxy=self.proxy,
66
+ timeout=timeout,
67
+ cookie_store=True,
68
+ referer=True,
69
+ impersonate="chrome_124",
70
+ follow_redirects=False,
71
+ verify=False,
72
+ )
73
+ self._exception_event = Event()
74
+ self._chat_messages: List[Dict[str, str]] = []
75
+ self._chat_vqd: str = ""
76
+
77
+ def __enter__(self) -> "WEBS":
78
+ return self
79
+
80
+ def __exit__(
81
+ self,
82
+ exc_type: Optional[Type[BaseException]] = None,
83
+ exc_val: Optional[BaseException] = None,
84
+ exc_tb: Optional[TracebackType] = None,
85
+ ) -> None:
86
+ pass
87
+
88
+ @cached_property
89
+ def parser(self) -> "LHTMLParser":
90
+ """Get HTML parser."""
91
+ return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
92
+
93
+ def _get_url(
94
+ self,
95
+ method: str,
96
+ url: str,
97
+ params: Optional[Dict[str, str]] = None,
98
+ content: Optional[bytes] = None,
99
+ data: Optional[Union[Dict[str, str], bytes]] = None,
100
+ ) -> bytes:
101
+ if self._exception_event.is_set():
102
+ raise WebscoutE("Exception occurred in previous call.")
103
+ try:
104
+ resp = self.client.request(method, url, params=params, content=content, data=data)
105
+ except Exception as ex:
106
+ self._exception_event.set()
107
+ if "time" in str(ex).lower():
108
+ raise TimeoutE(f"{url} {type(ex).__name__}: {ex}") from ex
109
+ raise WebscoutE(f"{url} {type(ex).__name__}: {ex}") from ex
110
+ logger.debug(f"_get_url() {resp.url} {resp.status_code} {len(resp.content)}")
111
+ if resp.status_code == 200:
112
+ return cast(bytes, resp.content)
113
+ self._exception_event.set()
114
+ if resp.status_code in (202, 301, 403):
115
+ raise RatelimitE(f"{resp.url} {resp.status_code} Ratelimit")
116
+ raise WebscoutE(f"{resp.url} return None. {params=} {content=} {data=}")
117
+
118
+ def _get_vqd(self, keywords: str) -> str:
119
+ """Get vqd value for a search query."""
120
+ resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
121
+ return _extract_vqd(resp_content, keywords)
122
+
123
+ def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
124
+ """Initiates a chat session with Webscout AI.
125
+
126
+ Args:
127
+ keywords (str): The initial message or question to send to the AI.
128
+ model (str): The model to use: "gpt-3.5", "claude-3-haiku". Defaults to "gpt-3.5".
129
+
130
+ Returns:
131
+ str: The response from the AI.
132
+ """
133
+ models = {"claude-3-haiku": "claude-3-haiku-20240307", "gpt-3.5": "gpt-3.5-turbo-0125"}
134
+ # vqd
135
+ if not self._chat_vqd:
136
+ resp = self.client.get("https://duckduckgo.com/duckchat/v1/status", headers={"x-vqd-accept": "1"})
137
+ self._chat_vqd = resp.headers.get("x-vqd-4", "")
138
+
139
+ self._chat_messages.append({"role": "user", "content": keywords})
140
+
141
+ json_data = {
142
+ "model": models[model],
143
+ "messages": self._chat_messages,
144
+ }
145
+ resp = self.client.post(
146
+ "https://duckduckgo.com/duckchat/v1/chat", headers={"x-vqd-4": self._chat_vqd}, json=json_data
147
+ )
148
+ self._chat_vqd = resp.headers.get("x-vqd-4", "")
149
+
150
+ messages = []
151
+ for line in resp.text.replace("data: ", "").replace("[DONE]", "").split("\n\n"):
152
+ x = line.strip()
153
+ if x:
154
+ j = json_loads(x)
155
+ message = j.get("message", "")
156
+ messages.append(message)
157
+ result = "".join(messages)
158
+ self._chat_messages.append({"role": "assistant", "content": result})
159
+ return result
160
+
161
+ def text(
162
+ self,
163
+ keywords: str,
164
+ region: str = "wt-wt",
165
+ safesearch: str = "moderate",
166
+ timelimit: Optional[str] = None,
167
+ backend: str = "api",
168
+ max_results: Optional[int] = None,
169
+ ) -> List[Dict[str, str]]:
170
+ """Webscout text search. Query params: https://duckduckgo.com/params.
171
+
172
+ Args:
173
+ keywords: keywords for query.
174
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
175
+ safesearch: on, moderate, off. Defaults to "moderate".
176
+ timelimit: d, w, m, y. Defaults to None.
177
+ backend: api, html, lite. Defaults to api.
178
+ api - collect data from https://duckduckgo.com,
179
+ html - collect data from https://html.duckduckgo.com,
180
+ lite - collect data from https://lite.duckduckgo.com.
181
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
182
+
183
+ Returns:
184
+ List of dictionaries with search results, or None if there was an error.
185
+
186
+ Raises:
187
+ WebscoutE: Base exception for webcout_search errors.
188
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
189
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
190
+ """
191
+ if LXML_AVAILABLE is False and backend != "api":
192
+ backend = "api"
193
+ warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
194
+
195
+ if backend == "api":
196
+ results = self._text_api(keywords, region, safesearch, timelimit, max_results)
197
+ elif backend == "html":
198
+ results = self._text_html(keywords, region, safesearch, timelimit, max_results)
199
+ elif backend == "lite":
200
+ results = self._text_lite(keywords, region, timelimit, max_results)
201
+ return results
202
+
203
+ def _text_api(
204
+ self,
205
+ keywords: str,
206
+ region: str = "wt-wt",
207
+ safesearch: str = "moderate",
208
+ timelimit: Optional[str] = None,
209
+ max_results: Optional[int] = None,
210
+ ) -> List[Dict[str, str]]:
211
+ """Webscout text search. Query params: https://duckduckgo.com/params.
212
+
213
+ Args:
214
+ keywords: keywords for query.
215
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
216
+ safesearch: on, moderate, off. Defaults to "moderate".
217
+ timelimit: d, w, m, y. Defaults to None.
218
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
219
+
220
+ Returns:
221
+ List of dictionaries with search results.
222
+
223
+ Raises:
224
+ WebscoutE: Base exception for webcout_search errors.
225
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
226
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
227
+ """
228
+ assert keywords, "keywords is mandatory"
229
+
230
+ vqd = self._get_vqd(keywords)
231
+
232
+ payload = {
233
+ "q": keywords,
234
+ "kl": region,
235
+ "l": region,
236
+ "p": "",
237
+ "s": "0",
238
+ "df": "",
239
+ "vqd": vqd,
240
+ "ex": "",
241
+ }
242
+ safesearch = safesearch.lower()
243
+ if safesearch == "moderate":
244
+ payload["ex"] = "-1"
245
+ elif safesearch == "off":
246
+ payload["ex"] = "-2"
247
+ elif safesearch == "on": # strict
248
+ payload["p"] = "1"
249
+ if timelimit:
250
+ payload["df"] = timelimit
251
+
252
+ cache = set()
253
+ results: List[Dict[str, str]] = []
254
+
255
+ def _text_api_page(s: int) -> List[Dict[str, str]]:
256
+ payload["s"] = f"{s}"
257
+ resp_content = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
258
+ page_data = _text_extract_json(resp_content, keywords)
259
+ page_results = []
260
+ for row in page_data:
261
+ href = row.get("u", None)
262
+ if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
263
+ cache.add(href)
264
+ body = _normalize(row["a"])
265
+ if body:
266
+ result = {
267
+ "title": _normalize(row["t"]),
268
+ "href": _normalize_url(href),
269
+ "body": body,
270
+ }
271
+ page_results.append(result)
272
+ return page_results
273
+
274
+ slist = [0]
275
+ if max_results:
276
+ max_results = min(max_results, 500)
277
+ slist.extend(range(23, max_results, 50))
278
+ try:
279
+ for r in self._executor.map(_text_api_page, slist):
280
+ results.extend(r)
281
+ except Exception as e:
282
+ raise e
283
+
284
+ return list(islice(results, max_results))
285
+
286
+ def _text_html(
287
+ self,
288
+ keywords: str,
289
+ region: str = "wt-wt",
290
+ safesearch: str = "moderate",
291
+ timelimit: Optional[str] = None,
292
+ max_results: Optional[int] = None,
293
+ ) -> List[Dict[str, str]]:
294
+ """Webscout text search. Query params: https://duckduckgo.com/params.
295
+
296
+ Args:
297
+ keywords: keywords for query.
298
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
299
+ safesearch: on, moderate, off. Defaults to "moderate".
300
+ timelimit: d, w, m, y. Defaults to None.
301
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
302
+
303
+ Returns:
304
+ List of dictionaries with search results.
305
+
306
+ Raises:
307
+ WebscoutE: Base exception for webcout_search errors.
308
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
309
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
310
+ """
311
+ assert keywords, "keywords is mandatory"
312
+
313
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
314
+ payload = {
315
+ "q": keywords,
316
+ "kl": region,
317
+ "p": safesearch_base[safesearch.lower()],
318
+ "o": "json",
319
+ "api": "d.js",
320
+ }
321
+ if timelimit:
322
+ payload["df"] = timelimit
323
+ if max_results and max_results > 20:
324
+ vqd = self._get_vqd(keywords)
325
+ payload["vqd"] = vqd
326
+
327
+ cache = set()
328
+ results: List[Dict[str, str]] = []
329
+
330
+ def _text_html_page(s: int) -> List[Dict[str, str]]:
331
+ payload["s"] = f"{s}"
332
+ resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
333
+ if b"No results." in resp_content:
334
+ return []
335
+
336
+ page_results = []
337
+ tree = document_fromstring(resp_content, self.parser)
338
+ elements = tree.xpath("//div[h2]")
339
+ if not isinstance(elements, List):
340
+ return []
341
+ for e in elements:
342
+ if isinstance(e, _Element):
343
+ hrefxpath = e.xpath("./a/@href")
344
+ href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
345
+ if (
346
+ href
347
+ and href not in cache
348
+ and not href.startswith(
349
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
350
+ )
351
+ ):
352
+ cache.add(href)
353
+ titlexpath = e.xpath("./h2/a/text()")
354
+ title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
355
+ bodyxpath = e.xpath("./a//text()")
356
+ body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
357
+ result = {
358
+ "title": _normalize(title),
359
+ "href": _normalize_url(href),
360
+ "body": _normalize(body),
361
+ }
362
+ page_results.append(result)
363
+ return page_results
364
+
365
+ slist = [0]
366
+ if max_results:
367
+ max_results = min(max_results, 500)
368
+ slist.extend(range(23, max_results, 50))
369
+ try:
370
+ for r in self._executor.map(_text_html_page, slist):
371
+ results.extend(r)
372
+ except Exception as e:
373
+ raise e
374
+
375
+ return list(islice(results, max_results))
376
+
377
+ def _text_lite(
378
+ self,
379
+ keywords: str,
380
+ region: str = "wt-wt",
381
+ timelimit: Optional[str] = None,
382
+ max_results: Optional[int] = None,
383
+ ) -> List[Dict[str, str]]:
384
+ """Webscout text search. Query params: https://duckduckgo.com/params.
385
+
386
+ Args:
387
+ keywords: keywords for query.
388
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
389
+ timelimit: d, w, m, y. Defaults to None.
390
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
391
+
392
+ Returns:
393
+ List of dictionaries with search results.
394
+
395
+ Raises:
396
+ WebscoutE: Base exception for webcout_search errors.
397
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
398
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
399
+ """
400
+ assert keywords, "keywords is mandatory"
401
+
402
+ payload = {
403
+ "q": keywords,
404
+ "o": "json",
405
+ "api": "d.js",
406
+ "kl": region,
407
+ }
408
+ if timelimit:
409
+ payload["df"] = timelimit
410
+
411
+ cache = set()
412
+ results: List[Dict[str, str]] = []
413
+
414
+ def _text_lite_page(s: int) -> List[Dict[str, str]]:
415
+ payload["s"] = f"{s}"
416
+ resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
417
+ if b"No more results." in resp_content:
418
+ return []
419
+
420
+ page_results = []
421
+ tree = document_fromstring(resp_content, self.parser)
422
+ elements = tree.xpath("//table[last()]//tr")
423
+ if not isinstance(elements, List):
424
+ return []
425
+
426
+ data = zip(cycle(range(1, 5)), elements)
427
+ for i, e in data:
428
+ if isinstance(e, _Element):
429
+ if i == 1:
430
+ hrefxpath = e.xpath(".//a//@href")
431
+ href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
432
+ if (
433
+ href is None
434
+ or href in cache
435
+ or href.startswith(
436
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
437
+ )
438
+ ):
439
+ [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
440
+ else:
441
+ cache.add(href)
442
+ titlexpath = e.xpath(".//a//text()")
443
+ title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
444
+ elif i == 2:
445
+ bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
446
+ body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
447
+ if href:
448
+ result = {
449
+ "title": _normalize(title),
450
+ "href": _normalize_url(href),
451
+ "body": _normalize(body),
452
+ }
453
+ page_results.append(result)
454
+ return page_results
455
+
456
+ slist = [0]
457
+ if max_results:
458
+ max_results = min(max_results, 500)
459
+ slist.extend(range(23, max_results, 50))
460
+ try:
461
+ for r in self._executor.map(_text_lite_page, slist):
462
+ results.extend(r)
463
+ except Exception as e:
464
+ raise e
465
+
466
+ return list(islice(results, max_results))
467
+
468
+ def images(
469
+ self,
470
+ keywords: str,
471
+ region: str = "wt-wt",
472
+ safesearch: str = "moderate",
473
+ timelimit: Optional[str] = None,
474
+ size: Optional[str] = None,
475
+ color: Optional[str] = None,
476
+ type_image: Optional[str] = None,
477
+ layout: Optional[str] = None,
478
+ license_image: Optional[str] = None,
479
+ max_results: Optional[int] = None,
480
+ ) -> List[Dict[str, str]]:
481
+ """Webscout images search. Query params: https://duckduckgo.com/params.
482
+
483
+ Args:
484
+ keywords: keywords for query.
485
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
486
+ safesearch: on, moderate, off. Defaults to "moderate".
487
+ timelimit: Day, Week, Month, Year. Defaults to None.
488
+ size: Small, Medium, Large, Wallpaper. Defaults to None.
489
+ color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
490
+ Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
491
+ type_image: photo, clipart, gif, transparent, line.
492
+ Defaults to None.
493
+ layout: Square, Tall, Wide. Defaults to None.
494
+ license_image: any (All Creative Commons), Public (PublicDomain),
495
+ Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
496
+ Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
497
+ Use Commercially). Defaults to None.
498
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
499
+
500
+ Returns:
501
+ List of dictionaries with images search results.
502
+
503
+ Raises:
504
+ WebscoutE: Base exception for webcout_search errors.
505
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
506
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
507
+ """
508
+ assert keywords, "keywords is mandatory"
509
+
510
+ vqd = self._get_vqd(keywords)
511
+
512
+ safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
513
+ timelimit = f"time:{timelimit}" if timelimit else ""
514
+ size = f"size:{size}" if size else ""
515
+ color = f"color:{color}" if color else ""
516
+ type_image = f"type:{type_image}" if type_image else ""
517
+ layout = f"layout:{layout}" if layout else ""
518
+ license_image = f"license:{license_image}" if license_image else ""
519
+ payload = {
520
+ "l": region,
521
+ "o": "json",
522
+ "q": keywords,
523
+ "vqd": vqd,
524
+ "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
525
+ "p": safesearch_base[safesearch.lower()],
526
+ }
527
+
528
+ cache = set()
529
+ results: List[Dict[str, str]] = []
530
+
531
+ def _images_page(s: int) -> List[Dict[str, str]]:
532
+ payload["s"] = f"{s}"
533
+ resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
534
+ resp_json = json_loads(resp_content)
535
+
536
+ page_data = resp_json.get("results", [])
537
+ page_results = []
538
+ for row in page_data:
539
+ image_url = row.get("image")
540
+ if image_url and image_url not in cache:
541
+ cache.add(image_url)
542
+ result = {
543
+ "title": row["title"],
544
+ "image": _normalize_url(image_url),
545
+ "thumbnail": _normalize_url(row["thumbnail"]),
546
+ "url": _normalize_url(row["url"]),
547
+ "height": row["height"],
548
+ "width": row["width"],
549
+ "source": row["source"],
550
+ }
551
+ page_results.append(result)
552
+ return page_results
553
+
554
+ slist = [0]
555
+ if max_results:
556
+ max_results = min(max_results, 500)
557
+ slist.extend(range(100, max_results, 100))
558
+ try:
559
+ for r in self._executor.map(_images_page, slist):
560
+ results.extend(r)
561
+ except Exception as e:
562
+ raise e
563
+
564
+ return list(islice(results, max_results))
565
+
566
+ def videos(
567
+ self,
568
+ keywords: str,
569
+ region: str = "wt-wt",
570
+ safesearch: str = "moderate",
571
+ timelimit: Optional[str] = None,
572
+ resolution: Optional[str] = None,
573
+ duration: Optional[str] = None,
574
+ license_videos: Optional[str] = None,
575
+ max_results: Optional[int] = None,
576
+ ) -> List[Dict[str, str]]:
577
+ """Webscout videos search. Query params: https://duckduckgo.com/params.
578
+
579
+ Args:
580
+ keywords: keywords for query.
581
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
582
+ safesearch: on, moderate, off. Defaults to "moderate".
583
+ timelimit: d, w, m. Defaults to None.
584
+ resolution: high, standart. Defaults to None.
585
+ duration: short, medium, long. Defaults to None.
586
+ license_videos: creativeCommon, youtube. Defaults to None.
587
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
588
+
589
+ Returns:
590
+ List of dictionaries with videos search results.
591
+
592
+ Raises:
593
+ WebscoutE: Base exception for webcout_search errors.
594
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
595
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
596
+ """
597
+ assert keywords, "keywords is mandatory"
598
+
599
+ vqd = self._get_vqd(keywords)
600
+
601
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
602
+ timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
603
+ resolution = f"videoDefinition:{resolution}" if resolution else ""
604
+ duration = f"videoDuration:{duration}" if duration else ""
605
+ license_videos = f"videoLicense:{license_videos}" if license_videos else ""
606
+ payload = {
607
+ "l": region,
608
+ "o": "json",
609
+ "q": keywords,
610
+ "vqd": vqd,
611
+ "f": f"{timelimit},{resolution},{duration},{license_videos}",
612
+ "p": safesearch_base[safesearch.lower()],
613
+ }
614
+
615
+ cache = set()
616
+ results: List[Dict[str, str]] = []
617
+
618
+ def _videos_page(s: int) -> List[Dict[str, str]]:
619
+ payload["s"] = f"{s}"
620
+ resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
621
+ resp_json = json_loads(resp_content)
622
+
623
+ page_data = resp_json.get("results", [])
624
+ page_results = []
625
+ for row in page_data:
626
+ if row["content"] not in cache:
627
+ cache.add(row["content"])
628
+ page_results.append(row)
629
+ return page_results
630
+
631
+ slist = [0]
632
+ if max_results:
633
+ max_results = min(max_results, 400)
634
+ slist.extend(range(59, max_results, 59))
635
+ try:
636
+ for r in self._executor.map(_videos_page, slist):
637
+ results.extend(r)
638
+ except Exception as e:
639
+ raise e
640
+
641
+ return list(islice(results, max_results))
642
+
643
+ def news(
644
+ self,
645
+ keywords: str,
646
+ region: str = "wt-wt",
647
+ safesearch: str = "moderate",
648
+ timelimit: Optional[str] = None,
649
+ max_results: Optional[int] = None,
650
+ ) -> List[Dict[str, str]]:
651
+ """Webscout news search. Query params: https://duckduckgo.com/params.
652
+
653
+ Args:
654
+ keywords: keywords for query.
655
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
656
+ safesearch: on, moderate, off. Defaults to "moderate".
657
+ timelimit: d, w, m. Defaults to None.
658
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
659
+
660
+ Returns:
661
+ List of dictionaries with news search results.
662
+
663
+ Raises:
664
+ WebscoutE: Base exception for webcout_search errors.
665
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
666
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
667
+ """
668
+ assert keywords, "keywords is mandatory"
669
+
670
+ vqd = self._get_vqd(keywords)
671
+
672
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
673
+ payload = {
674
+ "l": region,
675
+ "o": "json",
676
+ "noamp": "1",
677
+ "q": keywords,
678
+ "vqd": vqd,
679
+ "p": safesearch_base[safesearch.lower()],
680
+ }
681
+ if timelimit:
682
+ payload["df"] = timelimit
683
+
684
+ cache = set()
685
+ results: List[Dict[str, str]] = []
686
+
687
+ def _news_page(s: int) -> List[Dict[str, str]]:
688
+ payload["s"] = f"{s}"
689
+ resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
690
+ resp_json = json_loads(resp_content)
691
+ page_data = resp_json.get("results", [])
692
+ page_results = []
693
+ for row in page_data:
694
+ if row["url"] not in cache:
695
+ cache.add(row["url"])
696
+ image_url = row.get("image", None)
697
+ result = {
698
+ "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
699
+ "title": row["title"],
700
+ "body": _normalize(row["excerpt"]),
701
+ "url": _normalize_url(row["url"]),
702
+ "image": _normalize_url(image_url),
703
+ "source": row["source"],
704
+ }
705
+ page_results.append(result)
706
+ return page_results
707
+
708
+ slist = [0]
709
+ if max_results:
710
+ max_results = min(max_results, 200)
711
+ slist.extend(range(29, max_results, 29))
712
+ try:
713
+ for r in self._executor.map(_news_page, slist):
714
+ results.extend(r)
715
+ except Exception as e:
716
+ raise e
717
+
718
+ return list(islice(results, max_results))
719
+
720
+ def answers(self, keywords: str) -> List[Dict[str, str]]:
721
+ """Webscout instant answers. Query params: https://duckduckgo.com/params.
722
+
723
+ Args:
724
+ keywords: keywords for query,
725
+
726
+ Returns:
727
+ List of dictionaries with instant answers results.
728
+
729
+ Raises:
730
+ WebscoutE: Base exception for webcout_search errors.
731
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
732
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
733
+ """
734
+ assert keywords, "keywords is mandatory"
735
+
736
+ payload = {
737
+ "q": f"what is {keywords}",
738
+ "format": "json",
739
+ }
740
+ resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
741
+ page_data = json_loads(resp_content)
742
+
743
+ results = []
744
+ answer = page_data.get("AbstractText")
745
+ url = page_data.get("AbstractURL")
746
+ if answer:
747
+ results.append(
748
+ {
749
+ "icon": None,
750
+ "text": answer,
751
+ "topic": None,
752
+ "url": url,
753
+ }
754
+ )
755
+
756
+ # related
757
+ payload = {
758
+ "q": f"{keywords}",
759
+ "format": "json",
760
+ }
761
+ resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
762
+ resp_json = json_loads(resp_content)
763
+ page_data = resp_json.get("RelatedTopics", [])
764
+
765
+ for row in page_data:
766
+ topic = row.get("Name")
767
+ if not topic:
768
+ icon = row["Icon"].get("URL")
769
+ results.append(
770
+ {
771
+ "icon": f"https://duckduckgo.com{icon}" if icon else "",
772
+ "text": row["Text"],
773
+ "topic": None,
774
+ "url": row["FirstURL"],
775
+ }
776
+ )
777
+ else:
778
+ for subrow in row["Topics"]:
779
+ icon = subrow["Icon"].get("URL")
780
+ results.append(
781
+ {
782
+ "icon": f"https://duckduckgo.com{icon}" if icon else "",
783
+ "text": subrow["Text"],
784
+ "topic": topic,
785
+ "url": subrow["FirstURL"],
786
+ }
787
+ )
788
+
789
+ return results
790
+
791
+ def suggestions(self, keywords: str, region: str = "wt-wt") -> List[Dict[str, str]]:
792
+ """Webscout suggestions. Query params: https://duckduckgo.com/params.
793
+
794
+ Args:
795
+ keywords: keywords for query.
796
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
797
+
798
+ Returns:
799
+ List of dictionaries with suggestions results.
800
+
801
+ Raises:
802
+ WebscoutE: Base exception for webcout_search errors.
803
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
804
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
805
+ """
806
+ assert keywords, "keywords is mandatory"
807
+
808
+ payload = {
809
+ "q": keywords,
810
+ "kl": region,
811
+ }
812
+ resp_content = self._get_url("GET", "https://duckduckgo.com/ac/", params=payload)
813
+ page_data = json_loads(resp_content)
814
+ return [r for r in page_data]
815
+
816
+ def maps(
817
+ self,
818
+ keywords: str,
819
+ place: Optional[str] = None,
820
+ street: Optional[str] = None,
821
+ city: Optional[str] = None,
822
+ county: Optional[str] = None,
823
+ state: Optional[str] = None,
824
+ country: Optional[str] = None,
825
+ postalcode: Optional[str] = None,
826
+ latitude: Optional[str] = None,
827
+ longitude: Optional[str] = None,
828
+ radius: int = 0,
829
+ max_results: Optional[int] = None,
830
+ ) -> List[Dict[str, str]]:
831
+ """Webscout maps search. Query params: https://duckduckgo.com/params.
832
+
833
+ Args:
834
+ keywords: keywords for query
835
+ place: if set, the other parameters are not used. Defaults to None.
836
+ street: house number/street. Defaults to None.
837
+ city: city of search. Defaults to None.
838
+ county: county of search. Defaults to None.
839
+ state: state of search. Defaults to None.
840
+ country: country of search. Defaults to None.
841
+ postalcode: postalcode of search. Defaults to None.
842
+ latitude: geographic coordinate (north-south position). Defaults to None.
843
+ longitude: geographic coordinate (east-west position); if latitude and
844
+ longitude are set, the other parameters are not used. Defaults to None.
845
+ radius: expand the search square by the distance in kilometers. Defaults to 0.
846
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
847
+
848
+ Returns:
849
+ List of dictionaries with maps search results, or None if there was an error.
850
+
851
+ Raises:
852
+ WebscoutE: Base exception for webcout_search errors.
853
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
854
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
855
+ """
856
+ assert keywords, "keywords is mandatory"
857
+
858
+ vqd = self._get_vqd(keywords)
859
+
860
+ # if longitude and latitude are specified, skip the request about bbox to the nominatim api
861
+ if latitude and longitude:
862
+ lat_t = Decimal(latitude.replace(",", "."))
863
+ lat_b = Decimal(latitude.replace(",", "."))
864
+ lon_l = Decimal(longitude.replace(",", "."))
865
+ lon_r = Decimal(longitude.replace(",", "."))
866
+ if radius == 0:
867
+ radius = 1
868
+ # otherwise request about bbox to nominatim api
869
+ else:
870
+ if place:
871
+ params = {
872
+ "q": place,
873
+ "polygon_geojson": "0",
874
+ "format": "jsonv2",
875
+ }
876
+ else:
877
+ params = {
878
+ "polygon_geojson": "0",
879
+ "format": "jsonv2",
880
+ }
881
+ if street:
882
+ params["street"] = street
883
+ if city:
884
+ params["city"] = city
885
+ if county:
886
+ params["county"] = county
887
+ if state:
888
+ params["state"] = state
889
+ if country:
890
+ params["country"] = country
891
+ if postalcode:
892
+ params["postalcode"] = postalcode
893
+ # request nominatim api to get coordinates box
894
+ resp_content = self._get_url(
895
+ "GET",
896
+ "https://nominatim.openstreetmap.org/search.php",
897
+ params=params,
898
+ )
899
+ if resp_content == b"[]":
900
+ raise WebscoutE("maps() Coordinates are not found, check function parameters.")
901
+ resp_json = json_loads(resp_content)
902
+ coordinates = resp_json[0]["boundingbox"]
903
+ lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
904
+ lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
905
+
906
+ # if a radius is specified, expand the search square
907
+ lat_t += Decimal(radius) * Decimal(0.008983)
908
+ lat_b -= Decimal(radius) * Decimal(0.008983)
909
+ lon_l -= Decimal(radius) * Decimal(0.008983)
910
+ lon_r += Decimal(radius) * Decimal(0.008983)
911
+ logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
912
+
913
+ cache = set()
914
+ results: List[Dict[str, str]] = []
915
+
916
+ def _maps_page(
917
+ bbox: Tuple[Decimal, Decimal, Decimal, Decimal],
918
+ ) -> Optional[List[Dict[str, str]]]:
919
+ if max_results and len(results) >= max_results:
920
+ return None
921
+ lat_t, lon_l, lat_b, lon_r = bbox
922
+ params = {
923
+ "q": keywords,
924
+ "vqd": vqd,
925
+ "tg": "maps_places",
926
+ "rt": "D",
927
+ "mkexp": "b",
928
+ "wiki_info": "1",
929
+ "is_requery": "1",
930
+ "bbox_tl": f"{lat_t},{lon_l}",
931
+ "bbox_br": f"{lat_b},{lon_r}",
932
+ "strict_bbox": "1",
933
+ }
934
+ resp_content = self._get_url("GET", "https://duckduckgo.com/local.js", params=params)
935
+ resp_json = json_loads(resp_content)
936
+ page_data = resp_json.get("results", [])
937
+
938
+ page_results = []
939
+ for res in page_data:
940
+ r_name = f'{res["name"]} {res["address"]}'
941
+ if r_name in cache:
942
+ continue
943
+ else:
944
+ cache.add(r_name)
945
+ result = {
946
+ "title": res["name"],
947
+ "address": res["address"],
948
+ "country_code": res["country_code"],
949
+ "url": _normalize_url(res["website"]),
950
+ "phone": res["phone"] or "",
951
+ "latitude": res["coordinates"]["latitude"],
952
+ "longitude": res["coordinates"]["longitude"],
953
+ "source": _normalize_url(res["url"]),
954
+ "image": x.get("image", "") if (x := res["embed"]) else "",
955
+ "desc": x.get("description", "") if (x := res["embed"]) else "",
956
+ "hours": res["hours"] or "",
957
+ "category": res["ddg_category"] or "",
958
+ "facebook": f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else "",
959
+ "instagram": f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else "",
960
+ "twitter": f"https://twitter.com/{x}" if (x := res["twitter_id"]) else "",
961
+ }
962
+ page_results.append(result)
963
+ return page_results
964
+
965
+ # search squares (bboxes)
966
+ start_bbox = (lat_t, lon_l, lat_b, lon_r)
967
+ work_bboxes = [start_bbox]
968
+ while work_bboxes:
969
+ queue_bboxes = [] # for next iteration, at the end of the iteration work_bboxes = queue_bboxes
970
+ tasks = []
971
+ for bbox in work_bboxes:
972
+ tasks.append(bbox)
973
+ # if distance between coordinates > 1, divide the square into 4 parts and save them in queue_bboxes
974
+ if _calculate_distance(lat_t, lon_l, lat_b, lon_r) > 1:
975
+ lat_t, lon_l, lat_b, lon_r = bbox
976
+ lat_middle = (lat_t + lat_b) / 2
977
+ lon_middle = (lon_l + lon_r) / 2
978
+ bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
979
+ bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
980
+ bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
981
+ bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
982
+ queue_bboxes.extend([bbox1, bbox2, bbox3, bbox4])
983
+
984
+ # gather tasks using asyncio.wait_for and timeout
985
+ work_bboxes_results = []
986
+ try:
987
+ for r in self._executor.map(_maps_page, tasks):
988
+ if r:
989
+ work_bboxes_results.extend(r)
990
+ except Exception as e:
991
+ raise e
992
+
993
+ for x in work_bboxes_results:
994
+ if isinstance(x, list):
995
+ results.extend(x)
996
+ elif isinstance(x, dict):
997
+ results.append(x)
998
+
999
+ work_bboxes = queue_bboxes
1000
+ if not max_results or len(results) >= max_results or len(work_bboxes_results) == 0:
1001
+ break
1002
+
1003
+ return list(islice(results, max_results))
1004
+
1005
+ def translate(
1006
+ self, keywords: Union[List[str], str], from_: Optional[str] = None, to: str = "en"
1007
+ ) -> List[Dict[str, str]]:
1008
+ """Webscout translate.
1009
+
1010
+ Args:
1011
+ keywords: string or list of strings to translate.
1012
+ from_: translate from (defaults automatically). Defaults to None.
1013
+ to: what language to translate. Defaults to "en".
1014
+
1015
+ Returns:
1016
+ List od dictionaries with translated keywords.
1017
+
1018
+ Raises:
1019
+ WebscoutE: Base exception for webcout_search errors.
1020
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
1021
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
1022
+ """
1023
+ assert keywords, "keywords is mandatory"
1024
+
1025
+ vqd = self._get_vqd("translate")
1026
+
1027
+ payload = {
1028
+ "vqd": vqd,
1029
+ "query": "translate",
1030
+ "to": to,
1031
+ }
1032
+ if from_:
1033
+ payload["from"] = from_
1034
+
1035
+ def _translate_keyword(keyword: str) -> Dict[str, str]:
1036
+ resp_content = self._get_url(
1037
+ "POST",
1038
+ "https://duckduckgo.com/translation.js",
1039
+ params=payload,
1040
+ content=keyword.encode(),
1041
+ )
1042
+ page_data: Dict[str, str] = json_loads(resp_content)
1043
+ page_data["original"] = keyword
1044
+ return page_data
1045
+
1046
+ if isinstance(keywords, str):
1047
+ keywords = [keywords]
1048
+
1049
+ results = []
1050
+ try:
1051
+ for r in self._executor.map(_translate_keyword, keywords):
1052
+ results.append(r)
1053
+ except Exception as e:
1054
+ raise e
1055
+
1056
  return results