Add1E commited on
Commit
5e7e95f
·
verified ·
1 Parent(s): b86b57c

Delete pytrends

Browse files
pytrends/__init__.py DELETED
File without changes
pytrends/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (186 Bytes)
 
pytrends/__pycache__/dailydata.cpython-310.pyc DELETED
Binary file (4.73 kB)
 
pytrends/__pycache__/exceptions.cpython-310.pyc DELETED
Binary file (1.11 kB)
 
pytrends/__pycache__/request.cpython-310.pyc DELETED
Binary file (15.4 kB)
 
pytrends/dailydata.py DELETED
@@ -1,127 +0,0 @@
1
- from datetime import date, timedelta
2
- from functools import partial
3
- from time import sleep
4
- from calendar import monthrange
5
-
6
- import pandas as pd
7
-
8
- from pytrends.exceptions import ResponseError
9
- from pytrends.request import TrendReq
10
-
11
-
12
- def get_last_date_of_month(year: int, month: int) -> date:
13
- """Given a year and a month returns an instance of the date class
14
- containing the last day of the corresponding month.
15
-
16
- Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
17
- """
18
- return date(year, month, monthrange(year, month)[1])
19
-
20
-
21
- def convert_dates_to_timeframe(start: date, stop: date) -> str:
22
- """Given two dates, returns a stringified version of the interval between
23
- the two dates which is used to retrieve data for a specific time frame
24
- from Google Trends.
25
- """
26
- return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
27
-
28
-
29
- def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
30
- """Attempts to fecth data and retries in case of a ResponseError."""
31
- attempts, fetched = 0, False
32
- while not fetched:
33
- try:
34
- build_payload(timeframe=timeframe)
35
- except ResponseError as err:
36
- print(err)
37
- print(f'Trying again in {60 + 5 * attempts} seconds.')
38
- sleep(60 + 5 * attempts)
39
- attempts += 1
40
- if attempts > 3:
41
- print('Failed after 3 attemps, abort fetching.')
42
- break
43
- else:
44
- fetched = True
45
- return pytrends.interest_over_time()
46
-
47
-
48
- def get_daily_data(word: str,
49
- start_year: int,
50
- start_mon: int,
51
- stop_year: int,
52
- stop_mon: int,
53
- geo: str = 'US',
54
- verbose: bool = True,
55
- wait_time: float = 5.0) -> pd.DataFrame:
56
- """Given a word, fetches daily search volume data from Google Trends and
57
- returns results in a pandas DataFrame.
58
-
59
- Details: Due to the way Google Trends scales and returns data, special
60
- care needs to be taken to make the daily data comparable over different
61
- months. To do that, we download daily data on a month by month basis,
62
- and also monthly data. The monthly data is downloaded in one go, so that
63
- the monthly values are comparable amongst themselves and can be used to
64
- scale the daily data. The daily data is scaled by multiplying the daily
65
- value by the monthly search volume divided by 100.
66
- For a more detailed explanation see http://bit.ly/trendsscaling
67
-
68
- Args:
69
- word (str): Word to fetch daily data for.
70
- start_year (int): the start year
71
- start_mon (int): start 1st day of the month
72
- stop_year (int): the end year
73
- stop_mon (int): end at the last day of the month
74
- geo (str): geolocation
75
- verbose (bool): If True, then prints the word and current time frame
76
- we are fecthing the data for.
77
-
78
- Returns:
79
- complete (pd.DataFrame): Contains 4 columns.
80
- The column named after the word argument contains the daily search
81
- volume already scaled and comparable through time.
82
- The column f'{word}_unscaled' is the original daily data fetched
83
- month by month, and it is not comparable across different months
84
- (but is comparable within a month).
85
- The column f'{word}_monthly' contains the original monthly data
86
- fetched at once. The values in this column have been backfilled
87
- so that there are no NaN present.
88
- The column 'scale' contains the scale used to obtain the scaled
89
- daily data.
90
- """
91
-
92
- # Set up start and stop dates
93
- start_date = date(start_year, start_mon, 1)
94
- stop_date = get_last_date_of_month(stop_year, stop_mon)
95
-
96
- # Start pytrends for US region
97
- pytrends = TrendReq(hl='en-US', tz=360)
98
- # Initialize build_payload with the word we need data for
99
- build_payload = partial(pytrends.build_payload,
100
- kw_list=[word], cat=0, geo=geo, gprop='')
101
-
102
- # Obtain monthly data for all months in years [start_year, stop_year]
103
- monthly = _fetch_data(pytrends, build_payload,
104
- convert_dates_to_timeframe(start_date, stop_date))
105
-
106
- # Get daily data, month by month
107
- results = {}
108
- # if a timeout or too many requests error occur we need to adjust wait time
109
- current = start_date
110
- while current < stop_date:
111
- last_date_of_month = get_last_date_of_month(current.year, current.month)
112
- timeframe = convert_dates_to_timeframe(current, last_date_of_month)
113
- if verbose:
114
- print(f'{word}:{timeframe}')
115
- results[current] = _fetch_data(pytrends, build_payload, timeframe)
116
- current = last_date_of_month + timedelta(days=1)
117
- sleep(wait_time) # don't go too fast or Google will send 429s
118
-
119
- daily = pd.concat(results.values()).drop(columns=['isPartial'])
120
- complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
121
-
122
- # Scale daily data by monthly weights so the data is comparable
123
- complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
124
- complete['scale'] = complete[f'{word}_monthly'] / 100
125
- complete[word] = complete[f'{word}_unscaled'] * complete.scale
126
-
127
- return complete
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytrends/exceptions.py DELETED
@@ -1,17 +0,0 @@
1
- class ResponseError(Exception):
2
- """ Something was wrong with the response from Google. """
3
-
4
- def __init__(self, message, response):
5
- super().__init__(message)
6
- # pass response so it can be handled upstream
7
- self.response = response
8
-
9
- @classmethod
10
- def from_response(cls, response):
11
- message = f'The request failed: Google returned a response with code {response.status_code}'
12
- return cls(message, response)
13
-
14
-
15
- class TooManyRequestsError(ResponseError):
16
- """ Exception raised when the backend returns a 429 error code. """
17
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytrends/request.py DELETED
@@ -1,594 +0,0 @@
1
- import json
2
-
3
- import pandas as pd
4
- import requests
5
-
6
- from requests.adapters import HTTPAdapter
7
- from requests.packages.urllib3.util.retry import Retry
8
- from requests import status_codes
9
-
10
- from pytrends import exceptions
11
-
12
- from urllib.parse import quote
13
-
14
-
15
- BASE_TRENDS_URL = 'https://trends.google.com/trends'
16
-
17
-
18
- class TrendReq(object):
19
- """
20
- Google Trends API
21
- """
22
- GET_METHOD = 'get'
23
- POST_METHOD = 'post'
24
- GENERAL_URL = f'{BASE_TRENDS_URL}/api/explore'
25
- INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multiline'
26
- MULTIRANGE_INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multirange'
27
- INTEREST_BY_REGION_URL = f'{BASE_TRENDS_URL}/api/widgetdata/comparedgeo'
28
- RELATED_QUERIES_URL = f'{BASE_TRENDS_URL}/api/widgetdata/relatedsearches'
29
- TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/hottrends/visualize/internal/data'
30
- TOP_CHARTS_URL = f'{BASE_TRENDS_URL}/api/topcharts'
31
- SUGGESTIONS_URL = f'{BASE_TRENDS_URL}/api/autocomplete/'
32
- CATEGORIES_URL = f'{BASE_TRENDS_URL}/api/explore/pickers/category'
33
- TODAY_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/dailytrends'
34
- REALTIME_TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/realtimetrends'
35
- ERROR_CODES = (500, 502, 504, 429)
36
-
37
- def __init__(self, hl='en-US', tz=360, geo='', timeout=(2, 5), proxies='',
38
- retries=0, backoff_factor=0, requests_args=None):
39
- """
40
- Initialize default values for params
41
- """
42
- # google rate limit
43
- self.google_rl = 'You have reached your quota limit. Please try again later.'
44
- self.results = None
45
- # set user defined options used globally
46
- self.tz = tz
47
- self.hl = hl
48
- self.geo = geo
49
- self.kw_list = list()
50
- self.timeout = timeout
51
- self.proxies = proxies # add a proxy option
52
- self.retries = retries
53
- self.backoff_factor = backoff_factor
54
- self.proxy_index = 0
55
- self.requests_args = requests_args or {}
56
- self.cookies = self.GetGoogleCookie()
57
- # intialize widget payloads
58
- self.token_payload = dict()
59
- self.interest_over_time_widget = dict()
60
- self.interest_by_region_widget = dict()
61
- self.related_topics_widget_list = list()
62
- self.related_queries_widget_list = list()
63
-
64
- self.headers = {'accept-language': self.hl}
65
- self.headers.update(self.requests_args.pop('headers', {}))
66
-
67
- def GetGoogleCookie(self):
68
- """
69
- Gets google cookie (used for each and every proxy; once on init otherwise)
70
- Removes proxy from the list on proxy error
71
- """
72
- while True:
73
- if "proxies" in self.requests_args:
74
- try:
75
- return dict(filter(lambda i: i[0] == 'NID', requests.get(
76
- f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
77
- timeout=self.timeout,
78
- **self.requests_args
79
- ).cookies.items()))
80
- except:
81
- continue
82
- else:
83
- if len(self.proxies) > 0:
84
- proxy = {'https': self.proxies[self.proxy_index]}
85
- else:
86
- proxy = ''
87
- try:
88
- return dict(filter(lambda i: i[0] == 'NID', requests.get(
89
- f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
90
- timeout=self.timeout,
91
- proxies=proxy,
92
- **self.requests_args
93
- ).cookies.items()))
94
- except requests.exceptions.ProxyError:
95
- print('Proxy error. Changing IP')
96
- if len(self.proxies) > 1:
97
- self.proxies.remove(self.proxies[self.proxy_index])
98
- else:
99
- print('No more proxies available. Bye!')
100
- raise
101
- continue
102
-
103
- def GetNewProxy(self):
104
- """
105
- Increment proxy INDEX; zero on overflow
106
- """
107
- if self.proxy_index < (len(self.proxies) - 1):
108
- self.proxy_index += 1
109
- else:
110
- self.proxy_index = 0
111
-
112
- def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
113
- """Send a request to Google and return the JSON response as a Python object
114
- :param url: the url to which the request will be sent
115
- :param method: the HTTP method ('get' or 'post')
116
- :param trim_chars: how many characters should be trimmed off the beginning of the content of the response
117
- before this is passed to the JSON parser
118
- :param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
119
- :return:
120
- """
121
- s = requests.session()
122
- # Retries mechanism. Activated when one of statements >0 (best used for proxy)
123
- if self.retries > 0 or self.backoff_factor > 0:
124
- retry = Retry(total=self.retries, read=self.retries,
125
- connect=self.retries,
126
- backoff_factor=self.backoff_factor,
127
- status_forcelist=TrendReq.ERROR_CODES,
128
- method_whitelist=frozenset(['GET', 'POST']))
129
- s.mount('https://', HTTPAdapter(max_retries=retry))
130
-
131
- s.headers.update(self.headers)
132
- if len(self.proxies) > 0:
133
- self.cookies = self.GetGoogleCookie()
134
- s.proxies.update({'https': self.proxies[self.proxy_index]})
135
- if method == TrendReq.POST_METHOD:
136
- response = s.post(url, timeout=self.timeout,
137
- cookies=self.cookies, **kwargs,
138
- **self.requests_args) # DO NOT USE retries or backoff_factor here
139
- else:
140
- response = s.get(url, timeout=self.timeout, cookies=self.cookies,
141
- **kwargs, **self.requests_args) # DO NOT USE retries or backoff_factor here
142
- # check if the response contains json and throw an exception otherwise
143
- # Google mostly sends 'application/json' in the Content-Type header,
144
- # but occasionally it sends 'application/javascript
145
- # and sometimes even 'text/javascript
146
- if response.status_code == 200 and 'application/json' in \
147
- response.headers['Content-Type'] or \
148
- 'application/javascript' in response.headers['Content-Type'] or \
149
- 'text/javascript' in response.headers['Content-Type']:
150
- # trim initial characters
151
- # some responses start with garbage characters, like ")]}',"
152
- # these have to be cleaned before being passed to the json parser
153
- content = response.text[trim_chars:]
154
- # parse json
155
- self.GetNewProxy()
156
- return json.loads(content)
157
- else:
158
- if response.status_code == status_codes.codes.too_many_requests:
159
- raise exceptions.TooManyRequestsError.from_response(response)
160
- raise exceptions.ResponseError.from_response(response)
161
-
162
- def build_payload(self, kw_list, cat=0, timeframe='today 5-y', geo='',
163
- gprop=''):
164
- """Create the payload for related queries, interest over time and interest by region"""
165
- if gprop not in ['', 'images', 'news', 'youtube', 'froogle']:
166
- raise ValueError('gprop must be empty (to indicate web), images, news, youtube, or froogle')
167
- self.kw_list = kw_list
168
- self.geo = geo or self.geo
169
- self.token_payload = {
170
- 'hl': self.hl,
171
- 'tz': self.tz,
172
- 'req': {'comparisonItem': [], 'category': cat, 'property': gprop}
173
- }
174
-
175
- # Check if timeframe is a list
176
- if isinstance(timeframe, list):
177
- for index, kw in enumerate(self.kw_list):
178
- keyword_payload = {'keyword': kw, 'time': timeframe[index], 'geo': self.geo}
179
- self.token_payload['req']['comparisonItem'].append(keyword_payload)
180
- else:
181
- # build out json for each keyword with
182
- for kw in self.kw_list:
183
- keyword_payload = {'keyword': kw, 'time': timeframe, 'geo': self.geo}
184
- self.token_payload['req']['comparisonItem'].append(keyword_payload)
185
-
186
- # requests will mangle this if it is not a string
187
- self.token_payload['req'] = json.dumps(self.token_payload['req'])
188
- # get tokens
189
- self._tokens()
190
- return
191
-
192
- def _tokens(self):
193
- """Makes request to Google to get API tokens for interest over time, interest by region and related queries"""
194
- # make the request and parse the returned json
195
- widget_dicts = self._get_data(
196
- url=TrendReq.GENERAL_URL,
197
- method=TrendReq.POST_METHOD,
198
- params=self.token_payload,
199
- trim_chars=4,
200
- )['widgets']
201
- # order of the json matters...
202
- first_region_token = True
203
- # clear self.related_queries_widget_list and self.related_topics_widget_list
204
- # of old keywords'widgets
205
- self.related_queries_widget_list[:] = []
206
- self.related_topics_widget_list[:] = []
207
- # assign requests
208
- for widget in widget_dicts:
209
- if widget['id'] == 'TIMESERIES':
210
- self.interest_over_time_widget = widget
211
- if widget['id'] == 'GEO_MAP' and first_region_token:
212
- self.interest_by_region_widget = widget
213
- first_region_token = False
214
- # response for each term, put into a list
215
- if 'RELATED_TOPICS' in widget['id']:
216
- self.related_topics_widget_list.append(widget)
217
- if 'RELATED_QUERIES' in widget['id']:
218
- self.related_queries_widget_list.append(widget)
219
- return
220
-
221
- def interest_over_time(self):
222
- """Request data from Google's Interest Over Time section and return a dataframe"""
223
-
224
- over_time_payload = {
225
- # convert to string as requests will mangle
226
- 'req': json.dumps(self.interest_over_time_widget['request']),
227
- 'token': self.interest_over_time_widget['token'],
228
- 'tz': self.tz
229
- }
230
-
231
- # make the request and parse the returned json
232
- req_json = self._get_data(
233
- url=TrendReq.INTEREST_OVER_TIME_URL,
234
- method=TrendReq.GET_METHOD,
235
- trim_chars=5,
236
- params=over_time_payload,
237
- )
238
-
239
- df = pd.DataFrame(req_json['default']['timelineData'])
240
- if (df.empty):
241
- return df
242
-
243
- df['date'] = pd.to_datetime(df['time'].astype(dtype='float64'),
244
- unit='s')
245
- df = df.set_index(['date']).sort_index()
246
- # split list columns into seperate ones, remove brackets and split on comma
247
- result_df = df['value'].apply(lambda x: pd.Series(
248
- str(x).replace('[', '').replace(']', '').split(',')))
249
- # rename each column with its search term, relying on order that google provides...
250
- for idx, kw in enumerate(self.kw_list):
251
- # there is currently a bug with assigning columns that may be
252
- # parsed as a date in pandas: use explicit insert column method
253
- result_df.insert(len(result_df.columns), kw,
254
- result_df[idx].astype('int'))
255
- del result_df[idx]
256
-
257
- if 'isPartial' in df:
258
- # make other dataframe from isPartial key data
259
- # split list columns into seperate ones, remove brackets and split on comma
260
- df = df.fillna(False)
261
- result_df2 = df['isPartial'].apply(lambda x: pd.Series(
262
- str(x).replace('[', '').replace(']', '').split(',')))
263
- result_df2.columns = ['isPartial']
264
- # Change to a bool type.
265
- result_df2.isPartial = result_df2.isPartial == 'True'
266
- # concatenate the two dataframes
267
- final = pd.concat([result_df, result_df2], axis=1)
268
- else:
269
- final = result_df
270
- final['isPartial'] = False
271
-
272
- return final
273
-
274
- def multirange_interest_over_time(self):
275
- """Request data from Google's Interest Over Time section across different time ranges and return a dataframe"""
276
-
277
- over_time_payload = {
278
- # convert to string as requests will mangle
279
- 'req': json.dumps(self.interest_over_time_widget['request']),
280
- 'token': self.interest_over_time_widget['token'],
281
- 'tz': self.tz
282
- }
283
-
284
- # make the request and parse the returned json
285
- req_json = self._get_data(
286
- url=TrendReq.MULTIRANGE_INTEREST_OVER_TIME_URL,
287
- method=TrendReq.GET_METHOD,
288
- trim_chars=5,
289
- params=over_time_payload,
290
- )
291
-
292
- df = pd.DataFrame(req_json['default']['timelineData'])
293
- if (df.empty):
294
- return df
295
-
296
- result_df = pd.json_normalize(df['columnData'])
297
-
298
- # Split dictionary columns into seperate ones
299
- for i, column in enumerate(result_df.columns):
300
- result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = result_df[i].apply(pd.Series)["formattedTime"]
301
- result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = result_df[i].apply(pd.Series)["value"]
302
- result_df = result_df.drop([i], axis=1)
303
-
304
- # Adds a row with the averages at the top of the dataframe
305
- avg_row = {}
306
- for i, avg in enumerate(req_json['default']['averages']):
307
- avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = "Average"
308
- avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = req_json['default']['averages'][i]
309
-
310
- result_df.loc[-1] = avg_row
311
- result_df.index = result_df.index + 1
312
- result_df = result_df.sort_index()
313
-
314
- return result_df
315
-
316
-
317
- def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
318
- inc_geo_code=False):
319
- """Request data from Google's Interest by Region section and return a dataframe"""
320
-
321
- # make the request
322
- region_payload = dict()
323
- if self.geo == '':
324
- self.interest_by_region_widget['request'][
325
- 'resolution'] = resolution
326
- elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
327
- self.interest_by_region_widget['request'][
328
- 'resolution'] = resolution
329
-
330
- self.interest_by_region_widget['request'][
331
- 'includeLowSearchVolumeGeos'] = inc_low_vol
332
-
333
- # convert to string as requests will mangle
334
- region_payload['req'] = json.dumps(
335
- self.interest_by_region_widget['request'])
336
- region_payload['token'] = self.interest_by_region_widget['token']
337
- region_payload['tz'] = self.tz
338
-
339
- # parse returned json
340
- req_json = self._get_data(
341
- url=TrendReq.INTEREST_BY_REGION_URL,
342
- method=TrendReq.GET_METHOD,
343
- trim_chars=5,
344
- params=region_payload,
345
- )
346
- df = pd.DataFrame(req_json['default']['geoMapData'])
347
- if (df.empty):
348
- return df
349
-
350
- # rename the column with the search keyword
351
- geo_column = 'geoCode' if 'geoCode' in df.columns else 'coordinates'
352
- columns = ['geoName', geo_column, 'value']
353
- df = df[columns].set_index(['geoName']).sort_index()
354
- # split list columns into separate ones, remove brackets and split on comma
355
- result_df = df['value'].apply(lambda x: pd.Series(
356
- str(x).replace('[', '').replace(']', '').split(',')))
357
- if inc_geo_code:
358
- if geo_column in df.columns:
359
- result_df[geo_column] = df[geo_column]
360
- else:
361
- print('Could not find geo_code column; Skipping')
362
-
363
- # rename each column with its search term
364
- for idx, kw in enumerate(self.kw_list):
365
- result_df[kw] = result_df[idx].astype('int')
366
- del result_df[idx]
367
-
368
- return result_df
369
-
370
- def related_topics(self):
371
- """Request data from Google's Related Topics section and return a dictionary of dataframes
372
-
373
- If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None
374
- """
375
-
376
- # make the request
377
- related_payload = dict()
378
- result_dict = dict()
379
- for request_json in self.related_topics_widget_list:
380
- # ensure we know which keyword we are looking at rather than relying on order
381
- try:
382
- kw = request_json['request']['restriction'][
383
- 'complexKeywordsRestriction']['keyword'][0]['value']
384
- except KeyError:
385
- kw = ''
386
- # convert to string as requests will mangle
387
- related_payload['req'] = json.dumps(request_json['request'])
388
- related_payload['token'] = request_json['token']
389
- related_payload['tz'] = self.tz
390
-
391
- # parse the returned json
392
- req_json = self._get_data(
393
- url=TrendReq.RELATED_QUERIES_URL,
394
- method=TrendReq.GET_METHOD,
395
- trim_chars=5,
396
- params=related_payload,
397
- )
398
-
399
- # top topics
400
- try:
401
- top_list = req_json['default']['rankedList'][0]['rankedKeyword']
402
- df_top = pd.json_normalize(top_list, sep='_')
403
- except KeyError:
404
- # in case no top topics are found, the lines above will throw a KeyError
405
- df_top = None
406
-
407
- # rising topics
408
- try:
409
- rising_list = req_json['default']['rankedList'][1]['rankedKeyword']
410
- df_rising = pd.json_normalize(rising_list, sep='_')
411
- except KeyError:
412
- # in case no rising topics are found, the lines above will throw a KeyError
413
- df_rising = None
414
-
415
- result_dict[kw] = {'rising': df_rising, 'top': df_top}
416
- return result_dict
417
-
418
- def related_queries(self):
419
- """Request data from Google's Related Queries section and return a dictionary of dataframes
420
-
421
- If no top and/or rising related queries are found, the value for the key "top" and/or "rising" will be None
422
- """
423
-
424
- # make the request
425
- related_payload = dict()
426
- result_dict = dict()
427
- for request_json in self.related_queries_widget_list:
428
- # ensure we know which keyword we are looking at rather than relying on order
429
- try:
430
- kw = request_json['request']['restriction'][
431
- 'complexKeywordsRestriction']['keyword'][0]['value']
432
- except KeyError:
433
- kw = ''
434
- # convert to string as requests will mangle
435
- related_payload['req'] = json.dumps(request_json['request'])
436
- related_payload['token'] = request_json['token']
437
- related_payload['tz'] = self.tz
438
-
439
- # parse the returned json
440
- req_json = self._get_data(
441
- url=TrendReq.RELATED_QUERIES_URL,
442
- method=TrendReq.GET_METHOD,
443
- trim_chars=5,
444
- params=related_payload,
445
- )
446
-
447
- # top queries
448
- try:
449
- top_df = pd.DataFrame(
450
- req_json['default']['rankedList'][0]['rankedKeyword'])
451
- top_df = top_df[['query', 'value']]
452
- except KeyError:
453
- # in case no top queries are found, the lines above will throw a KeyError
454
- top_df = None
455
-
456
- # rising queries
457
- try:
458
- rising_df = pd.DataFrame(
459
- req_json['default']['rankedList'][1]['rankedKeyword'])
460
- rising_df = rising_df[['query', 'value']]
461
- except KeyError:
462
- # in case no rising queries are found, the lines above will throw a KeyError
463
- rising_df = None
464
-
465
- result_dict[kw] = {'top': top_df, 'rising': rising_df}
466
- return result_dict
467
-
468
- def trending_searches(self, pn='united_states'):
469
- """Request data from Google's Hot Searches section and return a dataframe"""
470
-
471
- # make the request
472
- # forms become obsolete due to the new TRENDING_SEARCHES_URL
473
- # forms = {'ajax': 1, 'pn': pn, 'htd': '', 'htv': 'l'}
474
- req_json = self._get_data(
475
- url=TrendReq.TRENDING_SEARCHES_URL,
476
- method=TrendReq.GET_METHOD
477
- )[pn]
478
- result_df = pd.DataFrame(req_json)
479
- return result_df
480
-
481
- def today_searches(self, pn='US'):
482
- """Request data from Google Daily Trends section and returns a dataframe"""
483
- forms = {'ns': 15, 'geo': pn, 'tz': '-180', 'hl': self.hl}
484
- req_json = self._get_data(
485
- url=TrendReq.TODAY_SEARCHES_URL,
486
- method=TrendReq.GET_METHOD,
487
- trim_chars=5,
488
- params=forms,
489
- **self.requests_args
490
- )['default']['trendingSearchesDays'][0]['trendingSearches']
491
- # parse the returned json
492
- result_df = pd.DataFrame(trend['title'] for trend in req_json)
493
- return result_df.iloc[:, -1]
494
-
495
- def realtime_trending_searches(self, pn='US', cat='all', count =300):
496
- """Request data from Google Realtime Search Trends section and returns a dataframe"""
497
- # Don't know what some of the params mean here, followed the nodejs library
498
- # https://github.com/pat310/google-trends-api/ 's implemenration
499
-
500
-
501
- #sort: api accepts only 0 as the value, optional parameter
502
-
503
- # ri: number of trending stories IDs returned,
504
- # max value of ri supported is 300, based on emperical evidence
505
-
506
- ri_value = 300
507
- if count < ri_value:
508
- ri_value = count
509
-
510
- # rs : don't know what is does but it's max value is never more than the ri_value based on emperical evidence
511
- # max value of ri supported is 200, based on emperical evidence
512
- rs_value = 200
513
- if count < rs_value:
514
- rs_value = count-1
515
-
516
- forms = {'ns': 15, 'geo': pn, 'tz': '300', 'hl': self.hl, 'cat': cat, 'fi' : '0', 'fs' : '0', 'ri' : ri_value, 'rs' : rs_value, 'sort' : 0}
517
- req_json = self._get_data(
518
- url=TrendReq.REALTIME_TRENDING_SEARCHES_URL,
519
- method=TrendReq.GET_METHOD,
520
- trim_chars=5,
521
- params=forms
522
- )['storySummaries']['trendingStories']
523
-
524
- # parse the returned json
525
- #wanted_keys = ["entityNames", "title"]
526
-
527
- #final_json = [{ key: ts[key] for key in ts.keys() if key in wanted_keys} for ts in req_json ]
528
-
529
- #result_df = pd.DataFrame(final_json)
530
-
531
- return req_json
532
-
533
- def top_charts(self, date, hl='en-US', tz=300, geo='GLOBAL'):
534
- """Request data from Google's Top Charts section and return a dataframe"""
535
-
536
- try:
537
- date = int(date)
538
- except:
539
- raise ValueError(
540
- 'The date must be a year with format YYYY. See https://github.com/GeneralMills/pytrends/issues/355')
541
-
542
- # create the payload
543
- chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
544
- 'isMobile': False}
545
-
546
- # make the request and parse the returned json
547
- req_json = self._get_data(
548
- url=TrendReq.TOP_CHARTS_URL,
549
- method=TrendReq.GET_METHOD,
550
- trim_chars=5,
551
- params=chart_payload
552
- )
553
- try:
554
- df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
555
- except IndexError:
556
- df = None
557
- return df
558
-
559
- def suggestions(self, keyword):
560
- """Request data from Google's Keyword Suggestion dropdown and return a dictionary"""
561
-
562
- # make the request
563
- kw_param = quote(keyword)
564
- parameters = {'hl': self.hl}
565
-
566
- req_json = self._get_data(
567
- url=TrendReq.SUGGESTIONS_URL + kw_param,
568
- params=parameters,
569
- method=TrendReq.GET_METHOD,
570
- trim_chars=5
571
- )['default']['topics']
572
- return req_json
573
-
574
- def categories(self):
575
- """Request available categories data from Google's API and return a dictionary"""
576
-
577
- params = {'hl': self.hl}
578
-
579
- req_json = self._get_data(
580
- url=TrendReq.CATEGORIES_URL,
581
- params=params,
582
- method=TrendReq.GET_METHOD,
583
- trim_chars=5
584
- )
585
- return req_json
586
-
587
- def get_historical_interest(self, *args, **kwargs):
588
- raise NotImplementedError(
589
- """This method has been removed for incorrectness. It will be removed completely in v5.
590
- If you'd like similar functionality, please try implementing it yourself and consider submitting a pull request to add it to pytrends.
591
-
592
- There is discussion at:
593
- https://github.com/GeneralMills/pytrends/pull/542"""
594
- )