webcrawler / pytrends /request.py
Add1E's picture
Upload 8 files
ff01b82 verified
raw
history blame
25.2 kB
import json
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from requests import status_codes
from pytrends import exceptions
from urllib.parse import quote
BASE_TRENDS_URL = 'https://trends.google.com/trends'
class TrendReq(object):
"""
Google Trends API
"""
GET_METHOD = 'get'
POST_METHOD = 'post'
GENERAL_URL = f'{BASE_TRENDS_URL}/api/explore'
INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multiline'
MULTIRANGE_INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multirange'
INTEREST_BY_REGION_URL = f'{BASE_TRENDS_URL}/api/widgetdata/comparedgeo'
RELATED_QUERIES_URL = f'{BASE_TRENDS_URL}/api/widgetdata/relatedsearches'
TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/hottrends/visualize/internal/data'
TOP_CHARTS_URL = f'{BASE_TRENDS_URL}/api/topcharts'
SUGGESTIONS_URL = f'{BASE_TRENDS_URL}/api/autocomplete/'
CATEGORIES_URL = f'{BASE_TRENDS_URL}/api/explore/pickers/category'
TODAY_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/dailytrends'
REALTIME_TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/realtimetrends'
TRENDS_URL = f'{BASE_TRENDS_URL}/api/trends'
ERROR_CODES = (500, 502, 504, 429)
def __init__(self, hl='en-US', tz=360, geo='', timeout=(2, 5), proxies='',
retries=0, backoff_factor=0, requests_args=None):
"""
Initialize default values for params
"""
# google rate limit
self.google_rl = 'You have reached your quota limit. Please try again later.'
self.results = None
# set user defined options used globally
self.tz = tz
self.hl = hl
self.geo = geo
self.kw_list = list()
self.timeout = timeout
self.proxies = proxies # add a proxy option
self.retries = retries
self.backoff_factor = backoff_factor
self.proxy_index = 0
self.requests_args = requests_args or {}
self.cookies = self.GetGoogleCookie()
# intialize widget payloads
self.token_payload = dict()
self.interest_over_time_widget = dict()
self.interest_by_region_widget = dict()
self.related_topics_widget_list = list()
self.related_queries_widget_list = list()
self.headers = {'accept-language': self.hl}
self.headers.update(self.requests_args.pop('headers', {}))
def GetGoogleCookie(self):
"""
Gets google cookie (used for each and every proxy; once on init otherwise)
Removes proxy from the list on proxy error
"""
while True:
if "proxies" in self.requests_args:
try:
return dict(filter(lambda i: i[0] == 'NID', requests.get(
f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
timeout=self.timeout,
**self.requests_args
).cookies.items()))
except:
continue
else:
if len(self.proxies) > 0:
proxy = {'https': self.proxies[self.proxy_index]}
else:
proxy = ''
try:
return dict(filter(lambda i: i[0] == 'NID', requests.get(
f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
timeout=self.timeout,
proxies=proxy,
**self.requests_args
).cookies.items()))
except requests.exceptions.ProxyError:
print('Proxy error. Changing IP')
if len(self.proxies) > 1:
self.proxies.remove(self.proxies[self.proxy_index])
else:
print('No more proxies available. Bye!')
raise
continue
def GetNewProxy(self):
"""
Increment proxy INDEX; zero on overflow
"""
if self.proxy_index < (len(self.proxies) - 1):
self.proxy_index += 1
else:
self.proxy_index = 0
def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
"""Send a request to Google and return the JSON response as a Python object
:param url: the url to which the request will be sent
:param method: the HTTP method ('get' or 'post')
:param trim_chars: how many characters should be trimmed off the beginning of the content of the response
before this is passed to the JSON parser
:param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
:return:
"""
s = requests.session()
# Retries mechanism. Activated when one of statements >0 (best used for proxy)
if self.retries > 0 or self.backoff_factor > 0:
retry = Retry(total=self.retries, read=self.retries,
connect=self.retries,
backoff_factor=self.backoff_factor,
status_forcelist=TrendReq.ERROR_CODES,
method_whitelist=frozenset(['GET', 'POST']))
s.mount('https://', HTTPAdapter(max_retries=retry))
s.headers.update(self.headers)
if len(self.proxies) > 0:
self.cookies = self.GetGoogleCookie()
s.proxies.update({'https': self.proxies[self.proxy_index]})
if method == TrendReq.POST_METHOD:
response = s.post(url, timeout=self.timeout,
cookies=self.cookies, **kwargs,
**self.requests_args) # DO NOT USE retries or backoff_factor here
else:
response = s.get(url, timeout=self.timeout, cookies=self.cookies,
**kwargs, **self.requests_args) # DO NOT USE retries or backoff_factor here
# check if the response contains json and throw an exception otherwise
# Google mostly sends 'application/json' in the Content-Type header,
# but occasionally it sends 'application/javascript
# and sometimes even 'text/javascript
if response.status_code == 200 and 'application/json' in \
response.headers['Content-Type'] or \
'application/javascript' in response.headers['Content-Type'] or \
'text/javascript' in response.headers['Content-Type']:
# trim initial characters
# some responses start with garbage characters, like ")]}',"
# these have to be cleaned before being passed to the json parser
content = response.text[trim_chars:]
# parse json
self.GetNewProxy()
return json.loads(content)
else:
if response.status_code == status_codes.codes.too_many_requests:
raise exceptions.TooManyRequestsError.from_response(response)
raise exceptions.ResponseError.from_response(response)
def build_payload(self, kw_list, cat=0, timeframe='today 5-y', geo='',
gprop=''):
"""Create the payload for related queries, interest over time and interest by region"""
if gprop not in ['', 'images', 'news', 'youtube', 'froogle']:
raise ValueError('gprop must be empty (to indicate web), images, news, youtube, or froogle')
self.kw_list = kw_list
self.geo = geo or self.geo
self.token_payload = {
'hl': self.hl,
'tz': self.tz,
'req': {'comparisonItem': [], 'category': cat, 'property': gprop}
}
# Check if timeframe is a list
if isinstance(timeframe, list):
for index, kw in enumerate(self.kw_list):
keyword_payload = {'keyword': kw, 'time': timeframe[index], 'geo': self.geo}
self.token_payload['req']['comparisonItem'].append(keyword_payload)
else:
# build out json for each keyword with
for kw in self.kw_list:
keyword_payload = {'keyword': kw, 'time': timeframe, 'geo': self.geo}
self.token_payload['req']['comparisonItem'].append(keyword_payload)
# requests will mangle this if it is not a string
self.token_payload['req'] = json.dumps(self.token_payload['req'])
# get tokens
self._tokens()
return
def _tokens(self):
"""Makes request to Google to get API tokens for interest over time, interest by region and related queries"""
# make the request and parse the returned json
widget_dicts = self._get_data(
url=TrendReq.GENERAL_URL,
method=TrendReq.POST_METHOD,
params=self.token_payload,
trim_chars=4,
)['widgets']
# order of the json matters...
first_region_token = True
# clear self.related_queries_widget_list and self.related_topics_widget_list
# of old keywords'widgets
self.related_queries_widget_list[:] = []
self.related_topics_widget_list[:] = []
# assign requests
for widget in widget_dicts:
if widget['id'] == 'TIMESERIES':
self.interest_over_time_widget = widget
if widget['id'] == 'GEO_MAP' and first_region_token:
self.interest_by_region_widget = widget
first_region_token = False
# response for each term, put into a list
if 'RELATED_TOPICS' in widget['id']:
self.related_topics_widget_list.append(widget)
if 'RELATED_QUERIES' in widget['id']:
self.related_queries_widget_list.append(widget)
return
def interest_over_time(self):
"""Request data from Google's Interest Over Time section and return a dataframe"""
over_time_payload = {
# convert to string as requests will mangle
'req': json.dumps(self.interest_over_time_widget['request']),
'token': self.interest_over_time_widget['token'],
'tz': self.tz
}
# make the request and parse the returned json
req_json = self._get_data(
url=TrendReq.INTEREST_OVER_TIME_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=over_time_payload,
)
df = pd.DataFrame(req_json['default']['timelineData'])
if (df.empty):
return df
df['date'] = pd.to_datetime(df['time'].astype(dtype='float64'),
unit='s')
df = df.set_index(['date']).sort_index()
# split list columns into seperate ones, remove brackets and split on comma
result_df = df['value'].apply(lambda x: pd.Series(
str(x).replace('[', '').replace(']', '').split(',')))
# rename each column with its search term, relying on order that google provides...
for idx, kw in enumerate(self.kw_list):
# there is currently a bug with assigning columns that may be
# parsed as a date in pandas: use explicit insert column method
result_df.insert(len(result_df.columns), kw,
result_df[idx].astype('int'))
del result_df[idx]
if 'isPartial' in df:
# make other dataframe from isPartial key data
# split list columns into seperate ones, remove brackets and split on comma
df = df.fillna(False)
result_df2 = df['isPartial'].apply(lambda x: pd.Series(
str(x).replace('[', '').replace(']', '').split(',')))
result_df2.columns = ['isPartial']
# Change to a bool type.
result_df2.isPartial = result_df2.isPartial == 'True'
# concatenate the two dataframes
final = pd.concat([result_df, result_df2], axis=1)
else:
final = result_df
final['isPartial'] = False
return final
def multirange_interest_over_time(self):
"""Request data from Google's Interest Over Time section across different time ranges and return a dataframe"""
over_time_payload = {
# convert to string as requests will mangle
'req': json.dumps(self.interest_over_time_widget['request']),
'token': self.interest_over_time_widget['token'],
'tz': self.tz
}
# make the request and parse the returned json
req_json = self._get_data(
url=TrendReq.MULTIRANGE_INTEREST_OVER_TIME_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=over_time_payload,
)
df = pd.DataFrame(req_json['default']['timelineData'])
if (df.empty):
return df
result_df = pd.json_normalize(df['columnData'])
# Split dictionary columns into seperate ones
for i, column in enumerate(result_df.columns):
result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = result_df[i].apply(pd.Series)["formattedTime"]
result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = result_df[i].apply(pd.Series)["value"]
result_df = result_df.drop([i], axis=1)
# Adds a row with the averages at the top of the dataframe
avg_row = {}
for i, avg in enumerate(req_json['default']['averages']):
avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = "Average"
avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = req_json['default']['averages'][i]
result_df.loc[-1] = avg_row
result_df.index = result_df.index + 1
result_df = result_df.sort_index()
return result_df
def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
inc_geo_code=False):
"""Request data from Google's Interest by Region section and return a dataframe"""
# make the request
region_payload = dict()
if self.geo == '':
self.interest_by_region_widget['request'][
'resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request'][
'resolution'] = resolution
self.interest_by_region_widget['request'][
'includeLowSearchVolumeGeos'] = inc_low_vol
# convert to string as requests will mangle
region_payload['req'] = json.dumps(
self.interest_by_region_widget['request'])
region_payload['token'] = self.interest_by_region_widget['token']
region_payload['tz'] = self.tz
# parse returned json
req_json = self._get_data(
url=TrendReq.INTEREST_BY_REGION_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=region_payload,
)
df = pd.DataFrame(req_json['default']['geoMapData'])
if (df.empty):
return df
# rename the column with the search keyword
geo_column = 'geoCode' if 'geoCode' in df.columns else 'coordinates'
columns = ['geoName', geo_column, 'value']
df = df[columns].set_index(['geoName']).sort_index()
# split list columns into separate ones, remove brackets and split on comma
result_df = df['value'].apply(lambda x: pd.Series(
str(x).replace('[', '').replace(']', '').split(',')))
if inc_geo_code:
if geo_column in df.columns:
result_df[geo_column] = df[geo_column]
else:
print('Could not find geo_code column; Skipping')
# rename each column with its search term
for idx, kw in enumerate(self.kw_list):
result_df[kw] = result_df[idx].astype('int')
del result_df[idx]
return result_df
def related_topics(self):
"""Request data from Google's Related Topics section and return a dictionary of dataframes
If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None
"""
# make the request
related_payload = dict()
result_dict = dict()
for request_json in self.related_topics_widget_list:
# ensure we know which keyword we are looking at rather than relying on order
try:
kw = request_json['request']['restriction'][
'complexKeywordsRestriction']['keyword'][0]['value']
except KeyError:
kw = ''
# convert to string as requests will mangle
related_payload['req'] = json.dumps(request_json['request'])
related_payload['token'] = request_json['token']
related_payload['tz'] = self.tz
# parse the returned json
req_json = self._get_data(
url=TrendReq.RELATED_QUERIES_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=related_payload,
)
# top topics
try:
top_list = req_json['default']['rankedList'][0]['rankedKeyword']
df_top = pd.json_normalize(top_list, sep='_')
except KeyError:
# in case no top topics are found, the lines above will throw a KeyError
df_top = None
# rising topics
try:
rising_list = req_json['default']['rankedList'][1]['rankedKeyword']
df_rising = pd.json_normalize(rising_list, sep='_')
except KeyError:
# in case no rising topics are found, the lines above will throw a KeyError
df_rising = None
result_dict[kw] = {'rising': df_rising, 'top': df_top}
return result_dict
def related_queries(self):
"""Request data from Google's Related Queries section and return a dictionary of dataframes
If no top and/or rising related queries are found, the value for the key "top" and/or "rising" will be None
"""
# make the request
related_payload = dict()
result_dict = dict()
for request_json in self.related_queries_widget_list:
# ensure we know which keyword we are looking at rather than relying on order
try:
kw = request_json['request']['restriction'][
'complexKeywordsRestriction']['keyword'][0]['value']
except KeyError:
kw = ''
# convert to string as requests will mangle
related_payload['req'] = json.dumps(request_json['request'])
related_payload['token'] = request_json['token']
related_payload['tz'] = self.tz
# parse the returned json
req_json = self._get_data(
url=TrendReq.RELATED_QUERIES_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=related_payload,
)
# top queries
try:
top_df = pd.DataFrame(
req_json['default']['rankedList'][0]['rankedKeyword'])
top_df = top_df[['query', 'value']]
except KeyError:
# in case no top queries are found, the lines above will throw a KeyError
top_df = None
# rising queries
try:
rising_df = pd.DataFrame(
req_json['default']['rankedList'][1]['rankedKeyword'])
rising_df = rising_df[['query', 'value']]
except KeyError:
# in case no rising queries are found, the lines above will throw a KeyError
rising_df = None
result_dict[kw] = {'top': top_df, 'rising': rising_df}
return result_dict
def trending_searches(self, pn='united_states'):
"""Request data from Google's Hot Searches section and return a dataframe"""
# make the request
# forms become obsolete due to the new TRENDING_SEARCHES_URL
# forms = {'ajax': 1, 'pn': pn, 'htd': '', 'htv': 'l'}
req_json = self._get_data(
url=TrendReq.TRENDING_SEARCHES_URL,
method=TrendReq.GET_METHOD
)[pn]
print(req_json)
result_df = pd.DataFrame(req_json)
return result_df
def today_searches(self, pn='US'):
"""Request data from Google Daily Trends section and returns a dataframe"""
forms = {'ns': 15, 'geo': pn, 'tz': '-180', 'hl': self.hl}
req_json = self._get_data(
url=TrendReq.TODAY_SEARCHES_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=forms,
**self.requests_args
)['default']['trendingSearchesDays'][0]['trendingSearches']
# parse the returned jso
return req_json
def realtime_trending_searches(self, pn='US', cat='all', count =300):
"""Request data from Google Realtime Search Trends section and returns a dataframe"""
# Don't know what some of the params mean here, followed the nodejs library
# https://github.com/pat310/google-trends-api/ 's implemenration
#sort: api accepts only 0 as the value, optional parameter
# ri: number of trending stories IDs returned,
# max value of ri supported is 300, based on emperical evidence
ri_value = 300
if count < ri_value:
ri_value = count
# rs : don't know what is does but it's max value is never more than the ri_value based on emperical evidence
# max value of ri supported is 200, based on emperical evidence
rs_value = 200
if count < rs_value:
rs_value = count-1
forms = {'ns': 15, 'geo': pn, 'tz': '300', 'hl': self.hl, 'cat': cat, 'fi' : '0', 'fs' : '0', 'ri' : ri_value, 'rs' : rs_value, 'sort' : 0}
req_json = self._get_data(
url=TrendReq.REALTIME_TRENDING_SEARCHES_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=forms
)['storySummaries']['trendingStories']
return req_json
def top_charts(self, date, hl='en-US', tz=300, geo='GLOBAL'):
"""Request data from Google's Top Charts section and return a dataframe"""
try:
date = int(date)
except:
raise ValueError(
'The date must be a year with format YYYY. See https://github.com/GeneralMills/pytrends/issues/355')
# create the payload
chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
'isMobile': False}
# make the request and parse the returned json
req_json = self._get_data(
url=TrendReq.TOP_CHARTS_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=chart_payload
)
try:
df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
except IndexError:
df = None
return df
def trends(self, date, hl='en-US', tz=300, geo='GLOBAL'):
"""Request data from Google's Top Charts section and return a dataframe"""
# create the payload
chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
'isMobile': False}
# make the request and parse the returned json
req_json = self._get_data(
url=TrendReq.GENERAL_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=chart_payload
)
try:
df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
except IndexError:
df = None
return df
def suggestions(self, keyword):
"""Request data from Google's Keyword Suggestion dropdown and return a dictionary"""
# make the request
kw_param = quote(keyword)
parameters = {'hl': self.hl}
req_json = self._get_data(
url=TrendReq.SUGGESTIONS_URL + kw_param,
params=parameters,
method=TrendReq.GET_METHOD,
trim_chars=5
)['default']['topics']
return req_json
def categories(self):
"""Request available categories data from Google's API and return a dictionary"""
params = {'hl': self.hl}
req_json = self._get_data(
url=TrendReq.CATEGORIES_URL,
params=params,
method=TrendReq.GET_METHOD,
trim_chars=5
)
return req_json
def get_historical_interest(self, *args, **kwargs):
raise NotImplementedError(
"""This method has been removed for incorrectness. It will be removed completely in v5.
If you'd like similar functionality, please try implementing it yourself and consider submitting a pull request to add it to pytrends.
There is discussion at:
https://github.com/GeneralMills/pytrends/pull/542"""
)