Krypto1 / scrape_utils.py
KatGaw's picture
adding new reddit group
05a3e2c
# Import libraries
#Import packages
import pandas as pd
import numpy as np
import time
import datetime
from pycoingecko import CoinGeckoAPI
#from utils import slice
# Get API for CoinGecko
#cg = CoinGeckoAPI()
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US')
from pytrends import dailydata
import yfinance as yf
import json
import prettytable
import os
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
load_dotenv()
COINMARKET_API_KEY=os.environ["COINMARKET_API_KEY"]
# Historical crypto data
def scrape_historical_series(coin_name,symbol,date_start,date_end):
import datetime
""" Scrape historical series on the sample of coins.
Args:
coin_names(list): List of coins we will use for training.
date_start(list): List of values for Year_start,Month_start,Day_start.
date_end(list): List of values for Year_end,Month_end,Day_end.
Returns:
Dataframe with the evolution of prices, market capitalizaiton, and total volume over time, for each respective currency.
"""
df_ts_coins1=pd.DataFrame()
#DATE definitions
date_time = datetime.datetime(int(date_start[0]),int(date_start[1]),int(date_start[2]))
date_time_now = datetime.datetime(int(date_end[0]),int(date_end[1]),int(date_end[2]))
unix_past=time.mktime(date_time.timetuple()) #change the date format into unix for scraping
unix_now=time.mktime(date_time_now.timetuple())
past=datetime.datetime(int(date_start[0]),int(date_start[1]),int(date_start[2])).strftime('%Y-%m-%d')
now=datetime.datetime(int(date_end[0]),int(date_end[1]),int(date_end[2])).strftime('%Y-%m-%d')
datum_range=pd.date_range(start=past,end=now, freq='D')
#empty lists
unix_all=[]
#create date variable
for val in datum_range:
unix_all=np.append(unix_all,time.mktime(val.timetuple()))
url = ' https://pro-api.coinmarketcap.com/v2/cryptocurrency/quotes/historical'
parameters = {
'time_start': (int(unix_past)),
'time_end': (int(unix_now)),
'symbol': symbol,
'convert':'USD',
'interval': 'daily',
}
headers = {
'Accepts': 'application/json',
'X-CMC_PRO_API_KEY': COINMARKET_API_KEY,
}
session = Session()
session.headers.update(headers)
try:
response = session.get(url, params=parameters)
data_json = json.loads(response.text)
#data = json.loads(response.text['data']['quote']['USD'])
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
#SCRAPE FOR ETH
#create date variable
for val in datum_range:
unix_all=np.append(unix_all,time.mktime(val.timetuple()))
url = ' https://pro-api.coinmarketcap.com/v2/cryptocurrency/quotes/historical'
parameters = {
'time_start': (int(unix_past)),
'time_end': (int(unix_now)),
'symbol': 'ETH',
'convert':'USD',
'interval': 'daily',
}
headers = {
'Accepts': 'application/json',
'X-CMC_PRO_API_KEY': COINMARKET_API_KEY,
}
session = Session()
session.headers.update(headers)
try:
response = session.get(url, params=parameters)
data_json_eth = json.loads(response.text)
#data = json.loads(response.text['data']['quote']['USD'])
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
date=[]
price=[]
price_eth=[]
market_caps=[]
total_volumes=[]
for i in range(len(data_json['data'][symbol][0]['quotes'])):
date=np.append(date,data_json['data'][symbol][0]['quotes'][i]['quote']['USD']['timestamp'])
price=np.append(price,data_json['data'][symbol][0]['quotes'][i]['quote']['USD']['price'])
market_caps=np.append(market_caps,data_json['data'][symbol][0]['quotes'][i]['quote']['USD']['market_cap'])
total_volumes=np.append(total_volumes,data_json['data'][symbol][0]['quotes'][i]['quote']['USD']['volume_24h'])
price_eth=np.append(price_eth,data_json_eth['data']['ETH'][0]['quotes'][i]['quote']['USD']['price'])
ts_coins_cut=pd.DataFrame({'date':date, 'prices':price,'market_caps':market_caps,'total_vol':total_volumes,'price_eth':price_eth})
ts_coins_cut['id']=np.repeat(coin_name,len(ts_coins_cut))
ts_coins_cut['date']=pd.to_datetime(ts_coins_cut['date'])
# SCRAPE CURRENT DATA
unix_all=[]
#create date variable
for val in datum_range:
unix_all=np.append(unix_all,time.mktime(val.timetuple()))
url = ' https://pro-api.coinmarketcap.com/v2/cryptocurrency/quotes/latest'
parameters = {
'symbol': symbol,
'convert':'USD',
}
headers = {
'Accepts': 'application/json',
'X-CMC_PRO_API_KEY': COINMARKET_API_KEY,
}
session = Session()
session.headers.update(headers)
try:
response = session.get(url, params=parameters)
data_json = json.loads(response.text)
#data = json.loads(response.text['data']['quote']['USD'])
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
# Current data ETH
#create date variable
for val in datum_range:
unix_all=np.append(unix_all,time.mktime(val.timetuple()))
url = ' https://pro-api.coinmarketcap.com/v2/cryptocurrency/quotes/latest'
parameters = {
'symbol': 'ETH',
'convert':'USD',
}
headers = {
'Accepts': 'application/json',
'X-CMC_PRO_API_KEY': COINMARKET_API_KEY,
}
session = Session()
session.headers.update(headers)
try:
response = session.get(url, params=parameters)
data_json_eth = json.loads(response.text)
#data = json.loads(response.text['data']['quote']['USD'])
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)
date=data_json['data'][str(symbol)][0]['quote']['USD']['last_updated']
market_cap=data_json['data'][str(symbol)][0]['quote']['USD']['market_cap']
total_volumes=data_json['data'][str(symbol)][0]['quote']['USD']['volume_24h']
price=data_json['data'][str(symbol)][0]['quote']['USD']['price']
price_eth=data_json_eth['data']['ETH'][0]['quote']['USD']['price']
# CREATE CURRENT ROW
from datetime import date
today = date.today()
df_today_row=pd.DataFrame({0:['id','date','prices','market_caps','total_vol','price_eth'],1:[coin_name[0],today.strftime('%Y-%m-%d %H:%M:%S'),price,market_cap,total_volumes,price_eth]}).T
df_today_row.columns=df_today_row.iloc[0,:]
df_today_row=df_today_row.drop(0)
ts_coins_cut.to_csv('ts_coins_cut.csv')
return ts_coins_cut, df_today_row
# 2. Macro variables, CLI
def scrape_cli(past,today):
"""Scrape data on leading indicator for USA.
Args:
past(date): Date for which you want to start scraping.
today(date): Date for which you want to end scraping.
Returns:
Dataframe with CLI and dates.
"""
countries=['USA'] #,'OECDE','OECD','NMEC']
past_date=past.strftime('%Y-%m')
today_date=today.strftime('%Y-%m')
clis=[]
bclis=[]
names=[]
datas_country=pd.DataFrame()
datas1=pd.DataFrame()
types=['CLI'] #,'BCLI']
for type in types:
print(type)
'''Scrape OECD data and create dataset in the form of time series where variables are CLI and BCLI for each country'''
for country in countries:
# Scrape data
# if type=='BCLI':
# mainpage=requests.get(f'https://stats.oecd.org/restsdmx/sdmx.ashx/GetData/MEI_CLI/BSCICP03.{country}.M/all?startTime={past}&endTime={today}')
if type=='CLI':
mainpage=requests.get(f'https://stats.oecd.org/restsdmx/sdmx.ashx/GetData/MEI_CLI/CSCICP03.{country}.M/all?startTime={past_date}&endTime={today_date}')
soup=BeautifulSoup(mainpage.content,'xml') #'html.parser')
whatis=soup.find_all("ObsValue")
whatis_key=soup.find_all("ObsKey")
country=([(str(whatis_key[i]).split('"REF_AREA" value="')[1][:3]) for i in range(len(whatis))])
dates=[pd.to_datetime(str(whatis_key[i]).split('"TIME_PERIOD" value="')[1][:7]) for i in range(len(whatis))]
measure=[(str(whatis_key[i]).split('"MEASURE" value="')[1][:7][:-2]) for i in range(len(whatis))]
values=[float(str(whatis[i]).split('value="')[1][0:-4]) for i in range(len(whatis))]
df_cli=pd.DataFrame({'date':dates,'country':country,'measure':measure,type:values})
df_cli.index=pd.to_datetime(df_cli['date'])
df_cli=df_cli.loc[df_cli['country']=='USA']['CLI'].astype('float').resample('M').mean()
return df_cli
def scrape_cpi_employment():
"""Scrape CPI and employment data."""
headers = {'Content-type': 'application/json'}
variables=['CUUR0000SA0','LNS12000000']
data = json.dumps({"seriesid": variables,"startyear":"2024", "endyear":"2024"})
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)
json_data = json.loads(p.text)
year_all=[]
period_all=[]
value_all=[]
series_id=[]
if len(json_data['Results'])>0:
for series in json_data['Results']['series']:
x=prettytable.PrettyTable(["series id","year","period","value","footnotes"])
seriesId = series['seriesID']
for item in series['data']:
year = item['year']
period = item['period']
value = item['value']
footnotes=""
for footnote in item['footnotes']:
if footnote:
footnotes = footnotes + footnote['text'] + ','
if 'M01' <= period <= 'M12':
x.add_row([seriesId,year,period,value,footnotes[0:-1]])
year_all=np.append(year_all,year)
period_all=np.append(period_all,period)
value_all=np.append(value_all,value)
if seriesId=='CUUR0000SA0':
series_id=np.append(series_id,'CPI')
if seriesId=='LNS12000000':
series_id=np.append(series_id,'Employment')
date=[(pd.to_datetime(f"{year_all[i]}'-'{int(period_all[i][-2:])}")) for i in range(len(year_all))]
df_cpi=pd.DataFrame({'date':date,'value':value_all})
df_cpi['series_id']=series_id
df_cpi.set_index('date',inplace=True)
df_cpi=pd.concat([df_cpi.loc[df_cpi['series_id']=='CPI'],df_cpi.loc[df_cpi['series_id']=='Employment']],axis=1)
df_cpi=df_cpi.drop(columns='series_id')
df_cpi.columns=['CPI','Employment']
else:
df_cpi=pd.DataFrame()
return df_cpi
def scrape_google_trends(currency, currency_short):
curr_neni=[]
names_values=[currency]
names_short=[currency_short]
from datetime import date
today = date.today()
Day_end = today.strftime("%d")
Month_end = today.strftime("%m")
Year_end = today.strftime("%Y")
Hour_end=21
Minute_end=20
past=today-datetime.timedelta(days=200)
Day_start = past.strftime("%d")
Month_start = past.strftime("%m")
Year_start = past.strftime("%Y")
date_start=[Year_start,Month_start,Day_start]
date_end=[Year_end,Month_end,Day_end]
date_all1=pd.date_range(past,today)
#data_all1=np.repeat(0,len(date))
keywords = []
google_data=pd.DataFrame()
for run_name in list(names_values):
'''Scrape Google trends and create one time-series in the form of concated time series across all currencies'''
#google_old_slice=slice(run_name,google_old_for_slice,google_old_for_slice['id'])
run=list(names_values).index(run_name)
time.sleep(5)
try:
data=dailydata.get_daily_data(str(run_name),int(Year_start), int(Month_start), int(Year_end), int(Month_end),verbose=False) #kw_list, 2021, 10, 2021, 11, geo = '',verbose=False,wait_time=5
data1=data.iloc[:,4]
except:
try:
time.sleep(5)
new_index=list(names_values).index(run_name)
data = dailydata.get_daily_data(word=names_short[new_index],start_year=Year_start, start_mon=Month_start, stop_year=Year_end, stop_mon=Month_end,verbose=False) #kw_list, 2021, 10, 2021, 11, geo = '',verbose=False,wait_time=5
data1=data.iloc[:,4]
except:
pass
curr_neni=np.append(curr_neni,run_name)
#print(f'no currency {run_name} to scrape in google trends')
data1=(np.repeat(0,len(date_all1)))
data1=pd.DataFrame({'google_trend':pd.Series(data1)})
data1.insert(0,'id',np.repeat(run_name,len(data1)))
#google_tog=pd.concat([google_old_slice.set_index('date'),data1],axis=0)
google_data=pd.concat([google_data,data1],axis=0)
#change index from date to date_new to match old_dataset
google_data.reset_index(inplace=True)
if int(np.mean(data1['google_trend']))==0==0:
google_data['date_new']=date_all1
google_data.set_index('date_new',inplace=True)
else:
google_data.columns=np.append('date_new',google_data.columns[1:])
google_data.set_index('date_new',inplace=True)
return google_data
def scrape_stocks(past,today):
# Set dates in the form needed for scraping
date_old = past
date_new = today
#date_new = date.today().strftime("%Y-%m-%d")
df=pd.DataFrame()
#the codes for variables we are going to scrape
codes=['^GSPC','GC=F','EURUSD%3DX','^TNX']
codes_names=['GSPC','GC=F','EURUSD','TNX']
for code in codes:
'''Scrape Yahoo finance and create dataset with time series for all the financial variables'''
code_index=codes.index(code)
code_name=codes_names[code_index]
df_code = yf.download(code,start=date_old, end=date_new,progress=False) #^IXIC print
df_code=pd.DataFrame(df_code)
df_code=df_code.reset_index()
df_code_ts=df_code.iloc[:,1]
df_code_ts=df_code_ts.rename(code_name)
df = pd.concat([df,df_code_ts],axis=1)
df_time=df_code.iloc[:,0]
#set the index to date_new
df.insert(0,'date',df_time)
df_finance=df.dropna()
df_finance.set_index('date',inplace=True)
df_finance.index=pd.to_datetime(df_finance.index)
#combine dataset with the old one
return df_finance