I do just that (scrape the website).
The code isn't nice so I didn't publish it, but I guess you could use it
as a starting point. It does a bit more since I try to cache the whole
year of data and not spam them when I import prices for the full month.
It's not 100% proof since I don't cache the the first request in
'get_historical_price')
Here is how you'd specify it:
2020-01-01 commodity LU0102238812
price: "EUR:price.ft_com.fund/LU0102238812:EUR"
And here is the code (I use it for both ETFs and funds)
import datetime
import requests
import shelve
import tempfile
import re
import json
import os
from bs4 import BeautifulSoup
from beancount.core.number import D
from beancount.prices import source
def now():
return int(datetime.datetime.now(datetime.timezone.utc).timestamp())
def source_price(price, dt, currency):
if currency == 'GBX':
currency = 'GBP'
price = price / 100
return source.SourcePrice(price, dt, currency)
class BaseSource(source.Source):
uri = ""
def find_current_price(self, soup):
elements = soup.find_all('div',
class_='mod-tearsheet-overview__quote')
assert len(elements) == 1
element = elements[0]
ul = element.contents[0]
disclaimer = element.contents[1]
assert
ul.name == "ul"
li = ul.contents[0]
assert
li.name == "li"
currency = re.search("Price \((.*)\)",
li.contents[0].text).group(1)
price_str = li.contents[1].text
return price_str, currency
def get_latest_price(self, ticker):
response = requests.get(self.uri, {'s': ticker})
soup = BeautifulSoup(response.text, 'html.parser')
price_str, currency = self.find_current_price(soup)
element = soup.find('div', class_='mod-tearsheet-overview__quote')
disclaimer = element.contents[1]
quote_date_m = re.search(", as of (.*)$", disclaimer.text)
if quote_date_m:
print(quote_date_m.group(1))
dt = datetime.datetime.strptime(quote_date_m.group(1), "%b
%d %Y %H:%M GMT.")
dt = dt.replace(tzinfo=datetime.timezone.utc)
else:
dt = None
return source_price(D(price_str), dt, currency)
def get_historical_price(self, ticker, time):
response = requests.get(self.uri, {'s': ticker})
soup = BeautifulSoup(response.text, 'html.parser')
price_str, currency = self.find_current_price(soup)
elements = soup.find_all('div', attrs={"data-module-name" :
"HistoricalPricesApp"})
assert len(elements) == 1
symbol_id = json.loads(elements[0]['data-mod-config'])['symbol']
year = (time - datetime.timedelta(days=1)).year
temp_dir = tempfile.gettempdir()
cache_path = os.path.join(temp_dir, f'ft_com-{year}-{symbol_id}')
data_uri='
https://markets.ft.com/data/equities/ajax/get-historical-prices'
args = {
'startdate': f'{year}/01/01',
'endDate': f'{year}/12/31',
'symbol': symbol_id
}
with shelve.open(cache_path) as db:
if db.get('expiry', -1) < now():
db['expiry'] = now() + (3600 * 24)
history_response = requests.get(data_uri, args).json()
db['result'] = history_response
else:
history_response = db.get('result')
history_soup = BeautifulSoup(history_response['html'],
'html.parser')
for tr in history_soup.contents:
date_str = tr.contents[0].contents[0].text
dt = datetime.datetime.strptime(date_str, "%A, %B %d, %Y")
dt = dt.replace(tzinfo=datetime.timezone.utc)
if dt <= time:
price_str = tr.contents[4].text
return source_price(D(price_str), dt, currency)
assert False
from .base import BaseSource
class Source(BaseSource):
uri = "
https://markets.ft.com/data/funds/tearsheet/historical"