I did get this working but I don't think I will continue to use
it. I don't look at my finances often enough. In fact I usually
pick away at updating all my accounts every 6 months (for an hour
or two on about 10 evenings).
The only real downside was restaurant transactions that included
tips. I would only get an email for the initial charge so my
imported value would not have the final total. Then I would have
to comb through everything to find the issues and get my balance
checks to pass.
I am going to pursue CSV import like a normal person, but I am
glad to have tried. I'll dump some code below in case it inspires
and/or horrifies anyone someday.
Importing the emails was quite fiddly...bizarre MIME
quoted-printable stuff. I had planned to use the Unread state to
keep track of what had been processed. Maybe that explains the
~5% mysteriously missing transaction rate.
from datetime import datetime
import quopri
import email
import imaplib
import base64
import os
import re
from email import policy
from email.message import EmailMessage
from email.parser import Parser
from pprint import pprint
mail = imaplib.IMAP4_SSL(email_server, 993)
mail.login(email_user, email_pass)
test2 = mail.select("INBOX.CC-notifications")
test = mail.list()
type, data = mail.search(None, 'UNSEEN')
for num in data[0].split():
typ, data = mail.fetch(num, '(RFC822)' )
for response_part in data:
if isinstance(response_part, tuple):
raw_email = data[0][1]# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
msg = email.message_from_string(raw_email_string)
if msg['Message-Id']:
subject = msg['Subject']
# Like: 'Thu, 11 Jan 2024 22:39:49 -0500 (EST)'
date_with_parens = msg['Date']
date_string = re.sub(r' \([^)]*\)', '', date_with_parens)
date = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
filename = 'downloads/' + date.strftime('%Y-%m-%d_%H-%M-%S') + '.eml'
with open(filename, 'w') as file:
file.write(raw_email_string)
else:
print('No Message-Id for %s')
Nothing too crazy with parsing the data out of the email body.
It is susceptible to changing formatting
import decimal
import os
import re
from beancount.core import amount
from beancount.core import data
from beancount.core.amount import Amount
from beancount.core.number import D
from beancount.ingest import importer
from beancount.ingest import regression
from chardet import detect
from datetime import datetime
from email import policy
from email.parser import Parser
from pprint import pprint
class Importer(importer.ImporterProtocol):
"""An importer for <credit card> email notifications"""
def __init__(self, lastFour, account, accounts, currency):
self.lastFour = lastFour
self.account = account
self.accounts = accounts
self.currency = currency
def identify(self, file):
#print(self.extract_amount(self.extract_body(file)))
return (re.search(self.lastFour, self.extract_body(file)) is not None)
def file_name(self, file):
return os.path.basename(file.name).split('_')[1]
def file_account(self, _):
return self.account
def file_date(self, file):
return self.extract_date(self.extract_body(file))
def extract(self, file):
entries = []
body = self.extract_body(file)
meta = data.new_metadata(file.name, lineno=0)
date = self.extract_date(body)
flag = "txn"
payee = self.extract_payee(body)
narration = ""
tags = data.EMPTY_SET
links = data.EMPTY_SET
txn = data.Transaction(meta, date, flag, payee, narration, tags, links, postings=[])
account = self.file_account(file)
liabilities_units = Amount(-self.extract_amount(body), self.currency)
expenses_units = Amount(self.extract_amount(body), self.currency)
txn.postings.append(data.Posting(account, liabilities_units, None, None, None, None))
if payee in self.accounts:
txn.postings.append(data.Posting(self.accounts.get(payee), expenses_units, None, None, None, None))
else:
txn.postings.append(data.Posting("Expenses:Uncategorized", expenses_units, None, None, None, None))
entries.append(txn)
return entries
def extract_body(self, file):
with open(file.name, 'r', encoding='utf-8', errors='ignore') as in_file:
msg = Parser(policy=policy.default).parse(in_file)
return msg.get_body().get_content()
def extract_date(self, body):
# Email contains "Jan 1, 2024 at 12:00 PM ET" as an example.
date_text = re.search("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [0-9]{1,2}, [0-9]{4} at ", body).group()
return datetime.strptime(date_text, "%b %d, %Y at ").date()
def extract_payee(self, body):
# Email contains this as an example:
# <td align="left" style="vertical-align:top; padding:0px 0px 0px 0px; font-family:Arial, Helvetica, sans-serif; font-size:16px; color:#414042;" class="font14">Merchant</td>
# <td align="right" style="vertical-align:top; padding:0px 0px 0px 5px; font-family:Arial, Helvetica, sans-serif; font-size:16px; font-weight:bold; color:#414042;" class="font14">GOOGLE *SyncTech</td>
return re.search("Merchant<\\/td>\\n.*font14\">(.*?)<\\/td>", body).group(1)
def extract_amount(self, body):
# Email contains this as an example:
# <td align="left" style="vertical-align:top; padding:0px 0px 0px 0px; font-family:Arial, Helvetica, sans-serif; font-size:16px; color:#414042;" class="font14">Amount</td>
# <td align="right" style="vertical-align:top; padding:0px 0px 0px 5px; font-family:Arial, Helvetica, sans-serif; font-size:16px; font-weight:bold; color:#414042;" class="font14">$5.36</td>
return D(re.search("Amount<\\/td>\\n.*font14\">\\$(.*?)<\\/td>", body).group(1))