From 689a9429cd5339da73a03cd0eb5b5246a5dad7ef Mon Sep 17 00:00:00 2001 From: Citation Checking Project Date: Sun, 12 Jul 2020 08:25:11 +0000 Subject: [PATCH] implement cache --- .gitignore | 1 + README.md | 4 ++++ analyze.py | 28 ++++++++++++++++++++++------ 3 files changed, 27 insertions(+), 6 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8974612 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.cache.db diff --git a/README.md b/README.md index 3208f02..913076d 100644 --- a/README.md +++ b/README.md @@ -38,3 +38,7 @@ Updated data should be regularly saved to the version control system. The script `analyze.py` can be used to locally process the data. It can be run with Python 3.7 or higher. If called with no arguments, it will list all available operating modes. + +**Due to the size of the dataset,** the first invocation of `analyze.py` might +take a minute or two to complete. All subsequent invocations should be nearly +instantaneous thanks to the cache. diff --git a/analyze.py b/analyze.py index 4878fe7..4ab7f81 100755 --- a/analyze.py +++ b/analyze.py @@ -18,8 +18,10 @@ from collections import defaultdict from decimal import Decimal from datetime import datetime as DateTime from dataclasses import dataclass +import os from pathlib import Path import re +import shelve # a sum in US dollars Amount = Decimal @@ -90,23 +92,37 @@ def parse_donations(lines: Iterable[str]) -> Iterable[Donation]: amount=Decimal(x['amount'].replace(',', '')), ) -def all_files(): +def all_source_files(): return Path().glob('*.txt') +CACHE_FILE = Path('.cache.db') +def load_data() -> Dict[str, List[Donation]]: + '''load all the parsed data, using the cache if possible''' + with shelve.open(os.fspath(CACHE_FILE), protocol=4) as cache: + fresh_time = CACHE_FILE.stat().st_mtime + for f in all_source_files(): + if f.name not in cache or f.stat().st_mtime >= fresh_time: + cache[f.name] = list(parse_donations(f.open())) + data = {f.name: cache[f.name] for f in all_source_files()} + return data + def grand(args): '''print grand total''' - print(sum(d.amount for f in all_files() for d in parse_donations(f.open()))) + data = load_data() + print(sum(d.amount for (f, ds) in data.items() for d in ds)) def files(args): '''print total per file''' - for f in sorted(all_files()): - print(f, sum(d.amount for d in parse_donations(f.open()))) + data = load_data() + for (f, ds) in sorted(data.items()): + print(f, sum(d.amount for d in ds)) def paypigs(args): '''print total per paypig''' sums = defaultdict(Amount) - for f in all_files(): - for d in parse_donations(f.open()): + data = load_data() + for (f, ds) in data.items(): + for d in ds: sums[d.paypig] += d.amount for paypig, amount in sorted(sums.items(), key=lambda i: i[1]): print(paypig, amount)