From 689a9429cd5339da73a03cd0eb5b5246a5dad7ef Mon Sep 17 00:00:00 2001
From: Citation Checking Project <TheCCProject@protonmail.com>
Date: Sun, 12 Jul 2020 08:25:11 +0000
Subject: [PATCH] implement cache

---
 .gitignore |  1 +
 README.md  |  4 ++++
 analyze.py | 28 ++++++++++++++++++++++------
 3 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8974612
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.cache.db
diff --git a/README.md b/README.md
index 3208f02..913076d 100644
--- a/README.md
+++ b/README.md
@@ -38,3 +38,7 @@ Updated data should be regularly saved to the version control system.
 The script `analyze.py` can be used to locally process the data. It can be run
 with Python 3.7 or higher. If called with no arguments, it will list all
 available operating modes.
+
+**Due to the size of the dataset,** the first invocation of `analyze.py` might
+take a minute or two to complete. All subsequent invocations should be nearly
+instantaneous thanks to the cache.
diff --git a/analyze.py b/analyze.py
index 4878fe7..4ab7f81 100755
--- a/analyze.py
+++ b/analyze.py
@@ -18,8 +18,10 @@ from collections import defaultdict
 from decimal import Decimal
 from datetime import datetime as DateTime
 from dataclasses import dataclass
+import os
 from pathlib import Path
 import re
+import shelve
 
 # a sum in US dollars
 Amount = Decimal
@@ -90,23 +92,37 @@ def parse_donations(lines: Iterable[str]) -> Iterable[Donation]:
                     amount=Decimal(x['amount'].replace(',', '')),
                 )
 
-def all_files():
+def all_source_files():
     return Path().glob('*.txt')
 
+CACHE_FILE = Path('.cache.db')
+def load_data() -> Dict[str, List[Donation]]:
+    '''load all the parsed data, using the cache if possible'''
+    with shelve.open(os.fspath(CACHE_FILE), protocol=4) as cache:
+        fresh_time = CACHE_FILE.stat().st_mtime
+        for f in all_source_files():
+            if f.name not in cache or f.stat().st_mtime >= fresh_time:
+                cache[f.name] = list(parse_donations(f.open()))
+        data = {f.name: cache[f.name] for f in all_source_files()}
+    return data
+
 def grand(args):
     '''print grand total'''
-    print(sum(d.amount for f in all_files() for d in parse_donations(f.open())))
+    data = load_data()
+    print(sum(d.amount for (f, ds) in data.items() for d in ds))
 
 def files(args):
     '''print total per file'''
-    for f in sorted(all_files()):
-        print(f, sum(d.amount for d in parse_donations(f.open())))
+    data = load_data()
+    for (f, ds) in sorted(data.items()):
+        print(f, sum(d.amount for d in ds))
 
 def paypigs(args):
     '''print total per paypig'''
     sums = defaultdict(Amount)
-    for f in all_files():
-        for d in parse_donations(f.open()):
+    data = load_data()
+    for (f, ds) in data.items():
+        for d in ds:
             sums[d.paypig] += d.amount
     for paypig, amount in sorted(sums.items(), key=lambda i: i[1]):
         print(paypig, amount)