139 lines
3.5 KiB
Python
139 lines
3.5 KiB
Python
import json
|
|
import os
|
|
import re
|
|
|
|
REGEX_URL_ARCHIVE_LI = r'https{0,1}://(?P<domain>archive\.(is|li|md|ph|today|vn))/(?P<wip>wip/){0,1}(?P<key>[^/]+)/{0,1}'
|
|
REGEX_URL_ARCHIVE_ORG = r'https{0,1}://(?P<domain>web\.archive\.org)/web/(?P<datestamp>[0-9]+)/(?P<original>.+)'
|
|
REGEX_URL_GENERIC = r'https{0,1}://.+'
|
|
REPO_PATH = os.path.dirname(__file__)
|
|
|
|
class Clownlib(object):
|
|
def __init__(self, repo_path=REPO_PATH):
|
|
#set up repo and data file paths
|
|
self.repo_path = os.path.expanduser(repo_path)
|
|
self.data_path = os.path.join(self.repo_path,'data.json')
|
|
|
|
#compile archive URL regexes
|
|
self._regex_archive_li = re.compile(REGEX_URL_ARCHIVE_LI)
|
|
self._regex_archive_org = re.compile(REGEX_URL_ARCHIVE_ORG)
|
|
self._regex_url_generic = re.compile(REGEX_URL_GENERIC)
|
|
|
|
#load data from file
|
|
self.load()
|
|
|
|
def __getitem__(self, key):
|
|
'''
|
|
Allows indexing the self.data attribute as a normal list,
|
|
or key:value lookups based
|
|
|
|
Args:
|
|
key (TYPE): #DOC#
|
|
|
|
Returns:
|
|
TYPE: #DOC#
|
|
'''
|
|
if type(key) in [int, slice]:
|
|
result = self.data[key]
|
|
elif self._regex_archive_org.match(key):
|
|
result = self._getitem_archive.match(key)
|
|
elif self._regex_archive_li.match(key):
|
|
subkey = self._regex_archive_li.match(key).groupdict()['key']
|
|
result = self._getitem_archive(subkey)
|
|
elif self._regex_url_generic.match(key):
|
|
result = self._getitem_original(key)
|
|
elif key in self.groups:
|
|
result = self._getitem_group(key)
|
|
else:
|
|
result = []
|
|
|
|
try:
|
|
assert(result)
|
|
except AssertionError:
|
|
raise KeyError(key)
|
|
else:
|
|
return result
|
|
|
|
def add(self, original, archived, group='Miscellaneous', note=None):
|
|
'''
|
|
Add a new URL + archive URL pairing to the clown library
|
|
|
|
Args:
|
|
original (str): original URL
|
|
archived (str): archive URL
|
|
group (str, [optional]): URL grouping to use
|
|
note (str, [optional]): Any notes about the URL
|
|
'''
|
|
blob = {'original':original, 'archived':archived, 'note':note, 'group':group}
|
|
if blob['note'] is None:
|
|
blob.pop('note')
|
|
self.data.append(blob)
|
|
|
|
if group and group not in self.groups:
|
|
self.groups.append(group)
|
|
|
|
def _getitem_archive(self, key):
|
|
'''
|
|
Gets records whose "archived" value matches the specified value
|
|
|
|
Args:
|
|
key (str): key to look up (archive.li / archive.org)
|
|
|
|
Returns:
|
|
list: all matching records
|
|
'''
|
|
out = []
|
|
for i in self.data:
|
|
if i['archived'] and i['archived'].endswith(key):
|
|
out.append(i)
|
|
return out
|
|
|
|
def _getitem_original(self, key):
|
|
'''
|
|
Gets records whose "original" value matches the specified value
|
|
|
|
Args:
|
|
key (str): key to look up
|
|
|
|
Returns:
|
|
list: all matching records
|
|
'''
|
|
out = []
|
|
for i in self.data:
|
|
if not i['original']:
|
|
continue
|
|
elif i['original'] == key or i['original'].startswith(key):
|
|
out.append(i)
|
|
return out
|
|
|
|
def _getitem_group(self, key):
|
|
'''
|
|
Gets records whose "group" value matches the specified value
|
|
|
|
Args:
|
|
key (str): key to look up
|
|
|
|
Returns:
|
|
list: all matching records
|
|
'''
|
|
out = []
|
|
for i in self.data:
|
|
if i['group'] == key or i['group'].lower() == key.lower():
|
|
out.append(i)
|
|
return out
|
|
|
|
def load(self):
|
|
'''
|
|
Load data from saved JSON.
|
|
'''
|
|
with open(self.data_path) as file:
|
|
self.data = json.load(file)
|
|
|
|
#configure group data on load
|
|
self.groups = set([i['group'] for i in self.data])
|
|
|
|
def save(self):
|
|
'''
|
|
Save data to JSON file.
|
|
'''
|
|
with open(self.data_path, 'w+') as file:
|
|
file.write(json.dumps(self.data, indent=4, sort_keys=True)) |