Files

139 lines
3.5 KiB
Python

import json
import os
import re
REGEX_URL_ARCHIVE_LI = r'https{0,1}://(?P<domain>archive\.(is|li|md|ph|today|vn))/(?P<wip>wip/){0,1}(?P<key>[^/]+)/{0,1}'
REGEX_URL_ARCHIVE_ORG = r'https{0,1}://(?P<domain>web\.archive\.org)/web/(?P<datestamp>[0-9]+)/(?P<original>.+)'
REGEX_URL_GENERIC = r'https{0,1}://.+'
REPO_PATH = os.path.dirname(__file__)
class Clownlib(object):
def __init__(self, repo_path=REPO_PATH):
#set up repo and data file paths
self.repo_path = os.path.expanduser(repo_path)
self.data_path = os.path.join(self.repo_path,'data.json')
#compile archive URL regexes
self._regex_archive_li = re.compile(REGEX_URL_ARCHIVE_LI)
self._regex_archive_org = re.compile(REGEX_URL_ARCHIVE_ORG)
self._regex_url_generic = re.compile(REGEX_URL_GENERIC)
#load data from file
self.load()
def __getitem__(self, key):
'''
Allows indexing the self.data attribute as a normal list,
or key:value lookups based
Args:
key (TYPE): #DOC#
Returns:
TYPE: #DOC#
'''
if type(key) in [int, slice]:
result = self.data[key]
elif self._regex_archive_org.match(key):
result = self._getitem_archive.match(key)
elif self._regex_archive_li.match(key):
subkey = self._regex_archive_li.match(key).groupdict()['key']
result = self._getitem_archive(subkey)
elif self._regex_url_generic.match(key):
result = self._getitem_original(key)
elif key in self.groups:
result = self._getitem_group(key)
else:
result = []
try:
assert(result)
except AssertionError:
raise KeyError(key)
else:
return result
def add(self, original, archived, group='Miscellaneous', note=None):
'''
Add a new URL + archive URL pairing to the clown library
Args:
original (str): original URL
archived (str): archive URL
group (str, [optional]): URL grouping to use
note (str, [optional]): Any notes about the URL
'''
blob = {'original':original, 'archived':archived, 'note':note, 'group':group}
if blob['note'] is None:
blob.pop('note')
self.data.append(blob)
if group and group not in self.groups:
self.groups.append(group)
def _getitem_archive(self, key):
'''
Gets records whose "archived" value matches the specified value
Args:
key (str): key to look up (archive.li / archive.org)
Returns:
list: all matching records
'''
out = []
for i in self.data:
if i['archived'] and i['archived'].endswith(key):
out.append(i)
return out
def _getitem_original(self, key):
'''
Gets records whose "original" value matches the specified value
Args:
key (str): key to look up
Returns:
list: all matching records
'''
out = []
for i in self.data:
if not i['original']:
continue
elif i['original'] == key or i['original'].startswith(key):
out.append(i)
return out
def _getitem_group(self, key):
'''
Gets records whose "group" value matches the specified value
Args:
key (str): key to look up
Returns:
list: all matching records
'''
out = []
for i in self.data:
if i['group'] == key or i['group'].lower() == key.lower():
out.append(i)
return out
def load(self):
'''
Load data from saved JSON.
'''
with open(self.data_path) as file:
self.data = json.load(file)
#configure group data on load
self.groups = set([i['group'] for i in self.data])
def save(self):
'''
Save data to JSON file.
'''
with open(self.data_path, 'w+') as file:
file.write(json.dumps(self.data, indent=4, sort_keys=True))