import json import os import re REGEX_URL_ARCHIVE_LI = r'https{0,1}://(?Parchive\.(is|li|md|ph|today|vn))/(?Pwip/){0,1}(?P[^/]+)/{0,1}' REGEX_URL_ARCHIVE_ORG = r'https{0,1}://(?Pweb\.archive\.org)/web/(?P[0-9]+)/(?P.+)' REGEX_URL_GENERIC = r'https{0,1}://.+' REPO_PATH = os.path.dirname(__file__) class Clownlib(object): def __init__(self, repo_path=REPO_PATH): #set up repo and data file paths self.repo_path = os.path.expanduser(repo_path) self.data_path = os.path.join(self.repo_path,'data.json') #compile archive URL regexes self._regex_archive_li = re.compile(REGEX_URL_ARCHIVE_LI) self._regex_archive_org = re.compile(REGEX_URL_ARCHIVE_ORG) self._regex_url_generic = re.compile(REGEX_URL_GENERIC) #load data from file self.load() def __getitem__(self, key): ''' Allows indexing the self.data attribute as a normal list, or key:value lookups based Args: key (TYPE): #DOC# Returns: TYPE: #DOC# ''' if type(key) in [int, slice]: result = self.data[key] elif self._regex_archive_org.match(key): result = self._getitem_archive.match(key) elif self._regex_archive_li.match(key): subkey = self._regex_archive_li.match(key).groupdict()['key'] result = self._getitem_archive(subkey) elif self._regex_url_generic.match(key): result = self._getitem_original(key) elif key in self.groups: result = self._getitem_group(key) else: result = [] try: assert(result) except AssertionError: raise KeyError(key) else: return result def add(self, original, archived, group='Miscellaneous', note=None): ''' Add a new URL + archive URL pairing to the clown library Args: original (str): original URL archived (str): archive URL group (str, [optional]): URL grouping to use note (str, [optional]): Any notes about the URL ''' blob = {'original':original, 'archived':archived, 'note':note, 'group':group} if blob['note'] is None: blob.pop('note') self.data.append(blob) if group and group not in self.groups: self.groups.append(group) def _getitem_archive(self, key): ''' Gets records whose "archived" value matches the specified value Args: key (str): key to look up (archive.li / archive.org) Returns: list: all matching records ''' out = [] for i in self.data: if i['archived'] and i['archived'].endswith(key): out.append(i) return out def _getitem_original(self, key): ''' Gets records whose "original" value matches the specified value Args: key (str): key to look up Returns: list: all matching records ''' out = [] for i in self.data: if not i['original']: continue elif i['original'] == key or i['original'].startswith(key): out.append(i) return out def _getitem_group(self, key): ''' Gets records whose "group" value matches the specified value Args: key (str): key to look up Returns: list: all matching records ''' out = [] for i in self.data: if i['group'] == key or i['group'].lower() == key.lower(): out.append(i) return out def load(self): ''' Load data from saved JSON. ''' with open(self.data_path) as file: self.data = json.load(file) #configure group data on load self.groups = set([i['group'] for i in self.data]) def save(self): ''' Save data to JSON file. ''' with open(self.data_path, 'w+') as file: file.write(json.dumps(self.data, indent=4, sort_keys=True))