clownlib/data.py

import json
import os
import re

REGEX_URL_ARCHIVE_LI = r'https{0,1}://(?P<domain>archive\.(is|li|md|ph|today|vn))/(?P<wip>wip/){0,1}(?P<key>[^/]+)/{0,1}'
REGEX_URL_ARCHIVE_ORG = r'https{0,1}://(?P<domain>web\.archive\.org)/web/(?P<datestamp>[0-9]+)/(?P<original>.+)'
REGEX_URL_GENERIC = r'https{0,1}://.+'
REPO_PATH = os.path.dirname(__file__)

class Clownlib(object):
	def __init__(self, repo_path=REPO_PATH):
		#set up repo and data file paths
		self.repo_path = os.path.expanduser(repo_path)
		self.data_path = os.path.join(self.repo_path,'data.json')

		#compile archive URL regexes
		self._regex_archive_li = re.compile(REGEX_URL_ARCHIVE_LI)
		self._regex_archive_org = re.compile(REGEX_URL_ARCHIVE_ORG)
		self._regex_url_generic = re.compile(REGEX_URL_GENERIC)

		#load data from file
		self.load()

	def __getitem__(self, key):
		'''
		Allows indexing the self.data attribute as a normal list,
		or key:value lookups based

		Args:
		    key (TYPE): #DOC#

		Returns:
		    TYPE: #DOC#
		'''
		if type(key) in [int, slice]:
			result = self.data[key]
		elif self._regex_archive_org.match(key):
			result = self._getitem_archive.match(key)
		elif self._regex_archive_li.match(key):
			subkey = self._regex_archive_li.match(key).groupdict()['key']
			result = self._getitem_archive(subkey)
		elif self._regex_url_generic.match(key):
			result = self._getitem_original(key)
		elif key in self.groups:
			result = self._getitem_group(key)
		else:
			result = []

		try:
			assert(result)
		except AssertionError:
			raise KeyError(key)
		else:
			return result

	def add(self, original, archived, group='Miscellaneous', note=None):
		'''
		Add a new URL + archive URL pairing to the clown library

		Args:
		    original (str): original URL
		    archived (str): archive URL
		    group (str, [optional]): URL grouping to use
		    note (str, [optional]): Any notes about the URL
		'''
		blob = {'original':original, 'archived':archived, 'note':note, 'group':group}
		if blob['note'] is None:
			blob.pop('note')
		self.data.append(blob)

		if group and group not in self.groups:
			self.groups.append(group)

	def _getitem_archive(self, key):
		'''
		Gets records whose "archived" value matches the specified value

		Args:
		    key (str): key to look up (archive.li / archive.org)

		Returns:
		    list: all matching records
		'''
		out = []
		for i in self.data:
			if i['archived'] and i['archived'].endswith(key):
				out.append(i)
		return out

	def _getitem_original(self, key):
		'''
		Gets records whose "original" value matches the specified value

		Args:
		    key (str): key to look up

		Returns:
		    list: all matching records
		'''
		out = []
		for i in self.data:
			if not i['original']:
				continue
			elif i['original'] == key or i['original'].startswith(key):
				out.append(i)
		return out

	def _getitem_group(self, key):
		'''
		Gets records whose "group" value matches the specified value

		Args:
		    key (str): key to look up

		Returns:
		    list: all matching records
		'''
		out = []
		for i in self.data:
			if i['group'] == key or i['group'].lower() == key.lower():
				out.append(i)
		return out

	def load(self):
		'''
		Load data from saved JSON.
		'''
		with open(self.data_path) as file:
			self.data = json.load(file)

		#configure group data on load
		self.groups = set([i['group'] for i in self.data])

	def save(self):
		'''
		Save data to JSON file.
		'''
		with open(self.data_path, 'w+') as file:
			file.write(json.dumps(self.data, indent=4, sort_keys=True))