import json import os import time import urllib from selenium.common.exceptions import NoSuchElementException from timer import ArcTimer from driver import get_driver, DEAD_DRIVER_TUPLE class ArchiveBase(object): ''' Archiver base class. ''' def __init__(self, base_url=None, new_driver=False, print_sleep=3, request_sleep=0, pageLoadStrategy=None, timeout=None, **kwargs): ''' Args: base_url (str): Base archive url, should have a "{url}" format string wherever the target URL is supposed to go. new_driver (bool): whether to make a new webdriver or use the existing global one print_sleep (int): seconds to wait before updating the console readout request_sleep (int): seconds to wait between ALL requests pageLoadStrategy (str): look up pageLoadStrategy in Selenium timeout (int): seconds to wait for the current archive to complete before starting the next one **kwargs: arbitrary keyword arguments to set as object attributes ''' self.base_url = base_url self.print_sleep = print_sleep self.request_sleep = request_sleep self.driver = get_driver(new_driver) self.timeout = timeout self.pageLoadStrategy = pageLoadStrategy self.out = [] for k,v in kwargs.items(): setattr(self, k, v) def archive_all(self, urls=None, file=None, unique=True): ''' Archive multiple URLs. If no URLs are specified, will attempt to load a list of URLs to archive from the specified file. Args: urls (list, [optional]): list of URLs to archive file (str, [optional]): path to a JSON to save/load from unique (bool, [optional]): if true then ignore URLs that are already archived (i.e. in self.out attribute) Returns: TYPE: #DOC# Raises: ValueError: #DOC# ''' if urls is not None: if unique: urls_done = set([i['original'] for i in self.out]) urls_todo = set(urls).difference(urls_done) self.out += [{'original':i, 'archived':None} for i in urls_todo] elif file is not None: try: self.out = self.load(file) except: raise ValueError('u dun goofed!') todo = [i for i in self.out if i['archived'] is None] total = len(todo) for index, dicto in enumerate(todo): url = dicto['original'] this = self.archive_one(url, index+1, total) dicto.update(this) if file: self.save(self.out, file) time.sleep(self.request_sleep) return self.out def archive_one(self, url, cur=1, max=1, **kwargs): ''' #DOC# Add description Args: url (str): URL to archive cur (int, [optional]): current index for the console printout max (int, [optional]): maximum index for the console printout **kwargs: keyword args to use for URL formatting Returns: dict: dictionary with original URL + archive URL ''' print("Archiving ({cur}/{max}): {url}".format(**locals())) archive_end_url = self.get_looper(url) print("\nORIGINAL: {}\nARCHIVED: {}".format(url, archive_end_url)) return {'original':url, 'archived':archive_end_url} def get(self, url, **kwargs): ''' Get an archive of the specified URL Args: url (str): **kwargs: keyword args to use for URL formatting ''' target_url = self.base_url.format(url=urllib.quote(url), **kwargs) try: self.driver.get(target_url) assert(self.neterror_check()) except AssertionError: self.neterror_handler(url=url, **kwargs) except DEAD_DRIVER_TUPLE: self.restart_driver() self.get(url) def get_looper(self, url, timer=None, **kwargs): ''' Makes sure that the driver gets restarted if it dies partway through Args: url (TYPE): URL to initiate the archive process timer (timer.ArcTimer, [optional]): timer object; monitors timeouts **kwargs: keyword args to use for URL formatting Returns: str: current page URL ''' if timer is None: timer = ArcTimer(sleep=self.print_sleep, timeout=self.timeout) try: self.get(url, **kwargs) while not self.is_done(timer=timer): timer.report() return self.driver.current_url except DEAD_DRIVER_TUPLE:# as e: self.restart_driver() self.get_looper(url, timer) def is_done(self, **kwargs): ''' Used by get_looper to determine when the archive is done ''' raise NotImplementedError('ERROR: Must implement "is_done" method!') def load(self, path): ''' Loads data from a JSON file ''' path = os.path.expanduser(path) with open(path) as file: js = json.load(file) return js def neterror_check(self): ''' Makes sure a neterror hasn't ruined shit ''' try: assert(self.driver.find_element_by_class_name('neterror')) return True except AssertionError: return False except NoSuchElementException: return True except Exception as e: raise e def neterror_handler(self, url, **kwargs): ''' placeholder for class-specific neterror handler ''' pass def restart_driver(self): ''' Restarts the webdriver ''' try: self.driver.quit() except: pass self.driver = get_driver(new=True, pageLoadStrategy=self.pageLoadStrategy) def save(self, data, path, indent=4): ''' Saves the archived URLs Args: data (dict/list): JSON-compatible archive data object (e.g. dict, list) path (str): desired save path for the archive JSON indent (int, [optional]): JSON indent to pass into json.dumps ''' path = os.path.expanduser(path) with open(path, 'w+') as file: file.write(json.dumps(data,indent=indent)) print('Saved data to file: {}'.format(path))