arcli/archivers/base.py

import json
import os
import time
import urllib
from selenium.common.exceptions import NoSuchElementException

from timer import ArcTimer
from driver import get_driver, DEAD_DRIVER_TUPLE

class ArchiveBase(object):
    '''
    Archiver base class.
    '''

    def __init__(self, base_url=None, new_driver=False, print_sleep=3, request_sleep=0, pageLoadStrategy=None, timeout=None, **kwargs):
        '''
        Args:
            base_url (str): Base archive url, should have a "{url}" format string
                wherever the target URL is supposed to go.
            new_driver (bool): whether to make a new webdriver or
                use the existing global one
            print_sleep (int): seconds to wait before updating the console readout
            request_sleep (int): seconds to wait between ALL requests
            pageLoadStrategy (str): look up pageLoadStrategy in Selenium
            timeout (int): seconds to wait for the current archive to complete
                before starting the next one
            **kwargs: arbitrary keyword arguments to set as object attributes
        '''
        self.base_url = base_url
        self.print_sleep = print_sleep
        self.request_sleep = request_sleep
        self.driver = get_driver(new_driver)
        self.timeout = timeout
        self.pageLoadStrategy = pageLoadStrategy
        self.out = []
        for k,v in kwargs.items():
            setattr(self, k, v)

    def archive_all(self, urls=None, file=None, unique=True):
        '''
        Archive multiple URLs. If no URLs are specified, will attempt to load a list of URLs to archive from the specified file.

        Args:
            urls (list, [optional]): list of URLs to archive
            file (str, [optional]): path to a JSON to save/load from
            unique (bool, [optional]): if true then ignore URLs that are
                already archived (i.e. in self.out attribute)

        Returns:
            TYPE: #DOC#

        Raises:
            ValueError: #DOC#
        '''
        if urls is not None:
            if unique:
                urls_done = set([i['original'] for i in self.out])
                urls_todo = set(urls).difference(urls_done)
            self.out += [{'original':i, 'archived':None} for i in urls_todo]
        elif file is not None:
            try:
                self.out = self.load(file)
            except:
                raise ValueError('u dun goofed!')

        todo = [i for i in self.out if i['archived'] is None]
        total = len(todo)
        for index, dicto in enumerate(todo):
            url = dicto['original']
            this = self.archive_one(url, index+1, total)
            dicto.update(this)
            if file:
                self.save(self.out, file)
            time.sleep(self.request_sleep)
        return self.out

    def archive_one(self, url, cur=1, max=1, **kwargs):
        '''
        #DOC# Add description

        Args:
            url (str): URL to archive
            cur (int, [optional]): current index for the console printout
            max (int, [optional]): maximum index for the console printout
            **kwargs: keyword args to use for URL formatting

        Returns:
            dict: dictionary with original URL + archive URL
        '''
        print("Archiving ({cur}/{max}): {url}".format(**locals()))
        archive_end_url = self.get_looper(url)
        print("\nORIGINAL: {}\nARCHIVED: {}".format(url, archive_end_url))
        return {'original':url, 'archived':archive_end_url}

    def get(self, url, **kwargs):
        '''
        Get an archive of the specified URL

        Args:
            url (str):
            **kwargs: keyword args to use for URL formatting
        '''
        target_url = self.base_url.format(url=urllib.quote(url), **kwargs)
        try:
            self.driver.get(target_url)
            assert(self.neterror_check())
        except AssertionError:
            self.neterror_handler(url=url, **kwargs)
        except DEAD_DRIVER_TUPLE:
            self.restart_driver()
            self.get(url)

    def get_looper(self, url, timer=None, **kwargs):
        '''
        Makes sure that the driver gets restarted if it dies partway through

        Args:
            url (TYPE): URL to initiate the archive process
            timer (timer.ArcTimer, [optional]): timer object; monitors timeouts
            **kwargs: keyword args to use for URL formatting

        Returns:
            str: current page URL
        '''
        if timer is None:
            timer = ArcTimer(sleep=self.print_sleep, timeout=self.timeout)
        try:
            self.get(url, **kwargs)
            while not self.is_done(timer=timer):
                timer.report()
            return self.driver.current_url
        except DEAD_DRIVER_TUPLE:# as e:
            self.restart_driver()
            self.get_looper(url, timer)

    def is_done(self, **kwargs):
        '''
        Used by get_looper to determine when the archive is done
        '''
        raise NotImplementedError('ERROR: Must implement "is_done" method!')

    def load(self, path):
        '''
        Loads data from a JSON file
        '''
        path = os.path.expanduser(path)
        with open(path) as file:
            js = json.load(file)
        return js

    def neterror_check(self):
        '''
        Makes sure a neterror hasn't ruined shit
        '''
        try:
            assert(self.driver.find_element_by_class_name('neterror'))
            return True
        except AssertionError:
            return False
        except NoSuchElementException:
            return True
        except Exception as e:
            raise e

    def neterror_handler(self, url, **kwargs):
        '''
        placeholder for class-specific neterror handler
        '''
        pass

    def restart_driver(self):
        '''
        Restarts the webdriver
        '''
        try:
            self.driver.quit()
        except:
            pass
        self.driver = get_driver(new=True, pageLoadStrategy=self.pageLoadStrategy)

    def save(self, data, path, indent=4):
        '''
        Saves the archived URLs

        Args:
            data (dict/list): JSON-compatible archive data object (e.g. dict, list)
            path (str): desired save path for the archive JSON
            indent (int, [optional]): JSON indent to pass into json.dumps
        '''
        path = os.path.expanduser(path)
        with open(path, 'w+') as file:
            file.write(json.dumps(data,indent=indent))
        print('Saved data to file: {}'.format(path))