arcli/archivers/org.py

import re
from base import ArchiveBase

REGEX_DONE = r'https://web.archive.org/web/[0-9]{14}/'
URL_BASE = 'https://web.archive.org/save/{url}'

class ArchiveOrg(ArchiveBase):
    '''
    Archiver class for Internet Archive (archive.org)
    '''
    def __init__(self, base_url=URL_BASE, new_driver=False, print_sleep=3, request_sleep=10, pageLoadStrategy=None, timeout=None):
        '''
        Args:
            base_url (str): Base archive url, should have a "{url}" format string
                wherever the target URL is supposed to go.
            new_driver (bool): whether to make a new webdriver or
                use the existing global one
            print_sleep (int): seconds to wait before updating the console readout
            request_sleep (int): seconds to wait between ALL requests
            pageLoadStrategy (str): look up pageLoadStrategy in Selenium
            timeout (int): seconds to wait for the current archive to complete
                before starting the next one
        '''
        super(ArchiveOrg,self).__init__(**{k:v for k,v in locals().items() if k is not 'self'})
        self.regex = re.compile(REGEX_DONE)

    def is_done(self, url, **kwargs):
        '''
        Used by get_looper to determine when the archive is done
        '''
        if self.regex.match(self.driver.current_url)\
        and self.driver.current_url.endswith(url):
            return True
        else:
            return False