35 lines
1.4 KiB
Python
35 lines
1.4 KiB
Python
import re
|
|
from base import ArchiveBase
|
|
|
|
REGEX_DONE = r'https://web.archive.org/web/[0-9]{14}/'
|
|
URL_BASE = 'https://web.archive.org/save/{url}'
|
|
|
|
class ArchiveOrg(ArchiveBase):
|
|
'''
|
|
Archiver class for Internet Archive (archive.org)
|
|
'''
|
|
def __init__(self, base_url=URL_BASE, new_driver=False, print_sleep=3, request_sleep=10, pageLoadStrategy=None, timeout=None):
|
|
'''
|
|
Args:
|
|
base_url (str): Base archive url, should have a "{url}" format string
|
|
wherever the target URL is supposed to go.
|
|
new_driver (bool): whether to make a new webdriver or
|
|
use the existing global one
|
|
print_sleep (int): seconds to wait before updating the console readout
|
|
request_sleep (int): seconds to wait between ALL requests
|
|
pageLoadStrategy (str): look up pageLoadStrategy in Selenium
|
|
timeout (int): seconds to wait for the current archive to complete
|
|
before starting the next one
|
|
'''
|
|
super(ArchiveOrg,self).__init__(**{k:v for k,v in locals().items() if k is not 'self'})
|
|
self.regex = re.compile(REGEX_DONE)
|
|
|
|
def is_done(self, url, **kwargs):
|
|
'''
|
|
Used by get_looper to determine when the archive is done
|
|
'''
|
|
if self.regex.match(self.driver.current_url)\
|
|
and self.driver.current_url.endswith(url):
|
|
return True
|
|
else:
|
|
return False |