82 lines
3.0 KiB
Python
82 lines
3.0 KiB
Python
import re
|
|
from base import ArchiveBase
|
|
|
|
REGEX_WIP = r'https://archive\.(li|today|vn|md|[a-z]+)/wip/.+'
|
|
REGEX_DONE = r'https://archive\.(li|today|vn|md|[a-z]+)/(?![\?]run=1|submit/|wip/)'
|
|
URL_BASE = "https://archive.{tld}/?run=1&url={{url}}"
|
|
TLD_ARCHIVE_LI = ['li','today','vn','md','is']
|
|
|
|
|
|
class ArchiveTLD(object):
|
|
'''
|
|
Descriptor class for archive TLDs
|
|
'''
|
|
|
|
def __get__(self, obj, objtype):
|
|
'''
|
|
Returns the object's _tld attribute
|
|
'''
|
|
return getattr(obj, '_tld')
|
|
|
|
def __set__(self, obj, value):
|
|
'''
|
|
Sets the object's _tld attribute and updates the object's
|
|
_base_url attribute so that the URL uses the new TLD
|
|
'''
|
|
setattr(obj, '_tld', value)
|
|
new_base = getattr(obj, '_base_url').format(tld=obj._tld)
|
|
setattr(obj, 'base_url', new_base)
|
|
|
|
class ArchiveLi(ArchiveBase):
|
|
'''
|
|
Archiver for archive.li/today/vn/md/is
|
|
'''
|
|
tld = ArchiveTLD()
|
|
|
|
def __init__(self, base_url=URL_BASE, regex=REGEX_DONE, regex_wip=REGEX_WIP, new_driver=False, print_sleep=3, request_sleep=0, tld='li', pageLoadStrategy=None, timeout=60):
|
|
'''
|
|
Args:
|
|
base_url (str): Base archive url, should have a "{url}" format string
|
|
wherever the target URL is supposed to go.
|
|
regex (str): regex string for completed archive URLs
|
|
regex_wip (str): regex string for URLs that indicate the archive is still going
|
|
new_driver (bool): whether to make a new webdriver or
|
|
use the existing global one
|
|
print_sleep (int): seconds to wait before updating the console readout
|
|
request_sleep (int): seconds to wait between ALL requests
|
|
tld (str, [optional]): #DOC#
|
|
pageLoadStrategy (str): look up pageLoadStrategy in Selenium
|
|
timeout (int): seconds to wait for the current archive to complete
|
|
before starting the next one
|
|
**kwargs: arbitrary keyword arguments to set as object attributes
|
|
'''
|
|
self._base_url = base_url
|
|
super(ArchiveLi,self).__init__(**{k:v for k,v in locals().items() if k is not 'self'})
|
|
self.regex = re.compile(regex)
|
|
self.regex_wip = re.compile(regex_wip)
|
|
self.timeout = timeout
|
|
|
|
def is_done(self, timer, **kwargs):
|
|
'''
|
|
Used by get_looper to determine when the archive is done
|
|
'''
|
|
if self.regex.match(self.driver.current_url):
|
|
return True
|
|
elif timer.is_timeout() and self.regex_wip.match(self.driver.current_url):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def neterror_handler(self, url, **kwargs):
|
|
'''
|
|
placeholder for class-specific neterror handler
|
|
'''
|
|
for index, tld in enumerate(TLD_ARCHIVE_LI):
|
|
if tld == self.tld:
|
|
break
|
|
try:
|
|
self.tld = TLD_ARCHIVE_LI[index+1]
|
|
except IndexError:
|
|
self.tld = TLD_ARCHIVE_LI[0]
|
|
|
|
self.get(url, **kwargs) |