193 lines
6.5 KiB
Python
193 lines
6.5 KiB
Python
import json
|
|
import os
|
|
import time
|
|
import urllib
|
|
from selenium.common.exceptions import NoSuchElementException
|
|
|
|
from timer import ArcTimer
|
|
from driver import get_driver, DEAD_DRIVER_TUPLE
|
|
|
|
class ArchiveBase(object):
|
|
'''
|
|
Archiver base class.
|
|
'''
|
|
|
|
def __init__(self, base_url=None, new_driver=False, print_sleep=3, request_sleep=0, pageLoadStrategy=None, timeout=None, **kwargs):
|
|
'''
|
|
Args:
|
|
base_url (str): Base archive url, should have a "{url}" format string
|
|
wherever the target URL is supposed to go.
|
|
new_driver (bool): whether to make a new webdriver or
|
|
use the existing global one
|
|
print_sleep (int): seconds to wait before updating the console readout
|
|
request_sleep (int): seconds to wait between ALL requests
|
|
pageLoadStrategy (str): look up pageLoadStrategy in Selenium
|
|
timeout (int): seconds to wait for the current archive to complete
|
|
before starting the next one
|
|
**kwargs: arbitrary keyword arguments to set as object attributes
|
|
'''
|
|
self.base_url = base_url
|
|
self.print_sleep = print_sleep
|
|
self.request_sleep = request_sleep
|
|
self.driver = get_driver(new_driver)
|
|
self.timeout = timeout
|
|
self.pageLoadStrategy = pageLoadStrategy
|
|
self.out = []
|
|
for k,v in kwargs.items():
|
|
setattr(self, k, v)
|
|
|
|
def archive_all(self, urls=None, file=None, unique=True):
|
|
'''
|
|
Archive multiple URLs. If no URLs are specified, will attempt to load a list of URLs to archive from the specified file.
|
|
|
|
Args:
|
|
urls (list, [optional]): list of URLs to archive
|
|
file (str, [optional]): path to a JSON to save/load from
|
|
unique (bool, [optional]): if true then ignore URLs that are
|
|
already archived (i.e. in self.out attribute)
|
|
|
|
Returns:
|
|
TYPE: #DOC#
|
|
|
|
Raises:
|
|
ValueError: #DOC#
|
|
'''
|
|
if urls is not None:
|
|
if unique:
|
|
urls_done = set([i['original'] for i in self.out])
|
|
urls_todo = set(urls).difference(urls_done)
|
|
self.out += [{'original':i, 'archived':None} for i in urls_todo]
|
|
elif file is not None:
|
|
try:
|
|
self.out = self.load(file)
|
|
except:
|
|
raise ValueError('u dun goofed!')
|
|
|
|
todo = [i for i in self.out if i['archived'] is None]
|
|
total = len(todo)
|
|
for index, dicto in enumerate(todo):
|
|
url = dicto['original']
|
|
this = self.archive_one(url, index+1, total)
|
|
dicto.update(this)
|
|
if file:
|
|
self.save(self.out, file)
|
|
time.sleep(self.request_sleep)
|
|
return self.out
|
|
|
|
def archive_one(self, url, cur=1, max=1, **kwargs):
|
|
'''
|
|
#DOC# Add description
|
|
|
|
Args:
|
|
url (str): URL to archive
|
|
cur (int, [optional]): current index for the console printout
|
|
max (int, [optional]): maximum index for the console printout
|
|
**kwargs: keyword args to use for URL formatting
|
|
|
|
Returns:
|
|
dict: dictionary with original URL + archive URL
|
|
'''
|
|
print("Archiving ({cur}/{max}): {url}".format(**locals()))
|
|
archive_end_url = self.get_looper(url)
|
|
print("\nORIGINAL: {}\nARCHIVED: {}".format(url, archive_end_url))
|
|
return {'original':url, 'archived':archive_end_url}
|
|
|
|
def get(self, url, **kwargs):
|
|
'''
|
|
Get an archive of the specified URL
|
|
|
|
Args:
|
|
url (str):
|
|
**kwargs: keyword args to use for URL formatting
|
|
'''
|
|
target_url = self.base_url.format(url=urllib.quote(url), **kwargs)
|
|
try:
|
|
self.driver.get(target_url)
|
|
assert(self.neterror_check())
|
|
except AssertionError:
|
|
self.neterror_handler(url=url, **kwargs)
|
|
except DEAD_DRIVER_TUPLE:
|
|
self.restart_driver()
|
|
self.get(url)
|
|
|
|
def get_looper(self, url, timer=None, **kwargs):
|
|
'''
|
|
Makes sure that the driver gets restarted if it dies partway through
|
|
|
|
Args:
|
|
url (TYPE): URL to initiate the archive process
|
|
timer (timer.ArcTimer, [optional]): timer object; monitors timeouts
|
|
**kwargs: keyword args to use for URL formatting
|
|
|
|
Returns:
|
|
str: current page URL
|
|
'''
|
|
if timer is None:
|
|
timer = ArcTimer(sleep=self.print_sleep, timeout=self.timeout)
|
|
try:
|
|
self.get(url, **kwargs)
|
|
while not self.is_done(timer=timer):
|
|
timer.report()
|
|
return self.driver.current_url
|
|
except DEAD_DRIVER_TUPLE:# as e:
|
|
self.restart_driver()
|
|
self.get_looper(url, timer)
|
|
|
|
def is_done(self, **kwargs):
|
|
'''
|
|
Used by get_looper to determine when the archive is done
|
|
'''
|
|
raise NotImplementedError('ERROR: Must implement "is_done" method!')
|
|
|
|
def load(self, path):
|
|
'''
|
|
Loads data from a JSON file
|
|
'''
|
|
path = os.path.expanduser(path)
|
|
with open(path) as file:
|
|
js = json.load(file)
|
|
return js
|
|
|
|
def neterror_check(self):
|
|
'''
|
|
Makes sure a neterror hasn't ruined shit
|
|
'''
|
|
try:
|
|
assert(self.driver.find_element_by_class_name('neterror'))
|
|
return True
|
|
except AssertionError:
|
|
return False
|
|
except NoSuchElementException:
|
|
return True
|
|
except Exception as e:
|
|
raise e
|
|
|
|
def neterror_handler(self, url, **kwargs):
|
|
'''
|
|
placeholder for class-specific neterror handler
|
|
'''
|
|
pass
|
|
|
|
def restart_driver(self):
|
|
'''
|
|
Restarts the webdriver
|
|
'''
|
|
try:
|
|
self.driver.quit()
|
|
except:
|
|
pass
|
|
self.driver = get_driver(new=True, pageLoadStrategy=self.pageLoadStrategy)
|
|
|
|
def save(self, data, path, indent=4):
|
|
'''
|
|
Saves the archived URLs
|
|
|
|
Args:
|
|
data (dict/list): JSON-compatible archive data object (e.g. dict, list)
|
|
path (str): desired save path for the archive JSON
|
|
indent (int, [optional]): JSON indent to pass into json.dumps
|
|
'''
|
|
path = os.path.expanduser(path)
|
|
with open(path, 'w+') as file:
|
|
file.write(json.dumps(data,indent=indent))
|
|
print('Saved data to file: {}'.format(path)) |