Files

193 lines
6.5 KiB
Python

import json
import os
import time
import urllib
from selenium.common.exceptions import NoSuchElementException
from timer import ArcTimer
from driver import get_driver, DEAD_DRIVER_TUPLE
class ArchiveBase(object):
'''
Archiver base class.
'''
def __init__(self, base_url=None, new_driver=False, print_sleep=3, request_sleep=0, pageLoadStrategy=None, timeout=None, **kwargs):
'''
Args:
base_url (str): Base archive url, should have a "{url}" format string
wherever the target URL is supposed to go.
new_driver (bool): whether to make a new webdriver or
use the existing global one
print_sleep (int): seconds to wait before updating the console readout
request_sleep (int): seconds to wait between ALL requests
pageLoadStrategy (str): look up pageLoadStrategy in Selenium
timeout (int): seconds to wait for the current archive to complete
before starting the next one
**kwargs: arbitrary keyword arguments to set as object attributes
'''
self.base_url = base_url
self.print_sleep = print_sleep
self.request_sleep = request_sleep
self.driver = get_driver(new_driver)
self.timeout = timeout
self.pageLoadStrategy = pageLoadStrategy
self.out = []
for k,v in kwargs.items():
setattr(self, k, v)
def archive_all(self, urls=None, file=None, unique=True):
'''
Archive multiple URLs. If no URLs are specified, will attempt to load a list of URLs to archive from the specified file.
Args:
urls (list, [optional]): list of URLs to archive
file (str, [optional]): path to a JSON to save/load from
unique (bool, [optional]): if true then ignore URLs that are
already archived (i.e. in self.out attribute)
Returns:
TYPE: #DOC#
Raises:
ValueError: #DOC#
'''
if urls is not None:
if unique:
urls_done = set([i['original'] for i in self.out])
urls_todo = set(urls).difference(urls_done)
self.out += [{'original':i, 'archived':None} for i in urls_todo]
elif file is not None:
try:
self.out = self.load(file)
except:
raise ValueError('u dun goofed!')
todo = [i for i in self.out if i['archived'] is None]
total = len(todo)
for index, dicto in enumerate(todo):
url = dicto['original']
this = self.archive_one(url, index+1, total)
dicto.update(this)
if file:
self.save(self.out, file)
time.sleep(self.request_sleep)
return self.out
def archive_one(self, url, cur=1, max=1, **kwargs):
'''
#DOC# Add description
Args:
url (str): URL to archive
cur (int, [optional]): current index for the console printout
max (int, [optional]): maximum index for the console printout
**kwargs: keyword args to use for URL formatting
Returns:
dict: dictionary with original URL + archive URL
'''
print("Archiving ({cur}/{max}): {url}".format(**locals()))
archive_end_url = self.get_looper(url)
print("\nORIGINAL: {}\nARCHIVED: {}".format(url, archive_end_url))
return {'original':url, 'archived':archive_end_url}
def get(self, url, **kwargs):
'''
Get an archive of the specified URL
Args:
url (str):
**kwargs: keyword args to use for URL formatting
'''
target_url = self.base_url.format(url=urllib.quote(url), **kwargs)
try:
self.driver.get(target_url)
assert(self.neterror_check())
except AssertionError:
self.neterror_handler(url=url, **kwargs)
except DEAD_DRIVER_TUPLE:
self.restart_driver()
self.get(url)
def get_looper(self, url, timer=None, **kwargs):
'''
Makes sure that the driver gets restarted if it dies partway through
Args:
url (TYPE): URL to initiate the archive process
timer (timer.ArcTimer, [optional]): timer object; monitors timeouts
**kwargs: keyword args to use for URL formatting
Returns:
str: current page URL
'''
if timer is None:
timer = ArcTimer(sleep=self.print_sleep, timeout=self.timeout)
try:
self.get(url, **kwargs)
while not self.is_done(timer=timer):
timer.report()
return self.driver.current_url
except DEAD_DRIVER_TUPLE:# as e:
self.restart_driver()
self.get_looper(url, timer)
def is_done(self, **kwargs):
'''
Used by get_looper to determine when the archive is done
'''
raise NotImplementedError('ERROR: Must implement "is_done" method!')
def load(self, path):
'''
Loads data from a JSON file
'''
path = os.path.expanduser(path)
with open(path) as file:
js = json.load(file)
return js
def neterror_check(self):
'''
Makes sure a neterror hasn't ruined shit
'''
try:
assert(self.driver.find_element_by_class_name('neterror'))
return True
except AssertionError:
return False
except NoSuchElementException:
return True
except Exception as e:
raise e
def neterror_handler(self, url, **kwargs):
'''
placeholder for class-specific neterror handler
'''
pass
def restart_driver(self):
'''
Restarts the webdriver
'''
try:
self.driver.quit()
except:
pass
self.driver = get_driver(new=True, pageLoadStrategy=self.pageLoadStrategy)
def save(self, data, path, indent=4):
'''
Saves the archived URLs
Args:
data (dict/list): JSON-compatible archive data object (e.g. dict, list)
path (str): desired save path for the archive JSON
indent (int, [optional]): JSON indent to pass into json.dumps
'''
path = os.path.expanduser(path)
with open(path, 'w+') as file:
file.write(json.dumps(data,indent=indent))
print('Saved data to file: {}'.format(path))