diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..aff1bde --- /dev/null +++ b/__init__.py @@ -0,0 +1,2 @@ +from archivers import ArchiveLi +from archivers import ArchiveOrg \ No newline at end of file diff --git a/archivers/__init__.py b/archivers/__init__.py new file mode 100644 index 0000000..8eeb2a8 --- /dev/null +++ b/archivers/__init__.py @@ -0,0 +1,3 @@ +from base import ArchiveBase +from org import ArchiveOrg +from li import ArchiveLi \ No newline at end of file diff --git a/archivers/base.py b/archivers/base.py new file mode 100644 index 0000000..b21d520 --- /dev/null +++ b/archivers/base.py @@ -0,0 +1,193 @@ +import json +import os +import time +import urllib +from selenium.common.exceptions import NoSuchElementException + +from timer import ArcTimer +from driver import get_driver, DEAD_DRIVER_TUPLE + +class ArchiveBase(object): + ''' + Archiver base class. + ''' + + def __init__(self, base_url=None, new_driver=False, print_sleep=3, request_sleep=0, pageLoadStrategy=None, timeout=None, **kwargs): + ''' + Args: + base_url (str): Base archive url, should have a "{url}" format string + wherever the target URL is supposed to go. + new_driver (bool): whether to make a new webdriver or + use the existing global one + print_sleep (int): seconds to wait before updating the console readout + request_sleep (int): seconds to wait between ALL requests + pageLoadStrategy (str): look up pageLoadStrategy in Selenium + timeout (int): seconds to wait for the current archive to complete + before starting the next one + **kwargs: arbitrary keyword arguments to set as object attributes + ''' + self.base_url = base_url + self.print_sleep = print_sleep + self.request_sleep = request_sleep + self.driver = get_driver(new_driver) + self.timeout = timeout + self.pageLoadStrategy = pageLoadStrategy + self.out = [] + for k,v in kwargs.items(): + setattr(self, k, v) + + def archive_all(self, urls=None, file=None, unique=True): + ''' + Archive multiple URLs. If no URLs are specified, will attempt to load a list of URLs to archive from the specified file. + + Args: + urls (list, [optional]): list of URLs to archive + file (str, [optional]): path to a JSON to save/load from + unique (bool, [optional]): if true then ignore URLs that are + already archived (i.e. in self.out attribute) + + Returns: + TYPE: #DOC# + + Raises: + ValueError: #DOC# + ''' + if urls is not None: + if unique: + urls_done = set([i['original'] for i in self.out]) + urls_todo = set(urls).difference(urls_done) + self.out += [{'original':i, 'archived':None} for i in urls_todo] + elif file is not None: + try: + self.out = self.load(file) + except: + raise ValueError('u dun goofed!') + + todo = [i for i in self.out if i['archived'] is None] + total = len(todo) + for index, dicto in enumerate(todo): + url = dicto['original'] + this = self.archive_one(url, index+1, total) + dicto.update(this) + if file: + self.save(self.out, file) + time.sleep(self.request_sleep) + return self.out + + def archive_one(self, url, cur=1, max=1, **kwargs): + ''' + #DOC# Add description + + Args: + url (str): URL to archive + cur (int, [optional]): current index for the console printout + max (int, [optional]): maximum index for the console printout + **kwargs: keyword args to use for URL formatting + + Returns: + dict: dictionary with original URL + archive URL + ''' + print("Archiving ({cur}/{max}): {url}".format(**locals())) + archive_end_url = self.get_looper(url) + print("\nORIGINAL: {}\nARCHIVED: {}".format(url, archive_end_url)) + return {'original':url, 'archived':archive_end_url} + + def get(self, url, **kwargs): + ''' + Get an archive of the specified URL + + Args: + url (str): + **kwargs: keyword args to use for URL formatting + ''' + target_url = self.base_url.format(url=urllib.quote(url), **kwargs) + try: + self.driver.get(target_url) + assert(self.neterror_check()) + except AssertionError: + self.neterror_handler(url=url, **kwargs) + except DEAD_DRIVER_TUPLE: + self.restart_driver() + self.get(url) + + def get_looper(self, url, timer=None, **kwargs): + ''' + Makes sure that the driver gets restarted if it dies partway through + + Args: + url (TYPE): URL to initiate the archive process + timer (timer.ArcTimer, [optional]): timer object; monitors timeouts + **kwargs: keyword args to use for URL formatting + + Returns: + str: current page URL + ''' + if timer is None: + timer = ArcTimer(sleep=self.print_sleep, timeout=self.timeout) + try: + self.get(url, **kwargs) + while not self.is_done(timer=timer): + timer.report() + return self.driver.current_url + except DEAD_DRIVER_TUPLE:# as e: + self.restart_driver() + self.get_looper(url, timer) + + def is_done(self, **kwargs): + ''' + Used by get_looper to determine when the archive is done + ''' + raise NotImplementedError('ERROR: Must implement "is_done" method!') + + def load(self, path): + ''' + Loads data from a JSON file + ''' + path = os.path.expanduser(path) + with open(path) as file: + js = json.load(file) + return js + + def neterror_check(self): + ''' + Makes sure a neterror hasn't ruined shit + ''' + try: + assert(self.driver.find_element_by_class_name('neterror')) + return True + except AssertionError: + return False + except NoSuchElementException: + return True + except Exception as e: + raise e + + def neterror_handler(self, url, **kwargs): + ''' + placeholder for class-specific neterror handler + ''' + pass + + def restart_driver(self): + ''' + Restarts the webdriver + ''' + try: + self.driver.quit() + except: + pass + self.driver = get_driver(new=True, pageLoadStrategy=self.pageLoadStrategy) + + def save(self, data, path, indent=4): + ''' + Saves the archived URLs + + Args: + data (dict/list): JSON-compatible archive data object (e.g. dict, list) + path (str): desired save path for the archive JSON + indent (int, [optional]): JSON indent to pass into json.dumps + ''' + path = os.path.expanduser(path) + with open(path, 'w+') as file: + file.write(json.dumps(data,indent=indent)) + print('Saved data to file: {}'.format(path)) \ No newline at end of file diff --git a/archivers/li.py b/archivers/li.py new file mode 100644 index 0000000..4c2af5f --- /dev/null +++ b/archivers/li.py @@ -0,0 +1,82 @@ +import re +from base import ArchiveBase + +REGEX_WIP = r'https://archive\.(li|today|vn|md|[a-z]+)/wip/.+' +REGEX_DONE = r'https://archive\.(li|today|vn|md|[a-z]+)/(?![\?]run=1|submit/|wip/)' +URL_BASE = "https://archive.{tld}/?run=1&url={{url}}" +TLD_ARCHIVE_LI = ['li','today','vn','md','is'] + + +class ArchiveTLD(object): + ''' + Descriptor class for archive TLDs + ''' + + def __get__(self, obj, objtype): + ''' + Returns the object's _tld attribute + ''' + return getattr(obj, '_tld') + + def __set__(self, obj, value): + ''' + Sets the object's _tld attribute and updates the object's + _base_url attribute so that the URL uses the new TLD + ''' + setattr(obj, '_tld', value) + new_base = getattr(obj, '_base_url').format(tld=obj._tld) + setattr(obj, 'base_url', new_base) + +class ArchiveLi(ArchiveBase): + ''' + Archiver for archive.li/today/vn/md/is + ''' + tld = ArchiveTLD() + + def __init__(self, base_url=URL_BASE, regex=REGEX_DONE, regex_wip=REGEX_WIP, new_driver=False, print_sleep=3, request_sleep=0, tld='li', pageLoadStrategy=None, timeout=60): + ''' + Args: + base_url (str): Base archive url, should have a "{url}" format string + wherever the target URL is supposed to go. + regex (str): regex string for completed archive URLs + regex_wip (str): regex string for URLs that indicate the archive is still going + new_driver (bool): whether to make a new webdriver or + use the existing global one + print_sleep (int): seconds to wait before updating the console readout + request_sleep (int): seconds to wait between ALL requests + tld (str, [optional]): #DOC# + pageLoadStrategy (str): look up pageLoadStrategy in Selenium + timeout (int): seconds to wait for the current archive to complete + before starting the next one + **kwargs: arbitrary keyword arguments to set as object attributes + ''' + self._base_url = base_url + super(ArchiveLi,self).__init__(**{k:v for k,v in locals().items() if k is not 'self'}) + self.regex = re.compile(regex) + self.regex_wip = re.compile(regex_wip) + self.timeout = timeout + + def is_done(self, timer, **kwargs): + ''' + Used by get_looper to determine when the archive is done + ''' + if self.regex.match(self.driver.current_url): + return True + elif timer.is_timeout() and self.regex_wip.match(self.driver.current_url): + return True + else: + return False + + def neterror_handler(self, url, **kwargs): + ''' + placeholder for class-specific neterror handler + ''' + for index, tld in enumerate(TLD_ARCHIVE_LI): + if tld == self.tld: + break + try: + self.tld = TLD_ARCHIVE_LI[index+1] + except IndexError: + self.tld = TLD_ARCHIVE_LI[0] + + self.get(url, **kwargs) \ No newline at end of file diff --git a/archivers/org.py b/archivers/org.py new file mode 100644 index 0000000..07a966c --- /dev/null +++ b/archivers/org.py @@ -0,0 +1,35 @@ +import re +from base import ArchiveBase + +REGEX_DONE = r'https://web.archive.org/web/[0-9]{14}/' +URL_BASE = 'https://web.archive.org/save/{url}' + +class ArchiveOrg(ArchiveBase): + ''' + Archiver class for Internet Archive (archive.org) + ''' + def __init__(self, base_url=URL_BASE, new_driver=False, print_sleep=3, request_sleep=10, pageLoadStrategy=None, timeout=None): + ''' + Args: + base_url (str): Base archive url, should have a "{url}" format string + wherever the target URL is supposed to go. + new_driver (bool): whether to make a new webdriver or + use the existing global one + print_sleep (int): seconds to wait before updating the console readout + request_sleep (int): seconds to wait between ALL requests + pageLoadStrategy (str): look up pageLoadStrategy in Selenium + timeout (int): seconds to wait for the current archive to complete + before starting the next one + ''' + super(ArchiveOrg,self).__init__(**{k:v for k,v in locals().items() if k is not 'self'}) + self.regex = re.compile(REGEX_DONE) + + def is_done(self, url, **kwargs): + ''' + Used by get_looper to determine when the archive is done + ''' + if self.regex.match(self.driver.current_url)\ + and self.driver.current_url.endswith(url): + return True + else: + return False \ No newline at end of file diff --git a/chomper.py b/chomper.py new file mode 100644 index 0000000..8c9a55b --- /dev/null +++ b/chomper.py @@ -0,0 +1,24 @@ +from bs4 import BeautifulSoup +from driver import Driven + +class ChomperBase(Driven): + ''' + Web parser/ingester base class + + Attributes: + soup (TYPE): #DOC# + ''' + def __init__(self, new_driver=False): + localArgs = {k:v for k,v in locals().items() if k is not 'self'} + super(ChomperBase,self).__init__(**localArgs) + + def main(self, url): + self.get(url) + self.make_soup() + return self.parse() + + def make_soup(self): + self.soup = BeautifulSoup(self.driver.page_source,'lxml') + + def parse(self): + raise NotImplementedError('Inheriting class must implement parse_soup!') \ No newline at end of file diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..9538d8f --- /dev/null +++ b/crawler.py @@ -0,0 +1,18 @@ +from driver import Driven +from bs4 import BeautifulSoup + +class CrawlerBase(Driven): + + ''' + Webcrawler base class + ''' + + def __init__(self, new_driver=False, **kwargs): + super(CrawlerBase,self).__init__(new_driver=new_driver, **kwargs) + self.data = [] + + def crawl(self): + raise NotImplementedError('Inheriting class must implement crawl!') + + def make_soup(self): + self.soup = BeautifulSoup(self.driver.page_source,'lxml') \ No newline at end of file diff --git a/driver.py b/driver.py new file mode 100644 index 0000000..2d39c08 --- /dev/null +++ b/driver.py @@ -0,0 +1,45 @@ +import socket +from selenium import webdriver +from selenium.common.exceptions import NoSuchFrameException, NoSuchWindowException, RemoteDriverServerException, TimeoutException, WebDriverException +#from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +#DRIVER_CAPS = DesiredCapabilities.CHROME +#DRIVER_CAPS['pageLoadStrategy'] = 'eager' +DEAD_DRIVER_TUPLE = (NoSuchWindowException, NoSuchFrameException, RemoteDriverServerException, WebDriverException, TimeoutException, socket.error) +PAGE_LOAD_STRATEGY = 'normal' + +def get_driver(new=False, pageLoadStrategy=None): + global driver + if new: + return make_driver(pageLoadStrategy) + else: + try: + assert(driver) + except (AssertionError, NameError): + driver = make_driver(pageLoadStrategy) + return driver + +def make_driver(pageLoadStrategy=None): + if pageLoadStrategy is None: + pageLoadStrategy = PAGE_LOAD_STRATEGY + return webdriver.Chrome(desired_capabilities={'pageLoadStrategy':pageLoadStrategy}) + +class Driven(object): + def __init__(self, new_driver=False, **kwargs): + self.driver = get_driver(new_driver) + for k,v in kwargs.items(): + setattr(self, k, v) + + def get(self, url): + try: + self.driver.get(url) + return self.driver.current_url + except DEAD_DRIVER_TUPLE: + self.restart_driver() + self.get(url) + + def restart_driver(self): + try: + self.driver.quit() + except: + pass + self.driver = get_driver(new=True) \ No newline at end of file diff --git a/parsers/bungie.py b/parsers/bungie.py new file mode 100644 index 0000000..a8ce86d --- /dev/null +++ b/parsers/bungie.py @@ -0,0 +1,13 @@ +from bs4 import BeautifulSoup + +def bungie(source): + soup_urls = [] + soup = BeautifulSoup(source,'lxml') + for tr in soup.find_all('tr'): + try: + a = tr.find_all('td')[0].find_all('a')[0] + except IndexError: + continue + if 'href' in a.attrs and a.attrs['href'].startswith('http://forums.bungie.org/'): + soup_urls.append({'title':a.text,'original':a.attrs['href']}) + return soup_urls \ No newline at end of file diff --git a/parsers/pull.py b/parsers/pull.py new file mode 100644 index 0000000..648ba86 --- /dev/null +++ b/parsers/pull.py @@ -0,0 +1,137 @@ +from chomper import ChomperBase +from crawler import CrawlerBase +from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +import re + +FORUM_URL_BASE = 'https://prettyuglylittleliar.net/forum/{forum}/' +FORUM_URL_PAGE = FORUM_URL_BASE + '?page={{page}}' +PAGE_REGEX = r'Page [0-9]+ of |[\s]+$' +CATEGORIES = [ + "1-pretty-ugly-little-liar", + "4-snowflakes", + "6-general" +] +SUBFORUMS = [ + "2-news-announcements", + "3-introduce-yourself", + "5-dakota-rose-\u30c0\u30b3\u30bf-\u30ed\u30fc\u30ba", + "7-general-discussion", + "8-site-feedback", + "9-kiki-kannibal", + "10-venus-angelic", + "11-kanadajin", + "12-little-snowflakes", + "13-online-personalities", + "14-beauty-fashion", + "15-health-wellbeing", + "16-love-relationships", + "20-johanna-herrstedt", + "21-taylor-r", + "22-jessica-nigri", + "24-movies-television", + "25-music", + "26-gaming", + "27-wylona-hayashi", + "28-skincare", + "29-reading", + "32-entertainment", + "35-yumi-king", + "36-yandev", + "38-simply-kenna-cozy-kitsune", + "39-vic-mignogna", + "42-pokimane" +] + +class PullCrawler(CrawlerBase): + def __init__(self, new_driver=False): + #localArgs = {k:v for k,v in locals().items() if k is not 'self'} + super(PullCrawler,self).__init__(new_driver=new_driver) + + def crawl(self): + total = len(self.subforums) + for index, subforum in enumerate(self.subforums): + print('Crawling subforum "{}" ({}/{})...'.format(subforum.subforum,index+1,total)) + subforum.main() + self.data.append(subforum.dump()) + + def make_subforum_crawlers(self): + self.subforums = [] + for subforum in SUBFORUMS: + self.subforums.append(PullSubforumCrawler(subforum)) + + def main(self): + self.make_subforum_crawlers() + self.crawl() + +class PullSubforumCrawler(CrawlerBase): + def __init__(self, subforum, new_driver=False): + localArgs = {k:v for k,v in locals().items() if k is not 'self'} + super(PullSubforumCrawler,self).__init__(**localArgs) + self.base_url = FORUM_URL_BASE.format(forum=subforum) + self.page_url = FORUM_URL_PAGE.format(forum=subforum) + + def crawl(self): + total = len(self.urls) + for index, url in enumerate(self.urls): + print('Crawling subforum page {}/{}...'.format(index+1,total)) + if self.driver.current_url != url: + self.get(url) + page_threads = self.get_page_threads() + self.data.append({'url':url, 'page_number':index+1, 'threads':page_threads, 'type':'subforum_page', 'source':self.driver.page_source}) + + def dump(self): + return {'url':self.base_url, 'title':self.title, 'pages':self.data} + + def get_page_count(self): + try: + count_text = self.driver.find_element_by_class_name('ipsPagination_pageJump').text + self.page_count = int(re.sub(PAGE_REGEX, '', count_text)) + except NoSuchElementException: + self.page_count = 0 + except Exception as e: + raise e + + def get_page_threads(self): + self.make_soup() + threads = self.soup.find_all('li',attrs={'itemtype':'http://schema.org/Article'}) + page_threads = [] + for thread in threads: + page_threads.append(self.parse_thread(thread)) + return page_threads + + def get_subforum_title(self): + self.make_soup() + self.title = self.soup.find('h1',attrs={'class':'ipsType_pageTitle'}).text + + def main(self): + self.get(self.base_url) + self.get_page_count() + self.get_subforum_title() + self.make_page_urls() + self.crawl() + + def make_page_urls(self): + self.urls = [self.base_url] + for x in range(1, self.page_count+1): + self.urls.append(self.page_url.format(page=x)) + + def parse_thread(self, thread): + head = thread.find_all('h4')[0] + #url = head.find_all('a')[0].attrs['href'] + url = head.find_all('a',attrs={'itemprop':'url'})[0].attrs['href'] + title = head.find_all('span',attrs={'itemprop':'name'})[-1].text.strip() + try: + last_page_obj = thread.find_all('li',attrs={'class':'ipsPagination_last'})[0] + last_page = int(last_page_obj.text.strip()) + except IndexError: + last_page = 1 + urls = self._parse_thread_pages(url, last_page) + return {'urls':urls, 'title':title, 'page_count':last_page, 'type':'thread'} + + def _parse_thread_pages(self, url, last_page): + out = [url] + for x in range(2, last_page+1): + out.append(url+'?page={}'.format(x)) + return out \ No newline at end of file diff --git a/parsers/wikia.py b/parsers/wikia.py new file mode 100644 index 0000000..671c8c1 --- /dev/null +++ b/parsers/wikia.py @@ -0,0 +1,152 @@ +from chomper import ChomperBase +from crawler import CrawlerBase +from selenium.common.exceptions import TimeoutException, WebDriverException +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains + +class ChomperWikiaProfile(ChomperBase): + def __init__(self, new_driver=False): + localArgs = {k:v for k,v in locals().items() if k is not 'self'} + super(ChomperWikiaProfile,self).__init__(**localArgs) + + def parse(self): + pass + +class WikiaMessageCrawler(CrawlerBase): + def __init__(self, new_driver=False): + localArgs = {k:v for k,v in locals().items() if k is not 'self'} + super(WikiaMessageCrawler,self).__init__(**localArgs) + + def crawl(self): + has_more_pages = True + while has_more_pages: + self.get_page_comments() + has_more_pages = self.next_page() + return self.data + + def main(self, url): + self.get(url) + return self.crawl() + + def next_page(self): + try: + page_next = self.driver.find_elements_by_class_name('Pagination')[0].find_elements_by_link_text('Next')[0] + self._next_page_click(page_next) + #page_next.send_keys(Keys.ARROW_DOWN) + #page_next.click() + return True + except IndexError: + return False + + def _next_page_click(self, page_next): + try: + page_next.click() + except WebDriverException: + page_next.send_keys(Keys.ARROW_DOWN) + self.next_page_click(page_next) + except Exception as e: + raise e + + def get_page_comments(self): + self._get_page_comment_trees() + for tree in self.comment_trees: + self.data.append(self.process_comment_tree(tree)) + return self.data + + def _get_page_comment_trees(self): + #get container with all comments + self.comments_box = driver.find_elements_by_class_name('comments')[0] + + #get all top-level comments + self.comment_trees = [] + for i in self.comments_box.find_elements_by_tag_name('li'): + if i.get_attribute('class').startswith('SpeechBubble message message-main'): + self.comment_trees.append(i) + return self.comment_trees + + def process_comment_tree(self, tree): + self.comment_tree = {'replies':[]} + self._process_comment_tree_load_more(tree) + messages = tree.find_elements_by_class_name('speech-bubble-message') + for index, message in enumerate(messages): + try: + data = self.process_message(message) + except IndexError as e: #should only happen if it's the reply box + try: #make sure it's the reply box, break if so + assert(message.find_element_by_class_name('replyBody')) + break + except AssertionError: + raise e #raise original exception if it's not the reply box + if index == 0: + self.comment_tree.update(data) + self.comment_tree.update(self._process_message_title(message)) + else: + self.comment_tree['replies'].append(data) + return self.comment_tree + + def _process_comment_tree_load_more(self, tree): + try: + load_more = tree.find_elements_by_class_name('load-more')[0] + ''' + try: + load_more.send_keys(Keys.ARROW_DOWN) + except: + pass + load_more.click() + ''' + self.force_click_element(load_more) + except IndexError: + pass + except Exception as e: + raise e + + def force_click_element(self, element, retries=100): + try: + assert(retries > 0) + element.click() + except WebDriverException: + try: + element.send_keys(Keys.ARROW_DOWN) + self.force_click_element(element, retries=retries-1) + except WebDriverException: + actions = ActionChains(self.driver) + actions.move_to_element(element) + actions.click() + actions.perform() + except (AssertionError, TimeoutException): + raise TimeoutException('Tried to click element {} times without success!'.format(retries)) + except Exception as e: + raise e + + def process_message(self, message): + #get message author + author = message.find_elements_by_class_name('edited-by')[0].text + + #get message body + body = message.find_elements_by_class_name('msg-body')[0].text + + #get permalink and timestamp from timestamp class + time_ptr = message.find_elements_by_class_name('timestamp')[0].find_elements_by_tag_name('a')[0] + permalink = time_ptr.get_attribute('href') + timestamp = time_ptr.text + + return {'author':author, 'body':body, 'date':timestamp, 'url':permalink} + + def _process_message_title(self, message): + try: + title = message.find_elements_by_class_name('msg-title')[0].text + return {'title':title} + except: + return {} + +class Wikia: + def wikia_crawl(): + #go to wall + driver.find_elements_by_link_text('Message Wall')[0].click() + #get container with all comments + comments_box = driver.find_elements_by_class_name('comments')[0] + pages = driver.find_elements_by_class_name('Pagination')[0] + + def other(): + driver.find_elements_by_link_text('Blog')[0].click() + driver.find_elements_by_link_text('Contributions')[0].click() diff --git a/timer.py b/timer.py new file mode 100644 index 0000000..9948a7a --- /dev/null +++ b/timer.py @@ -0,0 +1,28 @@ +import datetime +import sys +import time + +class ArcTimer(object): + def __init__(self, sleep=3, timeout=None): + self.start = datetime.datetime.now() + self.sleep = sleep + self.timeout = timeout + + def count_seconds(self): + self.seconds = (datetime.datetime.now() - self.start).total_seconds() + + def is_timeout(self): + try: + assert(self.timeout is not None) + assert(self.seconds >= self.timeout) + sys.stdout.write("\rHit timeout when waiting for archive to complete ({} seconds)".format(self.seconds)) + sys.stdout.flush() + return True + except: + return False + + def report(self): + self.count_seconds() + sys.stdout.write("\rWaiting for archive to complete ({} seconds)".format(self.seconds)) + sys.stdout.flush() + time.sleep(self.sleep) \ No newline at end of file