Files

137 lines
4.7 KiB
Python

from chomper import ChomperBase
from crawler import CrawlerBase
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
FORUM_URL_BASE = 'https://prettyuglylittleliar.net/forum/{forum}/'
FORUM_URL_PAGE = FORUM_URL_BASE + '?page={{page}}'
PAGE_REGEX = r'Page [0-9]+ of |[\s]+$'
CATEGORIES = [
"1-pretty-ugly-little-liar",
"4-snowflakes",
"6-general"
]
SUBFORUMS = [
"2-news-announcements",
"3-introduce-yourself",
"5-dakota-rose-\u30c0\u30b3\u30bf-\u30ed\u30fc\u30ba",
"7-general-discussion",
"8-site-feedback",
"9-kiki-kannibal",
"10-venus-angelic",
"11-kanadajin",
"12-little-snowflakes",
"13-online-personalities",
"14-beauty-fashion",
"15-health-wellbeing",
"16-love-relationships",
"20-johanna-herrstedt",
"21-taylor-r",
"22-jessica-nigri",
"24-movies-television",
"25-music",
"26-gaming",
"27-wylona-hayashi",
"28-skincare",
"29-reading",
"32-entertainment",
"35-yumi-king",
"36-yandev",
"38-simply-kenna-cozy-kitsune",
"39-vic-mignogna",
"42-pokimane"
]
class PullCrawler(CrawlerBase):
def __init__(self, new_driver=False):
#localArgs = {k:v for k,v in locals().items() if k is not 'self'}
super(PullCrawler,self).__init__(new_driver=new_driver)
def crawl(self):
total = len(self.subforums)
for index, subforum in enumerate(self.subforums):
print('Crawling subforum "{}" ({}/{})...'.format(subforum.subforum,index+1,total))
subforum.main()
self.data.append(subforum.dump())
def make_subforum_crawlers(self):
self.subforums = []
for subforum in SUBFORUMS:
self.subforums.append(PullSubforumCrawler(subforum))
def main(self):
self.make_subforum_crawlers()
self.crawl()
class PullSubforumCrawler(CrawlerBase):
def __init__(self, subforum, new_driver=False):
localArgs = {k:v for k,v in locals().items() if k is not 'self'}
super(PullSubforumCrawler,self).__init__(**localArgs)
self.base_url = FORUM_URL_BASE.format(forum=subforum)
self.page_url = FORUM_URL_PAGE.format(forum=subforum)
def crawl(self):
total = len(self.urls)
for index, url in enumerate(self.urls):
print('Crawling subforum page {}/{}...'.format(index+1,total))
if self.driver.current_url != url:
self.get(url)
page_threads = self.get_page_threads()
self.data.append({'url':url, 'page_number':index+1, 'threads':page_threads, 'type':'subforum_page', 'source':self.driver.page_source})
def dump(self):
return {'url':self.base_url, 'title':self.title, 'pages':self.data}
def get_page_count(self):
try:
count_text = self.driver.find_element_by_class_name('ipsPagination_pageJump').text
self.page_count = int(re.sub(PAGE_REGEX, '', count_text))
except NoSuchElementException:
self.page_count = 0
except Exception as e:
raise e
def get_page_threads(self):
self.make_soup()
threads = self.soup.find_all('li',attrs={'itemtype':'http://schema.org/Article'})
page_threads = []
for thread in threads:
page_threads.append(self.parse_thread(thread))
return page_threads
def get_subforum_title(self):
self.make_soup()
self.title = self.soup.find('h1',attrs={'class':'ipsType_pageTitle'}).text
def main(self):
self.get(self.base_url)
self.get_page_count()
self.get_subforum_title()
self.make_page_urls()
self.crawl()
def make_page_urls(self):
self.urls = [self.base_url]
for x in range(1, self.page_count+1):
self.urls.append(self.page_url.format(page=x))
def parse_thread(self, thread):
head = thread.find_all('h4')[0]
#url = head.find_all('a')[0].attrs['href']
url = head.find_all('a',attrs={'itemprop':'url'})[0].attrs['href']
title = head.find_all('span',attrs={'itemprop':'name'})[-1].text.strip()
try:
last_page_obj = thread.find_all('li',attrs={'class':'ipsPagination_last'})[0]
last_page = int(last_page_obj.text.strip())
except IndexError:
last_page = 1
urls = self._parse_thread_pages(url, last_page)
return {'urls':urls, 'title':title, 'page_count':last_page, 'type':'thread'}
def _parse_thread_pages(self, url, last_page):
out = [url]
for x in range(2, last_page+1):
out.append(url+'?page={}'.format(x))
return out