137 lines
4.7 KiB
Python
137 lines
4.7 KiB
Python
from chomper import ChomperBase
|
|
from crawler import CrawlerBase
|
|
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
import re
|
|
|
|
FORUM_URL_BASE = 'https://prettyuglylittleliar.net/forum/{forum}/'
|
|
FORUM_URL_PAGE = FORUM_URL_BASE + '?page={{page}}'
|
|
PAGE_REGEX = r'Page [0-9]+ of |[\s]+$'
|
|
CATEGORIES = [
|
|
"1-pretty-ugly-little-liar",
|
|
"4-snowflakes",
|
|
"6-general"
|
|
]
|
|
SUBFORUMS = [
|
|
"2-news-announcements",
|
|
"3-introduce-yourself",
|
|
"5-dakota-rose-\u30c0\u30b3\u30bf-\u30ed\u30fc\u30ba",
|
|
"7-general-discussion",
|
|
"8-site-feedback",
|
|
"9-kiki-kannibal",
|
|
"10-venus-angelic",
|
|
"11-kanadajin",
|
|
"12-little-snowflakes",
|
|
"13-online-personalities",
|
|
"14-beauty-fashion",
|
|
"15-health-wellbeing",
|
|
"16-love-relationships",
|
|
"20-johanna-herrstedt",
|
|
"21-taylor-r",
|
|
"22-jessica-nigri",
|
|
"24-movies-television",
|
|
"25-music",
|
|
"26-gaming",
|
|
"27-wylona-hayashi",
|
|
"28-skincare",
|
|
"29-reading",
|
|
"32-entertainment",
|
|
"35-yumi-king",
|
|
"36-yandev",
|
|
"38-simply-kenna-cozy-kitsune",
|
|
"39-vic-mignogna",
|
|
"42-pokimane"
|
|
]
|
|
|
|
class PullCrawler(CrawlerBase):
|
|
def __init__(self, new_driver=False):
|
|
#localArgs = {k:v for k,v in locals().items() if k is not 'self'}
|
|
super(PullCrawler,self).__init__(new_driver=new_driver)
|
|
|
|
def crawl(self):
|
|
total = len(self.subforums)
|
|
for index, subforum in enumerate(self.subforums):
|
|
print('Crawling subforum "{}" ({}/{})...'.format(subforum.subforum,index+1,total))
|
|
subforum.main()
|
|
self.data.append(subforum.dump())
|
|
|
|
def make_subforum_crawlers(self):
|
|
self.subforums = []
|
|
for subforum in SUBFORUMS:
|
|
self.subforums.append(PullSubforumCrawler(subforum))
|
|
|
|
def main(self):
|
|
self.make_subforum_crawlers()
|
|
self.crawl()
|
|
|
|
class PullSubforumCrawler(CrawlerBase):
|
|
def __init__(self, subforum, new_driver=False):
|
|
localArgs = {k:v for k,v in locals().items() if k is not 'self'}
|
|
super(PullSubforumCrawler,self).__init__(**localArgs)
|
|
self.base_url = FORUM_URL_BASE.format(forum=subforum)
|
|
self.page_url = FORUM_URL_PAGE.format(forum=subforum)
|
|
|
|
def crawl(self):
|
|
total = len(self.urls)
|
|
for index, url in enumerate(self.urls):
|
|
print('Crawling subforum page {}/{}...'.format(index+1,total))
|
|
if self.driver.current_url != url:
|
|
self.get(url)
|
|
page_threads = self.get_page_threads()
|
|
self.data.append({'url':url, 'page_number':index+1, 'threads':page_threads, 'type':'subforum_page', 'source':self.driver.page_source})
|
|
|
|
def dump(self):
|
|
return {'url':self.base_url, 'title':self.title, 'pages':self.data}
|
|
|
|
def get_page_count(self):
|
|
try:
|
|
count_text = self.driver.find_element_by_class_name('ipsPagination_pageJump').text
|
|
self.page_count = int(re.sub(PAGE_REGEX, '', count_text))
|
|
except NoSuchElementException:
|
|
self.page_count = 0
|
|
except Exception as e:
|
|
raise e
|
|
|
|
def get_page_threads(self):
|
|
self.make_soup()
|
|
threads = self.soup.find_all('li',attrs={'itemtype':'http://schema.org/Article'})
|
|
page_threads = []
|
|
for thread in threads:
|
|
page_threads.append(self.parse_thread(thread))
|
|
return page_threads
|
|
|
|
def get_subforum_title(self):
|
|
self.make_soup()
|
|
self.title = self.soup.find('h1',attrs={'class':'ipsType_pageTitle'}).text
|
|
|
|
def main(self):
|
|
self.get(self.base_url)
|
|
self.get_page_count()
|
|
self.get_subforum_title()
|
|
self.make_page_urls()
|
|
self.crawl()
|
|
|
|
def make_page_urls(self):
|
|
self.urls = [self.base_url]
|
|
for x in range(1, self.page_count+1):
|
|
self.urls.append(self.page_url.format(page=x))
|
|
|
|
def parse_thread(self, thread):
|
|
head = thread.find_all('h4')[0]
|
|
#url = head.find_all('a')[0].attrs['href']
|
|
url = head.find_all('a',attrs={'itemprop':'url'})[0].attrs['href']
|
|
title = head.find_all('span',attrs={'itemprop':'name'})[-1].text.strip()
|
|
try:
|
|
last_page_obj = thread.find_all('li',attrs={'class':'ipsPagination_last'})[0]
|
|
last_page = int(last_page_obj.text.strip())
|
|
except IndexError:
|
|
last_page = 1
|
|
urls = self._parse_thread_pages(url, last_page)
|
|
return {'urls':urls, 'title':title, 'page_count':last_page, 'type':'thread'}
|
|
|
|
def _parse_thread_pages(self, url, last_page):
|
|
out = [url]
|
|
for x in range(2, last_page+1):
|
|
out.append(url+'?page={}'.format(x))
|
|
return out |