111 lines
4.9 KiB
Python
111 lines
4.9 KiB
Python
#scrapes posts from Kiwi Farms thread and saves them to PostData objects
|
|
|
|
from postData import PostData
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
def collectInput(range):
|
|
#collects user input for multiple choice questions, validates it, and returns response
|
|
#
|
|
#range = upper range for input
|
|
choice = input(": ")
|
|
while not choice.isdigit() or not (0 < int(choice) <= range):
|
|
print(f"Error: Input must be a number between 1 and {range}. Try again.")
|
|
choice = input(": ")
|
|
return int(choice)
|
|
|
|
def printPost(post):
|
|
#print basic info about post
|
|
reactionsCtr = "" #text output for ratings recieved
|
|
print(f"""
|
|
POST FOUND - User: {post.poster} | Date: {post.postDate} | #{post.postNum}
|
|
Total Reactions: {post.totalScore} Weighted: {post.weightedScore} Positive: {post.positive} Negative: {post.negative} Neutral: {post.neutral}"""
|
|
)
|
|
for reaction, num in post.ratings.items():
|
|
if num > 0: #don't collect ratings that weren't recieved
|
|
reactionsCtr = reactionsCtr + reaction + ": " + str(num) + " "
|
|
print(reactionsCtr)
|
|
|
|
def collectPosts(pageData, filter, pageNum, minRating = 0, user = "", stopPage = 0, ratingFilter = ""):
|
|
#scrapes for posts that pass the specified filter
|
|
#keeps going until the last page in thread (when there is not "next" button to click) or stopPage reached
|
|
#
|
|
# pageData = BeautifulSoup object representing first page to search
|
|
# filter = filter being used (from Filter class in scraper.py)
|
|
# pageNum = current page in thread
|
|
# minRating = minimum rating needed to pass filter if filtering by ratings
|
|
# user = user to filter by if filtering by user
|
|
# stopPage = page to stop at; 0 if none specified
|
|
# ratingFilter = rating to filter by if filtering by specific rating
|
|
#
|
|
# returns list of PostData objects pertaining to posts that passed the filter
|
|
|
|
terminate = False #true when scraping should terminate early
|
|
savedPosts = [] #posts saved (this will be returned)
|
|
indivPost = PostData() #postData object representing a single post to be analyzed
|
|
|
|
ctr = 0
|
|
while True:
|
|
if terminate:
|
|
break
|
|
|
|
posts = pageData.find_all("article", class_ = "message") #collect all posts in page as list of BS objects
|
|
|
|
for post in posts: #analyze individual posts
|
|
ctr += 1
|
|
#print(ctr) #I use this for debugging, just ignore it.
|
|
indivPost = PostData()
|
|
#collect reactions if filtering by reactions
|
|
if not filter.value == 7: indivPost.reactionData(post)
|
|
#collect username if filtering by user
|
|
else: indivPost.getUsername(post)
|
|
|
|
if (#check if post passes filter
|
|
(filter.value == 1 and indivPost.positive >= minRating) #positive
|
|
or (filter.value == 2 and indivPost.negative >= minRating) #negative
|
|
or (filter.value == 3 and indivPost.neutral >= minRating) #neutral
|
|
or (filter.value == 4 and indivPost.totalScore >= minRating) #total ratings
|
|
or (filter.value == 6 and indivPost.weightedScore >= minRating) #weighted score
|
|
or (filter.value == 5 and indivPost.ratings[ratingFilter] >= minRating) #specific rating
|
|
or (filter.value == 7 and indivPost.poster == user) #user
|
|
):#if filter passed, save rest of post's data, add to savedPosts, and print that the post was found
|
|
indivPost.takeInfo(post)
|
|
savedPosts.append(indivPost)
|
|
printPost(indivPost)
|
|
|
|
if pageNum == stopPage:#stop scraping when user's stop-page has been scraped
|
|
break
|
|
elif not pageData.find("a", class_ = "pageNav-jump--next") == None:#stop scraping when program can't progress to another page
|
|
#if next page exists
|
|
while True:
|
|
if terminate:
|
|
break
|
|
|
|
try:
|
|
pageData = requests.get(f"http://kiwifarms.net" + pageData.find("a", class_ = "pageNav-jump--next")['href'])
|
|
pageData = BeautifulSoup(pageData.text, 'lxml')
|
|
break
|
|
except:#this runs when connection can't be made to the next page
|
|
print(f"""
|
|
Error: Connection timed out while trying to connect to page {pageData.find("a", class_ = "pageNav-jump--next")['href'].split('-')[-1]}.
|
|
|
|
What would you like to do?
|
|
(1) retry (please confirm that the page is working before you do this)
|
|
(2) save what you have already
|
|
(3) quit
|
|
""")
|
|
choice = collectInput(3)
|
|
if choice == 1: #retry connection
|
|
continue
|
|
elif choice == 2: #save what's collected so far
|
|
terminate = True
|
|
continue
|
|
else: #quit
|
|
quit()
|
|
pageNum += 1
|
|
else:
|
|
break
|
|
|
|
print(f"\nScraping finished.\nPosts found: {len(savedPosts)}")
|
|
|
|
return savedPosts |