Files

111 lines
4.9 KiB
Python

#scrapes posts from Kiwi Farms thread and saves them to PostData objects
from postData import PostData
from bs4 import BeautifulSoup
import requests
def collectInput(range):
#collects user input for multiple choice questions, validates it, and returns response
#
#range = upper range for input
choice = input(": ")
while not choice.isdigit() or not (0 < int(choice) <= range):
print(f"Error: Input must be a number between 1 and {range}. Try again.")
choice = input(": ")
return int(choice)
def printPost(post):
#print basic info about post
reactionsCtr = "" #text output for ratings recieved
print(f"""
POST FOUND - User: {post.poster} | Date: {post.postDate} | #{post.postNum}
Total Reactions: {post.totalScore} Weighted: {post.weightedScore} Positive: {post.positive} Negative: {post.negative} Neutral: {post.neutral}"""
)
for reaction, num in post.ratings.items():
if num > 0: #don't collect ratings that weren't recieved
reactionsCtr = reactionsCtr + reaction + ": " + str(num) + " "
print(reactionsCtr)
def collectPosts(pageData, filter, pageNum, minRating = 0, user = "", stopPage = 0, ratingFilter = ""):
#scrapes for posts that pass the specified filter
#keeps going until the last page in thread (when there is not "next" button to click) or stopPage reached
#
# pageData = BeautifulSoup object representing first page to search
# filter = filter being used (from Filter class in scraper.py)
# pageNum = current page in thread
# minRating = minimum rating needed to pass filter if filtering by ratings
# user = user to filter by if filtering by user
# stopPage = page to stop at; 0 if none specified
# ratingFilter = rating to filter by if filtering by specific rating
#
# returns list of PostData objects pertaining to posts that passed the filter
terminate = False #true when scraping should terminate early
savedPosts = [] #posts saved (this will be returned)
indivPost = PostData() #postData object representing a single post to be analyzed
ctr = 0
while True:
if terminate:
break
posts = pageData.find_all("article", class_ = "message") #collect all posts in page as list of BS objects
for post in posts: #analyze individual posts
ctr += 1
#print(ctr) #I use this for debugging, just ignore it.
indivPost = PostData()
#collect reactions if filtering by reactions
if not filter.value == 7: indivPost.reactionData(post)
#collect username if filtering by user
else: indivPost.getUsername(post)
if (#check if post passes filter
(filter.value == 1 and indivPost.positive >= minRating) #positive
or (filter.value == 2 and indivPost.negative >= minRating) #negative
or (filter.value == 3 and indivPost.neutral >= minRating) #neutral
or (filter.value == 4 and indivPost.totalScore >= minRating) #total ratings
or (filter.value == 6 and indivPost.weightedScore >= minRating) #weighted score
or (filter.value == 5 and indivPost.ratings[ratingFilter] >= minRating) #specific rating
or (filter.value == 7 and indivPost.poster == user) #user
):#if filter passed, save rest of post's data, add to savedPosts, and print that the post was found
indivPost.takeInfo(post)
savedPosts.append(indivPost)
printPost(indivPost)
if pageNum == stopPage:#stop scraping when user's stop-page has been scraped
break
elif not pageData.find("a", class_ = "pageNav-jump--next") == None:#stop scraping when program can't progress to another page
#if next page exists
while True:
if terminate:
break
try:
pageData = requests.get(f"http://kiwifarms.net" + pageData.find("a", class_ = "pageNav-jump--next")['href'])
pageData = BeautifulSoup(pageData.text, 'lxml')
break
except:#this runs when connection can't be made to the next page
print(f"""
Error: Connection timed out while trying to connect to page {pageData.find("a", class_ = "pageNav-jump--next")['href'].split('-')[-1]}.
What would you like to do?
(1) retry (please confirm that the page is working before you do this)
(2) save what you have already
(3) quit
""")
choice = collectInput(3)
if choice == 1: #retry connection
continue
elif choice == 2: #save what's collected so far
terminate = True
continue
else: #quit
quit()
pageNum += 1
else:
break
print(f"\nScraping finished.\nPosts found: {len(savedPosts)}")
return savedPosts