Kiwi_Scraper/scraper/collectPosts.py

#scrapes posts from Kiwi Farms thread and saves them to PostData objects

from postData import PostData
from bs4 import BeautifulSoup
import requests

def collectInput(range):
    #collects user input for multiple choice questions, validates it, and returns response
    #
    #range = upper range for input
    choice = input(": ")
    while not choice.isdigit() or not (0 < int(choice) <= range):
        print(f"Error: Input must be a number between 1 and {range}. Try again.")
        choice = input(": ")
    return int(choice)

def printPost(post):
    #print basic info about post
    reactionsCtr = "" #text output for ratings recieved
    print(f"""
POST FOUND - User: {post.poster} | Date: {post.postDate} | #{post.postNum}
Total Reactions: {post.totalScore}  Weighted: {post.weightedScore}  Positive: {post.positive}  Negative: {post.negative}  Neutral: {post.neutral}"""
    )
    for reaction, num in post.ratings.items():
        if num > 0: #don't collect ratings that weren't recieved
            reactionsCtr = reactionsCtr + reaction + ": " + str(num) + "  "
    print(reactionsCtr)

def collectPosts(pageData, filter, pageNum, minRating = 0, user = "", stopPage = 0, ratingFilter = ""):
    #scrapes for posts that pass the specified filter
    #keeps going until the last page in thread (when there is not "next" button to click) or stopPage reached
    #
    # pageData = BeautifulSoup object representing first page to search
    # filter = filter being used (from Filter class in scraper.py)
    # pageNum = current page in thread
    # minRating = minimum rating needed to pass filter if filtering by ratings
    # user = user to filter by if filtering by user
    # stopPage = page to stop at; 0 if none specified
    # ratingFilter = rating to filter by if filtering by specific rating
    #
    # returns list of PostData objects pertaining to posts that passed the filter

    terminate = False #true when scraping should terminate early
    savedPosts = [] #posts saved (this will be returned)
    indivPost = PostData() #postData object representing a single post to be analyzed

    ctr = 0
    while True:
        if terminate:
            break

        posts = pageData.find_all("article", class_ = "message") #collect all posts in page as list of BS objects

        for post in posts: #analyze individual posts
            ctr += 1
            #print(ctr) #I use this for debugging, just ignore it.
            indivPost = PostData()
            #collect reactions if filtering by reactions
            if not filter.value == 7: indivPost.reactionData(post)
            #collect username if filtering by user
            else: indivPost.getUsername(post)

            if (#check if post passes filter
            (filter.value == 1 and indivPost.positive >= minRating) #positive
            or (filter.value == 2 and indivPost.negative >= minRating) #negative
            or (filter.value == 3 and indivPost.neutral >= minRating) #neutral
            or (filter.value == 4 and indivPost.totalScore >= minRating) #total ratings
            or (filter.value == 6 and indivPost.weightedScore >= minRating) #weighted score
            or (filter.value == 5 and indivPost.ratings[ratingFilter] >= minRating) #specific rating
            or (filter.value == 7 and indivPost.poster == user) #user
            ):#if filter passed, save rest of post's data, add to savedPosts, and print that the post was found
                indivPost.takeInfo(post)
                savedPosts.append(indivPost)
                printPost(indivPost)

        if pageNum == stopPage:#stop scraping when user's stop-page has been scraped
            break
        elif not pageData.find("a", class_ = "pageNav-jump--next") == None:#stop scraping when program can't progress to another page
            #if next page exists
            while True:
                if terminate:
                    break

                try:
                    pageData = requests.get(f"http://kiwifarms.net" + pageData.find("a", class_ = "pageNav-jump--next")['href'])
                    pageData = BeautifulSoup(pageData.text, 'lxml')
                    break
                except:#this runs when connection can't be made to the next page
                    print(f"""
Error: Connection timed out while trying to connect to page {pageData.find("a", class_ = "pageNav-jump--next")['href'].split('-')[-1]}.

What would you like to do?
(1) retry (please confirm that the page is working before you do this)
(2) save what you have already
(3) quit
                    """)
                    choice = collectInput(3)
                    if choice == 1: #retry connection
                        continue
                    elif choice == 2: #save what's collected so far
                        terminate = True
                        continue
                    else: #quit
                        quit()
            pageNum += 1
        else:
            break

    print(f"\nScraping finished.\nPosts found: {len(savedPosts)}")

    return savedPosts