Kiwi_Scraper/scraper/funcs.py

#colection of functions used by scraper.py along with some others
from bs4 import BeautifulSoup
import requests

############# localy used functions ###########################

def collectInput(range):
    #collects user input for multiple choice questions, validates it, and returns response
    #
    #range = upper range for input
    choice = input(": ")
    while not choice.isdigit() or not (0 < int(choice) <= range):
        print(f"Error: Input must be a number between 1 and {range}. Try again.")
        choice = input(": ")
    return int(choice)


def yesno(question):
    #collects and verifies yes/no input; returns true if yes, false if no
    #
    #question = question to ask
    choice = input(question)
    while not(choice.lower() == "y" or choice.lower() == "n"):
        print("Invalid input. Try again.")
        choice = input(question)
    if choice.lower() == "y":
        return True
    else:
        return False

##################### functions used outside file ########################

def getThread():
    #creates BeautifulSoup object from threadLink and retrieves thread's title and URL
    # returns BS object, thread title, and link to thread

    pageData = "" #beautiful soup object from threadLink
    threadTitle = "" #title of thread from threadLink
    threadLink = "" #link to thread

    while True:#make sure provided link is valid by checking for a thread title
        try:
            threadLink = input("\n: ")
            print("\nFetching thread...")
            pageData = requests.get(threadLink, headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
            pageData = BeautifulSoup(pageData.text, "lxml")
            threadTitle = pageData.find("h1", class_ = "p-title-value").text
            if not pageData.find("span", class_ = "label") == None:#remove label if it exists
                threadTitle = " ".join(threadTitle.split("\xa0")[1:])
            break
        except:
            print("Error: There was either trouble reaching the webpage, or the provided link is invalid.\nTry again.")

    return pageData, threadTitle, threadLink


def stop_page():
    #collect page number to stop at if user has a preference
    #returns page number to stop at or 0 if user wants all pages archived

    stopPage = 0 #page to stop at

    ui = yesno("Would you like to stop at a certain page (y/n)?: ")
    if ui:
        stopPage = input("What page do you want to stop at?: ")
        while not stopPage.isdigit():
            print("Input must be a number. Try again.")
            stopPage = input("What page do you want to stop at?: ")
        stopPage = int(stopPage)

    return stopPage


def getFilter():
    #gets user's selection of filter to use
    #returns numerical choice of filter

    print("""------------------------------------------
How would you like the thread to be filtered?

(1) positive ratings
(2) negative ratings
(3) neutral ratings
(4) total ratings
(5) specific rating
(6) weighted score (positive ratings count as positive points, negative ratings count as negative points, and neutral ratings don't count)
(7) specific user
    """)
    return collectInput(7)


def reactionSelect():
    #allows user to select a reaction from a numbered list
    #returns that reaction as a string

    print("""------------------------------------------
Which rating do you want filtered?

(1)   Like                 (2)   Dislike
(3)   Agree                (4)   Disagree
(5)   Winner               (6)   Informative
(7)   Thunk-Provoking      (8)   Feels
(9)   Islamic Content      (10)  Lunacy
(11)  Autistic             (12)  Horrifying
(13)  Optimistic           (14)  TMI
(15)  Late                 (16)  Dumb
(17)  Mad at the Internet  (18)  Semper Fidelis
(19)  Deviant              (20)  Achievement
(21)  DRINK!
    """)
    choice = collectInput(21)
    return ["Like", "Dislike", "Agree", "Disagree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Islamic Content", "Lunacy", "Autistic",
        "Horrifying", "Optimistic", "TMI", "Late", "Dumb", "Mad at the Internet", "Semper Fidelis", "Deviant", "Achievement", "DRINK!"][choice - 1]


def getUsername():
    #collects username from user input

    user = input("Enter username to filter by: ")#user input for name to filter by
    #verify choice with user
    if yesno(f"User '{user}' selected. Is this correct? (Make sure you spelled it correctly!)\n: ") == False:
        return getUsername()

    return user


def getRatingsThreshold(filter, reaction = ""):
    #collects ratings threshold based on filter value
    #
    #filter = Filter enum (from scraper.py)
    #reation = reaction to filter by if filtering by specific reaction
    #
    #returns user's designated ratings threshold

    minRating = 0 #ratings threshold to return

    if filter.value == 6: #weighted score
        print("Enter a minimum score for posts.")
    elif filter.value == 5: #specific reaction
        print(f"Enter a minumum number of \'{reaction}\' ratings.")
    else: #other filters
        print(f"Enter a minumum number of {filter.name} ratings.")
    minRating = input(": ")
    while not minRating.isdigit():
        print("Error: Input must be a number.")
        minRating = input(": ")
    return int(minRating)