147 lines
5.2 KiB
Python
147 lines
5.2 KiB
Python
#colection of functions used by scraper.py along with some others
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
############# localy used functions ###########################
|
|
|
|
def collectInput(range):
|
|
#collects user input for multiple choice questions, validates it, and returns response
|
|
#
|
|
#range = upper range for input
|
|
choice = input(": ")
|
|
while not choice.isdigit() or not (0 < int(choice) <= range):
|
|
print(f"Error: Input must be a number between 1 and {range}. Try again.")
|
|
choice = input(": ")
|
|
return int(choice)
|
|
|
|
|
|
def yesno(question):
|
|
#collects and verifies yes/no input; returns true if yes, false if no
|
|
#
|
|
#question = question to ask
|
|
choice = input(question)
|
|
while not(choice.lower() == "y" or choice.lower() == "n"):
|
|
print("Invalid input. Try again.")
|
|
choice = input(question)
|
|
if choice.lower() == "y":
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
##################### functions used outside file ########################
|
|
|
|
def getThread():
|
|
#creates BeautifulSoup object from threadLink and retrieves thread's title and URL
|
|
# returns BS object, thread title, and link to thread
|
|
|
|
pageData = "" #beautiful soup object from threadLink
|
|
threadTitle = "" #title of thread from threadLink
|
|
threadLink = "" #link to thread
|
|
|
|
while True:#make sure provided link is valid by checking for a thread title
|
|
try:
|
|
threadLink = input("\n: ")
|
|
print("\nFetching thread...")
|
|
pageData = requests.get(threadLink, headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'})
|
|
pageData = BeautifulSoup(pageData.text, "lxml")
|
|
threadTitle = pageData.find("h1", class_ = "p-title-value").text
|
|
if not pageData.find("span", class_ = "label") == None:#remove label if it exists
|
|
threadTitle = " ".join(threadTitle.split("\xa0")[1:])
|
|
break
|
|
except:
|
|
print("Error: There was either trouble reaching the webpage, or the provided link is invalid.\nTry again.")
|
|
|
|
return pageData, threadTitle, threadLink
|
|
|
|
|
|
def stop_page():
|
|
#collect page number to stop at if user has a preference
|
|
#returns page number to stop at or 0 if user wants all pages archived
|
|
|
|
stopPage = 0 #page to stop at
|
|
|
|
ui = yesno("Would you like to stop at a certain page (y/n)?: ")
|
|
if ui:
|
|
stopPage = input("What page do you want to stop at?: ")
|
|
while not stopPage.isdigit():
|
|
print("Input must be a number. Try again.")
|
|
stopPage = input("What page do you want to stop at?: ")
|
|
stopPage = int(stopPage)
|
|
|
|
return stopPage
|
|
|
|
|
|
def getFilter():
|
|
#gets user's selection of filter to use
|
|
#returns numerical choice of filter
|
|
|
|
print("""------------------------------------------
|
|
How would you like the thread to be filtered?
|
|
|
|
(1) positive ratings
|
|
(2) negative ratings
|
|
(3) neutral ratings
|
|
(4) total ratings
|
|
(5) specific rating
|
|
(6) weighted score (positive ratings count as positive points, negative ratings count as negative points, and neutral ratings don't count)
|
|
(7) specific user
|
|
""")
|
|
return collectInput(7)
|
|
|
|
|
|
def reactionSelect():
|
|
#allows user to select a reaction from a numbered list
|
|
#returns that reaction as a string
|
|
|
|
print("""------------------------------------------
|
|
Which rating do you want filtered?
|
|
|
|
(1) Like (2) Dislike
|
|
(3) Agree (4) Disagree
|
|
(5) Winner (6) Informative
|
|
(7) Thunk-Provoking (8) Feels
|
|
(9) Islamic Content (10) Lunacy
|
|
(11) Autistic (12) Horrifying
|
|
(13) Optimistic (14) TMI
|
|
(15) Late (16) Dumb
|
|
(17) Mad at the Internet (18) Semper Fidelis
|
|
(19) Deviant (20) Achievement
|
|
(21) DRINK!
|
|
""")
|
|
choice = collectInput(21)
|
|
return ["Like", "Dislike", "Agree", "Disagree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Islamic Content", "Lunacy", "Autistic",
|
|
"Horrifying", "Optimistic", "TMI", "Late", "Dumb", "Mad at the Internet", "Semper Fidelis", "Deviant", "Achievement", "DRINK!"][choice - 1]
|
|
|
|
|
|
def getUsername():
|
|
#collects username from user input
|
|
|
|
user = input("Enter username to filter by: ")#user input for name to filter by
|
|
#verify choice with user
|
|
if yesno(f"User '{user}' selected. Is this correct? (Make sure you spelled it correctly!)\n: ") == False:
|
|
return getUsername()
|
|
|
|
return user
|
|
|
|
|
|
def getRatingsThreshold(filter, reaction = ""):
|
|
#collects ratings threshold based on filter value
|
|
#
|
|
#filter = Filter enum (from scraper.py)
|
|
#reation = reaction to filter by if filtering by specific reaction
|
|
#
|
|
#returns user's designated ratings threshold
|
|
|
|
minRating = 0 #ratings threshold to return
|
|
|
|
if filter.value == 6: #weighted score
|
|
print("Enter a minimum score for posts.")
|
|
elif filter.value == 5: #specific reaction
|
|
print(f"Enter a minumum number of \'{reaction}\' ratings.")
|
|
else: #other filters
|
|
print(f"Enter a minumum number of {filter.name} ratings.")
|
|
minRating = input(": ")
|
|
while not minRating.isdigit():
|
|
print("Error: Input must be a number.")
|
|
minRating = input(": ")
|
|
return int(minRating) |