80 lines
3.2 KiB
Python
80 lines
3.2 KiB
Python
#Scrapes kiwifarms.net threads for posts above a certain ratings threshold.
|
|
#This filter can be set by the user, with several availible options.
|
|
|
|
#REQUIRED PACKEGES: bs4, requests
|
|
from bs4 import BeautifulSoup
|
|
import enum
|
|
from postData import PostData
|
|
from prepOutput import outputSelect
|
|
from funcs import getThread, stop_page, getFilter, reactionSelect, getRatingsThreshold, getUsername
|
|
from collectPosts import collectPosts
|
|
|
|
class Filter(enum.Enum):#enumeration for type of filter
|
|
NA = 0 #not yet assigned
|
|
positive = 1 #positive ratings
|
|
negative = 2 #negative ratings
|
|
neutral = 3 #neutral ratings
|
|
total = 4 #total ratings
|
|
specific = 5 #specific rating
|
|
weighted = 6 #weighted rating
|
|
user = 7 #specific user
|
|
|
|
###### Variables ######
|
|
#thread data
|
|
threadLink = ""#link to thread
|
|
threadTitle = ""#title of thread
|
|
pageData = ""#BeautifulSoup object storing threadpage's HTML
|
|
savedPosts = [] #stores PostData objects for posts to be saved
|
|
pageNum = 0 #current page number
|
|
usernameFilter = "" #username to filter by when applicable
|
|
|
|
#filter toggles
|
|
filterSelection = Filter.NA #type of filter being used
|
|
ratingSelection = "" #specific reaction to sort by when sorting by specific reaction
|
|
minRating = 0 #minimum number for ratings/score for filter
|
|
stopPage = 0 #page number to stop searching at, 0 means no page specified
|
|
|
|
######################
|
|
|
|
#collect link to thread
|
|
print("Welcome to the Kiwi Scraper!\n")
|
|
print("Please provide the link to the thread you want analyzed below.")
|
|
print("\nPlease note that this program will start searching at the first threadpage that you link to,",
|
|
"so if you'd like the thread analyzed starting at the frst page, please link to the first page of the thread; otherwise,",
|
|
"provide a link to the first page you want scraped.")
|
|
|
|
pageData, threadTitle, threadLink = getThread()
|
|
|
|
print("------------------------------------------\nThread:", threadTitle)
|
|
print("------------------------------------------")
|
|
|
|
#check if user would like to stop at a certain page
|
|
stopPage = stop_page()
|
|
|
|
#collect user peference for filtering
|
|
#collect filter type
|
|
filterSelection = Filter(getFilter())
|
|
|
|
if filterSelection == Filter.specific:#if filtering by specific rating ask which rating to use
|
|
ratingSelection = reactionSelect()
|
|
|
|
#if filtering by specific user, collect username
|
|
if filterSelection == Filter.user: usernameFilter = getUsername()
|
|
|
|
#colect ratings threshold if user is filtering by ratings
|
|
print("------------------------------------------")
|
|
if filterSelection == Filter.specific: minRating = getRatingsThreshold(filterSelection, ratingSelection)
|
|
elif not filterSelection == Filter.user: minRating = getRatingsThreshold(filterSelection)
|
|
|
|
#get current page number
|
|
try: pageNum = int(pageData.find("li", class_ = "pageNav-page--current").find("a").text)
|
|
except: pageNum = 1
|
|
|
|
#scrape for posts
|
|
print("\nGrab some popcorn, this might take a while...\n")
|
|
savedPosts = collectPosts(pageData = pageData, filter = filterSelection, pageNum = pageNum, minRating = minRating, user = usernameFilter, stopPage = stopPage, ratingFilter=ratingSelection)
|
|
|
|
# ask user where to save json file and what to name it
|
|
print("------------------------------------------")
|
|
|
|
outputSelect(savedPosts) |