Files

201 lines
8.0 KiB
Python

#class and functions used for collecting information about posts
from bs4 import BeautifulSoup
import requests
import re
def getText(post):
#returns text from post
return post.find("div", class_ = "bbWrapper").text
def getContent(post):
#TODO: add functionality for printing out post's text along with special markings fors:
# -links
# -spoilers
# -pictures
# -reply quotes
# -quoted text
# -embeded video (youtube)
# -video
return ""
def getAttachments(post):
#returns links to any attachments from post
try:
attachContainer = post.find("ul", class_ = "attachmentList") #look for attachments container
attachments_ = attachContainer.find_all("li", class_ = "attachment") #retrieve individual attachments
attachments = []
for at in attachments_: #retrieve attachment links
at = at.find("a")['href']
at = f"http://www.kiwifarms.net" + at
attachments.append(at)
return attachments
except:#if no attachments were found, return an empty list
return []
def getRatings(post):
#returns a dictionary with all of a post's ratings
try:
reactionsLink = f"http://kiwifarms.net" + post.find("a", class_ = "reactionsBar-link")["href"] #try to get link to reactions page
while True:
try:
reactions = requests.get(reactionsLink).text
break
except:#if program is unable to reach reactions page, ask user if they want to skip or retry
print(f"Error: Unable to retrieve reactions for post # "
+ "".join(re.split("\n|\t", post.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1] + ".")
print("What would you like to do?")
print("(1) retry (please wait a few minutes before you do this)")
print("(2) skip")
choice = input(": ")
while not choice.isdigit() or not (0 < choice < 3):
print("Error: Input must be a number between 1 and 2.")
choice = input(": ")
if choice == 1 :
continue
if choice == 2 :
break
reactions = BeautifulSoup(reactions, "lxml")
reactions = reactions.find("span", class_ = "hScroller-scroll").find_all("a")
reactionsFound = {} #dictionary of found reactions with their numbers
for r in reactions:
try: #for each reaction listed on the reaction page, make a relevant addition to reactionsFound
foundReaction = r.find("span", class_ = "reaction-text").text
foundReaction = foundReaction.split(" (")
reactionsFound.update([(foundReaction[0], int(foundReaction[1][:-1]))])
except:
continue
return reactionsFound
except: #if no reactions page found, return an empty dictionary
return {}
class PostData:
#stores data about posts
def __init__(self):
self.reactionsEvaluated = False #true when the post's reactions have already been evaluated
self.usernameRetrieved = False #true when the poster's name has already been retrieved
#metadata
self.postLink = "" #link to post
self.postNum = 0 #post's number in thread
self.poster = "" #user making post
self.postDate = "" #time of post
self.edited = False #true if post was edited
self.editDate = "n/a" #date of last edit
self.ratings = { #ratings recieved
"Like" : 0,
"Dislike" : 0,
"Agree" : 0,
"Disagree" : 0,
"Winner" : 0,
"Informative" : 0,
"Thunk-Provoking" : 0,
"Feels" : 0,
"Islamic Content" : 0,
"Lunacy" : 0,
"Autistic" : 0,
"Horrifying" : 0,
"Optimistic" : 0,
"TMI" : 0,
"Late" : 0,
"Dumb" : 0,
"Mad at the Internet" : 0,
"Semper Fidelis" : 0,
"Deviant" : 0,
"Achievement" : 0,
"DRINK!" : 0
}
#content
self.rawDat = "" #raw HTML data from post
self.rawText = "" #text from post not including pictures/media
self.content = "" #all text/media from post
self.attachedMedia = [] #links to any attached media
#rating data
self.positive = 0 #positive ratings
self.neutral = 0 #neurral ratings
self.negative = 0 #negative ratings
self.weightedScore = 0 #score accounting for positive and negitive ratings, where negatives count as negative points
self.totalScore = 0 #total number of ratings recieved
def addRatings(self, newRatings):
#takes in newTatings from passed in dictionary and adjusts
#both the ratings and ratings data accordingly
pos = ["Like", "Agree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Semper Fidelis", "Achievement"]
neu = ["Islamic Content", "Lunacy", "Horrifying", "Optimistic", "Devient", "DRINK!"]
neg = ["Dislike", "Disagree", "Autistic", "TMI", "Late", "Dumb", "Mad at the Internet"]
for key, value in newRatings.items():
try:
self.ratings[key] = value #adjust ratings dictionary appropriately
if key in pos: #rating is positive
self.positive += value
self.weightedScore += value
elif key in neu: #rating is neutral
self.neutral += value
elif key in neg: #rating is negative
self.negative += value
self.weightedScore -= value
self.totalScore += value
except:
continue
def reactionData(self, infoBS):
#processes passed in post info from BeautifulSoup, processes it into class,
#and returns a dictionary with it's basic reactions data (no specifics)
#
#useful for checking reactions before saving the rest of a post's data
global reactionsEvaluated
self.addRatings(getRatings(infoBS))
reactionsEvaluated = True
return {"positive" : self.positive, "neutral" : self.neutral, "negative" : self.negative, "weighted score" : self.weightedScore, "total score" : self.totalScore}
def getUsername(self, infoBS):
#retrieves the username of the poster and saves it to self.poster
#returns usename retrieved from post
#
#useful for retrieving username of poster without having to save the rest of the post's data
global usernameRetrieved
try:
self.poster = infoBS.find("span", class_ = "username").text
except AttributeError:#if user is a guest, username data is stored in a different tag
self.poster = infoBS.find("a", class_ = "username").text
usernameRetrieved = True
return self.poster
def takeInfo(self, infoBS):
#intakes BeautifulSoup object representing the HTML for a post
#and sorts its data into the proper variables
#get metadata
self.postLink = f"http://kiwifarms.net" + infoBS.find("a", class_ = "u-concealed")["href"]
self.postNum = int("".join(re.split("\n|\t", infoBS.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1].replace(',',''))
if not self.usernameRetrieved: self.getUsername(infoBS)
self.postDate = infoBS.find("time", class_ = "u-dt")["data-date-string"]
try:
self.editDate = infoBS.find("div", class_ = "message-lastEdit")
self.edited = True
except:
self.edited = False
#get content
self.rawDat = f"""{infoBS}"""
self.rawText = getText(infoBS)
self.content = getContent(infoBS)
self.attachedMedia = getAttachments(infoBS)
#get rating data
if not self.reactionsEvaluated:
self.reactionData(infoBS)