201 lines
8.0 KiB
Python
201 lines
8.0 KiB
Python
#class and functions used for collecting information about posts
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
import re
|
|
|
|
def getText(post):
|
|
#returns text from post
|
|
|
|
return post.find("div", class_ = "bbWrapper").text
|
|
|
|
|
|
def getContent(post):
|
|
#TODO: add functionality for printing out post's text along with special markings fors:
|
|
# -links
|
|
# -spoilers
|
|
# -pictures
|
|
# -reply quotes
|
|
# -quoted text
|
|
# -embeded video (youtube)
|
|
# -video
|
|
return ""
|
|
|
|
|
|
def getAttachments(post):
|
|
#returns links to any attachments from post
|
|
|
|
try:
|
|
attachContainer = post.find("ul", class_ = "attachmentList") #look for attachments container
|
|
attachments_ = attachContainer.find_all("li", class_ = "attachment") #retrieve individual attachments
|
|
attachments = []
|
|
for at in attachments_: #retrieve attachment links
|
|
at = at.find("a")['href']
|
|
at = f"http://www.kiwifarms.net" + at
|
|
attachments.append(at)
|
|
|
|
return attachments
|
|
except:#if no attachments were found, return an empty list
|
|
return []
|
|
|
|
|
|
def getRatings(post):
|
|
#returns a dictionary with all of a post's ratings
|
|
|
|
try:
|
|
reactionsLink = f"http://kiwifarms.net" + post.find("a", class_ = "reactionsBar-link")["href"] #try to get link to reactions page
|
|
while True:
|
|
try:
|
|
reactions = requests.get(reactionsLink).text
|
|
break
|
|
except:#if program is unable to reach reactions page, ask user if they want to skip or retry
|
|
print(f"Error: Unable to retrieve reactions for post # "
|
|
+ "".join(re.split("\n|\t", post.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1] + ".")
|
|
print("What would you like to do?")
|
|
print("(1) retry (please wait a few minutes before you do this)")
|
|
print("(2) skip")
|
|
choice = input(": ")
|
|
while not choice.isdigit() or not (0 < choice < 3):
|
|
print("Error: Input must be a number between 1 and 2.")
|
|
choice = input(": ")
|
|
if choice == 1 :
|
|
continue
|
|
if choice == 2 :
|
|
break
|
|
|
|
reactions = BeautifulSoup(reactions, "lxml")
|
|
reactions = reactions.find("span", class_ = "hScroller-scroll").find_all("a")
|
|
reactionsFound = {} #dictionary of found reactions with their numbers
|
|
for r in reactions:
|
|
try: #for each reaction listed on the reaction page, make a relevant addition to reactionsFound
|
|
foundReaction = r.find("span", class_ = "reaction-text").text
|
|
foundReaction = foundReaction.split(" (")
|
|
reactionsFound.update([(foundReaction[0], int(foundReaction[1][:-1]))])
|
|
except:
|
|
continue
|
|
|
|
return reactionsFound
|
|
except: #if no reactions page found, return an empty dictionary
|
|
return {}
|
|
|
|
|
|
class PostData:
|
|
#stores data about posts
|
|
|
|
def __init__(self):
|
|
self.reactionsEvaluated = False #true when the post's reactions have already been evaluated
|
|
self.usernameRetrieved = False #true when the poster's name has already been retrieved
|
|
|
|
#metadata
|
|
self.postLink = "" #link to post
|
|
self.postNum = 0 #post's number in thread
|
|
self.poster = "" #user making post
|
|
self.postDate = "" #time of post
|
|
self.edited = False #true if post was edited
|
|
self.editDate = "n/a" #date of last edit
|
|
self.ratings = { #ratings recieved
|
|
"Like" : 0,
|
|
"Dislike" : 0,
|
|
"Agree" : 0,
|
|
"Disagree" : 0,
|
|
"Winner" : 0,
|
|
"Informative" : 0,
|
|
"Thunk-Provoking" : 0,
|
|
"Feels" : 0,
|
|
"Islamic Content" : 0,
|
|
"Lunacy" : 0,
|
|
"Autistic" : 0,
|
|
"Horrifying" : 0,
|
|
"Optimistic" : 0,
|
|
"TMI" : 0,
|
|
"Late" : 0,
|
|
"Dumb" : 0,
|
|
"Mad at the Internet" : 0,
|
|
"Semper Fidelis" : 0,
|
|
"Deviant" : 0,
|
|
"Achievement" : 0,
|
|
"DRINK!" : 0
|
|
}
|
|
#content
|
|
self.rawDat = "" #raw HTML data from post
|
|
self.rawText = "" #text from post not including pictures/media
|
|
self.content = "" #all text/media from post
|
|
self.attachedMedia = [] #links to any attached media
|
|
#rating data
|
|
self.positive = 0 #positive ratings
|
|
self.neutral = 0 #neurral ratings
|
|
self.negative = 0 #negative ratings
|
|
self.weightedScore = 0 #score accounting for positive and negitive ratings, where negatives count as negative points
|
|
self.totalScore = 0 #total number of ratings recieved
|
|
|
|
def addRatings(self, newRatings):
|
|
#takes in newTatings from passed in dictionary and adjusts
|
|
#both the ratings and ratings data accordingly
|
|
|
|
pos = ["Like", "Agree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Semper Fidelis", "Achievement"]
|
|
neu = ["Islamic Content", "Lunacy", "Horrifying", "Optimistic", "Devient", "DRINK!"]
|
|
neg = ["Dislike", "Disagree", "Autistic", "TMI", "Late", "Dumb", "Mad at the Internet"]
|
|
|
|
for key, value in newRatings.items():
|
|
try:
|
|
self.ratings[key] = value #adjust ratings dictionary appropriately
|
|
if key in pos: #rating is positive
|
|
self.positive += value
|
|
self.weightedScore += value
|
|
elif key in neu: #rating is neutral
|
|
self.neutral += value
|
|
elif key in neg: #rating is negative
|
|
self.negative += value
|
|
self.weightedScore -= value
|
|
self.totalScore += value
|
|
except:
|
|
continue
|
|
|
|
def reactionData(self, infoBS):
|
|
#processes passed in post info from BeautifulSoup, processes it into class,
|
|
#and returns a dictionary with it's basic reactions data (no specifics)
|
|
#
|
|
#useful for checking reactions before saving the rest of a post's data
|
|
|
|
global reactionsEvaluated
|
|
self.addRatings(getRatings(infoBS))
|
|
reactionsEvaluated = True
|
|
return {"positive" : self.positive, "neutral" : self.neutral, "negative" : self.negative, "weighted score" : self.weightedScore, "total score" : self.totalScore}
|
|
|
|
def getUsername(self, infoBS):
|
|
#retrieves the username of the poster and saves it to self.poster
|
|
#returns usename retrieved from post
|
|
#
|
|
#useful for retrieving username of poster without having to save the rest of the post's data
|
|
|
|
global usernameRetrieved
|
|
try:
|
|
self.poster = infoBS.find("span", class_ = "username").text
|
|
except AttributeError:#if user is a guest, username data is stored in a different tag
|
|
self.poster = infoBS.find("a", class_ = "username").text
|
|
usernameRetrieved = True
|
|
return self.poster
|
|
|
|
|
|
def takeInfo(self, infoBS):
|
|
#intakes BeautifulSoup object representing the HTML for a post
|
|
#and sorts its data into the proper variables
|
|
|
|
#get metadata
|
|
self.postLink = f"http://kiwifarms.net" + infoBS.find("a", class_ = "u-concealed")["href"]
|
|
self.postNum = int("".join(re.split("\n|\t", infoBS.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1].replace(',',''))
|
|
if not self.usernameRetrieved: self.getUsername(infoBS)
|
|
self.postDate = infoBS.find("time", class_ = "u-dt")["data-date-string"]
|
|
try:
|
|
self.editDate = infoBS.find("div", class_ = "message-lastEdit")
|
|
self.edited = True
|
|
except:
|
|
self.edited = False
|
|
#get content
|
|
self.rawDat = f"""{infoBS}"""
|
|
self.rawText = getText(infoBS)
|
|
self.content = getContent(infoBS)
|
|
self.attachedMedia = getAttachments(infoBS)
|
|
#get rating data
|
|
if not self.reactionsEvaluated:
|
|
self.reactionData(infoBS) |