Kiwi_Scraper/scraper/postData.py

#class and functions used for collecting information about posts

from bs4 import BeautifulSoup
import requests
import re

def getText(post):
    #returns text from post

    return post.find("div", class_ = "bbWrapper").text


def getContent(post):
    #TODO: add functionality for printing out post's text along with special markings fors:
    # -links
    # -spoilers
    # -pictures
    # -reply quotes
    # -quoted text
    # -embeded video (youtube)
    # -video
    return ""


def getAttachments(post):
    #returns links to any attachments from post

    try:
        attachContainer = post.find("ul", class_ = "attachmentList") #look for attachments container
        attachments_ = attachContainer.find_all("li", class_ = "attachment") #retrieve individual attachments
        attachments = []
        for at in attachments_: #retrieve attachment links
            at = at.find("a")['href']
            at = f"http://www.kiwifarms.net" + at
            attachments.append(at)

        return attachments
    except:#if no attachments were found, return an empty list
        return []


def getRatings(post):
    #returns a dictionary with all of a post's ratings

    try:
        reactionsLink = f"http://kiwifarms.net" + post.find("a", class_ = "reactionsBar-link")["href"] #try to get link to reactions page
        while True:
            try:
                reactions = requests.get(reactionsLink).text
                break
            except:#if program is unable to reach reactions page, ask user if they want to skip or retry
                print(f"Error: Unable to retrieve reactions for post # "
                    + "".join(re.split("\n|\t", post.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1] + ".")
                print("What would you like to do?")
                print("(1) retry (please wait a few minutes before you do this)")
                print("(2) skip")
                choice = input(": ")
                while not choice.isdigit() or not (0 < choice < 3):
                    print("Error: Input must be a number between 1 and 2.")
                    choice = input(": ")
                if choice == 1 :
                    continue
                if choice == 2 :
                    break

        reactions = BeautifulSoup(reactions, "lxml")
        reactions = reactions.find("span", class_ = "hScroller-scroll").find_all("a")
        reactionsFound = {} #dictionary of found reactions with their numbers
        for r in reactions:
            try: #for each reaction listed on the reaction page, make a relevant addition to reactionsFound
                foundReaction = r.find("span", class_ = "reaction-text").text
                foundReaction = foundReaction.split(" (")
                reactionsFound.update([(foundReaction[0], int(foundReaction[1][:-1]))])
            except:
                continue

        return reactionsFound
    except: #if no reactions page found, return an empty dictionary
        return {}


class PostData:
    #stores data about posts

    def __init__(self):
        self.reactionsEvaluated = False #true when the post's reactions have already been evaluated
        self.usernameRetrieved = False #true when the poster's name has already been retrieved

        #metadata
        self.postLink = "" #link to post
        self.postNum = 0 #post's number in thread
        self.poster = "" #user making post
        self.postDate = "" #time of post
        self.edited = False #true if post was edited
        self.editDate = "n/a" #date of last edit
        self.ratings = { #ratings recieved
            "Like" : 0,
            "Dislike" : 0,
            "Agree" : 0,
            "Disagree" : 0,
            "Winner" : 0,
            "Informative" : 0,
            "Thunk-Provoking" : 0,
            "Feels" : 0,
            "Islamic Content" : 0,
            "Lunacy" : 0,
            "Autistic" : 0,
            "Horrifying" : 0,
            "Optimistic" : 0,
            "TMI" : 0,
            "Late" : 0,
            "Dumb" : 0,
            "Mad at the Internet" : 0,
            "Semper Fidelis" : 0,
            "Deviant" : 0,
            "Achievement" : 0,
            "DRINK!" : 0
        }
        #content
        self.rawDat = "" #raw HTML data from post
        self.rawText = "" #text from post not including pictures/media
        self.content = "" #all text/media from post
        self.attachedMedia = [] #links to any attached media
        #rating data
        self.positive = 0 #positive ratings
        self.neutral = 0 #neurral ratings
        self.negative = 0 #negative ratings
        self.weightedScore = 0 #score accounting for positive and negitive ratings, where negatives count as negative points
        self.totalScore = 0 #total number of ratings recieved

    def addRatings(self, newRatings):
        #takes in newTatings from passed in dictionary and adjusts
        #both the ratings and ratings data accordingly

        pos = ["Like", "Agree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Semper Fidelis", "Achievement"]
        neu = ["Islamic Content", "Lunacy", "Horrifying", "Optimistic", "Devient", "DRINK!"]
        neg = ["Dislike", "Disagree", "Autistic", "TMI", "Late", "Dumb", "Mad at the Internet"]

        for key, value in newRatings.items():
            try:
                self.ratings[key] = value #adjust ratings dictionary appropriately
                if key in pos: #rating is positive
                    self.positive += value
                    self.weightedScore += value
                elif key in neu: #rating is neutral
                    self.neutral += value
                elif key in neg: #rating is negative
                    self.negative += value
                    self.weightedScore -= value
                self.totalScore += value
            except:
                continue

    def reactionData(self, infoBS):
        #processes passed in post info from BeautifulSoup, processes it into class,
        #and returns a dictionary with it's basic reactions data (no specifics)
        #
        #useful for checking reactions before saving the rest of a post's data

        global reactionsEvaluated
        self.addRatings(getRatings(infoBS))
        reactionsEvaluated = True
        return {"positive" : self.positive, "neutral" : self.neutral, "negative" : self.negative, "weighted score" : self.weightedScore, "total score" : self.totalScore}

    def getUsername(self, infoBS):
        #retrieves the username of the poster and saves it to self.poster
        #returns usename retrieved from post
        #
        #useful for retrieving username of poster without having to save the rest of the post's data

        global usernameRetrieved
        try:
            self.poster = infoBS.find("span", class_ = "username").text
        except AttributeError:#if user is a guest, username data is stored in a different tag
            self.poster = infoBS.find("a", class_ = "username").text
        usernameRetrieved = True
        return self.poster


    def takeInfo(self, infoBS):
        #intakes BeautifulSoup object representing the HTML for a post
        #and sorts its data into the proper variables

        #get metadata
        self.postLink = f"http://kiwifarms.net" + infoBS.find("a", class_ = "u-concealed")["href"]
        self.postNum = int("".join(re.split("\n|\t", infoBS.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1].replace(',',''))
        if not self.usernameRetrieved: self.getUsername(infoBS)
        self.postDate = infoBS.find("time", class_ = "u-dt")["data-date-string"]
        try:
            self.editDate = infoBS.find("div", class_ = "message-lastEdit")
            self.edited = True
        except:
            self.edited = False
        #get content
        self.rawDat = f"""{infoBS}"""
        self.rawText = getText(infoBS)
        self.content = getContent(infoBS)
        self.attachedMedia = getAttachments(infoBS)
        #get rating data
        if not self.reactionsEvaluated:
            self.reactionData(infoBS)