Files
2019-09-04 00:22:24 -07:00

291 lines
7.4 KiB
Go

package main
import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/rs/zerolog/log"
)
var (
errDataMissing = errors.New("data missing from response")
errDomainFailed = errors.New("domain failed too many times")
)
type DomainWorker struct {
hub *WorkerHub
mu sync.RWMutex
rateLimit time.Duration
lastReq time.Time
domain string
userAgent string
httpTimeout time.Duration
badConnCount uint
badConnLast time.Time
badConnCooldown time.Duration
}
func NewDomainWorker(hub *WorkerHub, domain string, userAgent string, rateLimit uint, httpTimeout uint) *DomainWorker {
log.Info().Str("domain", domain).Msg("NewDomainWorker")
return &DomainWorker{
hub: hub,
rateLimit: time.Millisecond * time.Duration(rateLimit),
domain: domain,
userAgent: userAgent,
httpTimeout: time.Duration(httpTimeout) * time.Second,
badConnCooldown: time.Minute * 4,
}
}
func (o *DomainWorker) GetUserID(statusID string) (string, error) {
err := o.checkBadConn()
if err != nil {
return "", err
}
data, err := o.GetStatus(statusID)
if err != nil {
return "", err
}
account, ok := data["account"].(map[string]interface{})
if !ok {
return "", errDataMissing
}
userID, ok := account["id"].(string)
if !ok {
return "", errDataMissing
}
return userID, nil
}
func (o *DomainWorker) GetAccount(userID string) (map[string]interface{}, error) {
err := o.checkBadConn()
if err != nil {
return nil, err
}
body, err := o.getHTTPBody(fmt.Sprintf("https://%s/api/v1/accounts/%s", o.domain, userID))
if err != nil {
return nil, err
}
data := map[string]interface{}{}
err = json.Unmarshal(body, &data)
return data, err
}
func (o *DomainWorker) GetStatus(statusID string) (map[string]interface{}, error) {
err := o.checkBadConn()
if err != nil {
return nil, err
}
url := fmt.Sprintf("https://%s/api/v1/statuses/%s", o.domain, statusID)
body, err := o.getHTTPBody(url)
if err != nil {
return nil, err
}
data := map[string]interface{}{}
err = json.Unmarshal(body, &data)
return data, err
}
func (o *DomainWorker) GetTimeline(userID string, params string) ([]map[string]interface{}, error) {
err := o.checkBadConn()
if err != nil {
return nil, err
}
url := fmt.Sprintf("https://%s/api/v1/accounts/%s/statuses%s", o.domain, userID, params)
log.Info().Str("URL", url).Msg("GetTimeline")
body, err := o.getHTTPBody(url)
if err != nil {
return nil, err
}
data := []map[string]interface{}{}
err = json.Unmarshal(body, &data)
return data, err
}
func (o *DomainWorker) GetInstance() (map[string]interface{}, error) {
err := o.checkBadConn()
if err != nil {
return nil, err
}
body, err := o.getHTTPBody(fmt.Sprintf("https://%s/api/v1/instance", o.domain))
if err != nil {
return nil, err
}
data := map[string]interface{}{}
err = json.Unmarshal(body, &data)
return data, err
}
func (o *DomainWorker) FindUserID(username string) (string, error) {
log.Debug().Str("username", username).Msg("FindUserID")
err := o.checkBadConn()
if err != nil {
return "", err
}
/* Pleroma will just resolve usernames to IDs, BASED */
acct, err := o.GetAccount(username)
if err == nil {
userID, ok := acct["id"].(string)
if ok {
return userID, nil
}
}
/* Gab implemented a REST call for resolving usernames to IDs */
acct, err = o.GetAccountByUsername(username)
if err == nil {
userID, ok := acct["id"].(string)
if ok {
return userID, nil
}
}
/* Mastodon, engage the hacky scraper */
url := fmt.Sprintf("https://%s/@%s", o.domain, username)
doc, err := o.getGoQueryDoc(url)
if err != nil {
return "", err
}
/* Look for salmon link (older Mastodon?) */
var userID string
doc.Find("link").Each(func(i int, s *goquery.Selection) {
attr, exists := s.Attr("rel")
if exists && attr == "salmon" {
attr, exists = s.Attr("href")
if exists {
splitStr := strings.Split(attr, "/")
userID = splitStr[len(splitStr)-1]
return
}
}
})
if len(userID) > 0 {
return userID, nil
}
/* Pull from statuses */
var statusID string
doc.Find("div.entry.h-entry div.status div.status__info a.status__relative-time").Each(func(i int, selEntry *goquery.Selection) {
attr, exists := selEntry.Attr("href")
if exists && strings.Contains(attr, username) {
splitStr := strings.Split(attr, "/")
statusID = splitStr[len(splitStr)-1]
return
}
})
if len(statusID) > 0 {
status, err := o.GetStatus(statusID)
if err == nil {
account, ok := status["account"].(map[string]interface{})
if ok {
userID, ok = account["id"].(string)
if ok {
return userID, nil
}
}
}
}
return "", errDataMissing
}
// GetAccountByUsername gets an account object by username (Gab-fork only)
func (o *DomainWorker) GetAccountByUsername(username string) (map[string]interface{}, error) {
err := o.checkBadConn()
if err != nil {
return nil, err
}
url := fmt.Sprintf("https://%s/api/v1/account_by_username/%s", o.domain, username)
body, err := o.getHTTPBody(url)
if err != nil {
return nil, err
}
data := map[string]interface{}{}
err = json.Unmarshal(body, &data)
return data, err
}
func (o *DomainWorker) getHTTPBody(url string) ([]byte, error) {
o.mu.Lock()
defer o.mu.Unlock()
diff := time.Now().Sub(o.lastReq)
if diff < o.rateLimit {
sleepDur := o.rateLimit - diff
log.Debug().Str("domain", o.domain).Dur("sleep", sleepDur).Msg("Ratelimit Hit")
time.Sleep(sleepDur)
log.Debug().Str("domain", o.domain).Dur("sleep", sleepDur).Msg("Ratelimit Resume")
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", o.userAgent)
client := &http.Client{Timeout: o.httpTimeout}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
if resp.StatusCode == 401 || resp.StatusCode == 403 || resp.StatusCode == 410 || resp.StatusCode == 500 {
o.badConnCount++
o.badConnLast = time.Now()
}
return nil, errors.New("HTTP Status: " + resp.Status)
}
o.lastReq = time.Now()
return ioutil.ReadAll(resp.Body)
}
func (o *DomainWorker) getGoQueryDoc(url string) (*goquery.Document, error) {
o.mu.Lock()
defer o.mu.Unlock()
diff := time.Now().Sub(o.lastReq)
if diff < o.rateLimit {
sleepDur := o.rateLimit - diff
log.Debug().Str("domain", o.domain).Dur("sleep", sleepDur).Msg("Ratelimit Hit")
time.Sleep(sleepDur)
log.Debug().Str("domain", o.domain).Dur("sleep", sleepDur).Msg("Ratelimit Resume")
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", o.userAgent)
client := &http.Client{Timeout: o.httpTimeout}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
if resp.StatusCode == 401 || resp.StatusCode == 403 || resp.StatusCode == 410 || resp.StatusCode == 500 {
o.badConnCount++
o.badConnLast = time.Now()
}
return nil, errors.New("HTTP Status: " + resp.Status)
}
o.lastReq = time.Now()
return goquery.NewDocumentFromReader(resp.Body)
}
func (o *DomainWorker) checkBadConn() error {
if o.badConnCount > 0 && time.Now().Sub(o.badConnLast) > o.badConnCooldown {
log.Debug().Str("domain", o.domain).Msg("removed count")
o.mu.Lock()
o.badConnCount--
o.mu.Unlock()
}
if o.badConnCount > 3 { // skipping mutex lock because race condition insensitive
return errDomainFailed
}
return nil
}