mirror of
https://github.com/jaytaylor/archive.is.git
synced 2026-05-26 06:04:09 +00:00
Initial commit.
This commit is contained in:
+67
@@ -0,0 +1,67 @@
|
||||
##
|
||||
# Vim swap/working files.
|
||||
*.sw[opa]
|
||||
|
||||
##
|
||||
# SublimeText files.
|
||||
*.sublime-project
|
||||
*.sublime-workspace
|
||||
|
||||
##
|
||||
# IDEA IntelliJ files.
|
||||
*.idea
|
||||
*.iml
|
||||
|
||||
##
|
||||
# Visual Studio Code files.
|
||||
.vscode
|
||||
|
||||
##
|
||||
# Mac OS-X miscellany.
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
._*
|
||||
|
||||
##
|
||||
# Windows image file caches
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
##
|
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects).
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
##
|
||||
# Folders.
|
||||
_obj
|
||||
_test
|
||||
|
||||
##
|
||||
# Architecture specific extensions/prefixes.
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
*.test
|
||||
*.prof
|
||||
|
||||
##
|
||||
# Certificate files.
|
||||
id_[rd]sa
|
||||
*.pem
|
||||
*.crt
|
||||
*.key
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
language: go
|
||||
|
||||
go:
|
||||
- tip
|
||||
- "1.10"
|
||||
- 1.9
|
||||
- 1.8
|
||||
- 1.7
|
||||
- 1.6
|
||||
- 1.5
|
||||
|
||||
script:
|
||||
- go test ./...
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: change
|
||||
on_failure: always
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2018 Jay Taylor
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,25 @@
|
||||
# archiveis
|
||||
|
||||
[](https://godoc.org/github.com/jaytaylor/archiveis)
|
||||
[](https://travis-ci.org/jaytaylor/archiveis)
|
||||
[](https://goreportcard.com/report/github.com/jaytaylor/archiveis)
|
||||
|
||||
### About
|
||||
|
||||
archiveis is a golang package for archiving web pages via [archive.is](https://archive.is).
|
||||
|
||||
Please be mindful and responsible and go easy on them, we want archive.is to last forever!
|
||||
|
||||
Created by [Jay Taylor](https://jaytaylor.com/).
|
||||
|
||||
### Requirements
|
||||
|
||||
* Go version 1.5 or newer
|
||||
|
||||
### Running the test suite
|
||||
|
||||
go test ./...
|
||||
|
||||
#### License
|
||||
|
||||
Permissive MIT license, see the [LICENSE](LICENSE) file for more information.
|
||||
+104
@@ -0,0 +1,104 @@
|
||||
package archiveis
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gigawattio/errorlib"
|
||||
"github.com/parnurzeal/gorequest"
|
||||
)
|
||||
|
||||
const (
|
||||
baseURL = "https://archive.is"
|
||||
userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"
|
||||
)
|
||||
|
||||
var jsLocationExpr = regexp.MustCompile(`document\.location\.replace\(["']([^"']+)`)
|
||||
|
||||
// Capture archives the provided URL using the archive.is service.
|
||||
func Capture(u string) (string, error) {
|
||||
submitID, err := newSubmitID()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// return id, nil
|
||||
|
||||
content := fmt.Sprintf("submitid=%v&url=%v", url.QueryEscape(submitID), url.QueryEscape(u))
|
||||
fmt.Printf("content=%v\n", content)
|
||||
resp, body, errs := newRequest().Post(baseURL+"/submit/").Send(content).Set("content-type", "application/x-www-form-urlencoded").EndBytes()
|
||||
if err := errorlib.Merge(errs); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if resp.StatusCode/100 != 2 {
|
||||
return "", fmt.Errorf("form submit received unhappy response status-code=%v", resp.StatusCode)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("constructing goquery doc from submission response: %s", err)
|
||||
}
|
||||
|
||||
if script := doc.Find("script").First(); script != nil {
|
||||
js := strings.Trim(script.Text(), "\r\n\t ")
|
||||
if match := jsLocationExpr.FindStringSubmatch(js); len(match) > 1 {
|
||||
return match[1], nil
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("body: %+v\n", string(body))
|
||||
fmt.Printf("headers: %+v\n", resp.Header)
|
||||
fmt.Printf("trailers: %+v\n", resp.Trailer)
|
||||
|
||||
input := doc.Find("input[name=id]").First()
|
||||
if input == nil {
|
||||
return "", errors.New("page archive ID not found in submission response content")
|
||||
}
|
||||
id, exists := input.Attr("value")
|
||||
if !exists {
|
||||
return "", errors.New("no page archive ID value available")
|
||||
}
|
||||
|
||||
final := fmt.Sprintf("%v/%v", baseURL, id)
|
||||
return final, nil
|
||||
}
|
||||
|
||||
// newSubmitID gets the index page and extracts the form submission identifier.
|
||||
func newSubmitID() (string, error) {
|
||||
resp, body, errs := newRequest().Get(baseURL).EndBytes()
|
||||
if err := errorlib.Merge(errs); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if resp.StatusCode/100 != 2 {
|
||||
return "", fmt.Errorf("index retrieval received unhappy response status-code=%v", resp.StatusCode)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("constructing goquery doc from index: %s", err)
|
||||
}
|
||||
|
||||
input := doc.Find("input[name=submitid]").First()
|
||||
if input == nil {
|
||||
return "", errors.New("no submitid element found")
|
||||
}
|
||||
id, exists := input.Attr("value")
|
||||
if !exists {
|
||||
return "", errors.New("no submitid value available")
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func newRequest() *gorequest.SuperAgent {
|
||||
r := gorequest.New().
|
||||
Set("host", strings.Split(baseURL, "://")[1]).
|
||||
Set("user-agent", userAgent).
|
||||
Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8").
|
||||
Set("referer", baseURL+"/")
|
||||
return r
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
package archiveis
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
const page = "https://yro.slashdot.org/story/18/03/21/2112247/russia-secretly-helped-venezuela-launch-a-cryptocurrency-to-evade-us-sanctions#comments"
|
||||
|
||||
func TestCapture1(t *testing.T) {
|
||||
// Link which has been submitted before.
|
||||
url, err := Capture(page)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("Resolved URL=%q", url)
|
||||
}
|
||||
|
||||
func TestCapture2(t *testing.T) {
|
||||
// Link which has likely not been submitted before.
|
||||
url, err := Capture(fmt.Sprintf("%v?%v", page, time.Now().Unix()))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("Resolved URL=%q", url)
|
||||
}
|
||||
Reference in New Issue
Block a user