Files
dss/bin/scan
T
2020-12-22 16:52:52 -08:00

64 lines
2.0 KiB
JavaScript
Executable File

#!/usr/bin/env node
// scan a category (or tag) for articles
require('dotenv').config()
const program = require('commander')
const Database = require('better-sqlite3')
const Bluebird = require('bluebird')
const ds = require('../index')
const db = ds.db()
const baseURL = 'https://dailystormer.su'
async function main() {
program.option('-t, --taxonomy <TYPE>', `"section" or "tag"`, 'section')
program.option('-s, --skip <ITEMS>', `skip leading items`, 0)
program.option('-k, --keep <ITEMS>', `keep leading items`, 0)
program.option('-p, --pages <P>', `scrape this many listing pages`, 0)
program.parse(process.argv)
//console.log(program)
const category = program.args[0]
if (!category) {
console.warn('category required')
process.exit(1)
}
let options = {}
if (program.pages) {
options.pages = parseInt(program.pages)
}
const taxonomyBaseURL = `${baseURL}/${program.taxonomy}`
const partialArticles = await ds.scanCategory(taxonomyBaseURL, category, options)
let ps = partialArticles // partialArticles.slice(0, 3)
//console.log(ps)
//console.log('skip', program.skip)
//console.log('ps.length', ps.length)
if (program.skip) {
let skip = parseInt(program.skip)
ps = partialArticles.slice(skip)
}
if (program.keep) {
let keep = parseInt(program.keep)
ps = partialArticles.slice(0, keep)
}
//console.log('ps.length', ps.length)
//ps.forEach((art) => console.log(art.title))
//process.exit(0)
Bluebird.each(ps, (async (art, i, length) => {
const exists = await ds.articleExists(db, art.slug)
if (exists) {
console.log(`= [${i+1}/${length}] "${art.title}"`)
return
}
const more = await ds.fetchArticle(art.link)
const article = Object.assign({}, art, more)
const res = await ds.insertArticle(db, article)
if (res.success) {
console.log(`+ [${i+1}/${length}] "${art.title}"`)
} else {
console.log(`= [${i+1}/${length}] "${art.title}"`)
}
return
}))
}
main()