Files
dss/index.js
T
2020-12-20 11:11:49 -08:00

266 lines
7.6 KiB
JavaScript

const axios = require('axios').default
const cheerio = require('cheerio')
const flatten = require('lodash.flatten')
const RSSParser = require('rss-parser')
const Bluebird = require('bluebird')
const url = require('urlite')
const luxon = require('luxon')
const DateTime = luxon.DateTime
const parser = new RSSParser()
async function getFeed(feedURL='https://dailystormer.su/feed/') {
const res = await axios.get(feedURL)
const feed = await parser.parseString(res.data)
return feed
}
/**
* Fetch the RSS feed and return its parsed contents.
* @param {String} feedURL - URL of the feed
* @returns {Object} parsed RSS feed
*/
async function getArticlesFromFeed(feedURL='https://dailystormer.su/feed/') {
const feed = await getFeed(feedURL);
return feed.items.map((a) => {
const slug = url.parse(a.link).pathname
const article = {
slug,
title: a.title,
author: a.creator,
content: a['content:encoded'],
categories: a.categories.map((cat) => cat.toLowerCase()),
tags: [],
published_date: a.isoDate,
//published_date: DateTime.fromRFC2822(a.pubDate).toISO()
}
return article
})
}
async function scanCategory(baseURL, category, options={}) {
const categoryURL = `${baseURL}/${category}`
console.log(categoryURL)
// scan first page of category to find out how many pages we have.
const res = await axios.get(categoryURL)
const $ = cheerio.load(res.data)
const p = parseInt($('.pagination .pages').text().split(/\s/)[3])
const pages = []
for (let i = 1; i <= p; i++) {
pages.push(i)
}
const scans = await Bluebird.map(pages, (n) => { return scanCategoryPage(baseURL, category, n) }, { concurrency: 4 })
const flatScans = flatten(scans)
return flatScans
}
async function scanCategoryPage(baseURL, category, page) {
var categoryURL
if (page === 1) {
categoryURL = `${baseURL}/${category}`
} else {
categoryURL = `${baseURL}/${category}/page/${page}`
}
const res = await axios.get(categoryURL)
console.log(categoryURL)
const $ = cheerio.load(res.data)
const articles = $('article.item-list').map((i, article) => {
const h2_a = $(article).find('h2.post-box-title a')
const link = h2_a.attr('href')
const slug = url.parse(link).pathname
const title = h2_a.text().trim()
const author = $(article).find('p.post-meta .post-meta-author').text().trim()
const publishedDateText = $(article).find('p.post-meta .tie-date').text().trim()
const publishedDate = DateTime.fromFormat(publishedDateText, 'LLLL d, yyyy', { zone: 'UTC' }).toISO()
return {
link,
slug,
title,
author,
published_date: publishedDate
}
}).toArray()
return articles
}
function unhyphen(word) {
const ws = word.split(/-/)
return ws.slice(1).map((s) => s.toLowerCase()).join(" ")
}
function extractTaxonomyFromArticle($article, type) {
const cs = $article.attr('class')
.split(/\s+/)
.filter((word) => word.match(new RegExp(`^${type}`)))
const ts = cs.map(unhyphen)
return ts
}
async function getArticle(url) {
const res = await axios.get(url)
const $ = cheerio.load(res.data)
const $article = $('article')
// get tags
const tags = extractTaxonomyFromArticle($article, 'tag')
// get categories
const categories = extractTaxonomyFromArticle($article, 'category')
// get content
const content = $article.find('div.entry').html()
return {
tags,
categories,
content
}
}
async function getCategory(db, name) {
const category = await db.prepare('SELECT id, name FROM category WHERE name = ? LIMIT 1').get(name)
return category
}
async function assocCategory(db, articleId, name) {
try {
var categoryId
const category = await getCategory(db, name)
if (!category) {
const res = await insertCategory(db, name)
categoryId = res.lastInsertRowid
} else {
categoryId = category.id
}
const res2 = await db.prepare('INSERT INTO article__category (article_id, category_id) VALUES (?, ?)')
.run(articleId, categoryId)
} catch (e) {
return { success: false, error: e }
}
return { success: true }
}
async function getTag(db, name) {
const tag = await db.prepare('SELECT id, name FROM tag WHERE name = ? LIMIT 1').get(name)
return tag
}
async function assocTag(db, articleId, name) {
try {
var tagId
const tag = await getTag(db, name)
if (!tag) {
const res = await insertTag(db, name)
tagId = res.lastInsertRowid
} else {
tagId = tag.id
}
const res2 = await db.prepare('INSERT INTO article__tag (article_id, tag_id) VALUES (?, ?)')
.run(articleId, tagId)
} catch (e) {
return { success: false, error: e }
}
return { success: true }
}
async function insertCategory(db, name) {
try {
const res = db.prepare(`INSERT INTO category (name) VALUES (?)`).run(name) // not async apparently
res.success = true
return res
} catch (e) {
return {
success: false,
error: e
}
}
}
async function insertTag(db, name) {
try {
const res = db.prepare(`INSERT INTO tag (name) VALUES (?)`).run(name) // not async apparently
res.success = true
return res
} catch (e) {
return {
success: false,
error: e
}
}
}
async function insertArticle(db, article) {
try {
// TODO wrap in a transaction?
// insert into article
const insert = db.prepare('INSERT INTO article (slug, title, author, content, published_date) VALUES (@slug, @title, @author, @content, @published_date)')
const res = await insert.run(article)
const articleId = res.lastInsertRowid
// insert into article_search for full-text search
const insertArticleSearch = db.prepare('INSERT INTO article_search (author, title, content, slug) VALUES (@author, @title, @content, @slug)')
const articleForSearch = Object.assign({}, article)
articleForSearch.content = cheerio.load(article.content).text()
await insertArticleSearch.run(articleForSearch)
// associate categories
article.categories.forEach((async (name) => {
await assocCategory(db, articleId, name)
}))
// associate tags
if (article.tags) {
article.tags.forEach((async (name) => {
await assocTag(db, articleId, name)
}))
}
}
catch (e) {
return { success: false, error: e }
}
return { success: true }
}
/**
* Perform a full-text search of the db.
* @param {Database} db - articles database
* @param {String} query - search query in SQLite3 fts5 syntax
*/
async function search(db, query, limit, offset) {
const stmt = db.prepare(`
SELECT rank, author, title, slug,
snippet(article_search, 2, '<b>', '</b>', '...', 32) as snippet
FROM article_search
WHERE article_search MATCH @query
ORDER BY rank
LIMIT @limit OFFSET @offset
`)
const res = stmt.all({ query, limit, offset })
return res
}
async function searchCount(db, query) {
const stmt = db.prepare(`SELECT count(*) as c FROM article_search WHERE article_search MATCH @query`)
const res = stmt.get({ query })
return res.c
}
// TODO write a function to insert an article into the database
// TODO write a function to insert a category into the database
// TODO write a function to associate an article with categories in a database
// TODO write a function to scan a category for articles
// TODO write a function to scrape an article into an insertable object
module.exports = {
getFeed,
getArticlesFromFeed,
scanCategory,
scanCategoryPage,
unhyphen,
extractTaxonomyFromArticle,
getArticle,
getCategory,
assocCategory,
getTag,
assocTag,
insertCategory,
insertTag,
insertArticle,
search,
searchCount
}