Files
dss/index.js
T
seger672 1ed7e6a5f6 Optimize tag cloud query
Still does a sort that ought to be indexable, but fixing that would be considerably more work.
2021-01-06 18:20:30 +00:00

454 lines
12 KiB
JavaScript

const axios = require('axios').default
const Database = require('better-sqlite3')
const cheerio = require('cheerio')
const flatten = require('lodash.flatten')
const RSSParser = require('rss-parser')
const Bluebird = require('bluebird')
const url = require('urlite')
const ucfirst = require('ucfirst')
const luxon = require('luxon')
const DateTime = luxon.DateTime
function db(options={}) {
const dbPath = process.env.DSS_DB || './main.db'
return new Database(dbPath, options)
}
async function fetchFeed(feedURL='https://dailystormer.su/feed/') {
const parser = new RSSParser()
const res = await axios.get(feedURL)
const feed = await parser.parseString(res.data)
return feed
}
/**
* Fetch the RSS feed and return its parsed contents.
* @param {String} feedURL - URL of the feed
* @returns {Object} parsed RSS feed
*/
async function fetchArticlesFromFeed(feedURL='https://dailystormer.su/feed/') {
const feed = await fetchFeed(feedURL);
return feed.items.map((a) => {
const slug = url.parse(a.link).pathname
const article = {
slug,
title: a.title,
author: a.creator,
content: a['content:encoded'],
categories: a.categories.map((cat) => cat.toLowerCase()),
tags: [],
published_date: a.isoDate,
//published_date: DateTime.fromRFC2822(a.pubDate).toISO()
}
return article
})
}
async function scanCategory(baseURL, category, options={}) {
const categoryURL = `${baseURL}/${category}`
console.log(categoryURL)
// scan first page of category to find out how many pages we have.
const res = await axios.get(categoryURL)
const $ = cheerio.load(res.data)
let p
p = (options.pages) ? options.pages : parseInt($('.pagination .pages').text().split(/\s/)[3])
const pages = []
for (let i = 1; i <= p; i++) {
pages.push(i)
}
const scans = await Bluebird.map(pages, (n) => { return scanCategoryPage(baseURL, category, n) }, { concurrency: 8 })
const flatScans = flatten(scans)
return flatScans
}
async function scanCategoryPage(baseURL, category, page) {
var categoryURL
if (page === 1) {
categoryURL = `${baseURL}/${category}`
} else {
categoryURL = `${baseURL}/${category}/page/${page}`
}
const res = await axios.get(categoryURL)
console.log(categoryURL)
const $ = cheerio.load(res.data)
const articles = $('article.item-list').map((i, article) => {
const h2_a = $(article).find('h2.post-box-title a')
const link = h2_a.attr('href')
const slug = url.parse(link).pathname
const title = h2_a.text().trim()
const author = $(article).find('p.post-meta .post-meta-author').text().trim()
const publishedDateText = $(article).find('p.post-meta .tie-date').text().trim()
const publishedDate = DateTime.fromFormat(publishedDateText, 'LLLL d, yyyy', { zone: 'UTC' }).toISO()
return {
link,
slug,
title,
author,
published_date: publishedDate
}
}).toArray()
return articles
}
function unhyphen(word) {
const ws = word.split(/-/)
return ws.slice(1).map((s) => s.toLowerCase()).join(" ")
}
function extractTaxonomyFromArticle($article, type) {
const cs = $article.attr('class')
.split(/\s+/)
.filter((word) => word.match(new RegExp(`^${type}`)))
const ts = cs.map(unhyphen)
return ts
}
async function articleExists(db, slug) {
const res = db.prepare(`SELECT count(*) c FROM article WHERE slug = @slug`).get({slug})
return res.c > 0
}
// TODO rename to fetchArticle
async function fetchArticle(url) {
const res = await axios.get(url)
const $ = cheerio.load(res.data)
const $article = $('article')
// get tags
const tags = extractTaxonomyFromArticle($article, 'tag')
// get categories
const categories = extractTaxonomyFromArticle($article, 'category')
// get content
const content = $article.find('div.entry').html()
return {
tags,
categories,
content
}
}
async function getCategory(db, name) {
const category = await db.prepare('SELECT id, name FROM category WHERE name = ? LIMIT 1').get(name)
return category
}
async function getTag(db, name) {
const tag = await db.prepare('SELECT id, name FROM tag WHERE name = ? LIMIT 1').get(name)
return tag
}
async function assocCategory(db, articleId, name) {
try {
var categoryId
const category = await getCategory(db, name)
if (!category) {
const res = await insertCategory(db, name)
categoryId = res.lastInsertRowid
} else {
categoryId = category.id
}
const res2 = await db.prepare('INSERT INTO article__category (article_id, category_id) VALUES (?, ?)')
.run(articleId, categoryId)
} catch (e) {
return { success: false, error: e }
}
return { success: true }
}
async function assocTag(db, articleId, name) {
try {
var tagId
const tag = await getTag(db, name)
if (!tag) {
const res = await insertTag(db, name)
tagId = res.lastInsertRowid
} else {
tagId = tag.id
}
const res2 = await db.prepare('INSERT INTO article__tag (article_id, tag_id) VALUES (?, ?)')
.run(articleId, tagId)
} catch (e) {
return { success: false, error: e }
}
return { success: true }
}
async function insertCategory(db, name) {
try {
const res = db.prepare(`INSERT INTO category (name) VALUES (?)`).run(name) // not async apparently
res.success = true
return res
} catch (e) {
return {
success: false,
error: e
}
}
}
async function insertTag(db, name) {
try {
const res = db.prepare(`INSERT INTO tag (name) VALUES (?)`).run(name) // not async apparently
res.success = true
return res
} catch (e) {
return {
success: false,
error: e
}
}
}
async function insertArticle(db, article) {
try {
// TODO wrap in a transaction?
// insert into article
const [year, month] = article.published_date.split(/-/)
article.year = year
article.month = month
const insert = db.prepare('INSERT INTO article (slug, title, author, content, published_date, year, month) VALUES (@slug, @title, @author, @content, @published_date, @year, @month)')
const res = await insert.run(article)
const articleId = res.lastInsertRowid
// insert into article_search for full-text search
const insertArticleSearch = db.prepare('INSERT INTO article_search (author, title, content, slug) VALUES (@author, @title, @content, @slug)')
const articleForSearch = Object.assign({}, article)
articleForSearch.content = cheerio.load(article.content).text()
await insertArticleSearch.run(articleForSearch)
// associate categories
article.categories.forEach((async (name) => {
await assocCategory(db, articleId, name)
}))
// associate tags
if (article.tags) {
article.tags.forEach((async (name) => {
await assocTag(db, articleId, name)
}))
}
}
catch (e) {
return { success: false, error: e }
}
return { success: true }
}
/**
* Perform a full-text search of the db.
* @param {Database} db - articles database
* @param {String} query - search query in SQLite3 fts5 syntax
*/
async function search(db, query, limit, offset) {
const stmt = db.prepare(`
SELECT rank, author, title, slug,
snippet(article_search, 2, '<b>', '</b>', '...', 32) as snippet
FROM article_search
WHERE article_search MATCH @query
ORDER BY rank
LIMIT @limit OFFSET @offset
`)
const res = stmt.all({ query, limit, offset })
return res
}
async function searchCount(db, query) {
const stmt = db.prepare(`SELECT count(*) as c FROM article_search WHERE article_search MATCH @query`)
const res = stmt.get({ query })
return res.c
}
// TODO write a function to insert an article into the database
// TODO write a function to insert a category into the database
// TODO write a function to associate an article with categories in a database
// TODO write a function to scan a category for articles
// TODO write a function to scrape an article into an insertable object
// summaries for home page
async function getYears() {
const firstYear = 2013
const currentYear = (new Date()).getYear() + 1900
const years = []
for (let i = firstYear; i <= currentYear; i++) {
years.unshift(i)
}
return years
}
async function getCategories(db) {
const cats = db.prepare('SELECT name FROM category ORDER by id').all()
const cats2 = cats.map((c) => {
let capitalized = c.name.split(/\s+/).map(ucfirst).join(' ')
if (capitalized == 'Us') capitalized = 'US'
return {
name: c.name,
capitalized
}
})
return cats2
}
async function getTagCloud(db, limit, offset=0) {
const tags = db.prepare(`
SELECT name, count FROM (
SELECT tag_id, COUNT(1) AS count
FROM article__tag
GROUP BY tag_id
ORDER BY count DESC
LIMIT @limit
OFFSET @offset
)
INNER JOIN tag on tag_id = tag.id
`).all({ limit, offset })
return tags
}
// SECTION: Archives
async function getArticleCounts(db, year) {
const months = {
'01': 'January',
'02': 'February',
'03': 'March',
'04': 'April',
'05': 'May',
'06': 'June',
'07': 'July',
'08': 'August',
'09': 'September',
'10': 'October',
'11': 'November',
'12': 'December'
}
const counts = db.prepare(`
SELECT
count(*) c,
printf('%d-%02d', year, month) AS year_month,
printf('%02d', month) AS m
FROM article
WHERE year = @year
GROUP BY year, month
ORDER BY year_month
`).all({ year })
return counts.map((c) => {
c.human = months[c.m]
return c
})
}
async function getArticlesByYearMonth(db, year, month) {
const articles = db.prepare(`
SELECT
printf('%d-%02d', year, month) AS year_month,
author, title, slug, published_date
FROM article
WHERE year = @year
AND month = @month
ORDER BY published_date, id
`).all({ year, month })
const articlesWithDay = articles.map((art) => {
const d = DateTime.fromISO(art.published_date).setZone('UTC')
art.day = d.toFormat('MMMM d, y')
return art
})
return articlesWithDay
}
// SECTION: Categories
async function getArticlesByCategory(db, name, limit, offset) {
const articles = db.prepare(`
SELECT a.slug, a.title, a.author, a.published_date
FROM article a
JOIN article__category ac ON a.id = ac.article_id
JOIN category c ON c.id = ac.category_id
WHERE c.name = @name
ORDER BY a.published_date DESC, a.id DESC
LIMIT @limit
OFFSET @offset
`).all({ name, limit, offset })
const articlesWithMonth = articles.map((art) => {
const d = DateTime.fromISO(art.published_date).setZone('UTC')
art.month = d.toFormat('MMMM y')
return art
})
return articlesWithMonth
}
async function countArticlesByCategory(db, name) {
const count = db.prepare(`
SELECT count(*) AS c
FROM article a
JOIN article__category ac ON a.id = ac.article_id
JOIN category c ON c.id = ac.category_id
WHERE c.name = @name
`).get({ name })
return count.c
}
// SECTION: Tags
async function getArticlesByTag(db, name, limit, offset) {
const articles = db.prepare(`
SELECT a.slug, a.title, a.author, a.published_date
FROM article a
JOIN article__tag a_t ON a.id = a_t.article_id
JOIN tag c ON c.id = a_t.tag_id
WHERE c.name = @name
ORDER BY a.published_date DESC, a.id DESC
LIMIT @limit
OFFSET @offset
`).all({ name, limit, offset })
const articlesWithMonth = articles.map((art) => {
const d = DateTime.fromISO(art.published_date).setZone('UTC')
art.month = d.toFormat('MMMM y')
return art
})
return articlesWithMonth
}
async function countArticlesByTag(db, name) {
const count = db.prepare(`
SELECT count(*) AS c
FROM article a
JOIN article__tag a_t ON a.id = a_t.article_id
JOIN tag c ON c.id = a_t.tag_id
WHERE c.name = @name
`).get({ name })
return count.c
}
async function countTags(db) {
const count = db.prepare(`SELECT count(*) c FROM tag`).get()
return count.c
}
module.exports = {
db,
fetchFeed,
fetchArticlesFromFeed,
scanCategory,
scanCategoryPage,
unhyphen,
extractTaxonomyFromArticle,
articleExists,
fetchArticle,
getCategory,
getTag,
assocCategory,
assocTag,
insertCategory,
insertTag,
insertArticle,
search,
searchCount,
getYears,
getCategories,
getTagCloud,
getArticleCounts,
getArticlesByYearMonth,
getArticlesByCategory,
countArticlesByCategory,
getArticlesByTag,
countArticlesByTag,
countTags
}