1ed7e6a5f6
Still does a sort that ought to be indexable, but fixing that would be considerably more work.
454 lines
12 KiB
JavaScript
454 lines
12 KiB
JavaScript
const axios = require('axios').default
|
|
const Database = require('better-sqlite3')
|
|
const cheerio = require('cheerio')
|
|
const flatten = require('lodash.flatten')
|
|
const RSSParser = require('rss-parser')
|
|
const Bluebird = require('bluebird')
|
|
const url = require('urlite')
|
|
const ucfirst = require('ucfirst')
|
|
const luxon = require('luxon')
|
|
const DateTime = luxon.DateTime
|
|
|
|
function db(options={}) {
|
|
const dbPath = process.env.DSS_DB || './main.db'
|
|
return new Database(dbPath, options)
|
|
}
|
|
|
|
async function fetchFeed(feedURL='https://dailystormer.su/feed/') {
|
|
const parser = new RSSParser()
|
|
const res = await axios.get(feedURL)
|
|
const feed = await parser.parseString(res.data)
|
|
return feed
|
|
}
|
|
|
|
/**
|
|
* Fetch the RSS feed and return its parsed contents.
|
|
* @param {String} feedURL - URL of the feed
|
|
* @returns {Object} parsed RSS feed
|
|
*/
|
|
async function fetchArticlesFromFeed(feedURL='https://dailystormer.su/feed/') {
|
|
const feed = await fetchFeed(feedURL);
|
|
return feed.items.map((a) => {
|
|
const slug = url.parse(a.link).pathname
|
|
const article = {
|
|
slug,
|
|
title: a.title,
|
|
author: a.creator,
|
|
content: a['content:encoded'],
|
|
categories: a.categories.map((cat) => cat.toLowerCase()),
|
|
tags: [],
|
|
published_date: a.isoDate,
|
|
//published_date: DateTime.fromRFC2822(a.pubDate).toISO()
|
|
}
|
|
return article
|
|
})
|
|
}
|
|
|
|
async function scanCategory(baseURL, category, options={}) {
|
|
const categoryURL = `${baseURL}/${category}`
|
|
console.log(categoryURL)
|
|
// scan first page of category to find out how many pages we have.
|
|
const res = await axios.get(categoryURL)
|
|
const $ = cheerio.load(res.data)
|
|
let p
|
|
p = (options.pages) ? options.pages : parseInt($('.pagination .pages').text().split(/\s/)[3])
|
|
const pages = []
|
|
for (let i = 1; i <= p; i++) {
|
|
pages.push(i)
|
|
}
|
|
const scans = await Bluebird.map(pages, (n) => { return scanCategoryPage(baseURL, category, n) }, { concurrency: 8 })
|
|
const flatScans = flatten(scans)
|
|
return flatScans
|
|
}
|
|
|
|
async function scanCategoryPage(baseURL, category, page) {
|
|
var categoryURL
|
|
if (page === 1) {
|
|
categoryURL = `${baseURL}/${category}`
|
|
} else {
|
|
categoryURL = `${baseURL}/${category}/page/${page}`
|
|
}
|
|
const res = await axios.get(categoryURL)
|
|
console.log(categoryURL)
|
|
const $ = cheerio.load(res.data)
|
|
const articles = $('article.item-list').map((i, article) => {
|
|
const h2_a = $(article).find('h2.post-box-title a')
|
|
const link = h2_a.attr('href')
|
|
const slug = url.parse(link).pathname
|
|
const title = h2_a.text().trim()
|
|
const author = $(article).find('p.post-meta .post-meta-author').text().trim()
|
|
const publishedDateText = $(article).find('p.post-meta .tie-date').text().trim()
|
|
const publishedDate = DateTime.fromFormat(publishedDateText, 'LLLL d, yyyy', { zone: 'UTC' }).toISO()
|
|
return {
|
|
link,
|
|
slug,
|
|
title,
|
|
author,
|
|
published_date: publishedDate
|
|
}
|
|
}).toArray()
|
|
return articles
|
|
}
|
|
|
|
function unhyphen(word) {
|
|
const ws = word.split(/-/)
|
|
return ws.slice(1).map((s) => s.toLowerCase()).join(" ")
|
|
}
|
|
|
|
function extractTaxonomyFromArticle($article, type) {
|
|
const cs = $article.attr('class')
|
|
.split(/\s+/)
|
|
.filter((word) => word.match(new RegExp(`^${type}`)))
|
|
const ts = cs.map(unhyphen)
|
|
return ts
|
|
}
|
|
|
|
async function articleExists(db, slug) {
|
|
const res = db.prepare(`SELECT count(*) c FROM article WHERE slug = @slug`).get({slug})
|
|
return res.c > 0
|
|
}
|
|
|
|
// TODO rename to fetchArticle
|
|
async function fetchArticle(url) {
|
|
const res = await axios.get(url)
|
|
const $ = cheerio.load(res.data)
|
|
const $article = $('article')
|
|
// get tags
|
|
const tags = extractTaxonomyFromArticle($article, 'tag')
|
|
// get categories
|
|
const categories = extractTaxonomyFromArticle($article, 'category')
|
|
// get content
|
|
const content = $article.find('div.entry').html()
|
|
return {
|
|
tags,
|
|
categories,
|
|
content
|
|
}
|
|
}
|
|
|
|
async function getCategory(db, name) {
|
|
const category = await db.prepare('SELECT id, name FROM category WHERE name = ? LIMIT 1').get(name)
|
|
return category
|
|
}
|
|
|
|
async function getTag(db, name) {
|
|
const tag = await db.prepare('SELECT id, name FROM tag WHERE name = ? LIMIT 1').get(name)
|
|
return tag
|
|
}
|
|
|
|
async function assocCategory(db, articleId, name) {
|
|
try {
|
|
var categoryId
|
|
const category = await getCategory(db, name)
|
|
if (!category) {
|
|
const res = await insertCategory(db, name)
|
|
categoryId = res.lastInsertRowid
|
|
} else {
|
|
categoryId = category.id
|
|
}
|
|
const res2 = await db.prepare('INSERT INTO article__category (article_id, category_id) VALUES (?, ?)')
|
|
.run(articleId, categoryId)
|
|
} catch (e) {
|
|
return { success: false, error: e }
|
|
}
|
|
return { success: true }
|
|
}
|
|
|
|
async function assocTag(db, articleId, name) {
|
|
try {
|
|
var tagId
|
|
const tag = await getTag(db, name)
|
|
if (!tag) {
|
|
const res = await insertTag(db, name)
|
|
tagId = res.lastInsertRowid
|
|
} else {
|
|
tagId = tag.id
|
|
}
|
|
const res2 = await db.prepare('INSERT INTO article__tag (article_id, tag_id) VALUES (?, ?)')
|
|
.run(articleId, tagId)
|
|
} catch (e) {
|
|
return { success: false, error: e }
|
|
}
|
|
return { success: true }
|
|
}
|
|
|
|
async function insertCategory(db, name) {
|
|
try {
|
|
const res = db.prepare(`INSERT INTO category (name) VALUES (?)`).run(name) // not async apparently
|
|
res.success = true
|
|
return res
|
|
} catch (e) {
|
|
return {
|
|
success: false,
|
|
error: e
|
|
}
|
|
}
|
|
}
|
|
|
|
async function insertTag(db, name) {
|
|
try {
|
|
const res = db.prepare(`INSERT INTO tag (name) VALUES (?)`).run(name) // not async apparently
|
|
res.success = true
|
|
return res
|
|
} catch (e) {
|
|
return {
|
|
success: false,
|
|
error: e
|
|
}
|
|
}
|
|
}
|
|
|
|
async function insertArticle(db, article) {
|
|
try {
|
|
// TODO wrap in a transaction?
|
|
|
|
// insert into article
|
|
const [year, month] = article.published_date.split(/-/)
|
|
article.year = year
|
|
article.month = month
|
|
const insert = db.prepare('INSERT INTO article (slug, title, author, content, published_date, year, month) VALUES (@slug, @title, @author, @content, @published_date, @year, @month)')
|
|
const res = await insert.run(article)
|
|
const articleId = res.lastInsertRowid
|
|
// insert into article_search for full-text search
|
|
const insertArticleSearch = db.prepare('INSERT INTO article_search (author, title, content, slug) VALUES (@author, @title, @content, @slug)')
|
|
const articleForSearch = Object.assign({}, article)
|
|
articleForSearch.content = cheerio.load(article.content).text()
|
|
await insertArticleSearch.run(articleForSearch)
|
|
// associate categories
|
|
article.categories.forEach((async (name) => {
|
|
await assocCategory(db, articleId, name)
|
|
}))
|
|
// associate tags
|
|
if (article.tags) {
|
|
article.tags.forEach((async (name) => {
|
|
await assocTag(db, articleId, name)
|
|
}))
|
|
}
|
|
}
|
|
catch (e) {
|
|
return { success: false, error: e }
|
|
}
|
|
return { success: true }
|
|
}
|
|
|
|
/**
|
|
* Perform a full-text search of the db.
|
|
* @param {Database} db - articles database
|
|
* @param {String} query - search query in SQLite3 fts5 syntax
|
|
*/
|
|
async function search(db, query, limit, offset) {
|
|
const stmt = db.prepare(`
|
|
SELECT rank, author, title, slug,
|
|
snippet(article_search, 2, '<b>', '</b>', '...', 32) as snippet
|
|
FROM article_search
|
|
WHERE article_search MATCH @query
|
|
ORDER BY rank
|
|
LIMIT @limit OFFSET @offset
|
|
`)
|
|
const res = stmt.all({ query, limit, offset })
|
|
return res
|
|
}
|
|
|
|
async function searchCount(db, query) {
|
|
const stmt = db.prepare(`SELECT count(*) as c FROM article_search WHERE article_search MATCH @query`)
|
|
const res = stmt.get({ query })
|
|
return res.c
|
|
}
|
|
|
|
// TODO write a function to insert an article into the database
|
|
// TODO write a function to insert a category into the database
|
|
// TODO write a function to associate an article with categories in a database
|
|
// TODO write a function to scan a category for articles
|
|
// TODO write a function to scrape an article into an insertable object
|
|
|
|
// summaries for home page
|
|
|
|
async function getYears() {
|
|
const firstYear = 2013
|
|
const currentYear = (new Date()).getYear() + 1900
|
|
const years = []
|
|
for (let i = firstYear; i <= currentYear; i++) {
|
|
years.unshift(i)
|
|
}
|
|
return years
|
|
}
|
|
|
|
async function getCategories(db) {
|
|
const cats = db.prepare('SELECT name FROM category ORDER by id').all()
|
|
const cats2 = cats.map((c) => {
|
|
let capitalized = c.name.split(/\s+/).map(ucfirst).join(' ')
|
|
if (capitalized == 'Us') capitalized = 'US'
|
|
return {
|
|
name: c.name,
|
|
capitalized
|
|
}
|
|
})
|
|
return cats2
|
|
}
|
|
|
|
async function getTagCloud(db, limit, offset=0) {
|
|
const tags = db.prepare(`
|
|
SELECT name, count FROM (
|
|
SELECT tag_id, COUNT(1) AS count
|
|
FROM article__tag
|
|
GROUP BY tag_id
|
|
ORDER BY count DESC
|
|
LIMIT @limit
|
|
OFFSET @offset
|
|
)
|
|
INNER JOIN tag on tag_id = tag.id
|
|
`).all({ limit, offset })
|
|
return tags
|
|
}
|
|
|
|
// SECTION: Archives
|
|
|
|
async function getArticleCounts(db, year) {
|
|
const months = {
|
|
'01': 'January',
|
|
'02': 'February',
|
|
'03': 'March',
|
|
'04': 'April',
|
|
'05': 'May',
|
|
'06': 'June',
|
|
'07': 'July',
|
|
'08': 'August',
|
|
'09': 'September',
|
|
'10': 'October',
|
|
'11': 'November',
|
|
'12': 'December'
|
|
}
|
|
const counts = db.prepare(`
|
|
SELECT
|
|
count(*) c,
|
|
printf('%d-%02d', year, month) AS year_month,
|
|
printf('%02d', month) AS m
|
|
FROM article
|
|
WHERE year = @year
|
|
GROUP BY year, month
|
|
ORDER BY year_month
|
|
`).all({ year })
|
|
return counts.map((c) => {
|
|
c.human = months[c.m]
|
|
return c
|
|
})
|
|
}
|
|
|
|
async function getArticlesByYearMonth(db, year, month) {
|
|
const articles = db.prepare(`
|
|
SELECT
|
|
printf('%d-%02d', year, month) AS year_month,
|
|
author, title, slug, published_date
|
|
FROM article
|
|
WHERE year = @year
|
|
AND month = @month
|
|
ORDER BY published_date, id
|
|
`).all({ year, month })
|
|
const articlesWithDay = articles.map((art) => {
|
|
const d = DateTime.fromISO(art.published_date).setZone('UTC')
|
|
art.day = d.toFormat('MMMM d, y')
|
|
return art
|
|
})
|
|
return articlesWithDay
|
|
}
|
|
|
|
// SECTION: Categories
|
|
|
|
async function getArticlesByCategory(db, name, limit, offset) {
|
|
const articles = db.prepare(`
|
|
SELECT a.slug, a.title, a.author, a.published_date
|
|
FROM article a
|
|
JOIN article__category ac ON a.id = ac.article_id
|
|
JOIN category c ON c.id = ac.category_id
|
|
WHERE c.name = @name
|
|
ORDER BY a.published_date DESC, a.id DESC
|
|
LIMIT @limit
|
|
OFFSET @offset
|
|
`).all({ name, limit, offset })
|
|
const articlesWithMonth = articles.map((art) => {
|
|
const d = DateTime.fromISO(art.published_date).setZone('UTC')
|
|
art.month = d.toFormat('MMMM y')
|
|
return art
|
|
})
|
|
return articlesWithMonth
|
|
}
|
|
|
|
async function countArticlesByCategory(db, name) {
|
|
const count = db.prepare(`
|
|
SELECT count(*) AS c
|
|
FROM article a
|
|
JOIN article__category ac ON a.id = ac.article_id
|
|
JOIN category c ON c.id = ac.category_id
|
|
WHERE c.name = @name
|
|
`).get({ name })
|
|
return count.c
|
|
}
|
|
|
|
// SECTION: Tags
|
|
|
|
async function getArticlesByTag(db, name, limit, offset) {
|
|
const articles = db.prepare(`
|
|
SELECT a.slug, a.title, a.author, a.published_date
|
|
FROM article a
|
|
JOIN article__tag a_t ON a.id = a_t.article_id
|
|
JOIN tag c ON c.id = a_t.tag_id
|
|
WHERE c.name = @name
|
|
ORDER BY a.published_date DESC, a.id DESC
|
|
LIMIT @limit
|
|
OFFSET @offset
|
|
`).all({ name, limit, offset })
|
|
const articlesWithMonth = articles.map((art) => {
|
|
const d = DateTime.fromISO(art.published_date).setZone('UTC')
|
|
art.month = d.toFormat('MMMM y')
|
|
return art
|
|
})
|
|
return articlesWithMonth
|
|
}
|
|
|
|
async function countArticlesByTag(db, name) {
|
|
const count = db.prepare(`
|
|
SELECT count(*) AS c
|
|
FROM article a
|
|
JOIN article__tag a_t ON a.id = a_t.article_id
|
|
JOIN tag c ON c.id = a_t.tag_id
|
|
WHERE c.name = @name
|
|
`).get({ name })
|
|
return count.c
|
|
}
|
|
|
|
async function countTags(db) {
|
|
const count = db.prepare(`SELECT count(*) c FROM tag`).get()
|
|
return count.c
|
|
}
|
|
|
|
module.exports = {
|
|
db,
|
|
fetchFeed,
|
|
fetchArticlesFromFeed,
|
|
scanCategory,
|
|
scanCategoryPage,
|
|
unhyphen,
|
|
extractTaxonomyFromArticle,
|
|
articleExists,
|
|
fetchArticle,
|
|
getCategory,
|
|
getTag,
|
|
assocCategory,
|
|
assocTag,
|
|
insertCategory,
|
|
insertTag,
|
|
insertArticle,
|
|
search,
|
|
searchCount,
|
|
getYears,
|
|
getCategories,
|
|
getTagCloud,
|
|
getArticleCounts,
|
|
getArticlesByYearMonth,
|
|
getArticlesByCategory,
|
|
countArticlesByCategory,
|
|
getArticlesByTag,
|
|
countArticlesByTag,
|
|
countTags
|
|
}
|