const config = require('./config') const axios = require('axios').default const Database = require('better-sqlite3') const cheerio = require('cheerio') const flatten = require('lodash.flatten') const RSSParser = require('rss-parser') const Bluebird = require('bluebird') const url = require('urlite') const ucfirst = require('ucfirst') const luxon = require('luxon') const DateTime = luxon.DateTime function db(options={}) { const dbPath = process.env.DSS_DB || './main.db' return new Database(dbPath, options) } async function fetchFeed(feedURL) { const parser = new RSSParser() const res = await axios.get(feedURL || `https://${config.domain}/feed/`) const feed = await parser.parseString(res.data) return feed } /** * Fetch the RSS feed and return its parsed contents. * @param {String} feedURL - URL of the feed * @returns {Object} parsed RSS feed */ async function fetchArticlesFromFeed(feedURL) { const feed = await fetchFeed(feedURL || `https://${config.domain}/feed/`); return feed.items.map((a) => { const slug = url.parse(a.link).pathname const article = { slug, title: a.title, author: a.creator, content: a['content:encoded'], categories: a.categories.map((cat) => cat.toLowerCase()), tags: [], published_date: a.isoDate, //published_date: DateTime.fromRFC2822(a.pubDate).toISO() } return article }) } async function scanCategory(baseURL, category, options={}) { const categoryURL = `${baseURL}/${category}` console.log(categoryURL) // scan first page of category to find out how many pages we have. const res = await axios.get(categoryURL) const $ = cheerio.load(res.data) let p p = (options.pages) ? options.pages : parseInt($('.pagination .pages').text().split(/\s/)[3]) const pages = [] for (let i = 1; i <= p; i++) { pages.push(i) } const scans = await Bluebird.map(pages, (n) => { return scanCategoryPage(baseURL, category, n) }, { concurrency: 8 }) const flatScans = flatten(scans) return flatScans } async function scanCategoryPage(baseURL, category, page) { var categoryURL if (page === 1) { categoryURL = `${baseURL}/${category}` } else { categoryURL = `${baseURL}/${category}/page/${page}` } const res = await axios.get(categoryURL) console.log(categoryURL) const $ = cheerio.load(res.data) const articles = $('article.item-list').map((i, article) => { const h2_a = $(article).find('h2.post-box-title a') const link = h2_a.attr('href') const slug = url.parse(link).pathname const title = h2_a.text().trim() const author = $(article).find('p.post-meta .post-meta-author').text().trim() const publishedDateText = $(article).find('p.post-meta .tie-date').text().trim() const publishedDate = DateTime.fromFormat(publishedDateText, 'LLLL d, yyyy', { zone: 'UTC' }).toISO() return { link, slug, title, author, published_date: publishedDate } }).toArray() return articles } function unhyphen(word) { const ws = word.split(/-/) return ws.slice(1).map((s) => s.toLowerCase()).join(" ") } function extractTaxonomyFromArticle($article, type) { const cs = $article.attr('class') .split(/\s+/) .filter((word) => word.match(new RegExp(`^${type}`))) const ts = cs.map(unhyphen) return ts } async function articleExists(db, slug) { const res = db.prepare(`SELECT count(*) c FROM article WHERE slug = @slug`).get({slug}) return res.c > 0 } async function fetchArticle(url) { const res = await axios.get(url) const $ = cheerio.load(res.data) const $article = $('article') // get tags const tags = extractTaxonomyFromArticle($article, 'tag') // get categories const categories = extractTaxonomyFromArticle($article, 'category') // get content const content = $article.find('div.entry').html() return { tags, categories, content } } async function getCategory(db, name) { const category = await db.prepare('SELECT id, name FROM category WHERE name = ? LIMIT 1').get(name) return category } async function getTag(db, name) { const tag = await db.prepare('SELECT id, name FROM tag WHERE name = ? LIMIT 1').get(name) return tag } async function assocCategory(db, articleId, name) { try { var categoryId const category = await getCategory(db, name) if (!category) { const res = await insertCategory(db, name) categoryId = res.lastInsertRowid } else { categoryId = category.id } const res2 = await db.prepare('INSERT INTO article__category (article_id, category_id) VALUES (?, ?)') .run(articleId, categoryId) } catch (e) { return { success: false, error: e } } return { success: true } } async function assocTag(db, articleId, name) { try { var tagId const tag = await getTag(db, name) if (!tag) { const res = await insertTag(db, name) tagId = res.lastInsertRowid } else { tagId = tag.id } const res2 = await db.prepare('INSERT INTO article__tag (article_id, tag_id) VALUES (?, ?)') .run(articleId, tagId) } catch (e) { return { success: false, error: e } } return { success: true } } async function insertCategory(db, name) { try { const res = db.prepare(`INSERT INTO category (name) VALUES (?)`).run(name) // not async apparently res.success = true return res } catch (e) { return { success: false, error: e } } } async function insertTag(db, name) { try { const res = db.prepare(`INSERT INTO tag (name) VALUES (?)`).run(name) // not async apparently res.success = true return res } catch (e) { return { success: false, error: e } } } async function insertArticle(db, article) { try { // TODO wrap in a transaction? // insert into article const [year, month] = article.published_date.split(/-/) article.year = year article.month = month const insert = db.prepare('INSERT INTO article (slug, title, author, content, published_date, year, month) VALUES (@slug, @title, @author, @content, @published_date, @year, @month)') const res = await insert.run(article) const articleId = res.lastInsertRowid // insert into article_search for full-text search const insertArticleSearch = db.prepare('INSERT INTO article_search (author, title, content, slug) VALUES (@author, @title, @content, @slug)') const articleForSearch = Object.assign({}, article) articleForSearch.content = cheerio.load(article.content).text() await insertArticleSearch.run(articleForSearch) // associate categories article.categories.forEach((async (name) => { await assocCategory(db, articleId, name) })) // associate tags if (article.tags) { article.tags.forEach((async (name) => { await assocTag(db, articleId, name) })) } } catch (e) { return { success: false, error: e } } return { success: true } } /** * Perform a full-text search of the db. * @param {Database} db - articles database * @param {String} query - search query in SQLite3 fts5 syntax */ async function search(db, query, limit, offset) { const stmt = db.prepare(` SELECT rank, author, title, slug, snippet(article_search, 2, '', '', '...', 32) as snippet FROM article_search WHERE article_search MATCH @query ORDER BY rank LIMIT @limit OFFSET @offset `) const res = stmt.all({ query, limit, offset }) return res } async function searchCount(db, query) { const stmt = db.prepare(`SELECT count(*) as c FROM article_search WHERE article_search MATCH @query`) const res = stmt.get({ query }) return res.c } // summaries for home page async function getYears() { const firstYear = 2013 const currentYear = (new Date()).getYear() + 1900 const years = [] for (let i = firstYear; i <= currentYear; i++) { years.unshift(i) } return years } async function getCategories(db) { const cats = db.prepare('SELECT name FROM category ORDER by id').all() const cats2 = cats.map((c) => { let capitalized = c.name.split(/\s+/).map(ucfirst).join(' ') if (capitalized == 'Us') capitalized = 'US' return { name: c.name, capitalized } }) return cats2 } async function getCategoriesForArticle(db, id) { const categories = db.prepare(` SELECT c.name FROM category c JOIN article__category ac ON ac.category_id = c.id WHERE ac.article_id = @id `).all({ id }).map((c) => c.name) return categories } async function getTagCloud(db, limit, offset=0) { const tags = db.prepare(` SELECT name, c FROM ( SELECT tag_id, COUNT(1) AS c FROM article__tag GROUP BY tag_id ORDER BY c DESC LIMIT @limit OFFSET @offset ) INNER JOIN tag on tag_id = tag.id `).all({ limit, offset }) return tags } // SECTION: Articles async function getArticleBySlug(db, slug) { const article = db.prepare('SELECT * FROM article WHERE slug = @slug').get({ slug }) if (article) { const tagsById = await getTagsForArticles(db, [ article.id ]) article.tags = tagsById[article.id] article.categories = await getCategoriesForArticle(db, article.id) delete article.year_month } return article } // SECTION: Archives async function getArticleCounts(db, year) { const months = { '01': 'January', '02': 'February', '03': 'March', '04': 'April', '05': 'May', '06': 'June', '07': 'July', '08': 'August', '09': 'September', '10': 'October', '11': 'November', '12': 'December' } const counts = db.prepare(` SELECT count(*) c, printf('%d-%02d', year, month) AS year_month, printf('%02d', month) AS m FROM article WHERE year = @year GROUP BY year, month ORDER BY year_month `).all({ year }) return counts.map((c) => { c.human = months[c.m] return c }) } async function getArticlesByYearMonth(db, year, month) { const articles = db.prepare(` SELECT id, printf('%d-%02d', year, month) AS year_month, author, title, slug, published_date FROM article WHERE year = @year AND month = @month ORDER BY published_date, id `).all({ year, month }) const articlesWithDay = articles.map((art) => { const d = DateTime.fromISO(art.published_date).setZone('UTC') art.day = d.toFormat('MMMM d, y') return art }) return articlesWithDay } // SECTION: Categories async function getArticlesByCategory(db, name, limit, offset) { const articles = db.prepare(` SELECT a.id, a.slug, a.title, a.author, a.published_date FROM article a JOIN article__category ac ON a.id = ac.article_id JOIN category c ON c.id = ac.category_id WHERE c.name = @name ORDER BY a.published_date DESC, a.id DESC LIMIT @limit OFFSET @offset `).all({ name, limit, offset }) const articlesWithMonth = articles.map((art) => { const d = DateTime.fromISO(art.published_date).setZone('UTC') art.month = d.toFormat('MMMM y') return art }) return articlesWithMonth } async function countArticlesByCategory(db, name) { const count = db.prepare(` SELECT count(*) AS c FROM article a JOIN article__category ac ON a.id = ac.article_id JOIN category c ON c.id = ac.category_id WHERE c.name = @name `).get({ name }) return count.c } // SECTION: Tags async function getArticlesByTag(db, name, limit, offset) { const articles = db.prepare(` SELECT a.id, a.slug, a.title, a.author, a.published_date FROM article a JOIN article__tag a_t ON a.id = a_t.article_id JOIN tag c ON c.id = a_t.tag_id WHERE c.name = @name ORDER BY a.published_date DESC, a.id DESC LIMIT @limit OFFSET @offset `).all({ name, limit, offset }) const articlesWithMonth = articles.map((art) => { const d = DateTime.fromISO(art.published_date).setZone('UTC') art.month = d.toFormat('MMMM y') return art }) return articlesWithMonth } async function countArticlesByTag(db, name) { const count = db.prepare(` SELECT count(*) AS c FROM article a JOIN article__tag a_t ON a.id = a_t.article_id JOIN tag c ON c.id = a_t.tag_id WHERE c.name = @name `).get({ name }) return count.c } async function countTags(db) { const count = db.prepare(`SELECT count(*) c FROM tag`).get() return count.c } function questionMarks(list){ let s = '' for (let i = 0; i < list.length; i++) { s += '?, ' } return s.replace(/, $/, '') } /** * Return a list of tags for the given list of article ids * @param {Database} db - SQLite3 database instance * @param {Array} ids - a list of article ids * @returns {Object} tags grouped by article.id */ async function getTagsForArticles(db, ids) { const tags = db.prepare(` SELECT a.id, t.name FROM article a JOIN article__tag a_t ON a.id = a_t.article_id JOIN tag t ON t.id = a_t.tag_id WHERE a.id IN (${questionMarks(ids)}) ORDER BY a.id, t.name `).all(...ids).reduce((m, a) => { if (!m[a.id]) { m[a.id] = [] } m[a.id].push(a.name) return m }, {}) return tags } function applyTagsToArticles(articles, tags) { return articles.map((a) => { a.tags = tags[a.id] return a }) } module.exports = { db, fetchFeed, fetchArticlesFromFeed, scanCategory, scanCategoryPage, unhyphen, extractTaxonomyFromArticle, articleExists, fetchArticle, getCategory, getCategoriesForArticle, getTag, assocCategory, assocTag, insertCategory, insertTag, insertArticle, search, searchCount, getYears, getCategories, getTagCloud, getArticleBySlug, getArticleCounts, getArticlesByYearMonth, getArticlesByCategory, countArticlesByCategory, getArticlesByTag, countArticlesByTag, countTags, getTagsForArticles, applyTagsToArticles }