const axios = require('axios').default const Database = require('better-sqlite3') const cheerio = require('cheerio') const flatten = require('lodash.flatten') const RSSParser = require('rss-parser') const Bluebird = require('bluebird') const url = require('urlite') const ucfirst = require('ucfirst') const luxon = require('luxon') const DateTime = luxon.DateTime function db(options={}) { const dbPath = process.env.DSS_DB || './main.db' return new Database(dbPath, options) } async function fetchFeed(feedURL='https://dailystormer.su/feed/') { const parser = new RSSParser() const res = await axios.get(feedURL) const feed = await parser.parseString(res.data) return feed } /** * Fetch the RSS feed and return its parsed contents. * @param {String} feedURL - URL of the feed * @returns {Object} parsed RSS feed */ async function fetchArticlesFromFeed(feedURL='https://dailystormer.su/feed/') { const feed = await fetchFeed(feedURL); return feed.items.map((a) => { const slug = url.parse(a.link).pathname const article = { slug, title: a.title, author: a.creator, content: a['content:encoded'], categories: a.categories.map((cat) => cat.toLowerCase()), tags: [], published_date: a.isoDate, //published_date: DateTime.fromRFC2822(a.pubDate).toISO() } return article }) } async function scanCategory(baseURL, category, options={}) { const categoryURL = `${baseURL}/${category}` console.log(categoryURL) // scan first page of category to find out how many pages we have. const res = await axios.get(categoryURL) const $ = cheerio.load(res.data) let p p = (options.pages) ? options.pages : parseInt($('.pagination .pages').text().split(/\s/)[3]) const pages = [] for (let i = 1; i <= p; i++) { pages.push(i) } const scans = await Bluebird.map(pages, (n) => { return scanCategoryPage(baseURL, category, n) }, { concurrency: 8 }) const flatScans = flatten(scans) return flatScans } async function scanCategoryPage(baseURL, category, page) { var categoryURL if (page === 1) { categoryURL = `${baseURL}/${category}` } else { categoryURL = `${baseURL}/${category}/page/${page}` } const res = await axios.get(categoryURL) console.log(categoryURL) const $ = cheerio.load(res.data) const articles = $('article.item-list').map((i, article) => { const h2_a = $(article).find('h2.post-box-title a') const link = h2_a.attr('href') const slug = url.parse(link).pathname const title = h2_a.text().trim() const author = $(article).find('p.post-meta .post-meta-author').text().trim() const publishedDateText = $(article).find('p.post-meta .tie-date').text().trim() const publishedDate = DateTime.fromFormat(publishedDateText, 'LLLL d, yyyy', { zone: 'UTC' }).toISO() return { link, slug, title, author, published_date: publishedDate } }).toArray() return articles } function unhyphen(word) { const ws = word.split(/-/) return ws.slice(1).map((s) => s.toLowerCase()).join(" ") } function extractTaxonomyFromArticle($article, type) { const cs = $article.attr('class') .split(/\s+/) .filter((word) => word.match(new RegExp(`^${type}`))) const ts = cs.map(unhyphen) return ts } async function articleExists(db, slug) { const res = db.prepare(`SELECT count(*) c FROM article WHERE slug = @slug`).get({slug}) return res.c > 0 } // TODO rename to fetchArticle async function fetchArticle(url) { const res = await axios.get(url) const $ = cheerio.load(res.data) const $article = $('article') // get tags const tags = extractTaxonomyFromArticle($article, 'tag') // get categories const categories = extractTaxonomyFromArticle($article, 'category') // get content const content = $article.find('div.entry').html() return { tags, categories, content } } async function getCategory(db, name) { const category = await db.prepare('SELECT id, name FROM category WHERE name = ? LIMIT 1').get(name) return category } async function getTag(db, name) { const tag = await db.prepare('SELECT id, name FROM tag WHERE name = ? LIMIT 1').get(name) return tag } async function assocCategory(db, articleId, name) { try { var categoryId const category = await getCategory(db, name) if (!category) { const res = await insertCategory(db, name) categoryId = res.lastInsertRowid } else { categoryId = category.id } const res2 = await db.prepare('INSERT INTO article__category (article_id, category_id) VALUES (?, ?)') .run(articleId, categoryId) } catch (e) { return { success: false, error: e } } return { success: true } } async function assocTag(db, articleId, name) { try { var tagId const tag = await getTag(db, name) if (!tag) { const res = await insertTag(db, name) tagId = res.lastInsertRowid } else { tagId = tag.id } const res2 = await db.prepare('INSERT INTO article__tag (article_id, tag_id) VALUES (?, ?)') .run(articleId, tagId) } catch (e) { return { success: false, error: e } } return { success: true } } async function insertCategory(db, name) { try { const res = db.prepare(`INSERT INTO category (name) VALUES (?)`).run(name) // not async apparently res.success = true return res } catch (e) { return { success: false, error: e } } } async function insertTag(db, name) { try { const res = db.prepare(`INSERT INTO tag (name) VALUES (?)`).run(name) // not async apparently res.success = true return res } catch (e) { return { success: false, error: e } } } async function insertArticle(db, article) { try { // TODO wrap in a transaction? // insert into article const insert = db.prepare('INSERT INTO article (slug, title, author, content, published_date) VALUES (@slug, @title, @author, @content, @published_date)') const res = await insert.run(article) const articleId = res.lastInsertRowid // insert into article_search for full-text search const insertArticleSearch = db.prepare('INSERT INTO article_search (author, title, content, slug) VALUES (@author, @title, @content, @slug)') const articleForSearch = Object.assign({}, article) articleForSearch.content = cheerio.load(article.content).text() await insertArticleSearch.run(articleForSearch) // associate categories article.categories.forEach((async (name) => { await assocCategory(db, articleId, name) })) // associate tags if (article.tags) { article.tags.forEach((async (name) => { await assocTag(db, articleId, name) })) } } catch (e) { return { success: false, error: e } } return { success: true } } /** * Perform a full-text search of the db. * @param {Database} db - articles database * @param {String} query - search query in SQLite3 fts5 syntax */ async function search(db, query, limit, offset) { const stmt = db.prepare(` SELECT rank, author, title, slug, snippet(article_search, 2, '', '', '...', 32) as snippet FROM article_search WHERE article_search MATCH @query ORDER BY rank LIMIT @limit OFFSET @offset `) const res = stmt.all({ query, limit, offset }) return res } async function searchCount(db, query) { const stmt = db.prepare(`SELECT count(*) as c FROM article_search WHERE article_search MATCH @query`) const res = stmt.get({ query }) return res.c } // TODO write a function to insert an article into the database // TODO write a function to insert a category into the database // TODO write a function to associate an article with categories in a database // TODO write a function to scan a category for articles // TODO write a function to scrape an article into an insertable object // summaries for home page async function getYears() { // TODO replace with query? ...or at least somethign that doesn't have to be manually updated. return [ 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, ] } async function getCategories(db) { const cats = db.prepare('SELECT name FROM category ORDER by id').all() const cats2 = cats.map((c) => { let capitalized = c.name.split(/\s+/).map(ucfirst).join(' ') if (capitalized == 'Us') capitalized = 'US' return { name: c.name, capitalized } }) return cats2 } async function getTagCloud(db, limit, offset=0) { const tags = db.prepare(` SELECT count(t.id) as c, t.name FROM article__tag a_t LEFT JOIN article a ON a_t.article_id = a.id LEFT JOIN tag t ON a_t.tag_id = t.id GROUP BY t.id ORDER BY c DESC LIMIT @limit OFFSET @offset `).all({ limit, offset }) return tags } // SECTION: Archives async function getArticleCounts(db, year) { const months = { '01': 'January', '02': 'February', '03': 'March', '04': 'April', '05': 'May', '06': 'June', '07': 'July', '08': 'August', '09': 'September', '10': 'October', '11': 'November', '12': 'December' } const counts = db.prepare(` SELECT count(*) c, strftime('%Y-%m', published_date) AS year_month, strftime('%m', published_date) AS m FROM article GROUP BY year_month HAVING year_month LIKE @year ORDER BY year_month `).all({ year: `${year}%` }) return counts.map((c) => { c.human = months[c.m] return c }) } async function getArticlesByYearMonth(db, year, month) { const articles = db.prepare(` SELECT strftime('%Y-%m', published_date) AS year_month, author, title, slug, published_date FROM article WHERE year_month = @period ORDER BY published_date `).all({ period: `${year}-${month}`}) const articlesWithDay = articles.map((art) => { const d = DateTime.fromISO(art.published_date).setZone('UTC') art.day = d.toFormat('MMMM d, y') return art }) return articlesWithDay } // SECTION: Categories async function getArticlesByCategory(db, name, limit, offset) { const articles = db.prepare(` SELECT a.slug, a.title, a.author, a.published_date FROM article a JOIN article__category ac ON a.id = ac.article_id JOIN category c ON c.id = ac.category_id WHERE c.name = @name ORDER BY published_date DESC LIMIT @limit OFFSET @offset `).all({ name, limit, offset }) const articlesWithMonth = articles.map((art) => { const d = DateTime.fromISO(art.published_date).setZone('UTC') art.month = d.toFormat('MMMM y') return art }) return articlesWithMonth } async function countArticlesByCategory(db, name) { const count = db.prepare(` SELECT count(*) AS c FROM article a JOIN article__category ac ON a.id = ac.article_id JOIN category c ON c.id = ac.category_id WHERE c.name = @name `).get({ name }) return count.c } // SECTION: Tags async function getArticlesByTag(db, name, limit, offset) { const articles = db.prepare(` SELECT a.slug, a.title, a.author, a.published_date FROM article a JOIN article__tag ac ON a.id = ac.article_id JOIN tag c ON c.id = ac.tag_id WHERE c.name = @name ORDER BY published_date DESC LIMIT @limit OFFSET @offset `).all({ name, limit, offset }) const articlesWithMonth = articles.map((art) => { const d = DateTime.fromISO(art.published_date).setZone('UTC') art.month = d.toFormat('MMMM y') return art }) return articlesWithMonth } async function countArticlesByTag(db, name) { const count = db.prepare(` SELECT count(*) AS c FROM article a JOIN article__tag at ON a.id = at.article_id JOIN tag c ON c.id = at.tag_id WHERE c.name = @name `).get({ name }) return count.c } async function countTags(db) { const count = db.prepare(`SELECT count(*) c FROM tag`).get() return count.c } module.exports = { db, fetchFeed, fetchArticlesFromFeed, scanCategory, scanCategoryPage, unhyphen, extractTaxonomyFromArticle, articleExists, fetchArticle, getCategory, getTag, assocCategory, assocTag, insertCategory, insertTag, insertArticle, search, searchCount, getYears, getCategories, getTagCloud, getArticleCounts, getArticlesByYearMonth, getArticlesByCategory, countArticlesByCategory, getArticlesByTag, countArticlesByTag, countTags }