266 lines
7.6 KiB
JavaScript
266 lines
7.6 KiB
JavaScript
const axios = require('axios').default
|
|
const cheerio = require('cheerio')
|
|
const flatten = require('lodash.flatten')
|
|
const RSSParser = require('rss-parser')
|
|
const Bluebird = require('bluebird')
|
|
const url = require('urlite')
|
|
const luxon = require('luxon')
|
|
const DateTime = luxon.DateTime
|
|
|
|
const parser = new RSSParser()
|
|
|
|
async function getFeed(feedURL='https://dailystormer.su/feed/') {
|
|
const res = await axios.get(feedURL)
|
|
const feed = await parser.parseString(res.data)
|
|
return feed
|
|
}
|
|
|
|
/**
|
|
* Fetch the RSS feed and return its parsed contents.
|
|
* @param {String} feedURL - URL of the feed
|
|
* @returns {Object} parsed RSS feed
|
|
*/
|
|
async function getArticlesFromFeed(feedURL='https://dailystormer.su/feed/') {
|
|
const feed = await getFeed(feedURL);
|
|
return feed.items.map((a) => {
|
|
const slug = url.parse(a.link).pathname
|
|
const article = {
|
|
slug,
|
|
title: a.title,
|
|
author: a.creator,
|
|
content: a['content:encoded'],
|
|
categories: a.categories.map((cat) => cat.toLowerCase()),
|
|
tags: [],
|
|
published_date: a.isoDate,
|
|
//published_date: DateTime.fromRFC2822(a.pubDate).toISO()
|
|
}
|
|
return article
|
|
})
|
|
}
|
|
|
|
async function scanCategory(baseURL, category, options={}) {
|
|
const categoryURL = `${baseURL}/${category}`
|
|
console.log(categoryURL)
|
|
// scan first page of category to find out how many pages we have.
|
|
const res = await axios.get(categoryURL)
|
|
const $ = cheerio.load(res.data)
|
|
const p = parseInt($('.pagination .pages').text().split(/\s/)[3])
|
|
const pages = []
|
|
for (let i = 1; i <= p; i++) {
|
|
pages.push(i)
|
|
}
|
|
const scans = await Bluebird.map(pages, (n) => { return scanCategoryPage(baseURL, category, n) }, { concurrency: 4 })
|
|
const flatScans = flatten(scans)
|
|
return flatScans
|
|
}
|
|
|
|
async function scanCategoryPage(baseURL, category, page) {
|
|
var categoryURL
|
|
if (page === 1) {
|
|
categoryURL = `${baseURL}/${category}`
|
|
} else {
|
|
categoryURL = `${baseURL}/${category}/page/${page}`
|
|
}
|
|
const res = await axios.get(categoryURL)
|
|
console.log(categoryURL)
|
|
const $ = cheerio.load(res.data)
|
|
const articles = $('article.item-list').map((i, article) => {
|
|
const h2_a = $(article).find('h2.post-box-title a')
|
|
const link = h2_a.attr('href')
|
|
const slug = url.parse(link).pathname
|
|
const title = h2_a.text().trim()
|
|
const author = $(article).find('p.post-meta .post-meta-author').text().trim()
|
|
const publishedDateText = $(article).find('p.post-meta .tie-date').text().trim()
|
|
const publishedDate = DateTime.fromFormat(publishedDateText, 'LLLL d, yyyy', { zone: 'UTC' }).toISO()
|
|
return {
|
|
link,
|
|
slug,
|
|
title,
|
|
author,
|
|
published_date: publishedDate
|
|
}
|
|
}).toArray()
|
|
return articles
|
|
}
|
|
|
|
function unhyphen(word) {
|
|
const ws = word.split(/-/)
|
|
return ws.slice(1).map((s) => s.toLowerCase()).join(" ")
|
|
}
|
|
|
|
function extractTaxonomyFromArticle($article, type) {
|
|
const cs = $article.attr('class')
|
|
.split(/\s+/)
|
|
.filter((word) => word.match(new RegExp(`^${type}`)))
|
|
const ts = cs.map(unhyphen)
|
|
return ts
|
|
}
|
|
|
|
async function getArticle(url) {
|
|
const res = await axios.get(url)
|
|
const $ = cheerio.load(res.data)
|
|
const $article = $('article')
|
|
// get tags
|
|
const tags = extractTaxonomyFromArticle($article, 'tag')
|
|
// get categories
|
|
const categories = extractTaxonomyFromArticle($article, 'category')
|
|
// get content
|
|
const content = $article.find('div.entry').html()
|
|
return {
|
|
tags,
|
|
categories,
|
|
content
|
|
}
|
|
}
|
|
|
|
async function getCategory(db, name) {
|
|
const category = await db.prepare('SELECT id, name FROM category WHERE name = ? LIMIT 1').get(name)
|
|
return category
|
|
}
|
|
|
|
async function assocCategory(db, articleId, name) {
|
|
try {
|
|
var categoryId
|
|
const category = await getCategory(db, name)
|
|
if (!category) {
|
|
const res = await insertCategory(db, name)
|
|
categoryId = res.lastInsertRowid
|
|
} else {
|
|
categoryId = category.id
|
|
}
|
|
const res2 = await db.prepare('INSERT INTO article__category (article_id, category_id) VALUES (?, ?)')
|
|
.run(articleId, categoryId)
|
|
} catch (e) {
|
|
return { success: false, error: e }
|
|
}
|
|
return { success: true }
|
|
}
|
|
|
|
async function getTag(db, name) {
|
|
const tag = await db.prepare('SELECT id, name FROM tag WHERE name = ? LIMIT 1').get(name)
|
|
return tag
|
|
}
|
|
|
|
async function assocTag(db, articleId, name) {
|
|
try {
|
|
var tagId
|
|
const tag = await getTag(db, name)
|
|
if (!tag) {
|
|
const res = await insertTag(db, name)
|
|
tagId = res.lastInsertRowid
|
|
} else {
|
|
tagId = tag.id
|
|
}
|
|
const res2 = await db.prepare('INSERT INTO article__tag (article_id, tag_id) VALUES (?, ?)')
|
|
.run(articleId, tagId)
|
|
} catch (e) {
|
|
return { success: false, error: e }
|
|
}
|
|
return { success: true }
|
|
}
|
|
|
|
async function insertCategory(db, name) {
|
|
try {
|
|
const res = db.prepare(`INSERT INTO category (name) VALUES (?)`).run(name) // not async apparently
|
|
res.success = true
|
|
return res
|
|
} catch (e) {
|
|
return {
|
|
success: false,
|
|
error: e
|
|
}
|
|
}
|
|
}
|
|
|
|
async function insertTag(db, name) {
|
|
try {
|
|
const res = db.prepare(`INSERT INTO tag (name) VALUES (?)`).run(name) // not async apparently
|
|
res.success = true
|
|
return res
|
|
} catch (e) {
|
|
return {
|
|
success: false,
|
|
error: e
|
|
}
|
|
}
|
|
}
|
|
|
|
async function insertArticle(db, article) {
|
|
try {
|
|
// TODO wrap in a transaction?
|
|
|
|
// insert into article
|
|
const insert = db.prepare('INSERT INTO article (slug, title, author, content, published_date) VALUES (@slug, @title, @author, @content, @published_date)')
|
|
const res = await insert.run(article)
|
|
const articleId = res.lastInsertRowid
|
|
// insert into article_search for full-text search
|
|
const insertArticleSearch = db.prepare('INSERT INTO article_search (author, title, content, slug) VALUES (@author, @title, @content, @slug)')
|
|
const articleForSearch = Object.assign({}, article)
|
|
articleForSearch.content = cheerio.load(article.content).text()
|
|
await insertArticleSearch.run(articleForSearch)
|
|
// associate categories
|
|
article.categories.forEach((async (name) => {
|
|
await assocCategory(db, articleId, name)
|
|
}))
|
|
// associate tags
|
|
if (article.tags) {
|
|
article.tags.forEach((async (name) => {
|
|
await assocTag(db, articleId, name)
|
|
}))
|
|
}
|
|
}
|
|
catch (e) {
|
|
return { success: false, error: e }
|
|
}
|
|
return { success: true }
|
|
}
|
|
|
|
/**
|
|
* Perform a full-text search of the db.
|
|
* @param {Database} db - articles database
|
|
* @param {String} query - search query in SQLite3 fts5 syntax
|
|
*/
|
|
async function search(db, query, limit, offset) {
|
|
const stmt = db.prepare(`
|
|
SELECT rank, author, title, slug,
|
|
snippet(article_search, 2, '<b>', '</b>', '...', 32) as snippet
|
|
FROM article_search
|
|
WHERE article_search MATCH @query
|
|
ORDER BY rank
|
|
LIMIT @limit OFFSET @offset
|
|
`)
|
|
const res = stmt.all({ query, limit, offset })
|
|
return res
|
|
}
|
|
|
|
async function searchCount(db, query) {
|
|
const stmt = db.prepare(`SELECT count(*) as c FROM article_search WHERE article_search MATCH @query`)
|
|
const res = stmt.get({ query })
|
|
return res.c
|
|
}
|
|
|
|
// TODO write a function to insert an article into the database
|
|
// TODO write a function to insert a category into the database
|
|
// TODO write a function to associate an article with categories in a database
|
|
// TODO write a function to scan a category for articles
|
|
// TODO write a function to scrape an article into an insertable object
|
|
|
|
module.exports = {
|
|
getFeed,
|
|
getArticlesFromFeed,
|
|
scanCategory,
|
|
scanCategoryPage,
|
|
unhyphen,
|
|
extractTaxonomyFromArticle,
|
|
getArticle,
|
|
getCategory,
|
|
assocCategory,
|
|
getTag,
|
|
assocTag,
|
|
insertCategory,
|
|
insertTag,
|
|
insertArticle,
|
|
search,
|
|
searchCount
|
|
}
|