meowmix.org asset scrapers

This commit is contained in:
Yotsubaaa
2020-09-22 22:56:16 +00:00
parent b36de6b714
commit ea2ac551dd
2 changed files with 48 additions and 0 deletions
+23
View File
@@ -0,0 +1,23 @@
# A quick Ruby script to grab all of the Meowmix embedded video URLs
# from its sitemap and download them to the local directory, one by one.
# 09/19/2020
# Using the Nokogiri gem for URL/XML parsing
require 'nokogiri'
require 'open-uri'
# Read in the sitemap XML
SITEMAP_URL = 'https://meowmix.org/video-sitemap-1.xml'
sitemap_xml = Nokogiri::XML(URI.open(SITEMAP_URL)).remove_namespaces!
# Get the video location URLs
urls = sitemap_xml.xpath('//url').map do |url|
url.xpath('./video/content_loc').text
end
# Download the video files to this local directory
urls.each do |video_url|
File.open(File.basename(video_url), 'wb') do |file|
file.write(URI.open(video_url).read)
end
end
+25
View File
@@ -0,0 +1,25 @@
# Grab all URLs for https://meowmix.org and write them to a file.
# 09/12/2020
# Nokogiri for the URL/XML parsing
require 'nokogiri'
require 'open-uri'
BASE_URL = 'https://meowmix.org'
SITEMAP = 'sitemap-1.xml'
URL_NODE_XPATH = '//url'
URL_TEXT_XPATH = './loc'
sitemap_xml = Nokogiri::XML(URI.open(BASE_URL + '/' + SITEMAP))
sitemap_xml.remove_namespaces!
urls = []
website_pages = sitemap_xml.xpath(URL_NODE_XPATH)
website_pages.each do |url_node|
url_text = url_node.xpath(URL_TEXT_XPATH).text
urls << url_text
end
File.open('Meowmix_urls.txt', 'w') do |file|
file.write(urls.join("\n"))
end