meowmix.org asset scrapers

2020-09-22 22:56:16 +00:00
parent b36de6b714
commit ea2ac551dd
2 changed files with 48 additions and 0 deletions
@@ -0,0 +1,23 @@
+# A quick Ruby script to grab all of the Meowmix embedded video URLs
+# from its sitemap and download them to the local directory, one by one.
+# 09/19/2020
+
+# Using the Nokogiri gem for URL/XML parsing
+require 'nokogiri'
+require 'open-uri'
+
+# Read in the sitemap XML
+SITEMAP_URL = 'https://meowmix.org/video-sitemap-1.xml'
+sitemap_xml = Nokogiri::XML(URI.open(SITEMAP_URL)).remove_namespaces!
+
+# Get the video location URLs
+urls = sitemap_xml.xpath('//url').map do |url|
+  url.xpath('./video/content_loc').text
+end
+
+# Download the video files to this local directory
+urls.each do |video_url|
+  File.open(File.basename(video_url), 'wb') do |file|
+    file.write(URI.open(video_url).read)
+  end
+end
@@ -0,0 +1,25 @@
+# Grab all URLs for https://meowmix.org and write them to a file.
+# 09/12/2020
+
+# Nokogiri for the URL/XML parsing
+require 'nokogiri'
+require 'open-uri'
+
+BASE_URL = 'https://meowmix.org'
+SITEMAP = 'sitemap-1.xml'
+URL_NODE_XPATH = '//url'
+URL_TEXT_XPATH = './loc'
+
+sitemap_xml = Nokogiri::XML(URI.open(BASE_URL + '/' + SITEMAP))
+sitemap_xml.remove_namespaces!
+
+urls = []
+website_pages = sitemap_xml.xpath(URL_NODE_XPATH)
+website_pages.each do |url_node|
+  url_text = url_node.xpath(URL_TEXT_XPATH).text
+  urls << url_text
+end
+
+File.open('Meowmix_urls.txt', 'w') do |file|
+  file.write(urls.join("\n"))
+end