meowmix.org asset scrapers
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
# A quick Ruby script to grab all of the Meowmix embedded video URLs
|
||||
# from its sitemap and download them to the local directory, one by one.
|
||||
# 09/19/2020
|
||||
|
||||
# Using the Nokogiri gem for URL/XML parsing
|
||||
require 'nokogiri'
|
||||
require 'open-uri'
|
||||
|
||||
# Read in the sitemap XML
|
||||
SITEMAP_URL = 'https://meowmix.org/video-sitemap-1.xml'
|
||||
sitemap_xml = Nokogiri::XML(URI.open(SITEMAP_URL)).remove_namespaces!
|
||||
|
||||
# Get the video location URLs
|
||||
urls = sitemap_xml.xpath('//url').map do |url|
|
||||
url.xpath('./video/content_loc').text
|
||||
end
|
||||
|
||||
# Download the video files to this local directory
|
||||
urls.each do |video_url|
|
||||
File.open(File.basename(video_url), 'wb') do |file|
|
||||
file.write(URI.open(video_url).read)
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,25 @@
|
||||
# Grab all URLs for https://meowmix.org and write them to a file.
|
||||
# 09/12/2020
|
||||
|
||||
# Nokogiri for the URL/XML parsing
|
||||
require 'nokogiri'
|
||||
require 'open-uri'
|
||||
|
||||
BASE_URL = 'https://meowmix.org'
|
||||
SITEMAP = 'sitemap-1.xml'
|
||||
URL_NODE_XPATH = '//url'
|
||||
URL_TEXT_XPATH = './loc'
|
||||
|
||||
sitemap_xml = Nokogiri::XML(URI.open(BASE_URL + '/' + SITEMAP))
|
||||
sitemap_xml.remove_namespaces!
|
||||
|
||||
urls = []
|
||||
website_pages = sitemap_xml.xpath(URL_NODE_XPATH)
|
||||
website_pages.each do |url_node|
|
||||
url_text = url_node.xpath(URL_TEXT_XPATH).text
|
||||
urls << url_text
|
||||
end
|
||||
|
||||
File.open('Meowmix_urls.txt', 'w') do |file|
|
||||
file.write(urls.join("\n"))
|
||||
end
|
||||
Reference in New Issue
Block a user