Sha256: 43d4325ce24d0866b847ce6f9030758a782367201b3951699371f00ae3024c7d
Contents?: true
Size: 1.85 KB
Versions: 1
Compression:
Stored size: 1.85 KB
Contents
require "wriggler/version" require "nokogiri" require "find" require "utf8_utils" module Wriggler attr_reader :content, :directory def self.crawl(tags=[], directory="") @content = Hash[tags.map {|k| [k, []]}] #Hash with content @directory = directory #Current top-level directory navigate_directory @content end private def self.navigate_directory #Set the cwd to the given dir send to gather all nested files from there Dir.chdir(@directory) gather_files end def self.gather_files #Gathers all of the HTML or XML files from this and all subdirectories into an array Find.find(@directory) do |file| if is_XML?(file) || is_HTML?(file) || is_TXT?(file) open_next_file(file) end end end def self.open_next_file(file) #Opens the next file on the list, depending on the extension passes it to HTML or XML f = File.open(file) if is_HTML?(file) set_HTML(f) elsif is_XML?(file) set_XML(f) end end def self.is_HTML?(file) #Determines, using a regex check, if it is an HTML file file =~ /.html/ end def self.is_XML?(file) #Determines, using a regex check, if it is an XML file file =~ /.xml/ end def self.set_HTML(file) #Set the HTML file into Nokogiri for crawling doc = Nokogiri::HTML(file) crawl_file(doc) end def self.set_XML(file) #Set the XML file into Nokogiri for crawling doc = Nokogiri::XML(file) crawl_file(doc) end def self.crawl_file(doc) #Crawl the Nokogiri Object for the file @content.each_key do |key| arr = [] if !doc.css("#{key}").empty? doc.css("#{key}").map{ |tag| arr << sanitize(tag.text) } elsif key == "html" || key == "xml" arr << "#{doc}" else arr << "" end @content.fetch(key) << arr end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
wriggler-1.5.0 | lib/wriggler.rb |