Sha256: bce82b29d851b362ceb8fe0e2bb0486cb15e4d4fa46c7f2783c57ce3fa45bcce

Contents?: true

Size: 1.64 KB

Versions: 5

Compression:

Stored size: 1.64 KB

Contents

require 'nokogiri'

module Mida

  # Class that holds the extracted Microdata
  class Document

    # An Array of Mida::Item objects.  These are all top-level
    # and hence not properties of other Items
    attr_reader :items

    # Create a new Microdata object
    #
    # [target] The string containing the html that you want to parse
    # [page_url] The url of target used for form absolute urls. This must
    #            include the filename, e.g. index.html.
    def initialize(target, page_url=nil)
      @doc = Nokogiri(target)
      @page_url = page_url
      @items = extract_items
    end

    # Returns an array of matching Mida::Item objects
    #
    # [vocabulary] A regexp to match the item types against
    def search(vocabulary, items=@items)
      found_items = []
      items.each do |item|
        # Allows matching against empty string, otherwise couldn't match
        # as item.type can be nil
        if (item.type.nil? && "" =~ vocabulary) || (item.type =~ vocabulary)
          found_items << item
        end
        found_items += search_values(item.properties.values, vocabulary)
      end
      found_items
    end

  private
    def extract_items
      items_doc = @doc.search('//*[@itemscope and not(@itemprop)]')
      return nil unless items_doc

      items_doc.collect do |item_doc|
        Item.new(item_doc, @page_url)
      end
    end

    def search_values(values, vocabulary)
      items = []
      values.each do |value|
        if value.is_a?(Mida::Item) then items += search(vocabulary, [value])
        elsif value.is_a?(Array) then items += search_values(value, vocabulary)
        end
      end
      items
    end

  end

end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
mida-0.1.3 lib/mida/document.rb
mida-0.1.2 lib/mida/document.rb
mida-0.1.1 lib/mida/document.rb
mida-0.1.0 lib/mida/document.rb
mida-0.0.0 lib/mida/document.rb