Sha256: 3121f5efdf23202c5db176d4636632124c4aa5fbd1b28623835d5378ade96d8b

Contents?: true

Size: 1.19 KB

Versions: 2

Compression:

Stored size: 1.19 KB

Contents

# encoding: UTF-8

module SemanticCrawler
  module Websites

    # Extract microdata from a website and output it as JSON
    class MicroData

      attr_accessor :url
      attr_accessor :microdata

      def initialize(url)
        doc = Nokogiri::HTML(open(url))
        microdata = Microdata::Document.new(doc.to_s)
        items = microdata.extract_items
        self.microdata = extract_microdata(items)
      end

      def to_json
        microdata.to_json
      end

      def to_s
        microdata
      end

      private
      def extract_microdata(items)
        hash = Hash.new
        if items.kind_of? Array and items.first and items.first.kind_of? String
          hash = items
        elsif items.kind_of? Array and items.first
          items.each do |item|
            props = item.properties
            properties = Hash.new
            props.each do |key, value|
              hash[item.type.first] ||= Array.new
              values = extract_microdata(value)
              properties.merge!(key.to_s => values)
            end
            hash[item.type.first] << properties
          end
        else
          raise "Not implemented!"
        end
        hash
      end

    end

  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
semantic-crawler-0.7.1 lib/semantic_crawler/websites/micro_data.rb
semantic-crawler-0.7.0 lib/semantic_crawler/websites/micro_data.rb