Sha256: ce5a048244ff881c10a77cfe4f597afe9c4d660c51ec7ef0c4a763e01ab6abfd

Contents?: true

Size: 1.53 KB

Versions: 2

Compression:

Stored size: 1.53 KB

Contents

module Pageflow
  module Chart
    class ScrapeSiteJob
      extend StateMachineJob
      @queue = :scraping

      attr_reader :downloader

      def initialize(downloader)
        @downloader = downloader
      end

      def perform(scraped_site)
        downloader.load(scraped_site.url) do |file|
          scraper = Scraper.new(file.read, Chart.config.scraper_options)
          scraped_site.html_file = StringIOWithContentType.new(
            scraper.html,
            file_name: 'file.html',
            content_type: 'text/html'
          )

          downloader.load_all(scraper.javascript_urls, extension: '.js', separator: "\n;") do |file|
            scraped_site.javascript_file = file
          end

          downloader.load_all(scraper.stylesheet_urls, extension: '.css', separator: "\n;") do |file|
            scraped_site.stylesheet_file = file
          end
        end

        downloader.load(scraped_site.csv_url) do |file|
          scraped_site.csv_file = file
        end

        :ok
      end

      def self.perform_with_result(scraped_site, options = {})
        # This is were the downloader passed to `initialize` is created.
        new(Downloader.new(base_url: scraped_site.url)).perform(scraped_site)
      end
    end

    class StringIOWithContentType < StringIO
      def initialize(string, options)
        super(string)
        @options = options
      end

      def content_type
        @options.fetch(:content_type)
      end

      def original_filename
        @options.fetch(:file_name)
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
pageflow-chart-0.1.1 app/jobs/pageflow/chart/scrape_site_job.rb
pageflow-chart-0.1.0 app/jobs/pageflow/chart/scrape_site_job.rb