Sha256: 9a9844d9e0ec5adf5409d55bf218ff3ac6ce576e06d6a206e305ba0afe698cc9

Contents?: true

Size: 1.93 KB

Versions: 2

Compression:

Stored size: 1.93 KB

Contents

module Pageflow
  module Chart
    class ScrapeSiteJob
      extend StateMachineJob
      @queue = :scraping

      attr_reader :downloader

      def initialize(downloader)
        @downloader = downloader
      end

      def perform(scraped_site)
        downloader.load(scraped_site.url) do |file|
          scraper = Scraper.new(file.read, Chart.config.scraper_options)
          scraped_site.html_file = StringIOWithContentType.new(
            scraper.html,
            file_name: 'file.html',
            content_type: 'text/html'
          )

          downloader.load_all(scraper.javascript_urls,
                              extension: '.js',
                              before_each: begin_try_catch,
                              after_each: end_try_catch) do |javascript_file|
            scraped_site.javascript_file = javascript_file
          end

          downloader.load_all(scraper.stylesheet_urls,
                              extension: '.css',
                              separator: "\n;") do |stylesheet_file|
            scraped_site.stylesheet_file = stylesheet_file
          end
        end

        downloader.load(scraped_site.csv_url) do |file|
          scraped_site.csv_file = file
        end

        :ok
      end

      def self.perform_with_result(scraped_site, options = {})
        # This is were the downloader passed to `initialize` is created.
        new(Downloader.new(base_url: scraped_site.url)).perform(scraped_site)
      end

      def begin_try_catch
        ";try {\n"
      end

      def end_try_catch
        "\n} catch(e) { console.log('Datawrapper raised: ' + (e.message || e)); }\n"
      end
    end

    class StringIOWithContentType < StringIO
      def initialize(string, options)
        super(string)
        @options = options
      end

      def content_type
        @options.fetch(:content_type)
      end

      def original_filename
        @options.fetch(:file_name)
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
pageflow-chart-0.2.1 app/jobs/pageflow/chart/scrape_site_job.rb
pageflow-chart-0.2.0 app/jobs/pageflow/chart/scrape_site_job.rb