Sha256: 505ef0d59516d3eb55a3d569f2cba8e302b8f022f68333b0de407564d02b79ef

Contents?: true

Size: 1.39 KB

Versions: 1

Compression:

Stored size: 1.39 KB

Contents

namespace :transcribable do
  desc "Harvest documents to transcribe from DocumentCloud"
  task :harvest => :environment do
    require 'rest-client'
    klass = Kernel.const_get(Transcribable.table.classify)
    dc = YAML.load(File.read("#{Rails.root.to_s}/config/documentcloud.yml"))
    dc_project = JSON.parse(RestClient.get("https://#{CGI::escape(dc['email'])}:#{CGI::escape(dc['password'])}@www.documentcloud.org/api/projects.json"))

    # i had to use this to return the desired project
    # trace came back NoMethodError: undefined method `scan' for 19735:Fixnum
    # running rails 4.2.1
    #dc_project = dc_project['projects'].select {|q| q['id'] == dc['project']}[0]

    dc_project = dc_project['projects'].select {|q| q['id'] == dc['project'].scan(/^\d+/)[0].to_i }[0]


    dc_project['document_ids'].each do |doc_id|
      begin
        dc_doc = JSON.parse(RestClient.get("https://www.documentcloud.org/api/documents/#{doc_id}.json"))['document']
      # this will skip non-public documents
      rescue RestClient::ResourceNotFound
        next
      end

      # uses updated method with model field & value as the argument
      obj = klass.find_or_initialize_by(url: "https://www.documentcloud.org/documents/#{dc_doc['id']}")

      # don't plow over verified docs if rerunning the script
      obj.verified = false if obj.new_record?
      obj.save
      puts "== added #{obj.url}"
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
transcribable-0.0.6 lib/tasks/harvester.rake