actions/process_pdfs.rb in documentcloud-cloud-crowd-0.0.5 vs actions/process_pdfs.rb in documentcloud-cloud-crowd-0.0.6

- old
+ new

@@ -4,12 +4,12 @@ # it all back into a tar archive for convenient download. # # See <tt>examples/process_pdfs_example.rb</tt> for more information. class ProcessPdfs < CloudCrowd::Action - # Split up a large pdf into single-page pdfs. - # The double pdftk shuffle fixes the document xrefs. + # Split up a large pdf into single-page pdfs. Batch them into 'batch_size' + # chunks for processing. The double pdftk shuffle fixes the document xrefs. def split `pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"` FileUtils.rm input_path pdfs = Dir["*.pdf_temp"] pdfs.each {|pdf| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`} @@ -39,10 +39,10 @@ # Merge all of the resulting images, all of the resulting text files, and # the concatenated merge of the full-text into a single tar archive, ready to # for download. def merge - JSON.parse(input).each do |batch_url| + input.each do |batch_url| batch_path = File.basename(batch_url) download(batch_url, batch_path) `tar -xzf #{batch_path}` FileUtils.rm batch_path end