process_pdfs.rb in documentcloud-cloud-crowd-0.0.6

- old
+ new

@@ -4,12 +4,12 @@
 # it all back into a tar archive for convenient download.
 #
 # See <tt>examples/process_pdfs_example.rb</tt> for more information.
 class ProcessPdfs < CloudCrowd::Action
   
-  # Split up a large pdf into single-page pdfs.
-  # The double pdftk shuffle fixes the document xrefs.
+  # Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
+  # chunks for processing. The double pdftk shuffle fixes the document xrefs.
   def split
     `pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
     FileUtils.rm input_path
     pdfs = Dir["*.pdf_temp"]
     pdfs.each {|pdf| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`}
@@ -39,10 +39,10 @@
   
   # Merge all of the resulting images, all of the resulting text files, and
   # the concatenated merge of the full-text into a single tar archive, ready to
   # for download.
   def merge
-    JSON.parse(input).each do |batch_url|
+    input.each do |batch_url|
       batch_path = File.basename(batch_url)
       download(batch_url, batch_path)
       `tar -xzf #{batch_path}`
       FileUtils.rm batch_path
     end