actions/process_pdfs.rb in documentcloud-cloud-crowd-0.0.5 vs actions/process_pdfs.rb in documentcloud-cloud-crowd-0.0.6
- old
+ new
@@ -4,12 +4,12 @@
# it all back into a tar archive for convenient download.
#
# See <tt>examples/process_pdfs_example.rb</tt> for more information.
class ProcessPdfs < CloudCrowd::Action
- # Split up a large pdf into single-page pdfs.
- # The double pdftk shuffle fixes the document xrefs.
+ # Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
+ # chunks for processing. The double pdftk shuffle fixes the document xrefs.
def split
`pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
FileUtils.rm input_path
pdfs = Dir["*.pdf_temp"]
pdfs.each {|pdf| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`}
@@ -39,10 +39,10 @@
# Merge all of the resulting images, all of the resulting text files, and
# the concatenated merge of the full-text into a single tar archive, ready to
# for download.
def merge
- JSON.parse(input).each do |batch_url|
+ input.each do |batch_url|
batch_path = File.basename(batch_url)
download(batch_url, batch_path)
`tar -xzf #{batch_path}`
FileUtils.rm batch_path
end