#!/usr/bin/env jruby # encoding: utf-8 require 'trollop' require_relative '../lib/tabula' FORMATS = ['CSV', 'TSV', 'HTML', 'JSON'] def parse_pages_arg(pages_arg) ranges = pages_arg.split(',').map(&:strip) pages = [] ranges.each do |range| s, e = range.split('-') return nil if (s.nil? && e.nil?) || s !~ /\d+/ || (!e.nil? && e !~ /\d+/) if e.nil? pages << s.to_i else return nil if s.to_i > e.to_i pages += (s.to_i..e.to_i).to_a end end pages.sort end def parse_command_line opts = Trollop::options do version "tabula #{Tabula::VERSION} (c) 2012-2013 Manuel AristarĂ¡n" banner <<-EOS Tabula helps you extract tables from PDFs Usage: tabula [options] where [options] are: EOS opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV' opt :outfile, 'Write output to instead of STDOUT', :default => '-' end if !opts[:area].nil? unless opts[:area].split(',').size == 4 \ && opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ } Trollop::die :area, "is invalid" end end Trollop::die :format, "is unknown" unless FORMATS.include?(opts[:format]) Trollop::die "need one filename" if ARGV.empty? pdf_filename = ARGV.shift Trollop::die 'file does not exist' unless File.exists? pdf_filename return opts, pdf_filename end def main opts, filename = parse_command_line area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f) out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w') extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages])) extractor.extract.each do |page| text = page.get_text(area) Tabula::Writers.send(opts[:format].to_sym, Tabula.make_table(text), out) end out.close end main