Sha256: 89d8d88e289671b80bd89e318960ba461a1a712d1898f3af9bee8a7bf43f2cfd

Contents?: true

Size: 1.88 KB

Versions: 3

Compression:

Stored size: 1.88 KB

Contents

#a list of filenames and the correct answer
# no more bs.
require_relative '../lib/tabula'


should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
should_use_original  = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}

correct = []
misclassified_as_original = []
misclassified_as_spreadsheet = []



def heuristic(page)
  page.is_tabular?
end

(should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
  extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])

  page = extractor.extract.first
  page.get_ruling_lines!
  # puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
  page_is_tabular = heuristic(page)
  # puts ""

  if page_is_tabular && expected_to_be_tabular  || !page_is_tabular && !expected_to_be_tabular
    correct << filename
  elsif page_is_tabular && !expected_to_be_tabular
    misclassified_as_spreadsheet << filename
  elsif !page_is_tabular && expected_to_be_tabular
    misclassified_as_original << filename
  end
end

puts "#{correct.size} PDFs were correctly classified"
puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
unless misclassified_as_spreadsheet.empty?
  puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
  misclassified_as_spreadsheet.each do |filename|
    puts " - #{File.basename(filename)}"
  end
end
unless misclassified_as_original.empty?
  puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
  misclassified_as_original.each do |filename|
    puts " - #{File.basename(filename)}"
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
tabula-extractor-0.7.2-java test/heuristic.rb
tabula-extractor-0.7.1-java test/heuristic.rb
tabula-extractor-0.7.0-java test/heuristic.rb