Sha256: 14d03b16ad57e20cb15f5be545c27c301f6d8510b128800def4f469637fdf08b

Contents?: true

Size: 1.59 KB

Versions: 8

Compression:

Stored size: 1.59 KB

Contents

require 'algorithms'
module Tabula
  module Whitespace

    # Detect whitespace in a document (not yet used in Tabula)
    # Described in "Two Geometric Algorithms for layout analysis" (Thomas Breuer)
    # http://pdf.aminer.org/000/140/219/two_geometric_algorithms_for_layout_analysis.pdf

    def self.find_closest(text_elements, x, y)
      text_elements.sort_by { |te|
        Math.sqrt((x - te.midpoint[0]) ** 2 + (y - te.midpoint[1]) ** 2)
      }.first
    end


    def self.find_whitespace(text_elements, bounds)
      queue = Containers::PriorityQueue.new
      queue.push([bounds, text_elements], bounds.width * bounds.height)
      rv = []


      while !queue.empty?
        r, obstacles = queue.pop
        if obstacles.empty?
          return r
        end

        pivot = self.find_closest(obstacles, *r.midpoint)

        subrectangles = [
                         ZoneEntity.new(r.top, pivot.right, r.right - pivot.right, pivot.top - r.top),
                         ZoneEntity.new(r.top, r.left, pivot.left - r.left, pivot.top - r.top),
                         ZoneEntity.new(pivot.bottom, r.left, pivot.left - r.left, r.bottom - pivot.bottom),
                         ZoneEntity.new(pivot.bottom, pivot.right, r.right - pivot.right, r.bottom - pivot.bottom)
                        ]
        subrectangles.each do |sub_r|
          obs = obstacles.select { |s|
            s.overlaps?(sub_r)
          }
          if obs.empty?
            rv << sub_r
          else
            queue.push([sub_r, obs], sub_r.width * sub_r.height)
          end
        end
      end
      return rv
    end
  end
end

Version data entries

8 entries across 8 versions & 1 rubygems

Version Path
tabula-extractor-0.6.6-java lib/tabula/whitespace.rb
tabula-extractor-0.6.5-java lib/tabula/whitespace.rb
tabula-extractor-0.6.4-java lib/tabula/whitespace.rb
tabula-extractor-0.6.3-java lib/tabula/whitespace.rb
tabula-extractor-0.6.1-java lib/tabula/whitespace.rb
tabula-extractor-0.5.1-java lib/tabula/whitespace.rb
tabula-extractor-0.5.0-java lib/tabula/whitespace.rb
tabula-extractor-0.0.1-java lib/tabula/whitespace.rb