module Tabula class Page < ZoneEntity include Tabula::HasCells attr_reader :rotation, :number_one_indexed, :file_path attr_writer :min_char_width, :min_char_height attr_accessor :cells def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil) super(0, 0, width, height) @rotation = rotation if number < 1 raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid." end @ruling_lines = ruling_lines @file_path = file_path @number_one_indexed = number @cells = [] @spreadsheets = nil @min_char_width = min_char_width @min_char_height = min_char_height self.texts = texts @ruling_lines += minimal_bounding_box_of_ruling_lines.to_lines.map{|l| Ruling.new(l.getY1, l.getX1, l.getX2 - l.getX1, l.getY2 - l.getY1)}.select &:finite? if spatial_index.nil? @spatial_index = TextElementIndex.new self.texts.each { |te| @spatial_index << te } else @spatial_index = spatial_index end end def minimal_bounding_box_of_ruling_lines max_x = 0 max_y = 0 min_x = ::Float::INFINITY min_y = ::Float::INFINITY horizontal_ruling_lines.each do |t| min_x = t.left if t.left < min_x max_x = t.right if t.right > max_x end vertical_ruling_lines.each do |t| min_y = t.top if t.top < min_y max_y = t.bottom if t.bottom > max_y end java.awt.geom.Rectangle2D::Float.new(min_x, min_y, max_x - min_x, max_y - min_y) end # is there a scenario under which we'd prefer to use this over `minimal_bounding_box_of_ruling_lines`? # if so, what is it? If there are no ruling lines on the page _at all_, then adding this bounding box is # useless. def minimal_bounding_box_of_text_elements max_x = 0 max_y = 0 min_x = ::Float::INFINITY min_y = ::Float::INFINITY @texts.each do |t| min_x = t.x if t.x < min_x min_y = t.y if t.y < min_y max_x = t.x if t.x > max_x max_y = t.y if t.y > max_y end java.awt.geom.Rectangle2D::Float.new(min_x, min_y, max_x - min_x, max_y - min_y) end def get_min_char_width @min_char_width ||= texts.map(&:width).min end def get_min_char_height @min_char_height ||= texts.map(&:height).min end def get_area(area) if area.is_a?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end texts = self.get_text(area) page_area = PageArea.new(file_path, area.width, area.height, rotation, number, texts, Ruling.crop_rulings_to_area(@ruling_lines, area), texts.map(&:width).min, texts.map(&:height).min, @spatial_index) return page_area end #returns a Table object def get_table(options={}) options = {:vertical_rulings => []}.merge(options) if texts.empty? return Tabula::Table.new(0, []) end texts = self.texts.sort text_chunks = TextElement.merge_words(texts, options) lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top) columns = unless options[:vertical_rulings].empty? options[:vertical_rulings].map(&:left).sort #pixel locations, not entities else TextChunk.column_positions(lines).sort end table = Table.new(lines.count, columns) lines.each_with_index do |line, i| line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te| j = columns.find_index { |s| te.left <= s } || columns.count table.add_text_element(te, i, j) end end # fixes up the table a little bit, replacing nils with empty TextElements # and sorting the lines. # table.rows.each do |l| # l.text_elements = l.text_elements.map do |te| # te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) # end # end # table.rows.sort_by!(&:top) table end #for API backwards-compatibility reasons, this returns an array of arrays. def make_table(options={}) get_table(options).rows end # returns the Spreadsheets; creating them if they're not memoized def spreadsheets(options={}) unless @spreadsheets.nil? return @spreadsheets end @spreadsheets = spreadsheet_areas(options).map do |rect| spr = Spreadsheet.new(rect.y, rect.x, rect.width, rect.height, self, #TODO: keep track of the cells, instead of getting them again inefficiently. [], vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) }, horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) } ) spr.cells = @cells.select{|c| spr.overlaps?(c) } spr.add_spanning_cells! spr end if options[:fill_in_cells] fill_in_cells! end spreadsheets end def spreadsheet_areas (options={}) get_ruling_lines!(options) self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options) spreadsheet_java_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons. #transform each spreadsheet area into a rectangle # and get the cells contained within it. # getBounds2D is theoretically better than getBounds, but it returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it. spreadsheet_java_areas.map{|a| a.getBounds } end def fill_in_cells!(options={}) spreadsheets(options).each do |spreadsheet| spreadsheet.cells.each do |cell| cell.text_elements = page.get_cell_text(cell) end spreadsheet.cells_resolved = true end end def number(indexing_base=:one_indexed) if indexing_base == :zero_indexed return @number_one_indexed - 1 else return @number_one_indexed end end # TODO no need for this, let's choose one name def ruling_lines get_ruling_lines! end def horizontal_ruling_lines get_ruling_lines! @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines end def vertical_ruling_lines get_ruling_lines! @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines end #returns ruling lines, memoizes them in def get_ruling_lines!(options={}) if @ruling_lines.nil? || @ruling_lines.empty? return [] end self.snap_points! @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) } @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?)) @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?)) @vertical_ruling_lines + @horizontal_ruling_lines end ## # get text insidea area # area can be an Array ([top, left, width, height]) # or a Rectangle2D def get_text(area=nil) if area.instance_of?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end if area.nil? texts else @spatial_index.contains(area) end end def fill_in_cell_texts!(areas) texts.each do |t| area = areas.find{|a| a.contains(t) } area.text_elements << t unless area.nil? end areas.each do |area| area.text_elements = TextElement.merge_words(area.text_elements) end end def get_cell_text(area=nil) TextElement.merge_words(self.get_text(area)) end def to_json(options={}) { :width => self.width, :height => self.height, :number => self.number, :rotation => self.rotation, :texts => self.texts }.to_json(options) end def snap_points! lines_to_points = {} points = [] @ruling_lines.each do |line| point1 = line.p1 #comptooters are the wurst point2 = line.p2 # for a given line, each call to #p1 and #p2 creates a new # Point2D::Float object, rather than returning the same one over and # over again. # so we have to get it, store it in memory as `point1` and `point2` # and then store those in various places (and now, modifying one will # modify the reference and thereby modify the other) lines_to_points[line] = [point1, point2] points += [point1, point2] end # lines are stored separately from their constituent points # so you can't modify the points and then modify the lines. # ah, but perhaps I can stick the points in a hash AND in an array # and then modify the lines by means of the points in the hash. [[:x, :x=, self.get_min_char_width], [:y, :y=, self.get_min_char_height]].each do |getter, setter, cell_size| sorted_points = points.sort_by(&getter) first_point = sorted_points.shift grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point| last = memo.last if (next_point.send(getter) - last.first.send(getter)).abs < cell_size memo[-1] << next_point else memo << [next_point] end memo end grouped_points.each do |group| uniq_locs = group.map(&getter).uniq avg_loc = uniq_locs.sum / uniq_locs.size group.each{|p| p.send(setter, avg_loc) } end end lines_to_points.each do |l, p1_p2| l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1] end end end end