lib/pdf/reader/page_layout.rb in pdf-reader-2.5.0 vs lib/pdf/reader/page_layout.rb in pdf-reader-2.6.0

- old
+ new

@@ -1,9 +1,10 @@ # coding: utf-8 # frozen_string_literal: true require 'pdf/reader/overlapping_runs_filter' +require 'pdf/reader/zero_width_runs_filter' class PDF::Reader # Takes a collection of TextRun objects and renders them into a single # string that best approximates the way they'd appear on a render PDF page. @@ -15,14 +16,16 @@ DEFAULT_FONT_SIZE = 12 def initialize(runs, mediabox) raise ArgumentError, "a mediabox must be provided" if mediabox.nil? - @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs)) + runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs) + runs = OverlappingRunsFilter.exclude_redundant_runs(runs) + @runs = merge_runs(runs) @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0 - @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0 + @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0 @page_width = (mediabox[2] - mediabox[0]).abs @page_height = (mediabox[3] - mediabox[1]).abs @x_offset = @runs.map(&:x).sort.first || 0 lowest_y = @runs.map(&:y).sort.first || 0 @y_offset = lowest_y > 0 ? 0 : lowest_y @@ -65,11 +68,11 @@ def row_count @row_count ||= (@page_height / @mean_font_size).floor end def col_count - @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor + @col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor end def row_multiplier @row_multiplier ||= @page_height.to_f / row_count.to_f end @@ -84,15 +87,15 @@ else collection.inject(0) { |accum, v| accum + v} / collection.size.to_f end end - def each_line(&block) - @runs.sort.group_by { |run| - run.y.to_i - }.map { |y, collection| - yield y, collection - } + def median(collection) + if collection.size == 0 + 0 + else + collection.sort[(collection.size * 0.5).floor] + end end # take a collection of TextRun objects and merge any that are in close # proximity def merge_runs(runs)