lib/pdf/reader/page_layout.rb in pdf-reader-2.5.0 vs lib/pdf/reader/page_layout.rb in pdf-reader-2.6.0
- old
+ new
@@ -1,9 +1,10 @@
# coding: utf-8
# frozen_string_literal: true
require 'pdf/reader/overlapping_runs_filter'
+require 'pdf/reader/zero_width_runs_filter'
class PDF::Reader
# Takes a collection of TextRun objects and renders them into a single
# string that best approximates the way they'd appear on a render PDF page.
@@ -15,14 +16,16 @@
DEFAULT_FONT_SIZE = 12
def initialize(runs, mediabox)
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
- @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
+ @runs = merge_runs(runs)
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
@page_width = (mediabox[2] - mediabox[0]).abs
@page_height = (mediabox[3] - mediabox[1]).abs
@x_offset = @runs.map(&:x).sort.first || 0
lowest_y = @runs.map(&:y).sort.first || 0
@y_offset = lowest_y > 0 ? 0 : lowest_y
@@ -65,11 +68,11 @@
def row_count
@row_count ||= (@page_height / @mean_font_size).floor
end
def col_count
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
+ @col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
end
def row_multiplier
@row_multiplier ||= @page_height.to_f / row_count.to_f
end
@@ -84,15 +87,15 @@
else
collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
end
end
- def each_line(&block)
- @runs.sort.group_by { |run|
- run.y.to_i
- }.map { |y, collection|
- yield y, collection
- }
+ def median(collection)
+ if collection.size == 0
+ 0
+ else
+ collection.sort[(collection.size * 0.5).floor]
+ end
end
# take a collection of TextRun objects and merge any that are in close
# proximity
def merge_runs(runs)