Sha256: cff53e8871c26744a45db02f9f24f763b00c038187af8f85281f27cec3eb93df

Contents?: true

Size: 1.88 KB

Versions: 6

Compression:

Stored size: 1.88 KB

Contents

# typed: true
# coding: utf-8
# frozen_string_literal: true

class PDF::Reader
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
  # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
  class OverlappingRunsFilter

    # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
    # have identical characters) then one will be discarded
    OVERLAPPING_THRESHOLD = 0.5

    def self.exclude_redundant_runs(runs)
      sweep_line_status = Array.new
      event_point_schedule = Array.new
      to_exclude = []

      runs.each do |run|
        event_point_schedule << EventPoint.new(run.x, run)
        event_point_schedule << EventPoint.new(run.endx, run)
      end

      event_point_schedule.sort! { |a,b| a.x <=> b.x }

      event_point_schedule.each do |event_point|
        run = event_point.run

        if event_point.start?
          if detect_intersection(sweep_line_status, event_point)
            to_exclude << run
          end
          sweep_line_status.push(run)
        else
          sweep_line_status.delete(run)
        end
      end
      runs - to_exclude
    end

    def self.detect_intersection(sweep_line_status, event_point)
      sweep_line_status.each do |open_text_run|
        if open_text_run.text == event_point.run.text &&
            event_point.x >= open_text_run.x &&
            event_point.x <= open_text_run.endx &&
            open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
          return true
        end
      end
      return false
    end
  end

  # Utility class used to avoid modifying the underlying TextRun objects while we're
  # looking for duplicates
  class EventPoint

    attr_reader :x

    attr_reader :run

    def initialize(x, run)
      @x = x
      @run = run
    end

    def start?
      @x == @run.x
    end
  end

end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
pdf-reader-2.10.0 lib/pdf/reader/overlapping_runs_filter.rb
pdf-reader-2.9.2 lib/pdf/reader/overlapping_runs_filter.rb
pdf-reader-2.9.1 lib/pdf/reader/overlapping_runs_filter.rb
pdf-reader-2.9.0 lib/pdf/reader/overlapping_runs_filter.rb
pdf-reader-2.8.0 lib/pdf/reader/overlapping_runs_filter.rb
pdf-reader-2.7.0 lib/pdf/reader/overlapping_runs_filter.rb