Sha256: 1a66aed3ee954a66b5ced8e7b39e6eba375f9e04d620d7d79d8b1bc991572c40

Contents?: true

Size: 1.86 KB

Versions: 5

Compression:

Stored size: 1.86 KB

Contents

# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
# probably makes sense only in cases where an upstream filter already has removed some blocks.

module Boilerpipe::Filters
  class BlockProximityFusion
    def initialize(max_blocks_distance, content_only, same_tag_level_only)
      @max_blocks_distance = max_blocks_distance
      @content_only = content_only
      @same_tag_level_only = same_tag_level_only
    end

    MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
    MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
    MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
    MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)

    def process(doc)
      text_blocks = doc.text_blocks
      return false if text_blocks.size < 2

      prev_block = if @content_only
                     text_blocks.find { |tb| tb.is_content? }
                   else
                     text_blocks.first
                   end

      return false if prev_block.nil?

      offset = text_blocks.index(prev_block) + 1
      blocks = text_blocks[offset..-1]

      blocks_to_remove = []

      blocks.each do |tb|
        if tb.is_not_content?
          prev_block = tb
          next
        end

        diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1
        if diff_blocks <= @max_blocks_distance
          ok = true
          ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
          ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only

          if ok
            prev_block.merge_next(tb)
            blocks_to_remove << tb
          else
            prev_block = tb
          end
        end
      end
      doc.replace_text_blocks!(text_blocks - blocks_to_remove)
      doc
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/block_proximity_fusion.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/block_proximity_fusion.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/block_proximity_fusion.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/block_proximity_fusion.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/block_proximity_fusion.rb