Sha256: 53c6d633cb6d696d89764566be5175d3d682e578d621a19b6d6f41079daaffb7
Contents?: true
Size: 1.88 KB
Versions: 5
Compression:
Stored size: 1.88 KB
Contents
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This # probably makes sense only in cases where an upstream filter already has removed some blocks. module Boilerpipe::Filters class BlockProximityFusion def initialize(max_blocks_distance, content_only, same_tag_level_only) @max_blocks_distance = max_blocks_distance @content_only = content_only @same_tag_level_only = same_tag_level_only end MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false) MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true) MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false) MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true) def process(doc) text_blocks = doc.text_blocks return false if text_blocks.size < 2 prev_block = if @content_only text_blocks.find{ |tb| tb.is_content? } else text_blocks.first end return false if prev_block.nil? offset = text_blocks.index(prev_block) + 1 blocks = text_blocks[offset..-1] blocks_to_remove = [] blocks.each do |tb| if tb.is_not_content? prev_block = tb next end diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 if diff_blocks <= @max_blocks_distance ok = true ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only if ok prev_block.merge_next(tb) blocks_to_remove << tb else prev_block = tb end end end doc.replace_text_blocks!( text_blocks - blocks_to_remove ) doc end end end
Version data entries
5 entries across 5 versions & 1 rubygems