Sha256: def03f5809dbf2836bd7001da2a435c02e24381f24e57730e9b7eef683e7d868

Contents?: true

Size: 959 Bytes

Versions: 5

Compression:

Stored size: 959 Bytes

Contents

# Splits TextBlocks at paragraph boundaries.
#
# NOTE: This is not fully supported (i.e., it will break highlighting support via
# #getContainedTextElements()), but this one probably is necessary for some other filters.
#
# see MinClauseWordsFilter

module Boilerpipe::Filters
  class SplitParagraphBlocksFilter
    def self.process(doc)
      tbs = doc.text_blocks
      new_blocks = []
      changes = false
      tbs.each do |tb|
        paragraphs = tb.text.split(/[\n\r]+/)

        if paragraphs.size < 2
          new_blocks << tb
          next
        end

        is_content = tb.is_content?
        labels = tb.labels
        paragraphs.each do |paragraph|
          tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
          tbP.content = is_content
          tbP.add_labels(labels)
          new_blocks << tbP
          changes = true
        end
      end

      doc.replace_text_blocks!(new_blocks) if changes
      doc
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/split_paragraph_blocks_filter.rb