Sha256: b1117393eadee8408e779d43e85e45dd918b44d7d901e3967bdc048ab74b0532

Contents?: true

Size: 962 Bytes

Versions: 1

Compression:

Stored size: 962 Bytes

Contents

# Splits TextBlocks at paragraph boundaries.
#
# NOTE: This is not fully supported (i.e., it will break highlighting support via
# #getContainedTextElements()), but this one probably is necessary for some other filters.
#
# see MinClauseWordsFilter

module Boilerpipe::Filters
  class SplitParagraphBlocksFilter

    def self.process(doc)
      tbs = doc.text_blocks
      new_blocks = []
      changes = false
      tbs.each do |tb|
        paragraphs = tb.text.split(/[\n\r]+/)

        if paragraphs.size < 2
          new_blocks << tb
          next
        end

        is_content = tb.is_content?
        labels = tb.labels
        paragraphs.each do |paragraph|
          tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
          tbP.content = is_content
          tbP.add_labels(labels)
          new_blocks << tbP
          changes = true
        end
      end

      doc.replace_text_blocks!(new_blocks) if changes
      doc
    end

  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/split_paragraph_blocks_filter.rb