Sha256: def03f5809dbf2836bd7001da2a435c02e24381f24e57730e9b7eef683e7d868
Contents?: true
Size: 959 Bytes
Versions: 5
Compression:
Stored size: 959 Bytes
Contents
# Splits TextBlocks at paragraph boundaries. # # NOTE: This is not fully supported (i.e., it will break highlighting support via # #getContainedTextElements()), but this one probably is necessary for some other filters. # # see MinClauseWordsFilter module Boilerpipe::Filters class SplitParagraphBlocksFilter def self.process(doc) tbs = doc.text_blocks new_blocks = [] changes = false tbs.each do |tb| paragraphs = tb.text.split(/[\n\r]+/) if paragraphs.size < 2 new_blocks << tb next end is_content = tb.is_content? labels = tb.labels paragraphs.each do |paragraph| tbP = ::Boilerpipe::Document::TextBlock.new(paragraph) tbP.content = is_content tbP.add_labels(labels) new_blocks << tbP changes = true end end doc.replace_text_blocks!(new_blocks) if changes doc end end end
Version data entries
5 entries across 5 versions & 1 rubygems