Sha256: 5b88c100c0b1f4ac1e5b483f7d7f8a59fd04254f1e007764a5af49c731210405

Contents?: true

Size: 852 Bytes

Versions: 1

Compression:

Stored size: 852 Bytes

Contents

#
# Keeps only blocks that have at least one segment fragment ("clause") with at least k
# words (default: 5).
#
# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
#
# SplitParagraphBlocksFilter

module Boilerpipe::Filters
  class MinClauseWordsFilter

    def self.process(doc, min_words=5)

      doc.text_blocks.each do |tb|
        next if tb.is_not_content?

        clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
        tb.text.scan(clause_delimiter).each do |possible_clause|
          if is_clause? possible_clause
            break
          else
            tb.content = false
          end
        end
      end

      doc
    end

    def self.is_clause?(text, min_words=5)
     return false if text.nil?
      whitespace = /[ \n\r]+/
      text.scan(whitespace).size >= min_words
    end

  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.4.0 lib/boilerpipe/filters/min_clause_words_filter.rb