Sha256: b6974237d47f91a1ced869291e5e9b5a6ffdf42b1c52ce69f89858d097e34dbb

Contents?: true

Size: 858 Bytes

Versions: 5

Compression:

Stored size: 858 Bytes

Contents

#
# Keeps only blocks that have at least one segment fragment ("clause") with at least k
# words (default: 5).
#
# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
#
# SplitParagraphBlocksFilter

module Boilerpipe::Filters
  class MinClauseWordsFilter
    def self.process(doc, min_words = 5)
      doc.text_blocks.each do |tb|
        next if tb.is_not_content?

        clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
        hasClause = false
        tb.text.scan(clause_delimiter).each do |possible_clause|
          hasClause |= is_clause? possible_clause
        end

        tb.content = false unless hasClause
      end

      doc
    end

    def self.is_clause?(text, min_words = 5)
      return false if text.nil?

      whitespace = /[ \n\r]+/
      text.scan(whitespace).size >= min_words
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
boilerpipe-ruby-0.5.0 lib/boilerpipe/filters/min_clause_words_filter.rb
boilerpipe-ruby-0.4.4 lib/boilerpipe/filters/min_clause_words_filter.rb
boilerpipe-ruby-0.4.3 lib/boilerpipe/filters/min_clause_words_filter.rb
boilerpipe-ruby-0.4.2 lib/boilerpipe/filters/min_clause_words_filter.rb
boilerpipe-ruby-0.4.1 lib/boilerpipe/filters/min_clause_words_filter.rb