Sha256: 5b88c100c0b1f4ac1e5b483f7d7f8a59fd04254f1e007764a5af49c731210405
Contents?: true
Size: 852 Bytes
Versions: 1
Compression:
Stored size: 852 Bytes
Contents
# # Keeps only blocks that have at least one segment fragment ("clause") with at least k # words (default: 5). # # NOTE: You might consider using the SplitParagraphBlocksFilter upstream. # # SplitParagraphBlocksFilter module Boilerpipe::Filters class MinClauseWordsFilter def self.process(doc, min_words=5) doc.text_blocks.each do |tb| next if tb.is_not_content? clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/ tb.text.scan(clause_delimiter).each do |possible_clause| if is_clause? possible_clause break else tb.content = false end end end doc end def self.is_clause?(text, min_words=5) return false if text.nil? whitespace = /[ \n\r]+/ text.scan(whitespace).size >= min_words end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
boilerpipe-ruby-0.4.0 | lib/boilerpipe/filters/min_clause_words_filter.rb |