lib/boilerpipe/document/text_block.rb in boilerpipe-ruby-0.4.0 vs lib/boilerpipe/document/text_block.rb in boilerpipe-ruby-0.4.1

- old
+ new

@@ -1,20 +1,17 @@ -require 'set' - module Boilerpipe module Document class TextBlock + # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999) - #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999) - attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text, :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density, :link_density, :labels, :tag_level, :num_full_text_words attr_accessor :content - def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0) + def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0) @labels = Set.new @text = text @num_words = num_words @num_words_in_anchor_text = num_words_in_anchor_text @num_words_in_wrapped_lines = num_words_in_wrapped_lines @@ -30,13 +27,13 @@ def self.empty_start new('', 0, 0, 0, 0, -1) end - def set_tag_level(level) - @tag_level = level - end + def set_tag_level(level) + @tag_level = level + end def is_content? @content end @@ -66,12 +63,12 @@ @text = "#{@text}\n#{other.text}" @num_words += other.num_words @num_words_in_anchor_text += other.num_words_in_anchor_text @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines @num_wrapped_lines += other.num_wrapped_lines - @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min - @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max + @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min + @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max init_densities @content |= other.is_content? @num_full_text_words += other.num_full_text_words @@ -85,22 +82,23 @@ @tag_level = [@tag_level, other.tag_level].min end def to_s - #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText(); + # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText(); labels = 'null' if !@labels.empty? - labels ="[#{ @labels.to_a.join(',')}]" + labels = "[#{@labels.to_a.join(',')}]" end "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}" end def clone throw NotImplementedError end private + def init_densities if @num_words_in_wrapped_lines == 0 @num_words_in_wrapped_lines = @num_words @num_wrapped_lines = 1 end