lib/boilerpipe/document/text_block.rb in boilerpipe-ruby-0.4.0 vs lib/boilerpipe/document/text_block.rb in boilerpipe-ruby-0.4.1
- old
+ new
@@ -1,20 +1,17 @@
-require 'set'
-
module Boilerpipe
module Document
class TextBlock
+ # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
- #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
-
attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
:num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
:link_density, :labels, :tag_level, :num_full_text_words
attr_accessor :content
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
+ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
@labels = Set.new
@text = text
@num_words = num_words
@num_words_in_anchor_text = num_words_in_anchor_text
@num_words_in_wrapped_lines = num_words_in_wrapped_lines
@@ -30,13 +27,13 @@
def self.empty_start
new('', 0, 0, 0, 0, -1)
end
- def set_tag_level(level)
- @tag_level = level
- end
+ def set_tag_level(level)
+ @tag_level = level
+ end
def is_content?
@content
end
@@ -66,12 +63,12 @@
@text = "#{@text}\n#{other.text}"
@num_words += other.num_words
@num_words_in_anchor_text += other.num_words_in_anchor_text
@num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
@num_wrapped_lines += other.num_wrapped_lines
- @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
- @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
+ @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
+ @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
init_densities
@content |= other.is_content?
@num_full_text_words += other.num_full_text_words
@@ -85,22 +82,23 @@
@tag_level = [@tag_level, other.tag_level].min
end
def to_s
- #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
+ # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
labels = 'null'
if !@labels.empty?
- labels ="[#{ @labels.to_a.join(',')}]"
+ labels = "[#{@labels.to_a.join(',')}]"
end
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
end
def clone
throw NotImplementedError
end
private
+
def init_densities
if @num_words_in_wrapped_lines == 0
@num_words_in_wrapped_lines = @num_words
@num_wrapped_lines = 1
end