lib/boilerpipe/sax/html_content_handler.rb in boilerpipe-ruby-0.4.0 vs lib/boilerpipe/sax/html_content_handler.rb in boilerpipe-ruby-0.4.1

- old
+ new

@@ -1,13 +1,10 @@ -require 'nokogiri' -require 'set' - module Boilerpipe::SAX class HTMLContentHandler < Nokogiri::XML::SAX::Document attr_reader :in_ignorable_element, :label_stacks, :last_start_tag - attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack + attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack ANCHOR_TEXT_START = "$\ue00a<" ANCHOR_TEXT_END = ">\ue00a$" def initialize @label_stacks = [] @@ -32,11 +29,10 @@ def start_element(name, attrs = []) @label_stacks << nil tag = name.upcase.intern - tag_action = @tag_actions[tag] if tag_action @tag_level += 1 if tag_action.changes_tag_level? @flush = tag_action.start(self, name, attrs) | @flush else @@ -49,19 +45,19 @@ end def characters(text) flush_block if @flush - return if @in_ignorable_element != 0 + return if in_ignorable_element? return if text.empty? # replace all whitespace with simple space text.gsub!(/\s+/, ' ') # trim whitespace - started_with_whitespace = text =~ /^\s/ - ended_with_whitespace = text =~ /\s$/ + started_with_whitespace = text =~ /^\s/ + ended_with_whitespace = text =~ /\s$/ text.strip! # add a single space if the block was only whitespace if text.empty? append_space @@ -156,14 +152,14 @@ else num_words_in_wrapped_lines = num_words - num_words_current_line end text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip, - num_words, - num_linked_words, - num_words_in_wrapped_lines, - num_wrapped_lines, @offset_blocks) + num_words, + num_linked_words, + num_words_in_wrapped_lines, + num_wrapped_lines, @offset_blocks) @offset_blocks += 1 clear_buffers text_block.set_tag_level(@block_tag_level) add_text_block(text_block) @@ -185,27 +181,28 @@ # \p{Nd} -- a decimal digit # \p{Nl} -- a letterlike numeric character # \p{No} -- a numeric character of other type def is_word?(word) - word =~ VALID_WORD_CHARACTER + word =~ VALID_WORD_CHARACTER end - #public void flushBlock() { + # public void flushBlock() { # int numWords = 0; # int numLinkedWords = 0; # int numWrappedLines = 0; # int currentLineLength = -1; // don't count the first space # final int maxLineLength = 80; # int numTokens = 0; # int numWordsCurrentLine = 0; - #} + # } def increase_in_ignorable_element! @in_ignorable_element += 1 end + # should we prevent less than zero here? def decrease_in_ignorable_element! @in_ignorable_element -= 1 end def increase_in_body! @@ -222,11 +219,10 @@ def in_anchor_tag? @in_anchor_tag > 0 end - def add_text_block(text_block) @label_stacks.each do |stack| next unless stack stack.each do |label_action| @@ -237,9 +233,10 @@ end # append space if last character wasn't already one def append_space return if @sb_last_was_whitespace + @sb_last_was_whitespace = true @text_buffer << ' ' @token_buffer << ' ' end