lib/boilerpipe/sax/html_content_handler.rb in boilerpipe-ruby-0.4.0 vs lib/boilerpipe/sax/html_content_handler.rb in boilerpipe-ruby-0.4.1
- old
+ new
@@ -1,13 +1,10 @@
-require 'nokogiri'
-require 'set'
-
module Boilerpipe::SAX
class HTMLContentHandler < Nokogiri::XML::SAX::Document
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
- attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
+ attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
ANCHOR_TEXT_START = "$\ue00a<"
ANCHOR_TEXT_END = ">\ue00a$"
def initialize
@label_stacks = []
@@ -32,11 +29,10 @@
def start_element(name, attrs = [])
@label_stacks << nil
tag = name.upcase.intern
-
tag_action = @tag_actions[tag]
if tag_action
@tag_level += 1 if tag_action.changes_tag_level?
@flush = tag_action.start(self, name, attrs) | @flush
else
@@ -49,19 +45,19 @@
end
def characters(text)
flush_block if @flush
- return if @in_ignorable_element != 0
+ return if in_ignorable_element?
return if text.empty?
# replace all whitespace with simple space
text.gsub!(/\s+/, ' ')
# trim whitespace
- started_with_whitespace = text =~ /^\s/
- ended_with_whitespace = text =~ /\s$/
+ started_with_whitespace = text =~ /^\s/
+ ended_with_whitespace = text =~ /\s$/
text.strip!
# add a single space if the block was only whitespace
if text.empty?
append_space
@@ -156,14 +152,14 @@
else
num_words_in_wrapped_lines = num_words - num_words_current_line
end
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
- num_words,
- num_linked_words,
- num_words_in_wrapped_lines,
- num_wrapped_lines, @offset_blocks)
+ num_words,
+ num_linked_words,
+ num_words_in_wrapped_lines,
+ num_wrapped_lines, @offset_blocks)
@offset_blocks += 1
clear_buffers
text_block.set_tag_level(@block_tag_level)
add_text_block(text_block)
@@ -185,27 +181,28 @@
# \p{Nd} -- a decimal digit
# \p{Nl} -- a letterlike numeric character
# \p{No} -- a numeric character of other type
def is_word?(word)
- word =~ VALID_WORD_CHARACTER
+ word =~ VALID_WORD_CHARACTER
end
- #public void flushBlock() {
+ # public void flushBlock() {
# int numWords = 0;
# int numLinkedWords = 0;
# int numWrappedLines = 0;
# int currentLineLength = -1; // don't count the first space
# final int maxLineLength = 80;
# int numTokens = 0;
# int numWordsCurrentLine = 0;
- #}
+ # }
def increase_in_ignorable_element!
@in_ignorable_element += 1
end
+ # should we prevent less than zero here?
def decrease_in_ignorable_element!
@in_ignorable_element -= 1
end
def increase_in_body!
@@ -222,11 +219,10 @@
def in_anchor_tag?
@in_anchor_tag > 0
end
-
def add_text_block(text_block)
@label_stacks.each do |stack|
next unless stack
stack.each do |label_action|
@@ -237,9 +233,10 @@
end
# append space if last character wasn't already one
def append_space
return if @sb_last_was_whitespace
+
@sb_last_was_whitespace = true
@text_buffer << ' '
@token_buffer << ' '
end