lib/aranha/parsers/html/node/default.rb in aranha-parsers-0.1.1 vs lib/aranha/parsers/html/node/default.rb in aranha-parsers-0.2.0
- old
+ new
@@ -7,16 +7,32 @@
module Html
module Node
class Default < ::Aranha::Parsers::Html::Node::Base
def string_value(node, xpath)
if node.at_xpath(xpath)
- node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip
+ sanitize_string(node.at_xpath(xpath).text)
else
''
end
end
+ def string_recursive_value(node, xpath, required = true)
+ root = node.at_xpath(xpath)
+ if root.blank?
+ return nil unless required
+ raise "No node found (Xpath: #{xpath})"
+ end
+ result = string_recursive(root)
+ return result unless result.blank?
+ return nil unless required
+ raise "String blank (Xpath: #{xpath})"
+ end
+
+ def string_recursive_optional_value(node, xpath)
+ string_recursive_value(node, xpath, false)
+ end
+
def quoted_value(node, xpath)
s = string_value(node, xpath)
return '' unless s
m = /\"([^\"]+)\"/.match(s)
@@ -79,13 +95,27 @@
def parse_float(node, xpath, required)
s = string_value(node, xpath)
m = /\d+(?:[\.\,](\d+))?/.match(s)
if m
- m[0].sub(',', '.').to_f
+ m[0].delete('.').tr(',', '.').to_f
elsif required
raise "Float value not found in \"#{s}\""
end
+ end
+
+ def sanitize_string(obj)
+ obj.to_s.tr("\u00A0", ' ').strip
+ end
+
+ def string_recursive(node)
+ return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text)
+ s = ''
+ node.children.each do |child|
+ child_s = string_recursive(child)
+ s += ' ' + child_s if child_s.present?
+ end
+ sanitize_string(s)
end
end
end
end
end