lib/aranha/parsers/html/node/default.rb in aranha-parsers-0.1.1 vs lib/aranha/parsers/html/node/default.rb in aranha-parsers-0.2.0

- old
+ new

@@ -7,16 +7,32 @@ module Html module Node class Default < ::Aranha::Parsers::Html::Node::Base def string_value(node, xpath) if node.at_xpath(xpath) - node.at_xpath(xpath).text.to_s.tr("\u00A0", ' ').strip + sanitize_string(node.at_xpath(xpath).text) else '' end end + def string_recursive_value(node, xpath, required = true) + root = node.at_xpath(xpath) + if root.blank? + return nil unless required + raise "No node found (Xpath: #{xpath})" + end + result = string_recursive(root) + return result unless result.blank? + return nil unless required + raise "String blank (Xpath: #{xpath})" + end + + def string_recursive_optional_value(node, xpath) + string_recursive_value(node, xpath, false) + end + def quoted_value(node, xpath) s = string_value(node, xpath) return '' unless s m = /\"([^\"]+)\"/.match(s) @@ -79,13 +95,27 @@ def parse_float(node, xpath, required) s = string_value(node, xpath) m = /\d+(?:[\.\,](\d+))?/.match(s) if m - m[0].sub(',', '.').to_f + m[0].delete('.').tr(',', '.').to_f elsif required raise "Float value not found in \"#{s}\"" end + end + + def sanitize_string(obj) + obj.to_s.tr("\u00A0", ' ').strip + end + + def string_recursive(node) + return sanitize_string(node.text) if node.is_a?(::Nokogiri::XML::Text) + s = '' + node.children.each do |child| + child_s = string_recursive(child) + s += ' ' + child_s if child_s.present? + end + sanitize_string(s) end end end end end