# coding: utf-8 # frozen_string_literal: true module Nokogiri module CSS # When translating CSS selectors to XPath queries with Nokogiri::CSS.xpath_for, the XPathVisitor # class allows for changing some of the behaviors related to builtin xpath functions and quirks # of HTML5. class XPathVisitor WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc: # Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions. module BuiltinsConfig # Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is # the default when calling Nokogiri::CSS.xpath_for directly. NEVER = :never # Always use Nokogiri builtin functions whenever possible. This is probably only useful for testing. ALWAYS = :always # Only use Nokogiri builtin functions when they will be faster than vanilla XPath. This is # the behavior chosen when searching for CSS selectors on a Nokogiri document, fragment, or # node. OPTIMAL = :optimal # :nodoc: array of values for validation VALUES = [NEVER, ALWAYS, OPTIMAL] end # Enum to direct XPathVisitor when to tweak the XPath query to suit the nature of the document # being searched. Note that searches for CSS selectors from a Nokogiri document, fragment, or # node will choose the correct option automatically. module DoctypeConfig # The document being searched is an XML document. This is the default. XML = :xml # The document being searched is an HTML4 document. HTML4 = :html4 # The document being searched is an HTML5 document. HTML5 = :html5 # :nodoc: array of values for validation VALUES = [XML, HTML4, HTML5] end # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new. attr_reader :builtins # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new. attr_reader :doctype # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new. attr_reader :prefix # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new. attr_reader :namespaces # :call-seq: # new() → XPathVisitor # new(builtins:, doctype:) → XPathVisitor # # [Parameters] # - +builtins:+ (BuiltinsConfig) Determine when to use Nokogiri's built-in xpath functions for performance improvements. # - +doctype:+ (DoctypeConfig) Make document-type-specific accommodations for CSS queries. # # [Returns] XPathVisitor # def initialize( builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML, prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX, namespaces: nil ) unless BuiltinsConfig::VALUES.include?(builtins) raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter") end unless DoctypeConfig::VALUES.include?(doctype) raise(ArgumentError, "Invalid values #{doctype.inspect} for doctype: keyword parameter") end @builtins = builtins @doctype = doctype @prefix = prefix @namespaces = namespaces end # :call-seq: config() → Hash # # [Returns] # a Hash representing the configuration of the XPathVisitor, suitable for use as # part of the CSS cache key. def config { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces } end # :stopdoc: def visit_function(node) msg = :"visit_function_#{node.value.first.gsub(/[(]/, "")}" return send(msg, node) if respond_to?(msg) case node.value.first when /^text\(/ "child::text()" when /^self\(/ "self::#{node.value[1]}" when /^eq\(/ "position()=#{node.value[1]}" when /^(nth|nth-of-type)\(/ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH) nth(node.value[1]) else "position()=#{node.value[1]}" end when /^nth-child\(/ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH) nth(node.value[1], child: true) else "count(preceding-sibling::*)=#{node.value[1].to_i - 1}" end when /^nth-last-of-type\(/ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH) nth(node.value[1], last: true) else index = node.value[1].to_i - 1 index == 0 ? "position()=last()" : "position()=last()-#{index}" end when /^nth-last-child\(/ if node.value[1].is_a?(Nokogiri::CSS::Node) && (node.value[1].type == :NTH) nth(node.value[1], last: true, child: true) else "count(following-sibling::*)=#{node.value[1].to_i - 1}" end when /^(first|first-of-type)\(/ "position()=1" when /^(last|last-of-type)\(/ "position()=last()" when /^contains\(/ "contains(.,#{node.value[1]})" when /^gt\(/ "position()>#{node.value[1]}" when /^only-child\(/ "last()=1" when /^comment\(/ "comment()" when /^has\(/ is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)" ".#{"//" unless is_direct}#{node.value[1].accept(self)}" else validate_xpath_function_name(node.value.first) # xpath function call, let's marshal those arguments args = ["."] args += node.value[1..-1].map do |n| n.is_a?(Nokogiri::CSS::Node) ? n.accept(self) : n end "nokogiri:#{node.value.first}#{args.join(",")})" end end def visit_not(node) child = node.value.first if :ELEMENT_NAME == child.type "not(self::#{child.accept(self)})" else "not(#{child.accept(self)})" end end def visit_id(node) node.value.first =~ /^#(.*)$/ "@id='#{Regexp.last_match(1)}'" end def visit_attribute_condition(node) attribute = node.value.first.accept(self) return attribute if node.value.length == 1 value = node.value.last value = "'#{value}'" unless /^['"]/.match?(value) # quoted values - see test_attribute_value_with_quotes in test/css/test_parser.rb if (value[0] == value[-1]) && %q{"'}.include?(value[0]) str_value = value[1..-2] if str_value.include?(value[0]) value = 'concat("' + str_value.split('"', -1).join(%q{",'"',"}) + '","")' end end case node.value[1] when :equal attribute + "=" + value.to_s when :not_equal attribute + "!=" + value.to_s when :substring_match "contains(#{attribute},#{value})" when :prefix_match "starts-with(#{attribute},#{value})" when :dash_match "#{attribute}=#{value} or starts-with(#{attribute},concat(#{value},'-'))" when :includes value = value[1..-2] # strip quotes css_class(attribute, value) when :suffix_match "substring(#{attribute},string-length(#{attribute})-string-length(#{value})+1,string-length(#{value}))=#{value}" else attribute + " #{node.value[1]} " + value.to_s end end def visit_pseudo_class(node) if node.value.first.is_a?(Nokogiri::CSS::Node) && (node.value.first.type == :FUNCTION) node.value.first.accept(self) else msg = :"visit_pseudo_class_#{node.value.first.gsub(/[(]/, "")}" return send(msg, node) if respond_to?(msg) case node.value.first when "first" then "position()=1" when "first-child" then "count(preceding-sibling::*)=0" when "last" then "position()=last()" when "last-child" then "count(following-sibling::*)=0" when "first-of-type" then "position()=1" when "last-of-type" then "position()=last()" when "only-child" then "count(preceding-sibling::*)=0 and count(following-sibling::*)=0" when "only-of-type" then "last()=1" when "empty" then "not(node())" when "parent" then "node()" when "root" then "not(parent::*)" else validate_xpath_function_name(node.value.first) "nokogiri:#{node.value.first}(.)" end end end def visit_class_condition(node) css_class("@class", node.value.first) end def visit_combinator(node) if is_of_type_pseudo_class?(node.value.last) "#{node.value.first&.accept(self)}][#{node.value.last.accept(self)}" else "#{node.value.first&.accept(self)} and #{node.value.last.accept(self)}" end end { "direct_adjacent_selector" => "/following-sibling::*[1]/self::", "following_selector" => "/following-sibling::", "descendant_selector" => "//", "child_selector" => "/", }.each do |k, v| class_eval <<~RUBY, __FILE__, __LINE__ + 1 def visit_#{k} node "\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}" end RUBY end def visit_conditional_selector(node) node.value.first.accept(self) + "[" + node.value.last.accept(self) + "]" end def visit_element_name(node) if @doctype == DoctypeConfig::HTML5 && html5_element_name_needs_namespace_handling(node) # HTML5 has namespaces that should be ignored in CSS queries # https://github.com/sparklemotion/nokogiri/issues/2376 if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?) if WILDCARD_NAMESPACES "*:#{node.value.first}" else "*[nokogiri-builtin:local-name-is('#{node.value.first}')]" end else "*[local-name()='#{node.value.first}']" end elsif node.value.length == 2 # has a namespace prefix if node.value.first.nil? # namespace prefix is empty node.value.last else node.value.join(":") end elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared "xmlns:#{node.value.first}" else node.value.first end end def visit_attrib_name(node) "@#{node.value.first}" end def accept(node) node.accept(self) end private def validate_xpath_function_name(name) if name.start_with?("-") raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'" end end def html5_element_name_needs_namespace_handling(node) # if there is already a namespace (i.e., it is a prefixed QName), use it as normal node.value.length == 1 && # if this is the wildcard selector "*", use it as normal node.value.first != "*" end def nth(node, options = {}) unless node.value.size == 4 raise(ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}") end a, b = read_a_and_positive_b(node.value) position = if options[:child] options[:last] ? "(count(following-sibling::*)+1)" : "(count(preceding-sibling::*)+1)" else options[:last] ? "(last()-position()+1)" : "position()" end if b.zero? "(#{position} mod #{a})=0" else compare = a < 0 ? "<=" : ">=" if a.abs == 1 "#{position}#{compare}#{b}" else "(#{position}#{compare}#{b}) and (((#{position}-#{b}) mod #{a.abs})=0)" end end end def read_a_and_positive_b(values) op = values[2].strip if op == "+" a = values[0].to_i b = values[3].to_i elsif op == "-" a = values[0].to_i b = a - (values[3].to_i % a) else raise ArgumentError, "expected an+b node to have either + or - as the operator, but is #{op.inspect}" end [a, b] end def is_of_type_pseudo_class?(node) # rubocop:disable Naming/PredicateName if node.type == :PSEUDO_CLASS if node.value[0].is_a?(Nokogiri::CSS::Node) && (node.value[0].type == :FUNCTION) node.value[0].value[0] else node.value[0] end =~ /(nth|first|last|only)-of-type(\()?/ end end def css_class(hay, needle) if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?) # use the builtin implementation "nokogiri-builtin:css-class(#{hay},'#{needle}')" else # use only ordinary xpath functions "contains(concat(' ',normalize-space(#{hay}),' '),' #{needle} ')" end end end end end