lib/hairaito/nokogiri/xml/node.rb in hairaito-0.0.2 vs lib/hairaito/nokogiri/xml/node.rb in hairaito-0.0.3

- old
+ new

@@ -1,43 +1,120 @@ module Hairaito module Nokogiri module XML module Node + # @return [Nokogiri::XML::NodeSet] all text nodes, that has self as ancestor def text_nodes result_nodes = [] traverse do |node| result_nodes << node if node.text? end result_nodes ::Nokogiri::XML::NodeSet.new(document, result_nodes) end + # @return [Nokogiri::XML::Node] first text node within self node + def first_text_node + traverse do |node| + return node if node.text? + end + nil + end + + # @param start_node [Nokogiri::XML::Node] left boundary + # @param end_node [Nokogiri::XML::Node] right boundary + # @return [Nokogiri::XML::NodeSet] all text nodes are located between specified boundaries def text_nodes_between(start_node, end_node) nodes = text_nodes indexes = [nodes.index(start_node), nodes.index(end_node)] raise ArgumentError.new('Node must contain both start and end nodes!') if indexes.compact.count < 2 # Start and end nodes are equals or are neighbours return [] if indexes.last - indexes.first < 2 result_nodes = nodes.slice((indexes.first + 1)..(indexes.last - 1)) ::Nokogiri::XML::NodeSet.new(document, result_nodes) end - def traverse_by_text(text, exclude_ancestors = true) - excluded = [] - result_nodes = [] - traverse do |node| - next if node.is_a?(::Nokogiri::XML::Text) - next if node.in?(excluded) - if node.text.include?(text) - result_nodes << node - excluded += node.ancestors if exclude_ancestors + # @param base [Nokogiri::XML::Node] root element for search + # @return [Nokogiri::XML::Node, nil] previous text node within base node or nil if it doesn't exist + def previous_text(base = document) + first_text_node = text_nodes.first + base_text_nodes = base.text_nodes + if (index = base_text_nodes.index(first_text_node)).blank? + raise ArgumentError.new('Base must contain self node!') + end + return if index == 0 + base_text_nodes[index - 1] + end + + # @param base [Nokogiri::XML::Node] root element for search + # @return [Nokogiri::XML::Node, nil] next text node within base node or nil if it doesn't exist + def next_text(base = document) + first_text_node = text_nodes.last + base_text_nodes = base.text_nodes + if (index = base_text_nodes.index(first_text_node)).blank? + raise ArgumentError.new('Base must contain self node!') + end + return if index == base_text_nodes.count - 1 + base_text_nodes[index + 1] + end + + # Yields for each match of specified string in child nodes recursively + # + # @yieldparam node [Nokogiri::XML::Node] child node contains specified string + # @yieldparam offset [Array] child text inner offset + # @param string [String] text for matching + # @param options [Hash] @see #traverse_by_text_default_options + # @return [Nokogiri::XML::Node] self node for chaining + def traverse_by_text(string, options = {}, &block) + traverse_by_text_defaults(options) + traverse do |current_node| + next if current_node.text? + + offset_types = @tbt_opts[:whole_words_only] ? [:inner_word, :boundary_word] : [:simple] + inner_offsets, boundary_offsets = current_node.matched_offsets(string, offset_types, @tbt_opts) + + # Check words bordered with current inline tag if current node has boundary words + # abc<span>def<span> or <span>def</span>ghi or abc<span>def</span>ghi + if current_node.name.in?(@tbt_opts[:inline_tags]) && self != current_node + if boundary_offsets.try(:first).try(:first) == 0 + previous_node = current_node.previous_text(self) + boundary_offsets.shift if previous_node.try(:matched_offsets, :any, :ending_word, @tbt_opts).present? + end + if boundary_offsets.try(:last).try(:first) == 0 + next_node = current_node.next_text(self) + boundary_offsets.pop if next_node.try(:matched_offsets, :any, :beginning_word, @tbt_opts).present? + end end + + offsets = (inner_offsets + (boundary_offsets || [])).sort_by{|offset| offset.first} + if offsets.any? + offsets.each {|offset| yield(current_node, offset)} if block_given? + if current_node != self + # Excludes processed offsets from all ancestors + ([current_node] + current_node.ancestors).each do |node| + pos = node.position_by_text_node(current_node.first_text_node) + # Shifts all offsets according to node inner position and excludes from future processing + node.exclude_offsets(offsets.map{|offset| [offset.first + pos, offset.last + pos]}) + # Reaches highlighting base + break if node == self + end + end + end end - result_nodes + self end + def position_by_text_node(text_node) + nodes = text_nodes + if (index = nodes.index(text_node)) < 0 + raise ArgumentError.new('Self node must contain text_node!') + end + return 0 if index == 0 + nodes[0..index - 1].map{|node| node.text}.join('').length + end + def text_node_by_position(in_text_position) text_nodes.each do |node| # Node does not contain parent_index if node.text.length - 1 < in_text_position in_text_position -= node.text.length @@ -46,19 +123,88 @@ return node, in_text_position end raise ArgumentError.new('Inner index is out of range!') end - def highlight_by_range(range) - prefix = range.first > 0 ? text[0..(range.first - 1)]: '' - suffix = range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: '' - for_wrapping = text[range] - new_contents = "#{prefix}#{document.highlight_snippet_part(for_wrapping)}#{suffix}" + def highlight_by_ranges(ranges, options) + if options[:snippet][:part_wrapper].blank? + raise ArgumentError.new('Snippet part wrapper tag is not specified!') + end + parts = [] + ranges = ranges.sort_by{|r| r[:range].first} + ranges.each_with_index do |range_data, index| + range = range_data[:range] + parts << (range.first > 0 ? text[0..(range.first - 1)]: '') if index == 0 + snippet_class = range_data[:starting] ? "#{options[:snippet][:starting_part_class]}" : '' + wrapper = document.create_element("#{options[:snippet][:part_wrapper]}", class: "#{options[:snippet][:part_wrapper_class]} #{snippet_class}") + wrapper.content = text[range] + parts << wrapper.to_s + parts << text[(range.last + 1)..(ranges[index + 1][:range].first - 1)] if index < ranges.count - 1 + parts << (range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: '') if index == ranges.count - 1 + end + new_contents = parts.join('') replace(new_contents) end def text_range_by_index(index, demand_length = nil) demand_length.present? ? index..[text.length - 1, index + demand_length - 1].min : 0..index + end + + # @return [Array] self node offsets were already processed + def excluded_offsets + @excluded_offsets ||= [] + end + + # @param offsets [Array] self node offsets to be excluded in the future processing + def exclude_offsets(offsets) + @excluded_offsets ||= [] + @excluded_offsets += offsets + end + + def matched_offsets(string, types, options) + types = [types] unless types.is_a?(Array) + offsets = [] + types.each do |type| + offsets << text.to_enum(:scan, build_regexp(string, type, options)).map do + offset = Regexp.last_match.offset(:text) + # Only one highlighting per position + offset unless overlapped_offsets?(excluded_offsets, offset) + end.compact || [] + end + return *offsets + end + + private + + def traverse_by_text_defaults(options) + @tbt_opts = { + whole_words_only: true, + inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt), + word_parts: '[а-яА-ЯёЁa-zA-Z\d]', + }.deep_merge(options).with_indifferent_access + end + + def build_regexp(string, type = :simple, options) + string = '.+' if string == :any + case type.to_sym + when :simple + return /(?<text>#{string})/ + when :inner_word + return /(?<!#{options[:word_parts]}|\A)(?<text>#{string})(?!#{options[:word_parts]}|\Z)/ + when :beginning_word + return /\A(?<text>#{string})(?!#{options[:word_parts]})/ + when :ending_word + return /(?<!#{options[:word_parts]})(?<text>#{string})\Z/ + when :boundary_word + return /(\A(?<text>#{string})(?!#{options[:word_parts]}))|((?<!#{options[:word_parts]})(?<text>#{string})\Z)|(\A(?<text>#{string})\Z)/ + end + end + + def overlapped_offsets?(offsets_collection, offset_for_check) + offsets_collection.each do |offset| + return true if (offset_for_check.first...offset_for_check.last).overlaps?(offset.first...offset.last) + end + false end end end end