module Traject
  # A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
  # objects as source records in the traject pipeline.
  #
  # It does process the entire input document with Nokogiri::XML.parse, DOM-parsing,
  # so will take RAM for the entire input document, until iteration completes.
  # (There is a separate half-finished `ExperimentalStreamingNokogiriReader` available, but it is
  # experimental, half-finished, may disappear or change in backwards compat at any time, problematic,
  # not recommended for production use, etc.)
  #
  # You can have it yield the _entire_ input XML as a single traject source record
  # (default), or you can use setting `nokogiri.each_record_xpath` to split
  # the source up into separate records to yield into traject pipeline -- each one
  # will be it's own Nokogiri::XML::Document.
  #
  # ## Settings
  # * nokogiri.default_namespaces: Set namespace prefixes that can be used in
  #   other settings, including `extract_xpath` from NokogiriMacros.
  # * nokogiri.each_record_xpath: if set to a string xpath, will take all matching nodes
  #   from the input doc, and yield the individually as source records to the pipeline.
  #   If you need to use namespaces here, you need to have them registered with
  #   `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
  #   to use them in your each_record_xpath.
  # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
  #
  # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
  #
  # What if you want to use each_record_xpath to yield certain nodes as source documents, but
  # there is additional some other info in other parts of the input document you need? This came up
  # when developing the OaiPmhNokogiriReader, which yields "//oai:record" as pipeline source documents,
  # but also needed to look at "//oai:resumptionToken" to scrape the entire results.
  #
  # There is a semi-finished/in-progress feature that meets that use case -- unclear if it will meet
  # all use cases for this general issue.
  #
  # Setting `nokogiri_reader.extra_xpath_hooks` can be set to a Hash where the keys are xpaths (if using
  # namespaces must be must be registered with `nokogiri.default_namespaces`), and the value is a lambda/
  # proc/callable object, taking two arguments.
  #
  #     provide "nokogiri_reader.extra_xpath_hooks", {
  #       "//oai:resumptionToken" =>
  #         lambda do |node, clipboard|
  #           clipboard[:resumption_token] = node.text
  #         end"
  #     }
  #
  # The first arg is the matching node. What's this clipboard? Well, what are you
  # gonna _do_ with what you get out of there, that you can do in a thread-safe way
  # in the middle of nokogiri processing? The second arg is a thread-safe Hash "clipboard"
  # that you can store things in, and later access via reader.clipboard.
  #
  # There's no great thread-safe way to get reader.clipboard in a normal nokogiri pipeline though,
  # (the reader can change in multi-file handling so there can be a race condition if you try naively,
  # don't!) Which is why this feature needs some work for general applicability. The OaiPmhReader
  # manually creates it's readers outside the usual nokogiri flow, so can use it.
  class NokogiriReader
    include Enumerable

    attr_reader :settings, :input_stream, :clipboard, :path_tracker

    def initialize(input_stream, settings)
      @settings = Traject::Indexer::Settings.new settings
      @input_stream = input_stream
      @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new

      default_namespaces # trigger validation
      validate_xpath(each_record_xpath, key_name: "each_record_xpath") if each_record_xpath
      extra_xpath_hooks.each_pair do |xpath, _callable|
        validate_xpath(xpath, key_name: "extra_xpath_hooks")
      end
    end

    def each_record_xpath
      @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
    end

    def extra_xpath_hooks
      @extra_xpath_hooks ||= settings["nokogiri_reader.extra_xpath_hooks"] || {}
    end

    def default_namespaces
      @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
        unless ns.kind_of?(Hash)
          raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
        end
      }
    end

    def each
      whole_input_doc = Nokogiri::XML.parse(input_stream)

      if each_record_xpath
        whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
          # We want to take the matching node, and make it into root in a new Nokogiri document.
          # This is tricky to do as performant as possible (we want to re-use the existing libxml node),
          # while preserving namespaces properly (especially in jruby). Some uses of noko api that seem
          # like they should work don't, esp in jruby.
          child_doc = Nokogiri::XML::Document.new

          reparent_node_to_root(child_doc, matching_node)

          yield child_doc

          child_doc = nil # hopefully make things easier on the GC.
        end
      else
        # caller wants whole doc as a traject source record
        yield whole_input_doc
      end

      run_extra_xpath_hooks(whole_input_doc)

    ensure
      # hopefully make things easier on the GC.
      whole_input_doc = nil
    end

    private


    # We simply do `new_parent_doc.root = node`
    # It seemed maybe safer to dup the node as well as remove the original from the original doc,
    # but I believe this will result in double memory usage, as unlinked nodes aren't GC'd until
    # their doc is.  I am hoping this pattern results in less memory usage.
    # https://github.com/sparklemotion/nokogiri/issues/1703
    #
    # We used to have to do something different in Jruby to work around bug:
    # https://github.com/sparklemotion/nokogiri/issues/1774
    #
    # But as of nokogiri 1.9, that does not work, and is not necessary if we accept
    # that Jruby nokogiri may put xmlns declerations on different elements than MRI,
    # although it should be semantically equivalent for a namespace-aware parser.
    # https://github.com/sparklemotion/nokogiri/issues/1875
    #
    # This as a separate method now exists largely as a historical artifact, and for this
    # documentation.
    def reparent_node_to_root(new_parent_doc, node)

      new_parent_doc.root = node

      return new_parent_doc
    end

    def validate_xpath(xpath, key_name:)
      components = each_record_xpath.split('/')
      components.each do |component|
        prefix, element = component.split(':')
        unless element
          # there was no namespace
          prefix, element = nil, prefix
        end

        if prefix
          ns_uri = default_namespaces[prefix]
          if ns_uri.nil?
            raise ArgumentError, "#{key_name}: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
          end
        end
      end
    end

    def run_extra_xpath_hooks(noko_doc)
      extra_xpath_hooks.each_pair do |xpath, callable|
        noko_doc.xpath(xpath, default_namespaces).each do |matching_node|
          callable.call(matching_node, clipboard)
        end
      end
    end
  end
end