#!/usr/bin/env ruby

REQUIRED_VERSION = Gem::Version.new('2.5')
if Gem::Version.new(RUBY_VERSION) < REQUIRED_VERSION
  raise "Ruby version #{RUBY_VERSION} not supported. " \
          "Please upgrade to #{REQUIRED_VERSION} or above."
end

require 'json' unless defined?(JSON)
require 'ripper'

class RipperJS < Ripper
  private

  # Scanner events occur when the lexer hits a new token, like a keyword or an
  # end. These nodes always contain just one argument which is a string
  # representing the content. For the most part these can just be printed
  # directly, which very few exceptions.
  SCANNER_EVENTS.each do |event|
    define_method(:"on_#{event}") do |body|
      { type: :"@#{event}", body: body, start: lineno, end: lineno }
    end
  end

  # Parser events represent nodes in the ripper abstract syntax tree. The event
  # is reported after the children of the node have already been built.
  PARSER_EVENTS.each do |event|
    define_method(:"on_#{event}") do |*body|
      min = body.map { |part| part.is_a?(Hash) ? part[:start] : lineno }.min
      { type: event, body: body, start: min || lineno, end: lineno }
    end
  end

  # Some nodes are lists that come back from the parser. They always start with
  # a `*_new` node (or in the case of string, `*_content`) and each additional
  # node in the list is a `*_add` node. This module takes those nodes and turns
  # them into one node with an array body.
  #
  # For example, the statement `[a, b, c]` would be parsed as:
  #
  # [:args_add,
  #   [:args_add,
  #     [:args_add,
  #       [:args_new],
  #       [:vcall, [:@ident, "a", [1, 1]]]
  #     ],
  #     [:vcall, [:@ident, "b", [1, 4]]]
  #   ],
  #   [:vcall, [:@ident, "c", [1, 7]]]
  # ]
  #
  # But after this module is applied that is instead parsed as:
  #
  # [:args,
  #   [
  #     [:vcall, [:@ident, "a", [1, 1]]],
  #     [:vcall, [:@ident, "b", [1, 4]]],
  #     [:vcall, [:@ident, "c", [1, 7]]]
  #   ]
  # ]
  #
  # This makes it a lot easier to join things with commas, and ends up resulting
  # in a much flatter `prettier` tree once it has been converted. Note that
  # because of this module some extra node types are added (the aggregate of
  # the previous `*_add` nodes) and some nodes now have arrays in places where
  # they previously had single nodes.
  prepend(
    Module.new do
      events = %i[
        args
        mlhs
        mrhs
        qsymbols
        qwords
        regexp
        stmts
        string
        symbols
        words
        xstring
      ]

      private

      events.each do |event|
        suffix = event == :string ? 'content' : 'new'

        define_method(:"on_#{event}_#{suffix}") do
          { type: event, body: [], start: lineno, end: lineno }
        end

        define_method(:"on_#{event}_add") do |parts, part|
          parts.tap do |node|
            node[:body] << part
            node[:end] = lineno
          end
        end
      end
    end
  )

  # For most nodes, it's enough to look at the child nodes to determine the
  # start of the parent node. However, for some nodes it's necessary to keep
  # track of the keywords as they come in from the lexer and to modify the start
  # node once we have it. We need accurate start and end lines so that we can
  # embed block comments into the right kind of node.
  prepend(
    Module.new do
      events = %i[begin else elsif ensure if rescue until while]

      def initialize(*args)
        super(*args)
        @keywords = []
      end

      def self.prepended(base)
        base.attr_reader :keywords
      end

      private

      def find_start(body)
        keywords[keywords.rindex { |keyword| keyword[:body] == body }][:start]
      end

      events.each do |event|
        keyword = event.to_s

        define_method(:"on_#{event}") do |*body|
          super(*body).tap { |sexp| sexp.merge!(start: find_start(keyword)) }
        end
      end

      def on_kw(body)
        super(body).tap { |sexp| keywords << sexp }
      end

      def on_program(*body)
        super(*body).tap { |sexp| sexp.merge!(start: 1) }
      end
    end
  )

  # This layer keeps track of inline comments as they come in. Ripper itself
  # doesn't attach comments to the AST, so we need to do it manually. In this
  # case, inline comments are defined as any comments wherein the lexer state is
  # not equal to EXPR_BEG (tracked in the BlockComments layer).
  prepend(
    Module.new do
      # Certain events needs to steal the comments from their children in order
      # for them to display properly.
      events = {
        aref: [:body, 1],
        args_add_block: [:body, 0],
        break: [:body, 0],
        command: [:body, 1],
        command_call: [:body, 3],
        regexp_literal: [:body, 0],
        string_literal: [:body, 0],
        symbol_literal: [:body, 0]
      }

      def initialize(*args)
        super(*args)
        @inline_comments = []
        @last_sexp = nil
      end

      def self.prepended(base)
        base.attr_reader :inline_comments, :last_sexp
      end

      private

      events.each do |event, path|
        define_method(:"on_#{event}") do |*body|
          @last_sexp =
            super(*body).tap do |sexp|
              comments = (sexp.dig(*path) || {}).delete(:comments)
              sexp.merge!(comments: comments) if comments
            end
        end
      end

      SPECIAL_LITERALS = %i[qsymbols qwords symbols words].freeze

      # Special array literals are handled in different ways and so their
      # comments need to be passed up to their parent array node.
      def on_array(*body)
        @last_sexp =
          super(*body).tap do |sexp|
            next unless SPECIAL_LITERALS.include?(body.dig(0, :type))

            comments = sexp.dig(:body, 0).delete(:comments)
            sexp.merge!(comments: comments) if comments
          end
      end

      # Handling this specially because we want to pull the comments out of both
      # child nodes.
      def on_assoc_new(*body)
        @last_sexp =
          super(*body).tap do |sexp|
            comments =
              (sexp.dig(:body, 0).delete(:comments) || []) +
                (sexp.dig(:body, 1).delete(:comments) || [])

            sexp.merge!(comments: comments) if comments.any?
          end
      end

      # Most scanner events don't stand on their own as s-expressions, but the
      # CHAR scanner event is effectively just a string, so we need to track it
      # as a s-expression.
      def on_CHAR(body)
        @last_sexp = super(body)
      end

      # We need to know exactly where the comment is, switching off the current
      # lexer state. In Ruby 2.7.0-dev, that's defined as:
      #
      # enum lex_state_bits {
      #     EXPR_BEG_bit,     /* ignore newline, +/- is a sign. */
      #     EXPR_END_bit,     /* newline significant, +/- is an operator. */
      #     EXPR_ENDARG_bit,  /* ditto, and unbound braces. */
      #     EXPR_ENDFN_bit,   /* ditto, and unbound braces. */
      #     EXPR_ARG_bit,     /* newline significant, +/- is an operator. */
      #     EXPR_CMDARG_bit,  /* newline significant, +/- is an operator. */
      #     EXPR_MID_bit,     /* newline significant, +/- is an operator. */
      #     EXPR_FNAME_bit,   /* ignore newline, no reserved words. */
      #     EXPR_DOT_bit,     /* right after `.' or `::', no reserved words. */
      #     EXPR_CLASS_bit,   /* immediate after `class', no here document. */
      #     EXPR_LABEL_bit,   /* flag bit, label is allowed. */
      #     EXPR_LABELED_bit, /* flag bit, just after a label. */
      #     EXPR_FITEM_bit,   /* symbol literal as FNAME. */
      #     EXPR_MAX_STATE
      # };
      def on_comment(body)
        sexp = { type: :@comment, body: body.chomp, start: lineno, end: lineno }

        case RipperJS.lex_state_name(state)
        when 'EXPR_END', 'EXPR_ARG|EXPR_LABELED', 'EXPR_ENDFN'
          last_sexp.merge!(comments: [sexp])
        when 'EXPR_CMDARG', 'EXPR_END|EXPR_ENDARG', 'EXPR_ENDARG', 'EXPR_ARG',
             'EXPR_FNAME|EXPR_FITEM', 'EXPR_CLASS', 'EXPR_END|EXPR_LABEL'
          inline_comments << sexp
        when 'EXPR_BEG|EXPR_LABEL', 'EXPR_MID'
          inline_comments << sexp.merge!(break: true)
        when 'EXPR_DOT'
          last_sexp.merge!(comments: [sexp.merge!(break: true)])
        end

        sexp
      end

      defined = private_instance_methods(false).grep(/\Aon_/) { $'.to_sym }

      (Ripper::PARSER_EVENTS - defined).each do |event|
        define_method(:"on_#{event}") do |*body|
          super(*body).tap do |sexp|
            @last_sexp = sexp
            next if inline_comments.empty?

            sexp[:comments] = inline_comments.reverse
            @inline_comments = []
          end
        end
      end
    end
  )

  # Nodes that are always on their own line occur when the lexer is in the
  # EXPR_BEG state. Those comments are tracked within the @block_comments
  # instance variable. Then for each node that could contain them, we attach
  # them after the node has been built.
  prepend(
    Module.new do
      events = {
        begin: [0, :body, 0],
        bodystmt: [0],
        class: [2, :body, 0],
        def: [2, :body, 0],
        defs: [4, :body, 0],
        else: [0],
        elsif: [1],
        ensure: [0],
        if: [1],
        program: [0],
        rescue: [2],
        sclass: [1, :body, 0],
        unless: [1],
        until: [1],
        when: [1],
        while: [1]
      }

      def initialize(*args)
        super(*args)
        @block_comments = []
        @current_embdoc = nil
      end

      def self.prepended(base)
        base.attr_reader :block_comments, :current_embdoc
      end

      private

      def attach_comments(sexp, stmts)
        range = sexp[:start]..sexp[:end]
        comments =
          block_comments.group_by { |comment| range.include?(comment[:start]) }

        if comments[true]
          stmts[:body] =
            (stmts[:body] + comments[true]).sort_by { |node| node[:start] }

          @block_comments = comments.fetch(false) { [] }
        end
      end

      events.each do |event, path|
        define_method(:"on_#{event}") do |*body|
          super(*body).tap { |sexp| attach_comments(sexp, body.dig(*path)) }
        end
      end

      def on_comment(body)
        super(body).tap do |sexp|
          block_comments << sexp if RipperJS.lex_state_name(state) == 'EXPR_BEG'
        end
      end

      def on_embdoc_beg(comment)
        @current_embdoc = {
          type: :embdoc, body: comment, start: lineno, end: lineno
        }
      end

      def on_embdoc(comment)
        @current_embdoc[:body] << comment
      end

      def on_embdoc_end(comment)
        @current_embdoc[:body] << comment.chomp
        @block_comments << @current_embdoc
        @current_embdoc = nil
      end

      def on_method_add_block(*body)
        super(*body).tap do |sexp|
          stmts = body[1][:body][1]
          stmts = stmts[:type] == :stmts ? stmts : body[1][:body][1][:body][0]

          attach_comments(sexp, stmts)
        end
      end
    end
  )

  # Tracking heredocs in somewhat interesting. Straight-line heredocs are
  # reported as strings, whereas squiggly-line heredocs are reported as
  # heredocs. We track the start and matching end of the heredoc as "beging" and
  # "ending" respectively.
  prepend(
    Module.new do
      def initialize(*args)
        super(*args)
        @heredoc_stack = []
      end

      def self.prepended(base)
        base.attr_reader :heredoc_stack
      end

      private

      def on_embexpr_beg(body)
        super(body).tap { |sexp| heredoc_stack << sexp }
      end

      def on_embexpr_end(body)
        super(body).tap { heredoc_stack.pop }
      end

      def on_heredoc_beg(beging)
        heredoc = { type: :heredoc, beging: beging, start: lineno, end: lineno }
        heredoc_stack << heredoc
      end

      def on_heredoc_end(ending)
        heredoc_stack[-1].merge!(ending: ending.chomp, end: lineno)
      end

      def on_heredoc_dedent(string, _width)
        heredoc = heredoc_stack.pop
        string.merge!(heredoc.slice(:type, :beging, :ending, :start, :end))
      end

      def on_string_literal(string)
        heredoc = heredoc_stack[-1]

        if heredoc && string[:type] != :heredoc && heredoc[:type] == :heredoc
          heredoc_stack.pop
          string.merge!(heredoc.slice(:type, :beging, :ending, :start, :end))
        else
          super
        end
      end
    end
  )

  # These are the event types that contain _actual_ string content. If there is
  # an encoding magic comment at the top of the file, ripper will actually
  # change into that encoding for the storage of the string. This will break
  # everything, so we need to force the encoding back into UTF-8 so that
  # the JSON library won't break.
  prepend(
    Module.new do
      private

      %w[comment ident tstring_content].each do |event|
        define_method(:"on_#{event}") do |body|
          super(body.force_encoding('UTF-8'))
        end
      end
    end
  )

  # Handles __END__ syntax, which allows individual scripts to keep content
  # after the main ruby code that can be read through DATA. Which looks like:
  #
  # foo.bar
  #
  # __END__
  # some other content that isn't read by ripper normally
  prepend(
    Module.new do
      def initialize(source, *args)
        super(source, *args)
        @source = source
        @ending = nil
      end

      def self.prepended(base)
        base.attr_reader :source, :ending
      end

      private

      def on___end__(body)
        @ending = super(source.split("\n")[lineno..-1].join("\n"))
      end

      def on_program(*body)
        super(*body).tap { |sexp| sexp[:body][0][:body] << ending if ending }
      end
    end
  )

  # Adds the used quote type onto string nodes. This is necessary because we're
  # going to have to stick to whatever quote the user chose if there are escape
  # sequences within the string. For example, if you have '\n' we can't switch
  # to double quotes without changing what it means.
  prepend(
    Module.new do
      private

      def on_tstring_end(quote)
        last_sexp.merge!(quote: quote)
      end

      def on_label_end(quote)
        last_sexp.merge!(quote: quote[0]) # quote is ": or ':
      end
    end
  )

  # Normally access controls are reported as vcall nodes. This module creates a
  # new node type to explicitly track those nodes instead, so that the printer
  # can add new lines as necessary.
  prepend(
    Module.new do
      KEYWORDS = %w[private protected public].freeze

      def initialize(source, *args)
        super(source, *args)
        @lines = source.split("\n")
      end

      def self.prepended(base)
        base.attr_reader :lines
      end

      private

      def on_vcall(ident)
        super(ident).tap do |sexp|
          if !KEYWORDS.include?(ident[:body]) ||
             ident[:body] != lines[lineno - 1].strip
            next
          end

          sexp.merge!(type: :access_ctrl)
        end
      end
    end
  )

  # When the only statement inside of a `def` node is a `begin` node, then you
  # can safely replace the body of the `def` with the body of the `begin`. For
  # example:
  #
  # def foo
  #   begin
  #     try_something
  #   rescue SomeError => error
  #     handle_error(error)
  #   end
  # end
  #
  # can get transformed into:
  #
  # def foo
  #   try_something
  # rescue SomeError => error
  #   handle_error(error)
  # end
  #
  # This module handles this by hoisting up the `bodystmt` node from the inner
  # `begin` up to the `def`.
  prepend(
    Module.new do
      private

      def on_def(ident, params, bodystmt)
        def_bodystmt = bodystmt
        stmts, *other_parts = bodystmt[:body]

        if !other_parts.any? && stmts[:body].length == 1 &&
           stmts.dig(:body, 0, :type) == :begin
          def_bodystmt = stmts.dig(:body, 0, :body, 0)
        end

        super(ident, params, def_bodystmt)
      end
    end
  )

  # By default, Ripper parses the expression `lambda { foo }` as a
  # `method_add_block` node, so we can't turn it back into `-> { foo }`. This
  # module overrides that behavior and reports it back as a `lambda` node
  # instead.
  prepend(
    Module.new do
      private

      def on_method_add_block(invocation, block)
        # It's possible to hit a `method_add_block` node without going through
        # `method_add_arg` node, ex: `super {}`. In that case we're definitely
        # not going to transform into a lambda.
        return super if invocation[:type] != :method_add_arg

        fcall, args = invocation[:body]

        # If there are arguments to the `lambda`, that means `lambda` has been
        # overridden as a function so we cannot transform it into a `lambda`
        # node.
        if fcall[:type] != :fcall || args[:type] != :args || args[:body].any?
          return super
        end

        ident = fcall.dig(:body, 0)
        return super if ident[:type] != :@ident || ident[:body] != 'lambda'

        super.tap do |sexp|
          params, stmts = block[:body]
          params ||= { type: :params, body: [] }

          sexp.merge!(type: :lambda, body: [params, stmts])
        end
      end
    end
  )
end

# If this is the main file we're executing, then most likely this is being
# executed from the parse.js spawn. In that case, read the ruby source from
# stdin and report back the AST over stdout.

if $0 == __FILE__
  builder = RipperJS.new($stdin.read)
  response = builder.parse

  if !response && builder.error?
    STDERR.puts 'Invalid ruby'
    exit 1
  end

  puts JSON.fast_generate(response)
end