module CodeRay # GZip library for writing and reading token dumps. autoload :GZip, 'coderay/helpers/gzip' # = Tokens TODO: Rewrite! # # The Tokens class represents a list of tokens returnd from # a Scanner. # # A token is not a special object, just a two-element Array # consisting of # * the _token_ _text_ (the original source of the token in a String) or # a _token_ _action_ (begin_group, end_group, begin_line, end_line) # * the _token_ _kind_ (a Symbol representing the type of the token) # # A token looks like this: # # ['# It looks like this', :comment] # ['3.1415926', :float] # ['$^', :error] # # Some scanners also yield sub-tokens, represented by special # token actions, namely begin_group and end_group. # # The Ruby scanner, for example, splits "a string" into: # # [ # [:begin_group, :string], # ['"', :delimiter], # ['a string', :content], # ['"', :delimiter], # [:end_group, :string] # ] # # Tokens is the interface between Scanners and Encoders: # The input is split and saved into a Tokens object. The Encoder # then builds the output from this object. # # Thus, the syntax below becomes clear: # # CodeRay.scan('price = 2.59', :ruby).html # # the Tokens object is here -------^ # # See how small it is? ;) # # Tokens gives you the power to handle pre-scanned code very easily: # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string # that you put in your DB. # # It also allows you to generate tokens directly (without using a scanner), # to load them from a file, and still use any Encoder that CodeRay provides. class Tokens < Array # The Scanner instance that created the tokens. attr_accessor :scanner # Encode the tokens using encoder. # # encoder can be # * a symbol like :html oder :statistic # * an Encoder class # * an Encoder object # # options are passed to the encoder. def encode encoder, options = {} unless encoder.is_a? Encoders::Encoder if encoder.respond_to? :to_sym encoder_class = Encoders[encoder] end encoder = encoder_class.new options end encoder.encode_tokens self, options end # Turn into a string using Encoders::Text. # # +options+ are passed to the encoder if given. def to_s encode CodeRay::Encoders::Encoder.new end # Redirects unknown methods to encoder calls. # # For example, if you call +tokens.html+, the HTML encoder # is used to highlight the tokens. def method_missing meth, options = {} encode_with meth, options rescue PluginHost::PluginNotFound super end def encode_with encoder, options = {} Encoders[encoder].new(options).encode_tokens self end # Returns the tokens compressed by joining consecutive # tokens of the same kind. # # This can not be undone, but should yield the same output # in most Encoders. It basically makes the output smaller. # # Combined with dump, it saves space for the cost of time. # # If the scanner is written carefully, this is not required - # for example, consecutive //-comment lines could already be # joined in one comment token by the Scanner. def optimize raise NotImplementedError, 'Tokens#optimize needs to be rewritten.' # last_kind = last_text = nil # new = self.class.new # for text, kind in self # if text.is_a? String # if kind == last_kind # last_text << text # else # new << [last_text, last_kind] if last_kind # last_text = text # last_kind = kind # end # else # new << [last_text, last_kind] if last_kind # last_kind = last_text = nil # new << [text, kind] # end # end # new << [last_text, last_kind] if last_kind # new end # Compact the object itself; see optimize. def optimize! replace optimize end # Ensure that all begin_group tokens have a correspondent end_group. # # TODO: Test this! def fix raise NotImplementedError, 'Tokens#fix needs to be rewritten.' # tokens = self.class.new # # Check token nesting using a stack of kinds. # opened = [] # for type, kind in self # case type # when :begin_group # opened.push [:begin_group, kind] # when :begin_line # opened.push [:end_line, kind] # when :end_group, :end_line # expected = opened.pop # if [type, kind] != expected # # Unexpected end; decide what to do based on the kind: # # - token was never opened: delete the end (just skip it) # next unless opened.rindex expected # # - token was opened earlier: also close tokens in between # tokens << token until (token = opened.pop) == expected # end # end # tokens << [type, kind] # end # # Close remaining opened tokens # tokens << token while token = opened.pop # tokens end def fix! replace fix end # TODO: Scanner#split_into_lines # # Makes sure that: # - newlines are single tokens # (which means all other token are single-line) # - there are no open tokens at the end the line # # This makes it simple for encoders that work line-oriented, # like HTML with list-style numeration. def split_into_lines raise NotImplementedError end def split_into_lines! replace split_into_lines end # Split the tokens into parts of the given +sizes+. # # The result will be an Array of Tokens objects. The parts have # the text size specified by the parameter. In addition, each # part closes all opened tokens. This is useful to insert tokens # betweem them. # # This method is used by @Scanner#tokenize@ when called with an Array # of source strings. The Diff encoder uses it for inline highlighting. def split_into_parts *sizes parts = [] opened = [] content = nil part = Tokens.new part_size = 0 size = sizes.first i = 0 for item in self case content when nil content = item when String if size && part_size + content.size > size # token must be cut if part_size < size # some part of the token goes into this part content = content.dup # content may no be safe to change part << content.slice!(0, size - part_size) << item end # close all open groups and lines... closing = opened.reverse.flatten.map do |content_or_kind| case content_or_kind when :begin_group :end_group when :begin_line :end_line else content_or_kind end end part.concat closing begin parts << part part = Tokens.new size = sizes[i += 1] end until size.nil? || size > 0 # ...and open them again. part.concat opened.flatten part_size = 0 redo unless content.empty? else part << content << item part_size += content.size end content = nil when Symbol case content when :begin_group, :begin_line opened << [content, item] when :end_group, :end_line opened.pop else raise ArgumentError, 'Unknown token action: %p, kind = %p' % [content, item] end part << content << item content = nil else raise ArgumentError, 'Token input junk: %p, kind = %p' % [content, item] end end parts << part parts << Tokens.new while parts.size < sizes.size parts end # Dumps the object into a String that can be saved # in files or databases. # # The dump is created with Marshal.dump; # In addition, it is gzipped using GZip.gzip. # # The returned String object includes Undumping # so it has an #undump method. See Tokens.load. # # You can configure the level of compression, # but the default value 7 should be what you want # in most cases as it is a good compromise between # speed and compression rate. # # See GZip module. def dump gzip_level = 7 dump = Marshal.dump self dump = GZip.gzip dump, gzip_level dump.extend Undumping end # Return the actual number of tokens. def count size / 2 end # Include this module to give an object an #undump # method. # # The string returned by Tokens.dump includes Undumping. module Undumping # Calls Tokens.load with itself. def undump Tokens.load self end end # Undump the object using Marshal.load, then # unzip it using GZip.gunzip. # # The result is commonly a Tokens object, but # this is not guaranteed. def Tokens.load dump dump = GZip.gunzip dump @dump = Marshal.load dump end if defined?(RUBY_ENGINE) && RUBY_ENGINE['rbx'] #:nocov: def text_token text, kind self << text << kind end def begin_group kind self << :begin_group << kind end def end_group kind self << :end_group << kind end def begin_line kind self << :begin_line << kind end def end_line kind self << :end_line << kind end #:nocov: else alias text_token push def begin_group kind; push :begin_group, kind end def end_group kind; push :end_group, kind end def begin_line kind; push :begin_line, kind end def end_line kind; push :end_line, kind end end alias tokens concat end end