lib/ver/plist.rb in ver-2009.12.14 vs lib/ver/plist.rb in ver-2010.02

- old
+ new

@@ -1,14 +1,41 @@ +# ver: sw=2 + require 'nokogiri' +require 'pp' +require_relative 'vendor/better_pp_hash' +# Define some magic constants that aren't available but may be returned from +# Regexp#options so we can make sure that roundtrip via PP works. +# +# 1.9.2 actually defines Regexp::FIXEDENCODING, but there's no release yet, so +# we simply assume the values from the Ruby C source (which means this might not +# work on any other implementation). +# Hopefully nobody tries to convert textmate bundles unless they are on MRI 1.9.1 ;) +class Regexp + NO_ENCODING = //n.options + + alias original_inspect inspect + + # Take into account //n option + def inspect + if options & NO_ENCODING == 0 + original_inspect + else + original_inspect << 'n' + end + end +end + module VER module Plist class XML def initialize(xml) @doc = Nokogiri(xml) @plist = {} @parsed = nil + @exceptions = [] end def parse @parsed || parse! end @@ -19,33 +46,40 @@ if dict @parsed = handle_dict(dict) else raise ArgumentError, 'This is no XML plist' end + + fail "#{@exceptions.size} errors encountered" if @exceptions.any? + @parsed end def handle_array(array) out = [] - key = nil array.children.each do |child| child_name = child.name next if child_name == 'text' out.push( case child_name when 'key' raise 'No key allows in array' - when 'array'; handle_array(child) - when 'dict'; handle_dict(child) - when 'false'; false - when 'integer', 'real'; child.inner_text.to_i - when 'string'; child.inner_text - when 'true'; true + when 'array' + handle_array(child) + when 'dict' + handle_dict(child) + when 'false' + false + when 'integer', 'real' + child.inner_text.to_i + when 'string' + child.inner_text + when 'true' + true else - # $unhandled[child_name] << child - raise "%p: %p is not handled" % [child_name, child] + raise "unhandled %p: %p" % [child_name, child] end ) end out @@ -57,26 +91,45 @@ dict.children.each do |child| child_name = child.name case child_name - when 'key'; key = child.inner_text - when 'text' # ignore + when 'key' + case key = child.inner_text + when /^\d+$/ + key = key.to_i + else + key = key.to_sym + end + when 'text' + # ignore else raise 'No key given' unless key out[key] = case child_name - when 'array'; handle_array(child) - when 'dict'; handle_dict(child) - when 'false'; false - when 'integer', 'real'; child.inner_text.to_i - when 'string'; child.inner_text - when 'true'; true + when 'array' + handle_array(child) + when 'dict' + handle_dict(child) + when 'integer', 'real' + child.inner_text.to_i + when 'true' + true + when 'false' + false + when 'string' + value = child.inner_text + + case key + when :begin, :match, :foldingStartMarker, :foldingStopMarker + sanitize_regexp(value) + else + value + end else - # $unhandled[child_name] << child - raise "%p: %p is not handled" % [child_name, child] + raise "unhandled %p: %p" % [child_name, child] end key = nil end end @@ -93,14 +146,149 @@ end def to_hash parse end + + private + + SANITIZE_REGEXP = {} + r = lambda{|string, replacement| + pattern = + if string.is_a? Regexp + string + else + Regexp.new(Regexp.escape(string)) + end + replacement.force_encoding Encoding::BINARY + SANITIZE_REGEXP[pattern] = replacement + } + + # found in newLisp.tmbundle + r['(?<!(?:\{|\"|]))(?:[\s\n]*)?(;+.*?)$', '(?<!(?:\{|"|\]))(?:\s*)(;+.*)$'] + + # found in Movable Type.tmbundle + r['(<)(\$[mM][tT]:?(?:\w+)?:?\w+)(.*?)(\$)?(>)', '(<)(\$[mM][tT]:?(?:\w*):?\w+)(.*?)(\$)?(>)'] + r['(</?)([mM][tT]:?(?:\w+)?:?\w+)(.*?)>', '(</?)([mM][tT]:?(?:\w*):?\w+)(.*?)>'] + r['\b([a-zA-Z-_:]+)', '\b([a-zA-Z_:-]+)'] + + # found in OCaml.tmbundle + r['(?=(\[<)(?![^\[]+?[^>]]))(.*?)(>])', '(?=(\[<)(?![^\[]+?[^>\]]))(.*?)(>\])'] + r['(\')(.*?)(;)(?=\s*\')|(?=\s*>])', '(\')(.*?)(;)(?=\s*\')|(?=\s*>\])'] + r['(\[)(?!\||<|>)(.*?)(?<!\||>)(])', '(\[)(?!\||<|>)(.*?)(?<!\||>)(\])'] + r['(\[)(\^?)(.*?)(])(?!\\\')', '(\[)(\^?)(.*?)(\])(?!\\\')'] + r['(\[<)(?=.*?>])(.*?)(?=>])', '(\[<)(?=.*?>\])(.*?)(?=>\])'] + r['(\[\|)(.*?)(\|])', '(\[\|)(.*?)(\|\])'] + r['\[<|>]', '\[<|>\]'] + + # found in Txt2tags.tmbundle + r['|[^]]+', '|[^\]]+'] + + # found in Perl Template Toolkit.tmbundle + r['\b([a-zA-Z-_:]+)\s*(=)', '\b([a-zA-Z_:-]+)\s*(=)'] + + # found in Property List.tmbundle + r['<!\[CDATA\[(.*?)]]>', '<!\[CDATA\[(.*?)\]\]>'] + r['(<!\[CDATA\[)(.*?)(]]>)', '(<!\[CDATA\[)(.*?)(\]\]>)'] + + # found in SWeave.tmbundle and a few others + # warning: nested repeat operator ? and * was replaced with '*' + r[/\.\?\*/, '.*'] + + # Found in Textile.tmbundle + r[/\\\[\[\^\]\]/, '\[[^\]]'] + + # found in Twiki.tmbundle + r['(\[)([^]]*)(\]) *(\[)(.*?)(\])', '(\[)([^\]]*)(\]) *(\[)(.*?)(\])'] + + # found in XML.tmbundle + r['<!\[CDATA\[(.*?)]]>', '<!\[CDATA\[(.*?)\]\]>'] + + # '\b\u\w+\u\w+' => '\b\\u\w+\\u\w+' + r[/\\(u)/i, '\\\\\1'] + + # found in MEL.tmbundle/Syntaxes/MEL.plist + r['(\$)[a-zA-Z_\x{7f}-\x{ff}][a-zA-Z0-9_\x{7f}-\x{ff}]*?\b', '(\$)\w\w*?\b'] + + # "[\x{7f}-\x{ff}]" => "[\x7f-\xff]" + r[/\\x\{(\h+)\}/, '\\\\x\1'] + + # remove lots of tabs at line start + r[/^\t+/, "\t"] + + # Escape #@ #$ #{ because they have special meaning in ruby literal regexps + r[/([^\\]|$)#([$@{])/, '\1\#\2'] + + # '(?=#)' => '(?=\#)' + r['(?=#)', '(?=\#)'] + + # found in Nemerle + r['(\{|(|<\[)', '(\{|\(|<\[)'] + r['(\}|)|]>)', '(\}|\)|\]>)'] + + # found in HTML (Active4D) + r['\b([a-zA-Z-:]+)','\b([a-zA-Z:-]+)'] + r['\[CDATA\[(.*?)]](?=>)', '\[CDATA\[(.*?)\]\](?=>)'] + + # found in Bulletin Board.tmbundle + r['[\[]]+', '[\[\]]+'] + + # found in C.tmbundle + r['\bdelete\b(\s*\[\])?|\bnew\b(?!])', '\bdelete\b(\s*\[\])?|\bnew\b(?!\])'] + + # found in reStructuredText.tmbundle + r['(\.\.)\s[A-z][A-z0-9-_]+(::)\s*$', '(\.\.)\s[A-z][A-z0-9_-]+(::)\s*$'] + r['^(\.\.)\s+((\[)(((#?)[^]]*?)|\*)(\]))\s+(.*)', '^(\.\.)\s+((\[)(((#?)[^\]]*?)|\*)(\]))\s+(.*)'] + + # Fix invalid regular expressions so we can write them out as Ruby code. + # + # NOTE: + # Yeah, it's weird to fix regular expressions with regular expressions. + # But it seems like the people writing syntax files slept all through + # regexp class and textmate must be working _very_ hard to allow all + # this without giving any warnings (maybe error messages are just not + # hip in osx). + # Also, there seems to be a lot of copy&pasting between syntax files, as + # soon as someone comes up with a horrible way to match things, + # everybody else is eager to use it in their files as well. + # </rant> + def sanitize_regexp(value) + original = value.dup + + SANITIZE_REGEXP.each do |pattern, replacement| + value.gsub!(pattern, replacement) + end + + Regexp.new(value) + rescue RegexpError => ex + if ex.message =~ /^invalid multibyte escape:/ + begin + /#{value.force_encoding(Encoding::BINARY)}/n + rescue RegexpError => ex + error(ex, original, value) + end + else + error(ex, original, value) + end + end + + def error(exception, original, modified) + puts ' [ exception ] '.center(80, '-') + puts exception, *exception.backtrace + puts ' [ original regexp ] '.center(80, '-') + p original + puts original + puts ' [ modified regexp ] '.center(80, '-') + p modified + puts modified + puts '-' * 80 + @exceptions << exception + end end module_function def parse_xml(filename) XML.new(File.read(filename)).parse end end -end \ No newline at end of file +end