require 'spec/msrun' begin require 'xmlparser' rescue LoadError puts "*******************************************************************" puts "WARNING: 'xmlparser' library not installed:" puts "Defaulting to REXML (slower, but guaranteed to parse correct xml)" puts "Use :parse_type => 'regex' for faster (but not guaranteed) parsing" puts "Or install 'xmlparser'!" puts "*******************************************************************" end begin $XMLParserClass = XMLParser rescue NameError $XMLParserClass = Object end require 'spec/mzxml' require 'rexml/document' require 'rexml/streamlistener' module Spec::MzXML::REXMLStreamListener; end module Spec::MzXML::PrecMzByNum; end # for REXML class Spec::MzXML::REXMLStreamListener::PrecMzByNum include REXML::StreamListener attr_accessor :prec_mz def initialize @prec_mz = [] @scan_num = nil @get_data = false end def tag_start(name,attrs) if name == "scan" @scan_num = attrs["num"].to_i elsif name == "precursorMz" @get_data = true end end def tag_end(name) if name == "precursorMz" @get_data = false end end def text(txt) if @get_data @prec_mz[@scan_num] = txt end end end module Spec::MzXML::XMLParser; end class Spec::MzXML::XMLParser::PrecMzByNum < $XMLParserClass @@scan_num = nil @@get_data = false attr_accessor :prec_mz def initialize @prec_mz = [] end def startElement(name,attrs) if name == "scan" @@scan_num = attrs["num"].to_i elsif name == "precursorMz" @prec_mz[@@scan_num] = "" @@get_data = true end end def endElement(name) if name == "precursorMz" @@get_data = false end end def character(data) if @@get_data @prec_mz[@@scan_num] << data end end end # Returns parallel arrays (times, spectra) where each spectra is an array # containing alternating mz and intensity (MS1 scans only) # and times are strings with the time in seconds class Spec::MzXML::XMLParser::TimesAndSpectra < $XMLParserClass include Spec::MzXML @@get_data = false @@get_peaks = false @@precision = 32 # @TODO: set dynamic attr_accessor :times, :spectra def times_and_spectra [@times, @spectra] end def initialize(ms_level=1) @ms_level = "#{ms_level}" @times = [] @spectra = [] end def startElement(name,attrs) if name == "scan" && attrs["msLevel"] == @ms_level @times << attrs["retentionTime"][2...-1] # strip PT and S: "PTx.xxxxS" @@get_peaks = true elsif name == "peaks" && @@get_peaks @@get_data = true @data = "" end end def character(data) if @@get_data @data << data end end def endElement(name) if name == "peaks" && @@get_peaks @spectra << base64_peaks_to_array(@data, @@precision) @@get_data = false @@get_peaks = false end end end class Spec::MzXML::Regexp @@scan_re = //mo def self.precursor_mz_and_intensity_by_scan(file) prec_re = /msLevel="2".*?([\d\.]+)<\/precursorMz>/mo self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures.reverse} end # (array will likely start at 1!) def self.by_scan_num(file, regex) arr = [] File.open(file) do |fh| string = fh.read matches = string.scan(@@scan_re) matches.each do |matched| if inner_match = regex.match(matched[1]) index = matched[0].to_i arr[index] = yield(inner_match) end end end arr end # Returns array where array[scan_num] = precursorMz # Parent scans are not arrayed # Values are strings. Array index likely starts at 1! # @TODO: replace the use of a yield block def self.precursor_mz_by_scan(file) prec_re = /msLevel="2".*?([\d\.]+)<\/precursorMz>/mo self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures[0]} end end class Spec::MzXML::Parser def default_parser xmlparser = false $".each do |lib| if lib =~ /xmlparser/ xmlparser = true end end if xmlparser return "xmlparser" else return "rexml" end end def initialize(file=nil, parse_type=:parse, *args) if file send(parse_type, file, *args) end end # Parse into a complete object structure (REXML??) def parse(file) # @TODO: write complete parser puts "need to write this guy!!!!" exit end # returns: [times_arr, [m/z,inten,m/z,inten...]] # where times are time strings (in seconds) def times_and_spectra(file) parser = Spec::MzXML::XMLParser::TimesAndSpectra.new parser.parse(IO.read(file)) parser.times_and_spectra end # Returns an array of scans indexed by scan number # NOTE that the first scan (zero indexed) will likely be nil! # accepts an optional parse_type = 'xmlparser' | 'rexml' def scans_by_num(mzXML_file, parse_type=nil) unless parse_type parse_type = default_parser end scans = [] case parse_type when 'xmlparser' parser = Spec::MzXML::XMLParser::TimeMzIntenIndexer.new parser.parse(IO.read(mzXML_file)) scans = parser.scans_by_num when 'rexml' # use REXML # This is really too slow for files of this size doc = REXML::Document.new File.new(mzXML_file) doc.elements.each('msRun/scan') do |scan| rt = scan.attributes['retentionTime'] ## like PT0.154000S" level = scan.attributes['msLevel'] to_print = [] prec_mz = nil prec_int = nil if level.to_i != 1 scan.elements.each("precursorMz") do |prec| prec_mz = prec.text.to_f prec_int = prec.attributes["precursorIntensity"].to_f end end # remove the leading PT and trailing S on the retention time! rt = rt[2...-1] num = scan.attributes['num'].to_i scans[num] = Spec::Scan.new(num, scan.attributes['msLevel'].to_i, rt.to_f, prec_mz, prec_int) end #doc.elements else throw ArgumentError, "invalid parse type: #{parse_type}" end ## update the scans for parents Spec::Scan.add_parent_scan(scans) scans end # Returns a Hash indexed by filename (with no extension) for a given path # extension = glob (string) or regex # The basename is given as: file.split('.').first def precursor_mz_by_scan_for_path(path, extension, parse_type=nil) hash = {} Dir.chdir path do files = [] if extension.class == String files = Dir[extension] elsif extension.class == Regexp files = Dir.entries(".").find_all do |dir| dir =~ extension end else puts "extension: #{extension} not a String or Regexp!" end files.each do |file| base = file.split('.').first hash[base] = precursor_mz_by_scan(file, parse_type) end end hash end # Returns hash where hash[scan_num] = [precursorMz, precursorIntensity] # Parent scans are not hashed # Keys and values are both strings def precursor_mz_and_inten_by_scan(file) # in progress end # first, converts backslash to forward slash in filename. # if .mzXML returns the filename # if .raw or .RAW converts the file to .mZXML and returns mzXML filename # if no recognized extension, looks for .mzXML file, then .RAW file (and # converts) # aborts if file was not able to be converted def file_to_mzxml(file) file.gsub!("\\",'/') old_file = file.dup if file =~ /\.mzXML$/ return file elsif file =~ /(\.RAW)|(\.raw)$/ old_file = file.dup ## t2x outputs in cwd (so go to the directory of the file!) dir = File.dirname(file) basename = File.basename(file) Dir.chdir(dir) do cmd = "#{Spec::MzXML::MZXML_CONVERTER} #{basename}" puts cmd puts `#{cmd}` end file.sub!(/\.RAW$/, '.mzXML') file.sub!(/\.raw$/, '.mzXML') unless File.exist? file abort "Couldn't convert #{old_file} to #{file}" end return file else if File.exist?( file + '.mzXML' ) return file_to_mzxml(file + '.mzXML') elsif File.exist?( file + '.RAW' ) return file_to_mzxml(file + '.RAW') elsif File.exist?( file + '.raw' ) return file_to_mzxml(file + '.raw') else return nil end end end def get_prec_mz_by_scan_for_time_index(file) index = Spec::MSRunIndex.new(file) prec_mz_by_scan = index.scans_by_num.collect do |scan| if scan ; scan.prec_mz else ; nil end end prec_mz_by_scan end # Returns array where array[scan_num] = precursorMz # Parent scans are not arrayed # Values are strings. Array index likely starts at 1! # parse_type = "regex" | "rexml" | "xmlparser" # also takes a MSRunIndex file (terminates with '.timeIndex') # also takes .RAW or .raw files and converts them to mzXML using # Spec::MzXML::MZXML_CONVERTER # also takes a file without an extension, in which case tests to see if the # index file exists, then the .mzXML file, then .RAW/.raw (and converts) def precursor_mz_by_scan(file, parse_type=nil) # If given a time index file: if File.exist?(file + '.timeIndex') return get_prec_mz_by_scan_for_time_index(file + '.timeIndex') elsif File.exist?(file + '.mzXML.timeIndex') return get_prec_mz_by_scan_for_time_index(file + '.mzXML.timeIndex') elsif file =~ /\.timeIndex$/ return get_prec_mz_by_scan_for_time_index(file) end file = file_to_mzxml(file) unless parse_type then parse_type = default_parser end case parse_type when "xmlparser" ##XMLParser: parser = Spec::MzXML::XMLParser::PrecMzByNum.new File.open(file) do |fh| parser.parse(fh.read) end parser.prec_mz when "regex" Spec::MzXML::Regexp.precursor_mz_by_scan(file) when "rexml" listener = Spec::MzXML::REXMLStreamListener::PrecMzByNum.new REXML::Document.parse_stream(File.new(file), listener) listener.prec_mz else puts "Don't recognize parse_type: #{parse_type}" end end # Returns a hash of basic info on an mzXML run: # *mzXML_elemt* *hash keys (symbols)* # scanCount scan_count # startTime start_time # endTime end_time # startMz start_mz # endMz end_mz def basic_info(mzxml_file) puts "parsing: #{mzxml_file} #{File.exist?(mzxml_file)}" hash = {} scan_count_tmp = [] (1..5).to_a.each do |n| scan_count_tmp[n] = 0 end @fh = File.open(mzxml_file) @line = "" scan_count_tmp[0] = _el("scanCount").to_i hash[:start_time] = _el("startTime").sub(/^PT/, "").sub(/S$/,"").to_f hash[:end_time] = _el("endTime").sub(/^PT/, "").sub(/S$/,"").to_f hash[:ms_level] = _el("msLevel").to_i scan_count_tmp[1] = 1 if hash[:ms_level] == 1 hash[:start_mz] = _el("startMz").to_f hash[:end_mz] = _el("endMz").to_f end while !@fh.eof? @line = @fh.readline ms_level = _el("msLevel") if ms_level scan_count_tmp[ms_level.to_i] += 1 else break end end scan_count = [] scan_count_tmp.each do |cnt| if cnt != 0 scan_count.push cnt else break end end hash[:scan_count] = scan_count @fh.close hash end # returns [start_mz, end_mz] of the first full scan (ms_level == 1) def start_and_end_mz(mzxml_file) @fh = File.open(mzxml_file) ms_level = 0 @line = "" while ms_level != 1 ms_level = _el("msLevel").to_i end start_mz = _el("startMz").to_f end_mz = _el("endMz").to_f @fh.close [start_mz, end_mz] end def _el(name) re = /#{name}="(.*)"/ while @line !~ re && !@fh.eof? @line = @fh.readline end if $1 return $1.dup else return nil end end end class Spec::MzXML::XMLParser::TimeMzIntenIndexer < XMLParser @@scan_num = nil @@get_data = false attr_accessor :scans_by_num def initialize @current_scan = nil @scans_by_num = [] end def startElement(name,attrs) if name == "scan" num = attrs["num"].to_i @current_scan = Spec::Scan.new(num, attrs["msLevel"].to_i, attrs["retentionTime"].gsub(/^PT/,'').gsub(/S$/,'').to_f) scans_by_num[num] = @current_scan elsif name == "precursorMz" @current_scan.prec_inten = attrs["precursorIntensity"].to_f @@get_data = true end end def endElement(name) if name == "precursorMz" @@get_data = false end end def character(data) if @@get_data @current_scan.prec_mz = data end end end