require 'xml_style_parser' require 'ms/spectrum' require 'ms/scan' class MS::Parser::MzXML::DOM include XMLStyleParser include MS::Parser::MzXML #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum) def initialize(parse_type=:msrun, version='1.0') @method = parse_type @version = version end def new_scan_from_hash(node) scan = MS::Scan.new # array class creates one with 9 positions scan[0] = node['num'].to_i scan[1] = node['msLevel'].to_i scan[2] = node['retentionTime'][2...-1].to_f if x = node['startMz'] scan[3] = x.to_f scan[4] = node['endMz'].to_f end scan end # takes a scan node and creates a scan object # the parent scan is the one directly above it in mslevel # if the def create_scan(scan_n, scans_by_num, get_spectra=true) if @version < '3.0' scan = new_scan_from_hash(scan_n) precs = [] scan_n.each do |node| case node.name when 'precursorMz' # should be able to do this!!! #scan[5] = scan_n.find('child::precursorMz').map do |prec_n| prec = MS::Precursor.new prec[1] = node['precursorIntensity'].to_f prec[0] = node.content.to_f if x = node['precursorScanNum'] prec[2] = scans_by_num[x.to_i] end precs << prec when 'peaks' next unless get_spectra # SHOULD be able to do this!! #peaks_n = scan_n.find_first('child::peaks') scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i) end end scan[5] = precs scan else # for version > 3.0 abort 'not supporting version 3.0 just yet' # note that mzXML version 3.0 *can* have more than one peak... # I'm not sure how to deal with that since I have one spectrum/scan end end # returns an array of msrun objects def msruns(file) raise NotImplementedError end # returns a string with double tags into single and missing # tags after peaks added in # we do this in windows style since these are generated off a windows # machine only def fix_bad_scan_tags(file) IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '').gsub(/<\/peaks>\s+\r\n \r\n MSRun # use this object instead of creating one # :spectra => *true|false # if false don't get spectra def msrun(file, opts={}) unless opts.key?(:spectra) opts[:spectra] = true end msrun_obj = if x = opts[:msrun] msrun_obj = x else MS::MSRun.new end root = if @version == '2.0' string = fix_bad_scan_tags(file) get_root_node_from_string(string) else get_root_node_from_file(file) end # right now we are only finding the first msRun (probably a rare case of # multiple runs in an mzXML file...) msrun_n = if @version >= '2.0' kids = root.children.select {|v| v.name == 'msRun' } raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1 kids.first else root end if msrun_n.name != 'msRun' raise RuntimeError, "extra node slipped in somehow" end ## HEADER scan_count = msrun_n['scanCount'].to_i msrun_obj.scan_count = scan_count scans_by_num = Array.new(scan_count + 1) ## SPECTRUM parent = nil scans = Array.new( scan_count ) scn_index = 0 # we should be able to do this, but it's not working!!! #scan_n = msrun_n.find_first('scan') #while (scn_index < scan_count) get_spectra = opts[:spectra] msrun_n.each do |scan_n| next unless scan_n.name == 'scan' scan = create_scan(scan_n, scans_by_num, get_spectra) scans[scn_index] = scan #sc = scan_n.next scans_by_num[scan[0]] = scan scn_index += 1 end ## update the scan's parents MS::MSRun.add_parent_scan(scans) # note that startTime and endTime are optional AND in >2.2 are dateTime # instead of duration types!, so we will just use scan times... # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0 # export. They give the start and end time in seconds, but they are # really minutes. All the more reason to use the first and last scans! msrun_obj.start_time = scans.first.time msrun_obj.end_time = scans.last.time msrun_obj.scans = scans end end