require 'xml_style_parser' require 'ms/spectrum' require 'ms/scan' class MS::Parser::MzXML::Hpricot include XMLStyleParser include MS::Parser::MzXML @@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum) def initialize(parse_type=:msrun, version='1.0') @method = parse_type @version = version end def new_scan_from_hash(node) scan = MS::Scan.new # array class creates one with 9 positions scan[0] = node['num'].to_i scan[1] = node['msLevel'].to_i scan[2] = node['retentionTime'][2...-1].to_f if x = node['startMz'] scan[3] = x.to_f scan[4] = node['endMz'].to_f end scan end # takes a scan node and creates a scan object # the parent scan is the one directly above it in mslevel # if the def create_scan(scan_n, scans_by_num, get_spectra=true) if @version < '3.0' scan = new_scan_from_hash(scan_n) precs = [] scan_n.each_child do |node| case node.name when 'precursorMz' # should be able to do this!!! #scan[5] = scan_n.find('child::precursorMz').map do |prec_n| prec = MS::Precursor.new prec[1] = node['precursorIntensity'].to_f prec[0] = node.content.to_f if x = node['precursorScanNum'] prec[2] = scans_by_num[x.to_i] end precs << prec when 'peaks' next unless get_spectra # SHOULD be able to do this!! #peaks_n = scan_n.find_first('child::peaks') scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i) end end scan[5] = precs scan else # for version > 3.0 abort 'not supporting version 3.0 just yet' # note that mzXML version 3.0 *can* have more than one peak... # I'm not sure how to deal with that since I have one spectrum/scan end end # returns an array of msrun objects def msruns(file) raise NotImplementedError end # returns a string with double tags into single and missing # tags after peaks added in # we do this in windows style since these are generated off a windows # machine only def fix_bad_scan_tags(file) IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '').gsub(/<\/peaks>\s+\r\n \r\n MSRun # use this object instead of creating one # :spectra => *true|false # if false don't get spectra def msrun(file, opts={}) unless opts.key?(:spectra) opts[:spectra] = true end msrun_obj = if x = opts[:msrun] msrun_obj = x else MS::MSRun.new end doc = File.open(file) {|fh| ::Hpricot.XML(fh) } #if @version == '2.0' # # may not be necessary in hpricot! # #string = fix_bad_scan_tags(file) # #XML::Parser.string(string).parse #else # XML::Document.file(file) #end msrun_n = doc.at('msRun') ## HEADER scan_count = msrun_n['scanCount'].to_i msrun_obj.scan_count = scan_count scans_by_num = Array.new(scan_count + 1) ## SPECTRUM parent = nil scans = Array.new( scan_count ) scn_index = 0 # we should be able to do this, but it's not working!!! #scan_n = msrun_n.find_first('scan') #while (scn_index < scan_count) get_spectra = opts[:spectra] msrun_n.each_child do |scan_n| p scan_n abort next unless scan_n.name == 'scan' scan = create_scan(scan_n, scans_by_num, get_spectra) scans[scn_index] = scan sc = scan_n.next scans_by_num[scan[0]] = scan scn_index += 1 end ## update the scan's parents MS::MSRun.add_parent_scan(scans) # note that startTime and endTime are optional AND in >2.2 are dateTime # instead of duration types!, so we will just use scan times... # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0 # export. They give the start and end time in seconds, but they are # really minutes. All the more reason to use the first and last scans! msrun_obj.start_time = scans.first.time msrun_obj.end_time = scans.last.time msrun_obj.scans = scans end end =begin ## THIS IS THE SAX PARSER VERSION. IT NEEDS A BIT OF BRUSH UP AND IT WOULD ## WORK. I THINK THE default guy is probably faster def msrun(file, msrun_obj) # Figure out where the first scan is at in the file: pos_after_first_scan = nil File.open(file) do |fh| fh.each do |line| if line =~ /