require 'xml_style_parser'
require 'ms/spectrum'
require 'ms/scan'
class MS::Parser::MzXML::Hpricot
include XMLStyleParser
include MS::Parser::MzXML
@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
def initialize(parse_type=:msrun, version='1.0')
@method = parse_type
@version = version
end
def new_scan_from_hash(node)
scan = MS::Scan.new # array class creates one with 9 positions
scan[0] = node['num'].to_i
scan[1] = node['msLevel'].to_i
scan[2] = node['retentionTime'][2...-1].to_f
if x = node['startMz']
scan[3] = x.to_f
scan[4] = node['endMz'].to_f
end
scan
end
# takes a scan node and creates a scan object
# the parent scan is the one directly above it in mslevel
# if the
def create_scan(scan_n, scans_by_num, get_spectra=true)
if @version < '3.0'
scan = new_scan_from_hash(scan_n)
precs = []
scan_n.each_child do |node|
case node.name
when 'precursorMz'
# should be able to do this!!!
#scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
prec = MS::Precursor.new
prec[1] = node['precursorIntensity'].to_f
prec[0] = node.content.to_f
if x = node['precursorScanNum']
prec[2] = scans_by_num[x.to_i]
end
precs << prec
when 'peaks'
next unless get_spectra
# SHOULD be able to do this!!
#peaks_n = scan_n.find_first('child::peaks')
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
end
end
scan[5] = precs
scan
else # for version > 3.0
abort 'not supporting version 3.0 just yet'
# note that mzXML version 3.0 *can* have more than one peak...
# I'm not sure how to deal with that since I have one spectrum/scan
end
end
# returns an array of msrun objects
def msruns(file)
raise NotImplementedError
end
# returns a string with double tags into single and missing
# tags after peaks added in
# we do this in windows style since these are generated off a windows
# machine only
def fix_bad_scan_tags(file)
IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '').gsub(/<\/peaks>\s+\r\n \r\n MSRun # use this object instead of creating one
# :spectra => *true|false # if false don't get spectra
def msrun(file, opts={})
unless opts.key?(:spectra)
opts[:spectra] = true
end
msrun_obj =
if x = opts[:msrun]
msrun_obj = x
else
MS::MSRun.new
end
doc = File.open(file) {|fh| ::Hpricot.XML(fh) }
#if @version == '2.0'
# # may not be necessary in hpricot!
# #string = fix_bad_scan_tags(file)
# #XML::Parser.string(string).parse
#else
# XML::Document.file(file)
#end
msrun_n = doc.at('msRun')
## HEADER
scan_count = msrun_n['scanCount'].to_i
msrun_obj.scan_count = scan_count
scans_by_num = Array.new(scan_count + 1)
## SPECTRUM
parent = nil
scans = Array.new( scan_count )
scn_index = 0
# we should be able to do this, but it's not working!!!
#scan_n = msrun_n.find_first('scan')
#while (scn_index < scan_count)
get_spectra = opts[:spectra]
msrun_n.each_child do |scan_n|
p scan_n
abort
next unless scan_n.name == 'scan'
scan = create_scan(scan_n, scans_by_num, get_spectra)
scans[scn_index] = scan
sc = scan_n.next
scans_by_num[scan[0]] = scan
scn_index += 1
end
## update the scan's parents
MS::MSRun.add_parent_scan(scans)
# note that startTime and endTime are optional AND in >2.2 are dateTime
# instead of duration types!, so we will just use scan times...
# Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
# export. They give the start and end time in seconds, but they are
# really minutes. All the more reason to use the first and last scans!
msrun_obj.start_time = scans.first.time
msrun_obj.end_time = scans.last.time
msrun_obj.scans = scans
end
end
=begin
## THIS IS THE SAX PARSER VERSION. IT NEEDS A BIT OF BRUSH UP AND IT WOULD
## WORK. I THINK THE default guy is probably faster
def msrun(file, msrun_obj)
# Figure out where the first scan is at in the file:
pos_after_first_scan = nil
File.open(file) do |fh|
fh.each do |line|
if line =~ /