lib/ms/parser/mzxml/dom.rb in mspire-0.3.9 vs lib/ms/parser/mzxml/dom.rb in mspire-0.4.2
- old
+ new
@@ -1,107 +1,132 @@
require 'xml_style_parser'
require 'ms/spectrum'
require 'ms/scan'
+require 'ms/parser/mzxml'
+require 'tempfile'
class MS::Parser::MzXML::DOM
include XMLStyleParser
include MS::Parser::MzXML
- #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
+ NetworkOrder = true
+ #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
+
def initialize(parse_type=:msrun, version='1.0')
@method = parse_type
@version = version
end
def new_scan_from_hash(node)
scan = MS::Scan.new # array class creates one with 9 positions
scan[0] = node['num'].to_i
scan[1] = node['msLevel'].to_i
- scan[2] = node['retentionTime'][2...-1].to_f
+ if x = node['retentionTime']
+ scan[2] = x[2...-1].to_f
+ end
if x = node['startMz']
scan[3] = x.to_f
scan[4] = node['endMz'].to_f
end
scan
end
+ # assumes that node contains scans and checks any scan nodes for children
+ def add_scan_nodes(nodes, scans, scn_index, scans_by_num, lazy, io)
+ nodes.each do |scan_n|
+ scan = create_scan(scan_n, scans_by_num, lazy, io)
+ scans[scn_index] = scan
+ scans_by_num[scan[0]] = scan
+ scn_index += 1
+ if @version > '1.0'
+ new_nodes = scan_n.find('child::scan')
+ if new_nodes.size > 0
+ scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, lazy, io)
+ end
+ end
+ end
+ scn_index
+ end
+
# takes a scan node and creates a scan object
# the parent scan is the one directly above it in mslevel
- # if the
- def create_scan(scan_n, scans_by_num, get_spectra=true)
- if @version < '3.0'
- scan = new_scan_from_hash(scan_n)
- precs = []
- scan_n.each do |node|
- case node.name
- when 'precursorMz'
- # should be able to do this!!!
- #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
- prec = MS::Precursor.new
- prec[1] = node['precursorIntensity'].to_f
- prec[0] = node.content.to_f
- if x = node['precursorScanNum']
- prec[2] = scans_by_num[x.to_i]
- end
- precs << prec
- when 'peaks'
- next unless get_spectra
+ # lazy must be a symbol from MS::MSRun.new
+ def create_scan(scan_n, scans_by_num, lazy, io=nil)
+ scan = new_scan_from_hash(scan_n)
+ prec = nil
+ scan_n.each do |node|
+ case node.name
+ when 'precursorMz'
+ # should be able to do this!!!
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
+ raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
+ prec = MS::Precursor.new
+ prec[1] = node['precursorIntensity'].to_f
+ prec[0] = node.content.to_f
+ if x = node['precursorScanNum']
+ prec[2] = scans_by_num[x.to_i]
+ end
+ when 'peaks'
+ case lazy
+ when :no_spectra
+ next
+ when :string
+ scan[6] = MS::Spectrum::LazyString.from_base64_peaks(node.content, node['precision'].to_i)
+ when :io
+ # assumes that parsing was done with a LazyPeaks parser!
+ nc = node.content
+ scan[6] = MS::Spectrum::LazyIO.new(io, nc.first, nc.last, node['precision'].to_i, MS::Parser::MzXML::DOM::NetworkOrder)
+ when :not
# SHOULD be able to do this!!
#peaks_n = scan_n.find_first('child::peaks')
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
end
end
- scan[5] = precs
- scan
- else # for version > 3.0
- abort 'not supporting version 3.0 just yet'
- # note that mzXML version 3.0 *can* have more than one peak...
- # I'm not sure how to deal with that since I have one spectrum/scan
end
+ scan[5] = prec
+ scan
end
# returns an array of msrun objects
def msruns(file)
raise NotImplementedError
end
- # returns a string with double </scan></scan> tags into single and missing
- # </scan> tags after peaks added in
- # we do this in windows style since these are generated off a windows
- # machine only
- def fix_bad_scan_tags(file)
- IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
- end
-
- # right now cannot parse multiple runs out of an mzXML version 2 file since
+ # right now cannot parse multiple runs out of an mzXML version 2 file since
# this is built around a single run per file
# OPTIONS:
- # :msrun => MSRun # use this object instead of creating one
- # :spectra => *true|false # if false don't get spectra
+ # :msrun => (an MSRun object) # use this object instead of creating one
+ # :lazy => [See MS::MSRun for documentation]
def msrun(file, opts={})
- unless opts.key?(:spectra)
- opts[:spectra] = true
- end
+ #unless opts.key?(:spectra)
+ # opts[:spectra] = true
+ #end
msrun_obj =
if x = opts[:msrun]
msrun_obj = x
else
MS::MSRun.new
end
- root =
- if @version == '2.0'
- string = fix_bad_scan_tags(file)
- get_root_node_from_string(string)
+ io =
+ if file.is_a? String # a filename
+ filename = file
+ File.open(file)
else
- get_root_node_from_file(file)
+ file
end
+ root = get_root_node_from_io(io)
+
+ if filename
+ io.close # can close now
+ end
+
# right now we are only finding the first msRun (probably a rare case of
# multiple runs in an mzXML file...)
msrun_n =
if @version >= '2.0'
kids = root.children.select {|v| v.name == 'msRun' }
@@ -116,30 +141,29 @@
## HEADER
scan_count = msrun_n['scanCount'].to_i
msrun_obj.scan_count = scan_count
scans_by_num = Array.new(scan_count + 1)
-
+
## SPECTRUM
parent = nil
scans = Array.new( scan_count )
scn_index = 0
# we should be able to do this, but it's not working!!!
#scan_n = msrun_n.find_first('scan')
#while (scn_index < scan_count)
- get_spectra = opts[:spectra]
+ lazy = opts[:lazy]
- msrun_n.each do |scan_n|
- next unless scan_n.name == 'scan'
- scan = create_scan(scan_n, scans_by_num, get_spectra)
- scans[scn_index] = scan
- #sc = scan_n.next
- scans_by_num[scan[0]] = scan
- scn_index += 1
+ if @version >= '3.0'
+ warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
+ # note that mzXML version 3.0 *can* have more than one peak...
+ # I'm not sure how to deal with that since I have one spectrum/scan
end
+ scan_nodes = msrun_n.find('child::scan')
+ add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, lazy, io)
## update the scan's parents
MS::MSRun.add_parent_scan(scans)
# note that startTime and endTime are optional AND in >2.2 are dateTime
@@ -149,11 +173,10 @@
# really minutes. All the more reason to use the first and last scans!
msrun_obj.start_time = scans.first.time
msrun_obj.end_time = scans.last.time
msrun_obj.scans = scans
- end
+ end
end
-