require 'ostruct' module Lederhosen # represents a usearch result class UResult def initialize(hash) @source = OpenStruct.new(hash) end def method_missing(method, *args, &block) @source.send(method, *args, &block) end def hit? @source.type == 'H' end def miss? @source.type == 'N' end end # class for parsing UC files, generates UResult objects class UCParser include Enumerable def initialize(handle) @handle = handle end def each(&block) @handle.each do |line| next if line =~ /^[#C]/ # skip comments and cluster summaries dat = parse_usearch_line(line.strip) result = UResult.new(dat) block.call(result) end end private # parse a line of usearch prefix # return a hash in the form: # { :taxonomy => '', :identity => '0.00', ... } # unless the line is not a "hit" in which case # the function returns nil def parse_usearch_line(str) # from http://drive5.com/usearch/manual/ucout.html # 1 Record type S, H, C or N (see table below). # 2 Cluster number (0-based). # 3 Sequence length (S, N and H) or cluster size (C). # 4 For H records, percent identity with target. # 5 For H records, the strand: + or - for nucleotides, . for proteins. # 6 Not used, parsers should ignore this field. Included for backwards compatibility. # 7 Not used, parsers should ignore this field. Included for backwards compatibility. # 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10). # 9 Label of query sequence (always present). # 10 Label of target sequence (H records only). str = str.split("\t") dat = { :type => str[0], :cluster_no => str[1], :alignment => str[7], :query => str[8], :target => str[9], } r = if dat[:type] =~ /[SNH]/ # hits { :length => str[2].to_i, :identity => str[3], :strand => str[4], } elsif dat[:type] == 'C' # clusters { :cluster_size => str[2].to_i } else raise Exception, "Do not understand record type #{str[0]}!" end dat.merge(r) end end end