# Copyright 2014, 2015 Ryan Moore
# Contact: moorer@udel.edu
#
# This file is part of parse_fasta.
#
# parse_fasta is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# parse_fasta is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with parse_fasta. If not, see .
require 'zlib'
# Provides simple interface for parsing fasta format files. Gzipped
# files are no problem.
class FastaFile < File
# Use it like IO::open
#
# @param fname [String] the name of the file to open
#
# @return [FastaFile] a FastaFile
def self.open(fname, *args)
begin
handle = Zlib::GzipReader.open(fname)
rescue Zlib::GzipFile::Error => e
handle = File.open(fname)
end
unless handle.each_char.peek[0] == '>'
raise ParseFasta::DataFormatError
end
handle.close
super
end
# Returns the records in the fasta file as a hash map with the
# headers as keys and the Sequences as values.
#
# @example Read a fastA into a hash table.
# seqs = FastaFile.open('reads.fa').to_hash
#
# @return [Hash] A hash with headers as keys, sequences as the
# values (Sequence objects)
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
def to_hash
hash = {}
self.each_record do |head, seq|
hash[head] = seq
end
hash
end
# Analagous to IO#each_line, #each_record is used to go through a
# fasta file record by record. It will accept gzipped files as well.
#
# @param separate_lines [Object] If truthy, separate lines of record
# into an array of Sequences, but if falsy, yield a Sequence
# object for the sequence instead.
#
# @example Parsing a fasta file (default behavior, gzip files are fine)
# FastaFile.open('reads.fna.gz').each_record do |header, sequence|
# puts [header, sequence.gc].join("\t")
# end
#
# @example Parsing a fasta file (with truthy value param)
# FastaFile.open('reads.fna').each_record(1) do |header, sequence|
# # header => 'sequence_1'
# # sequence => ['AACTG', 'AGTCGT', ... ]
# end
#
# @yield The header and sequence for each record in the fasta
# file to the block
#
# @yieldparam header [String] The header of the fasta record without
# the leading '>'
#
# @yieldparam sequence [Sequence, Array] The sequence of the
# fasta record. If `separate_lines` is falsy (the default
# behavior), will be Sequence, but if truthy will be
# Array.
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
def each_record(separate_lines=nil)
begin
f = Zlib::GzipReader.open(self)
rescue Zlib::GzipFile::Error => e
f = self
end
if separate_lines
f.each("\n>") do |line|
header, sequence = parse_line_separately(line)
yield(header.strip, sequence)
end
# f.each_with_index(">") do |line, idx|
# if idx.zero?
# if line != ">"
# raise ParseFasta::DataFormatError
# end
# else
# header, sequence = parse_line_separately(line)
# yield(header.strip, sequence)
# end
# end
else
header = ""
sequence = ""
f.each_line do |line|
line.chomp!
len = line.length
if header.empty? && line.start_with?(">")
header = line[1, len]
elsif line.start_with?(">")
yield(header.strip, Sequence.new(sequence || ""))
header = line[1, len]
sequence = ""
else
raise ParseFasta::SequenceFormatError if sequence.include? ">"
sequence << line
end
end
yield(header, Sequence.new(sequence || ""))
# f.each("\n>") do |line|
# header, sequence = parse_line(line)
# yield(header.strip, Sequence.new(sequence || ""))
# end
# f.each_with_index(sep=/^>/) do |line, idx|
# if idx.zero?
# if line != ">"
# raise ParseFasta::DataFormatError
# end
# else
# header, sequence = parse_line(line)
# yield(header.strip, Sequence.new(sequence || ""))
# end
# end
end
f.close if f.instance_of?(Zlib::GzipReader)
return f
end
# Fast version of #each_record
#
# Yields the sequence as a String, not Sequence. No separate lines
# option.
#
# @note If the fastA file has spaces in the sequence, they will be
# retained. If this is a problem, use #each_record instead.
#
# @yield The header and sequence for each record in the fasta
# file to the block
#
# @yieldparam header [String] The header of the fasta record without
# the leading '>'
#
# @yieldparam sequence [String] The sequence of the fasta record
#
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
def each_record_fast
begin
f = Zlib::GzipReader.open(self)
rescue Zlib::GzipFile::Error => e
f = self
end
header = ""
sequence = ""
f.each_line do |line|
line.chomp!
len = line.length
if header.empty? && line.start_with?(">")
header = line[1, len]
elsif line.start_with?(">")
yield(header.strip, sequence)
header = line[1, len]
sequence = ""
else
raise ParseFasta::SequenceFormatError if sequence.include? ">"
sequence << line
end
end
yield(header, sequence)
# f.each("\n>") do |line|
# header, sequence = parse_line(line)
# raise ParseFasta::SequenceFormatError if sequence.include? ">"
# yield(header.strip, sequence)
# end
f.close if f.instance_of?(Zlib::GzipReader)
return f
end
private
def parse_line(line)
line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
end
def parse_line_separately(line)
header, sequence =
line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
if sequence.nil?
sequences = []
else
sequences = sequence.split("\n")
.reject { |s| s.empty? }
.map { |s| Sequence.new(s) }
end
[header, sequences]
end
end