#-- # Copyright (C) 2002, 2003, 2004 Matt Armstrong. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN # NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # #++ # Implements the RMail::Parser, RMail::StreamParser and # RMail::StreamHandler classes. require 'rmail/message' require 'rmail/parser/multipart' module RMail # = Overview # # An RMail::StreamHandler documents the set of methods a # RMail::StreamParser handler must implement. See # RMail::StreamParser.parse. This is a low level interface to the # RMail message parser. # # = Order of Method Calls (Grammar) # # Calls to the methods of this class follow a specific grammar, # described informally below. The words in all caps are productions # in the grammar, while the lower case words are method calls to # this object. # # MESSAGE:: [ #mbox_from ] *( #header_field ) # ( BODY / MULTIPART_BODY ) # # BODY:: *body_begin *( #body_chunk ) #body_end # # MULTIPART_BODY:: #multipart_body_begin # *( #preamble_chunk ) # *( #part_begin MESSAGE #part_end) # *( #epilogue_chunk ) # #multipart_body_end # # = Order of Method Calls (English) # # If the grammar above is not clear, here is a description in English. # # The parser begins calling #header_field, possibly calling # #mbox_from for the first line. Then it determines if the message # was a MIME multipart message. # # If the message is a not a MIME multipart, the parser calls # #body_begin once, then #body_chunk any number of times, then # #body_end. # # If the message header is a MIME multipart message, then # #multipart_body_begin is called, followed by any number of calls # to #preamble_chunk. Then for each part parsed, #part_begin is # called, followed by a recursive set of calls described by the # "MESSAGE" production above, and then #part_end. After all parts # are parsed, any number of calls to #epilogue_chunk are followed by # a single call to #multipart_body_end. # # The recursive nature of MIME multipart messages is represented by # the recursive invocation of the "MESSAGE" production in the # grammar above. class StreamHandler # This method is called for Unix MBOX "From " lines in the message # header, it calls this method with the text. def mbox_from(line) end # This method is called when a header field is parsed. The # +field+ is the full text of the field, the +name+ is the name of # the field and the +value+ is the field's value with leading and # trailing whitespace removed. Note that both +field+ and +value+ # may be multi-line strings. def header_field(field, name, value) end # This method is called before a non-multipart message body is # about to be parsed. def body_begin end # This method is called with a string chunk of data from a # non-multipart message body. The string does not necessarily # begin or end on any particular boundary. def body_chunk(chunk) end # This method is called after all of the non-multipart message # body has been parsed. def body_end end # This method is called before a multipart message body is about # to be parsed. def multipart_body_begin end # This method is called with a chunk of data from a multipart # message body's preamble. The preamble is any text that appears # before the first part of the multipart message body. def preamble_chunk(chunk) end # This method is called when a part of a multipart body begins. def part_begin end # This method is called when a part of a multipart body ends. def part_end end # This method is called with a chunk of data from a multipart # message body's epilogue. The epilogue is any text that appears # after the last part of the multipart message body. def epilogue_chunk(chunk) end # This method is called after a multipart message body has been # completely parsed. # # The +delimiters+ is an Array of strings, one for each boundary # string found in the multipart body. The +boundary+ is the # boundary string used to delimit each part in the multipart body. # You can normally ignore both +delimiters+ and +boundary+ if you # are concerned only about message content. def multipart_body_end(delimiters, boundary) end end # The RMail::StreamParser is a low level message parsing API. It is # useful when you are interested in serially examining all message # content but are not interested in a full object representation of # the object. See StreamParser.parse. class StreamParser class << self # Parse a message from an input source. This method returns # nothing. Instead, the supplied +handler+ is expected to # implement the same methods as RMail::StreamHandler. The # message structure can be inferred from the methods called on # the +handler+. The +input+ can be any Ruby IO source or a # String. # # This is a low level parsing API. For a message parser that # returns an RMail::Message object, see the RMail::Parser class. # RMail::Parser is implemented using RMail::StreamParser. def parse(input, handler) RMail::StreamParser.new(input, handler).parse end end def initialize(input, handler) # :nodoc: @input = input @handler = handler @chunk_size = nil end def parse # :nodoc: input = RMail::Parser::PushbackReader.new(@input) input.chunk_size = @chunk_size if @chunk_size parse_low(input, 0) return nil end # Change the chunk size used to read the message. This is useful # mostly for testing, so we don't document it. attr_accessor :chunk_size # :nodoc: private def parse_low(input, depth) multipart_boundary = parse_header(input, depth) if multipart_boundary parse_multipart_body(input, depth, multipart_boundary) else parse_singlepart_body(input, depth) end end def parse_header(input, depth) data = nil header = nil pushback = nil boundary = nil while chunk = input.read data ||= '' data << chunk if data[0] == ?\n # A leading newline in the message is seen when parsing the # parts of a multipart message. It means there are no # headers. The body part starts directly after this # newline. rest = data[1..-1] else header, rest = data.split(/\n\n/, 2) end break if rest end input.pushback(rest) if header mime = false fields = header.split(/\n(?!\s)/) if fields.first =~ /^From / @handler.mbox_from(fields.first) fields.shift end fields.each { |field| if field =~ /^From / @handler.mbox_from(field) else name, value = RMail::Header::Field.parse(field) case name.downcase when 'mime-version' if value =~ /\b1\.0\b/ mime = true end when 'content-type' # FIXME: would be nice to have a procedural equivalent # to RMail::Header#param. header = RMail::Header.new header['content-type'] = value boundary = header.param('content-type', 'boundary') end @handler.header_field(field, name, value) end } unless mime or depth > 0 boundary = nil end end return boundary end def parse_multipart_body(input, depth, boundary) input = RMail::Parser::MultipartReader.new(input, boundary) input.chunk_size = @chunk_size if @chunk_size @handler.multipart_body_begin # Reach each part, adding it to this entity as appropriate. delimiters = [] while input.next_part if input.preamble? while chunk = input.read @handler.preamble_chunk(chunk) end elsif input.epilogue? while chunk = input.read @handler.epilogue_chunk(chunk) end else @handler.part_begin parse_low(input, depth + 1) @handler.part_end end delimiters << (input.delimiter || "") unless input.epilogue? end @handler.multipart_body_end(delimiters, boundary) end def parse_singlepart_body(input, depth) @handler.body_begin while chunk = input.read @handler.body_chunk(chunk) end @handler.body_end end end # The RMail::Parser class creates RMail::Message objects from Ruby # IO objects or strings. # # To parse from a string: # message = RMail::Parser.read(the_string) # # To parse from an IO object: # message = File.open('my-message') { |f| # RMail::Parser.read(f) # } # # You can also parse from STDIN, etc. # message = RMail::Parser.read(STDIN) # # In all cases, the parser consumes all input. class Parser # This exception class is thrown when the parser encounters an # error. # # Note: the parser tries hard to never throw exceptions -- this # error is thrown only when the API is used incorrectly and not on # invalid input. class Error < StandardError; end # Creates a new parser. Messages of +message_class+ will be # created by the parser. By default, the parser will create # RMail::Message objects. def initialize() @chunk_size = nil end # Parse a message from the IO object +io+ and return a new # message. The +io+ object can also be a string. def parse(input) handler = RMail::Parser::Handler.new parser = RMail::StreamParser.new(input, handler) parser.chunk_size = @chunk_size if @chunk_size parser.parse return handler.message end # Change the chunk size used to read the message. This is useful # mostly for testing. attr_accessor :chunk_size # Parse a message from the IO object +io+ and return a new # message. The +io+ object can also be a string. This is just # shorthand for: # # RMail::Parser.new.parse(io) def Parser.read(input) Parser.new.parse(input) end class Handler < RMail::StreamHandler # :nodoc: def initialize @parts = [ RMail::Message.new ] @preambles = [] @epilogues = [] end def mbox_from(field) @parts.last.header.mbox_from = field end def header_field(field, name, value) @parts.last.header.add_raw(field) end def body_begin @body = nil end def body_chunk(chunk) if @body @body << chunk else @body = chunk end end def body_end @parts.last.body = @body end def multipart_body_begin @preambles.push(nil) @epilogues.push(nil) end def preamble_chunk(chunk) if @preambles.last @preambles.last << chunk else @preambles[-1] = chunk end end def epilogue_chunk(chunk) if @epilogues.last @epilogues.last << chunk else @epilogues[-1] = chunk end end def multipart_body_end(delimiters, boundary) @parts.last.preamble = @preambles.pop @parts.last.epilogue = @epilogues.pop if @parts.last.body.nil? @parts.last.body = [] end @parts.last.set_delimiters(delimiters, boundary) end def part_begin @parts << RMail::Message.new end def part_end part = @parts.pop @parts.last.add_part(part) end def message @parts.first end end end end