class Msg
#
# = Introduction
#
# A big compononent of +Msg+ files is the property store, which holds
# all the key/value pairs of properties. The message itself, and all
# its Attachments and Recipients have an instance of
# this class.
#
# = Storage model
#
# Property keys (tags?) can be either simple hex numbers, in the
# range 0x0000 - 0xffff, or they can be named properties. In fact,
# properties in the range 0x0000 to 0x7fff are supposed to be the non-
# named properties, and can be considered to be in the +PS_MAPI+
# namespace. (correct?)
#
# Named properties are serialized in the 0x8000 to 0xffff range,
# and are referenced as a guid and long/string pair.
#
# There are key ranges, which can be used to imply things generally
# about keys.
#
# Further, we can give symbolic names to most keys, coming from
# constants in various places. Eg:
#
# 0x0037 => subject
# {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
# # displayed as categories in outlook
# {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
#
# Futher, there are completely different names, coming from other
# object models that get mapped to these things (CDO's model,
# Outlook's model etc). Eg "urn:schemas:httpmail:subject"
# I think these can be ignored though, as they aren't defined clearly
# in terms of mapi properties, and i'm really just trying to make
# a mapi property store. (It should also be relatively easy to
# support them later.)
#
# = Usage
#
# The api is driven by a desire to have the simple stuff "just work", ie
#
# properties.subject
# properties.display_name
#
# There also needs to be a way to look up properties more specifically:
#
# properties[0x0037] # => gets the subject
# properties[PS_MAPI, 0x0037] # => still gets the subject
# properties[PS_PUBLIC_STRINGS, 'Keywords'] # => gets the above categories
#
# The abbreviate versions work by "resolving" the symbols to full keys:
#
# properties.resolve :keywords # => [PS_OUTLOOK, 'Keywords']
# properties.resolve :subject # => [PS_MAPI, 0x0037]
#
# = Parsing
#
# There are three objects that need to be parsed to load a +Msg+ property store:
#
# 1. The +nameid+ directory (Properties.parse_nameid)
# 2. The many +substg+ objects, whose names should match Properties::SUBSTG_RX
# (Properties#parse_substg)
# 3. The +properties+ file (Properties#parse_properties)
#
# Understanding of the formats is by no means perfect
#
# = TODO
#
# * Test cases.
# * While the key objects are sufficient, the value objects are just plain
# ruby types. It currently isn't possible to write to the values, or to know
# which encoding the value had.
# * Consider other MAPI property stores, such as tnef/pst. Similar model?
# Generalise this one?
# * Have added IO support to Ole::Storage. now need to fix Properties. can't use
# current greedy-loading approach. still want strings to work nicely:
# props.subject
# but don't want to be loading up large binary blobs, typically attachments, eg
# props.attach_data.
# probably the easiest solution is that the binary "encoding", be to return an io
# object instead. and you must read it if you want it as a string
# maybe i can avoid the greedy model anyway? rather than parsing the properties completely,
# have it be load based? you request subject, that translates into, please load the right
# substg, et voila. maybe redo @raw as a lazy loading hash for substg objects, but do the
# others straight away. maybe just parse keys so i know what i've got??
class Properties
# duplicated here for now
SUPPORT_DIR = File.dirname(__FILE__) + '/../..'
# note that binary and default both use obj.open. not the block form. this means we should
# #close it later, which we don't. as we're only reading though, it shouldn't matter right?
# not really good though FIXME
ENCODINGS = {
0x000d => proc { |obj| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
0x001f => proc { |obj| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
# ascii
# FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
0x001e => proc { |obj| a = obj.read; a[-1] == 0 ? a[0...-2] : a },
0x0102 => proc { |obj| obj.open }, # binary?
:default => proc { |obj| obj.open }
}
# these won't be strings for much longer.
# maybe later, the Key#inspect could automatically show symbolic guid names if they
# are part of this builtin list.
# FIXME. hey, nice that my fake string is the same length though :)
PS_MAPI = '{not-really-sure-what-this-should-say}'
PS_PUBLIC_STRINGS = '{00020329-0000-0000-c000-000000000046}'
# string properties in this namespace automatically get added to the internet headers
PS_INTERNET_HEADERS = '{00020386-0000-0000-c000-000000000046}'
# theres are bunch of outlook ones i think
# http://blogs.msdn.com/stephen_griffin/archive/2006/05/10/outlook-2007-beta-documentation-notification-based-indexing-support.aspx
# IPM.Appointment
PSETID_Appointment = '{00062002-0000-0000-c000-000000000046}'
# IPM.Task
PSETID_Task = '{00062003-0000-0000-c000-000000000046}'
# used for IPM.Contact
PSETID_Address = '{00062004-0000-0000-c000-000000000046}'
PSETID_Common = '{00062008-0000-0000-c000-000000000046}'
# didn't find a source for this name. it is for IPM.StickyNote
PSETID_Note = '{0006200e-0000-0000-c000-000000000046}'
# for IPM.Activity. also called the journal?
PSETID_Log = '{0006200a-0000-0000-c000-000000000046}'
SUBSTG_RX = /__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?/
# access the underlying raw property hash
attr_reader :raw
# unused (non-property) objects after parsing an +Dirent+.
attr_reader :unused
attr_reader :nameid
def initialize
@raw = {}
@unused = []
# FIXME
@body_rtf = @body_html = @body = false
end
#--
# The parsing methods
#++
def self.load obj
prop = Properties.new
prop.load obj
prop
end
# Parse properties from the +Dirent+ obj
def load obj
# we need to do the nameid first, as it provides the map for later user defined properties
children = obj.children.dup
@nameid = if nameid_obj = children.find { |child| child.name == '__nameid_version1.0' }
children.delete nameid_obj
Properties.parse_nameid nameid_obj
end
# now parse the actual properties. i think dirs that match the substg should be decoded
# as properties to. 0x000d is just another encoding, the dir encoding. it should match
# whether the object is file / dir. currently only example is embedded msgs anyway
children.each do |child|
if child.file?
begin
case child.name
when /__properties_version1\.0/
parse_properties child
when SUBSTG_RX
parse_substg *($~[1..-1].map { |num| num.hex rescue nil } + [child])
else raise "bad name for mapi property #{child.name.inspect}"
end
#rescue
# Log.warn $!
# @unused << child
end
else @unused << child
end
end
end
# Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
# proxy keys in the 0x8000 - 0xffff range.
# Returns a hash of integer -> Key.
def self.parse_nameid obj
remaining = obj.children.dup
guids_obj, props_obj, names_obj =
%w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
remaining.delete obj[name]
end
# parse guids
# this is the guids for named properities (other than builtin ones)
# i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/m).map do |str|
Ole::Types.load_guid str
end
# parse names.
# the string ids for named properties
# they are no longer parsed, as they're referred to by offset not
# index. they are simply sequentially packed, as a long, giving
# the string length, then padding to 4 byte multiple, and repeat.
names_data = names_obj.read
# parse actual props.
# not sure about any of this stuff really.
# should flip a few bits in the real msg, to get a better understanding of how this works.
props = props_obj.read.scan(/.{8}/m).map do |str|
flags, offset = str[4..-1].unpack 'S2'
# the property will be serialised as this pseudo property, mapping it to this named property
pseudo_prop = 0x8000 + offset
named = flags & 1 == 1
prop = if named
str_off = *str.unpack('L')
len = *names_data[str_off, 4].unpack('L')
Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
else
a, b = str.unpack('S2')
Log.debug "b not 0" if b != 0
a
end
# a bit sus
guid_off = flags >> 1
# missing a few builtin PS_*
Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
guid = guids[guid_off - 2]
[pseudo_prop, Key.new(prop, guid)]
end
Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
# this leaves a bunch of other unknown chunks of data with completely unknown meaning.
# pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
Hash[*props.flatten]
end
# Parse an +Dirent+, as per msgconvert.pl. This is how larger properties, such
# as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
def parse_substg key, encoding, offset, obj
if (encoding & 0x1000) != 0
if !offset
# there is typically one with no offset first, whose data is a series of numbers
# equal to the lengths of all the sub parts. gives an implied array size i suppose.
# maybe you can initialize the array at this time. the sizes are the same as all the
# ole object sizes anyway, its to pre-allocate i suppose.
#p obj.data.unpack('L*')
# ignore this one
return
else
# remove multivalue flag for individual pieces
encoding &= ~0x1000
end
else
Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
offset = nil
end
# offset is for multivalue encodings.
unless encoder = ENCODINGS[encoding]
Log.warn "unknown encoding #{encoding}"
#encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
encoder = ENCODINGS[:default]
end
add_property key, encoder[obj], offset
end
# For parsing the +properties+ file. Smaller properties are serialized in one chunk,
# such as longs, bools, times etc. The parsing has problems.
def parse_properties obj
data = obj.read
# don't really understand this that well...
pad = data.length % 16
unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
end
data[pad..-1].scan(/.{16}/m).each do |data|
property, encoding = ('%08x' % data.unpack('L')).scan /.{4}/
key = property.hex
# doesn't make any sense to me. probably because its a serialization of some internal
# outlook structure...
next if property == '0000'
case encoding
when '0102', '001e', '001f', '101e', '101f', '000d'
# ignore on purpose. not sure what its for
# multivalue versions ignored also
when '0003' # long
# don't know what all the other data is for
add_property key, *data[8, 4].unpack('L')
when '000b' # boolean
# again, heaps more data than needed. and its not always 0 or 1.
# they are in fact quite big numbers. this is wrong.
# p [property, data[4..-1].unpack('H*')[0]]
add_property key, data[8, 4].unpack('L')[0] != 0
when '0040' # systime
# seems to work:
add_property key, Ole::Types.load_time(data[8..-1])
else
Log.warn "ignoring data in __properties section, encoding: #{encoding}"
Log << data.unpack('H*').inspect + "\n"
end
end
end
def add_property key, value, pos=nil
# map keys in the named property range through nameid
if Integer === key and key >= 0x8000
if !@nameid
Log.warn "no nameid section yet named properties used"
key = Key.new key
elsif real_key = @nameid[key]
key = real_key
else
Log.warn "property in named range not in nameid #{key.inspect}"
key = Key.new key
end
else
key = Key.new key
end
if pos
@raw[key] ||= []
Log.warn "duplicate property" unless Array === @raw[key]
# ^ this is actually a trickier problem. the issue is more that they must all be of
# the same type.
@raw[key][pos] = value
else
# take the last.
Log.warn "duplicate property #{key.inspect}" if @raw[key]
@raw[key] = value
end
end
# resolve an arg (could be key, code, string, or symbol), and possible guid to a key
def resolve arg, guid=nil
if guid; Key.new arg, guid
else
case arg
when Key; arg
when Integer; Key.new arg
else sym_to_key[arg.to_sym]
end
end or raise "unable to resolve key from #{[arg, guid].inspect}"
end
# just so i can get an easy unique list of missing ones
@@quiet_property = {}
def sym_to_key
# create a map for converting symbols to keys. cache it
unless @sym_to_key
@sym_to_key = {}
@raw.each do |key, value|
sym = key.to_sym
# used to use @@quiet_property to only ignore once
Log.info "couldn't find symbolic name for key #{key.inspect}" unless Symbol === sym
if @sym_to_key[sym]
Log.warn "duplicate key #{key.inspect}"
# we give preference to PS_MAPI keys
@sym_to_key[sym] = key if key.guid == PS_MAPI
else
# just assign
@sym_to_key[sym] = key
end
end
end
@sym_to_key
end
# accessors
def [] arg, guid=nil
@raw[resolve(arg, guid)] rescue nil
end
#--
# for completeness, but its a mute point until i can write to the ole
# objects.
#def []= arg, guid=nil, value
# @raw[resolve(arg, guid)] = value
#end
#++
def method_missing name, *args
if name.to_s !~ /\=$/ and args.empty?
self[name]
elsif name.to_s =~ /(.*)\=$/ and args.length == 1
self[$1] = args[0]
else
super
end
end
def to_h
hash = {}
sym_to_key.each { |sym, key| hash[sym] = self[key] if Symbol === sym }
hash
end
def inspect
'# 32 ? v[0..29] + '..."' : v}"
end.join(' ') + '>'
end
# -----
# temporary pseudo tags
# for providing rtf to plain text conversion. later, html to text too.
def body
return @body if @body != false
@body = (self[:body] rescue nil)
@body = (::RTF::Converter.rtf2text body_rtf rescue nil) if !@body or @body.strip.empty?
@body
end
# for providing rtf decompression
def body_rtf
return @body_rtf if @body_rtf != false
@body_rtf = (RTF.rtfdecompr rtf_compressed.read rescue nil)
end
# for providing rtf to html conversion
def body_html
return @body_html if @body_html != false
@body_html = (self[:body_html].read rescue nil)
@body_html = (Msg::RTF.rtf2html body_rtf rescue nil) if !@body_html or @body_html.strip.empty?
# last resort
@body_html = (::RTF::Converter.rtf2text body_rtf, :html rescue nil) if !@body_html or @body_html.strip.empty?
@body_html
end
# +Properties+ are accessed by Keys, which are coerced to this class.
# Includes a bunch of methods (hash, ==, eql?) to allow it to work as a key in
# a +Hash+.
#
# Also contains the code that maps keys to symbolic names.
class Key
attr_reader :code, :guid
def initialize code, guid=PS_MAPI
@code, @guid = code, guid
end
def to_sym
# hmmm, for some stuff, like, eg, the message class specific range, sym-ification
# of the key depends on knowing our message class. i don't want to store anything else
# here though, so if that kind of thing is needed, it can be passed to this function.
# worry about that when some examples arise.
case code
when Integer
if guid == PS_MAPI # and < 0x8000 ?
# the hash should be updated now that i've changed the process
MAPITAGS['%04x' % code].first[/_(.*)/, 1].downcase.to_sym rescue code
else
# handle other guids here, like mapping names to outlook properties, based on the
# outlook object model.
NAMED_MAP[self].to_sym rescue code
end
when String
# return something like
# note that named properties don't go through the map at the moment. so #categories
# doesn't work yet
code.downcase.to_sym
end
end
def to_s
to_sym.to_s
end
# FIXME implement these
def transmittable?
# etc, can go here too
end
# this stuff is to allow it to be a useful key
def hash
[code, guid].hash
end
def == other
hash == other.hash
end
alias eql? :==
def inspect
if Integer === code
hex = '0x%04x' % code
if guid == PS_MAPI
# just display as plain hex number
hex
else
"#"
end
else
# display full guid and code
"#"
end
end
end
#--
# YUCK moved here because we need Key
#++
# data files that provide for the code to symbolic name mapping
# guids in named_map are really constant references to the above
MAPITAGS = open("#{SUPPORT_DIR}/data/mapitags.yaml") { |file| YAML.load file }
NAMED_MAP = Hash[*open("#{SUPPORT_DIR}/data/named_map.yaml") { |file| YAML.load file }.map do |key, value|
[Key.new(key[0], const_get(key[1])), value]
end.flatten]
end
end