lib/sup/index.rb in sup-0.11 vs lib/sup/index.rb in sup-0.12

- old
+ new

@@ -1,10 +1,11 @@ ENV["XAPIAN_FLUSH_THRESHOLD"] = "1000" require 'xapian' require 'set' require 'fileutils' +require 'monitor' begin require 'chronic' $have_chronic = true rescue LoadError => e @@ -19,11 +20,11 @@ # for searching due to precomputing thread membership. class Index include InteractiveLock STEM_LANGUAGE = "english" - INDEX_VERSION = '2' + INDEX_VERSION = '4' ## dates are converted to integers for xapian, and are used for document ids, ## so we must ensure they're reasonably valid. this typically only affect ## spam. MIN_DATE = Time.at 0 @@ -46,10 +47,11 @@ include Singleton def initialize dir=BASE_DIR @dir = dir + FileUtils.mkdir_p @dir @lock = Lockfile.new lockfile, :retries => 0, :max_age => nil @sync_worker = nil @sync_queue = Queue.new @index_mutex = Monitor.new end @@ -102,19 +104,20 @@ path = File.join(@dir, 'xapian') if File.exists? path @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_OPEN) db_version = @xapian.get_metadata 'version' db_version = '0' if db_version.empty? - if db_version == '1' - info "Upgrading index format 1 to 2" + if false + info "Upgrading index format #{db_version} to #{INDEX_VERSION}" @xapian.set_metadata 'version', INDEX_VERSION elsif db_version != INDEX_VERSION - fail "This Sup version expects a v#{INDEX_VERSION} index, but you have an existing v#{db_version} index. Please downgrade to your previous version and dump your labels before upgrading to this version (then run sup-sync --restore)." + fail "This Sup version expects a v#{INDEX_VERSION} index, but you have an existing v#{db_version} index. Please run sup-dump to save your labels, move #{path} out of the way, and run sup-sync --restore." end else @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_CREATE) @xapian.set_metadata 'version', INDEX_VERSION + @xapian.set_metadata 'rescue-version', '0' end @enquire = Xapian::Enquire.new @xapian @enquire.weighting_scheme = Xapian::BoolWeight.new @enquire.docid_order = Xapian::Enquire::ASCENDING end @@ -191,15 +194,19 @@ ## Load message with the given message-id from the index def build_message id entry = synchronize { get_entry id } return unless entry - source = SourceManager[entry[:source_id]] - raise "invalid source #{entry[:source_id]}" unless source + locations = entry[:locations].map do |source_id,source_info| + source = SourceManager[source_id] + raise "invalid source #{source_id}" unless source + Location.new source, source_info + end - m = Message.new :source => source, :source_info => entry[:source_info], - :labels => entry[:labels], :snippet => entry[:snippet] + m = Message.new :locations => locations, + :labels => entry[:labels], + :snippet => entry[:snippet] mk_person = lambda { |x| Person.new(*x.reverse!) } entry[:from] = mk_person[entry[:from]] entry[:to].map!(&mk_person) entry[:cc].map!(&mk_person) @@ -258,10 +265,30 @@ ## was synced from def source_for_id id synchronize { get_entry(id)[:source_id] } end + ## Yields each tearm in the index that starts with prefix + def each_prefixed_term prefix + term = @xapian._dangerous_allterms_begin prefix + lastTerm = @xapian._dangerous_allterms_end prefix + until term.equals lastTerm + yield term.term + term.next + end + nil + end + + ## Yields (in lexicographical order) the source infos of all locations from + ## the given source with the given source_info prefix + def each_source_info source_id, prefix='', &b + prefix = mkterm :location, source_id, prefix + each_prefixed_term prefix do |x| + yield x[prefix.length..-1] + end + end + class ParseError < StandardError; end ## parse a query string from the user. returns a query object ## that can be passed to any index method with a 'query' ## argument. @@ -286,26 +313,10 @@ else "(#{email_field}:#{value} OR #{name_field}:#{value})" end end - ## if we see a label:deleted or a label:spam term anywhere in the query - ## string, we set the extra load_spam or load_deleted options to true. - ## bizarre? well, because the query allows arbitrary parenthesized boolean - ## expressions, without fully parsing the query, we can't tell whether - ## the user is explicitly directing us to search spam messages or not. - ## e.g. if the string is -(-(-(-(-label:spam)))), does the user want to - ## search spam messages or not? - ## - ## so, we rely on the fact that turning these extra options ON turns OFF - ## the adding of "-label:deleted" or "-label:spam" terms at the very - ## final stage of query processing. if the user wants to search spam - ## messages, not adding that is the right thing; if he doesn't want to - ## search spam messages, then not adding it won't have any effect. - query[:load_spam] = true if subs =~ /\blabel:spam\b/ - query[:load_deleted] = true if subs =~ /\blabel:deleted\b/ - ## gmail style "is" operator subs = subs.gsub(/\b(is|has):(\S+)\b/) do field, label = $1, $2 case label when "read" @@ -319,10 +330,33 @@ else "label:#{$2}" end end + ## labels are stored lower-case in the index + subs = subs.gsub(/\blabel:(\S+)\b/) do + label = $1 + "label:#{label.downcase}" + end + + ## if we see a label:deleted or a label:spam term anywhere in the query + ## string, we set the extra load_spam or load_deleted options to true. + ## bizarre? well, because the query allows arbitrary parenthesized boolean + ## expressions, without fully parsing the query, we can't tell whether + ## the user is explicitly directing us to search spam messages or not. + ## e.g. if the string is -(-(-(-(-label:spam)))), does the user want to + ## search spam messages or not? + ## + ## so, we rely on the fact that turning these extra options ON turns OFF + ## the adding of "-label:deleted" or "-label:spam" terms at the very + ## final stage of query processing. if the user wants to search spam + ## messages, not adding that is the right thing; if he doesn't want to + ## search spam messages, then not adding it won't have any effect. + query[:load_spam] = true if subs =~ /\blabel:spam\b/ + query[:load_deleted] = true if subs =~ /\blabel:deleted\b/ + query[:load_killed] = true if subs =~ /\blabel:killed\b/ + ## gmail style attachments "filename" and "filetype" searches subs = subs.gsub(/\b(filename|filetype):(\((.+?)\)\B|(\S+)\b)/) do field, name = $1, ($3 || $4) case field when "filename" @@ -451,10 +485,11 @@ 'attachment_extension' => 'O', 'msgid' => 'Q', 'id' => 'Q', 'thread' => 'H', 'ref' => 'R', + 'location' => 'J', } PREFIX = NORMAL_PREFIX.merge BOOLEAN_PREFIX MSGID_VALUENO = 0 @@ -512,11 +547,11 @@ doc.value MSGID_VALUENO end def get_entry id return unless doc = find_doc(id) - Marshal.load doc.data + doc.entry end def thread_killed? thread_id not run_query(Q.new(Q::OP_AND, mkterm(:thread, thread_id), mkterm(:label, :Killed)), 0, 1).empty? end @@ -545,10 +580,11 @@ pos_terms << mkterm(:type, 'mail') pos_terms.concat(labels.map { |l| mkterm(:label,l) }) pos_terms << opts[:qobj] if opts[:qobj] pos_terms << mkterm(:source_id, opts[:source_id]) if opts[:source_id] + pos_terms << mkterm(:location, *opts[:location]) if opts[:location] if opts[:participants] participant_terms = opts[:participants].map { |p| [:from,:to].map { |d| mkterm(:email, d, (Redwood::Person === p) ? p.email : p) } }.flatten pos_terms << Q.new(Q::OP_OR, participant_terms) end @@ -573,12 +609,11 @@ old_entry = !do_index_static && doc.entry snippet = do_index_static ? m.snippet : old_entry[:snippet] entry = { :message_id => m.id, - :source_id => m.source.id, - :source_info => m.source_info, + :locations => m.locations.map { |x| [x.source.id, x.info] }, :date => truncate_date(m.date), :snippet => snippet, :labels => m.labels.to_a, :from => [m.from.email, m.from.name], :to => m.to.map { |p| [p.email, p.name] }, @@ -593,10 +628,11 @@ doc.clear_terms doc.clear_values index_message_static m, doc, entry end + index_message_locations doc, entry, old_entry index_message_threading doc, entry, old_entry index_message_labels doc, entry[:labels], (do_index_static ? [] : old_entry[:labels]) doc.entry = entry synchronize do @@ -635,11 +671,10 @@ # Miscellaneous terms doc.add_term mkterm(:date, m.date) if m.date doc.add_term mkterm(:type, 'mail') doc.add_term mkterm(:msgid, m.id) - doc.add_term mkterm(:source_id, m.source.id) m.attachments.each do |a| a =~ /\.(\w+)$/ or next doc.add_term mkterm(:attachment_extension, $1) end @@ -652,10 +687,17 @@ doc.add_value MSGID_VALUENO, m.id doc.add_value DATE_VALUENO, date_value end + def index_message_locations doc, entry, old_entry + old_entry[:locations].map { |x| x[0] }.uniq.each { |x| doc.remove_term mkterm(:source_id, x) } if old_entry + entry[:locations].map { |x| x[0] }.uniq.each { |x| doc.add_term mkterm(:source_id, x) } + old_entry[:locations].each { |x| (doc.remove_term mkterm(:location, *x) rescue nil) } if old_entry + entry[:locations].each { |x| doc.add_term mkterm(:location, *x) } + end + def index_message_labels doc, new_labels, old_labels return if new_labels == old_labels added = new_labels.to_a - old_labels.to_a removed = old_labels.to_a - new_labels.to_a added.each { |t| doc.add_term mkterm(:label,t) } @@ -714,9 +756,11 @@ when :to then PREFIX['to_email'] else raise "Invalid email term type #{args[0]}" end + args[1].to_s.downcase when :source_id PREFIX['source_id'] + args[0].to_s.downcase + when :location + PREFIX['location'] + [args[0]].pack('n') + args[1].to_s when :attachment_extension PREFIX['attachment_extension'] + args[0].to_s.downcase when :msgid, :ref, :thread PREFIX[type.to_s] + args[0][0...(MAX_TERM_LENGTH-1)] else