app.rb in reckon-0.5.0

- old
+ new

@@ -1,99 +1,97 @@
-#coding: utf-8
+# coding: utf-8
 require 'pp'
 require 'yaml'
 
 module Reckon
   class App
-    VERSION = "Reckon 0.4.4"
-    attr_accessor :options, :accounts, :tokens, :seen, :csv_parser, :regexps
+    attr_accessor :options, :seen, :csv_parser, :regexps, :matcher
 
     def initialize(options = {})
+      LOGGER.level = Logger::INFO if options[:verbose]
       self.options = options
-      self.tokens = {}
       self.regexps = {}
-      self.accounts = {}
       self.seen = {}
       self.options[:currency] ||= '$'
       options[:string] = File.read(options[:file]) unless options[:string]
       @csv_parser = CSVParser.new( options )
+      @matcher = CosineSimilarity.new(options)
       learn!
     end
 
     def interactive_output(str)
       return if options[:unattended]
       puts str
     end
 
+    def learn!
+      learn_from_account_tokens(options[:account_tokens_file])
+
+      ledger_file = options[:existing_ledger_file]
+      return unless ledger_file
+      fail "#{ledger_file} doesn't exist!" unless File.exists?(ledger_file)
+      learn_from(File.read(ledger_file))
+    end
+
+    def learn_from_account_tokens(filename)
+      return unless filename
+
+      fail "#{filename} doesn't exist!" unless File.exists?(filename)
+
+      extract_account_tokens(YAML.load_file(filename)).each do |account, tokens|
+        tokens.each do |t|
+          if t.start_with?('/')
+            add_regexp(account, t)
+          else
+            @matcher.add_document(account, t)
+          end
+        end
+      end
+    end
+
     def learn_from(ledger)
       LedgerParser.new(ledger).entries.each do |entry|
         entry[:accounts].each do |account|
-          learn_about_account( account[:name],
-                              [entry[:desc], account[:amount]].join(" ") ) unless account[:name] == options[:bank_account]
-          seen[entry[:date]] ||= {}
-          seen[entry[:date]][@csv_parser.pretty_money(account[:amount])] = true
+          str = [entry[:desc], account[:amount]].join(" ")
+          @matcher.add_document(account[:name], str) unless account[:name] == options[:bank_account]
+          pretty_date = entry[:date].iso8601
+          seen[pretty_date] ||= {}
+          seen[pretty_date][@csv_parser.pretty_money(account[:amount])] = true
         end
       end
     end
 
-    def already_seen?(row)
-      seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
-    end
-
+    # Add tokens from account_tokens_file to accounts
     def extract_account_tokens(subtree, account = nil)
       if subtree.nil?
         puts "Warning: empty #{account} tree"
         {}
       elsif subtree.is_a?(Array)
         { account => subtree }
       else
-        at = subtree.map { |k, v| extract_account_tokens(v, [account, k].compact.join(':')) }
-        at.inject({}) { |k, v| k = k.merge(v)}
-      end
-    end
-
-    def learn!
-      if options[:account_tokens_file]
-        fail "#{options[:account_tokens_file]} doesn't exist!" unless File.exists?(options[:account_tokens_file])
-        extract_account_tokens(YAML.load_file(options[:account_tokens_file])).each do |account, tokens|
-          tokens.each { |t| learn_about_account(account, t, true) }
+        at = subtree.map do |k, v|
+          merged_acct = [account, k].compact.join(':')
+          extract_account_tokens(v, merged_acct)
         end
+        at.inject({}) { |memo, e| memo.merge!(e)}
       end
-      return unless options[:existing_ledger_file]
-      fail "#{options[:existing_ledger_file]} doesn't exist!" unless File.exists?(options[:existing_ledger_file])
-      ledger_data = File.read(options[:existing_ledger_file])
-      learn_from(ledger_data)
     end
 
-    def learn_about_account(account, data, parse_regexps = false)
-      accounts[account] ||= 0
-      if parse_regexps && data.start_with?('/')
-        # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
-        match = data.match(/^\/(.*)\/([ix]*)$/m)
-        fail "failed to parse regexp #{data}" unless match
-        options = 0
-        (match[2] || '').split('').each do |option|
-          case option
-          when 'x' then options |= Regexp::EXTENDED
-          when 'i' then options |= Regexp::IGNORECASE
-          end
+    def add_regexp(account, regex_str)
+      # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb
+      match = regex_str.match(/^\/(.*)\/([ix]*)$/m)
+      fail "failed to parse regexp #{regex_str}" unless match
+      options = 0
+      (match[2] || '').split('').each do |option|
+        case option
+        when 'x' then options |= Regexp::EXTENDED
+        when 'i' then options |= Regexp::IGNORECASE
         end
-        regexps[Regexp.new(match[1], options)] = account
-      else
-        tokenize(data).each do |token|
-          tokens[token] ||= {}
-          tokens[token][account] ||= 0
-          tokens[token][account] += 1
-          accounts[account] += 1
-        end
       end
+      regexps[Regexp.new(match[1], options)] = account
     end
 
-    def tokenize(str)
-      str.downcase.split(/[\s\-]/)
-    end
-
     def walk_backwards
       seen_anything_new = false
       each_row_backwards do |row|
         interactive_output Terminal::Table.new(:rows => [ [ row[:pretty_date], row[:pretty_money], row[:description] ] ])
 
@@ -105,12 +103,11 @@
           end
         else
           seen_anything_new = true
         end
 
-        possible_answers = most_specific_regexp_match(row)
-        possible_answers = weighted_account_match( row ).map! { |a| a[:account] } if possible_answers.empty?
+        possible_answers = suggest(row)
 
         ledger = if row[:money] > 0
           if options[:unattended]
             out_of_account = possible_answers.first || options[:default_outof_account] || 'Income:Unknown'
           else
@@ -154,110 +151,82 @@
         learn_from(ledger) unless options[:account_tokens_file]
         output(ledger)
       end
     end
 
-    def finish
-      options[:output_file].close unless options[:output_file] == STDOUT
-      interactive_output "Exiting."
-      exit
+    def each_row_backwards
+      rows = []
+      (0...@csv_parser.columns.first.length).to_a.each do |index|
+        rows << { :date => @csv_parser.date_for(index),
+                  :pretty_date => @csv_parser.pretty_date_for(index),
+                  :pretty_money => @csv_parser.pretty_money_for(index),
+                  :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
+                  :money => @csv_parser.money_for(index),
+                  :description => @csv_parser.description_for(index) }
+      end
+      rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row|
+        yield row
+      end
     end
 
-    def output(ledger_line)
-      options[:output_file].puts ledger_line
-      options[:output_file].flush
-    end
-
     def most_specific_regexp_match( row )
       matches = regexps.map { |regexp, account|
         if match = regexp.match(row[:description])
           [account, match[0]]
         end
       }.compact
       matches.sort_by! { |account, matched_text| matched_text.length }.map(&:first)
     end
 
-    # Weigh accounts by how well they match the row
-    def weighted_account_match( row )
-      query_tokens = tokenize(row[:description])
-
-      search_vector = []
-      account_vectors = {}
-
-      query_tokens.each do |token|
-        idf = Math.log((accounts.keys.length + 1) / ((tokens[token] || {}).keys.length.to_f + 1))
-        tf = 1.0 / query_tokens.length.to_f
-        search_vector << tf*idf
-
-        accounts.each do |account, total_terms|
-          tf = (tokens[token] && tokens[token][account]) ? tokens[token][account] / total_terms.to_f : 0
-          account_vectors[account] ||= []
-          account_vectors[account] << tf*idf
-        end
-      end
-
-      # Should I normalize the vectors?  Probably unnecessary due to tf-idf and short documents.
-
-      account_vectors = account_vectors.to_a.map do |account, account_vector|
-        { :cosine => (0...account_vector.length).to_a.inject(0) { |m, i| m + search_vector[i] * account_vector[i] },
-          :account => account }
-      end
-      account_vectors.sort! {|a, b| b[:cosine] <=> a[:cosine] }
-
-      # Return empty set if no accounts matched so that we can fallback to the defaults in the unattended mode
-      if options[:unattended]
-        if account_vectors.first && account_vectors.first[:account]
-          account_vectors = [] if account_vectors.first[:cosine] == 0
-        end
-      end
-
-      return account_vectors
+    def suggest(row)
+      most_specific_regexp_match(row) +
+        @matcher.find_similar(row[:description]).map { |n| n[:account] }
     end
 
     def ledger_format(row, line1, line2)
       out = "#{row[:pretty_date]}\t#{row[:description]}\n"
       out += "\t#{line1.first}\t\t\t\t\t#{line1.last}\n"
       out += "\t#{line2.first}\t\t\t\t\t#{line2.last}\n\n"
       out
     end
 
+    def output(ledger_line)
+      options[:output_file].puts ledger_line
+      options[:output_file].flush
+    end
+
+    def already_seen?(row)
+      seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]]
+    end
+
+    def finish
+      options[:output_file].close unless options[:output_file] == STDOUT
+      interactive_output "Exiting."
+      exit
+    end
+
     def output_table
       output = Terminal::Table.new do |t|
         t.headings = 'Date', 'Amount', 'Description'
         each_row_backwards do |row|
           t << [ row[:pretty_date], row[:pretty_money], row[:description] ]
         end
       end
       interactive_output output
     end
 
-    def each_row_backwards
-      rows = []
-      (0...@csv_parser.columns.first.length).to_a.each do |index|
-        rows << { :date => @csv_parser.date_for(index),
-          :pretty_date => @csv_parser.pretty_date_for(index),
-          :pretty_money => @csv_parser.pretty_money_for(index),
-          :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate),
-          :money => @csv_parser.money_for(index),
-          :description => @csv_parser.description_for(index) }
-      end
-      rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row|
-        yield row
-      end
-    end
-
     def self.parse_opts(args = ARGV)
       options = { :output_file => STDOUT }
       parser = OptionParser.new do |opts|
         opts.banner = "Usage: Reckon.rb [options]"
         opts.separator ""
 
         opts.on("-f", "--file FILE", "The CSV file to parse") do |file|
           options[:file] = file
         end
 
-        opts.on("-a", "--account name", "The Ledger Account this file is for") do |a|
+        opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a|
           options[:bank_account] = a
         end
 
         opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
           options[:verbose] = v
@@ -281,10 +250,18 @@
 
         opts.on("", "--ignore-columns 1,2,5", "Columns to ignore in the CSV file - the first column is column 1") do |ignore|
           options[:ignore_columns] = ignore.split(",").map { |i| i.to_i }
         end
 
+        opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number|
+          options[:money_column] = column_number
+        end
+
+        opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number|
+          options[:date_column] = column_number
+        end
+
         opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header|
           options[:contains_header] = 1
           options[:contains_header] = contains_header.to_i if contains_header
         end
 
@@ -314,15 +291,15 @@
 
         opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a|
           options[:account_tokens_file] = a
         end
 
-        opts.on("", "--default-into-account name", "Default into account") do |a|
+        opts.on("", "--default-into-account NAME", "Default into account") do |a|
           options[:default_into_account] = a
         end
 
-        opts.on("", "--default-outof-account name", "Default 'out of' account") do |a|
+        opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a|
           options[:default_outof_account] = a
         end
 
         opts.on("", "--suffixed", "If --currency should be used as a suffix. Defaults to false.") do |e|
           options[:suffixed] = e
@@ -349,10 +326,9 @@
           exit
         end
       end
 
       unless options[:bank_account]
-
         fail "Please specify --account for the unattended mode" if options[:unattended]
 
         options[:bank_account] = ask("What is the account name of this bank account in Ledger? ") do |q|
           q.readline = true
           q.validate = /^.{2,}$/