lib/reckon/app.rb in reckon-0.4.4 vs lib/reckon/app.rb in reckon-0.5.0

- old
+ new

@@ -1,99 +1,97 @@ -#coding: utf-8 +# coding: utf-8 require 'pp' require 'yaml' module Reckon class App - VERSION = "Reckon 0.4.4" - attr_accessor :options, :accounts, :tokens, :seen, :csv_parser, :regexps + attr_accessor :options, :seen, :csv_parser, :regexps, :matcher def initialize(options = {}) + LOGGER.level = Logger::INFO if options[:verbose] self.options = options - self.tokens = {} self.regexps = {} - self.accounts = {} self.seen = {} self.options[:currency] ||= '$' options[:string] = File.read(options[:file]) unless options[:string] @csv_parser = CSVParser.new( options ) + @matcher = CosineSimilarity.new(options) learn! end def interactive_output(str) return if options[:unattended] puts str end + def learn! + learn_from_account_tokens(options[:account_tokens_file]) + + ledger_file = options[:existing_ledger_file] + return unless ledger_file + fail "#{ledger_file} doesn't exist!" unless File.exists?(ledger_file) + learn_from(File.read(ledger_file)) + end + + def learn_from_account_tokens(filename) + return unless filename + + fail "#{filename} doesn't exist!" unless File.exists?(filename) + + extract_account_tokens(YAML.load_file(filename)).each do |account, tokens| + tokens.each do |t| + if t.start_with?('/') + add_regexp(account, t) + else + @matcher.add_document(account, t) + end + end + end + end + def learn_from(ledger) LedgerParser.new(ledger).entries.each do |entry| entry[:accounts].each do |account| - learn_about_account( account[:name], - [entry[:desc], account[:amount]].join(" ") ) unless account[:name] == options[:bank_account] - seen[entry[:date]] ||= {} - seen[entry[:date]][@csv_parser.pretty_money(account[:amount])] = true + str = [entry[:desc], account[:amount]].join(" ") + @matcher.add_document(account[:name], str) unless account[:name] == options[:bank_account] + pretty_date = entry[:date].iso8601 + seen[pretty_date] ||= {} + seen[pretty_date][@csv_parser.pretty_money(account[:amount])] = true end end end - def already_seen?(row) - seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]] - end - + # Add tokens from account_tokens_file to accounts def extract_account_tokens(subtree, account = nil) if subtree.nil? puts "Warning: empty #{account} tree" {} elsif subtree.is_a?(Array) { account => subtree } else - at = subtree.map { |k, v| extract_account_tokens(v, [account, k].compact.join(':')) } - at.inject({}) { |k, v| k = k.merge(v)} - end - end - - def learn! - if options[:account_tokens_file] - fail "#{options[:account_tokens_file]} doesn't exist!" unless File.exists?(options[:account_tokens_file]) - extract_account_tokens(YAML.load_file(options[:account_tokens_file])).each do |account, tokens| - tokens.each { |t| learn_about_account(account, t, true) } + at = subtree.map do |k, v| + merged_acct = [account, k].compact.join(':') + extract_account_tokens(v, merged_acct) end + at.inject({}) { |memo, e| memo.merge!(e)} end - return unless options[:existing_ledger_file] - fail "#{options[:existing_ledger_file]} doesn't exist!" unless File.exists?(options[:existing_ledger_file]) - ledger_data = File.read(options[:existing_ledger_file]) - learn_from(ledger_data) end - def learn_about_account(account, data, parse_regexps = false) - accounts[account] ||= 0 - if parse_regexps && data.start_with?('/') - # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb - match = data.match(/^\/(.*)\/([ix]*)$/m) - fail "failed to parse regexp #{data}" unless match - options = 0 - (match[2] || '').split('').each do |option| - case option - when 'x' then options |= Regexp::EXTENDED - when 'i' then options |= Regexp::IGNORECASE - end + def add_regexp(account, regex_str) + # https://github.com/tenderlove/psych/blob/master/lib/psych/visitors/to_ruby.rb + match = regex_str.match(/^\/(.*)\/([ix]*)$/m) + fail "failed to parse regexp #{regex_str}" unless match + options = 0 + (match[2] || '').split('').each do |option| + case option + when 'x' then options |= Regexp::EXTENDED + when 'i' then options |= Regexp::IGNORECASE end - regexps[Regexp.new(match[1], options)] = account - else - tokenize(data).each do |token| - tokens[token] ||= {} - tokens[token][account] ||= 0 - tokens[token][account] += 1 - accounts[account] += 1 - end end + regexps[Regexp.new(match[1], options)] = account end - def tokenize(str) - str.downcase.split(/[\s\-]/) - end - def walk_backwards seen_anything_new = false each_row_backwards do |row| interactive_output Terminal::Table.new(:rows => [ [ row[:pretty_date], row[:pretty_money], row[:description] ] ]) @@ -105,12 +103,11 @@ end else seen_anything_new = true end - possible_answers = most_specific_regexp_match(row) - possible_answers = weighted_account_match( row ).map! { |a| a[:account] } if possible_answers.empty? + possible_answers = suggest(row) ledger = if row[:money] > 0 if options[:unattended] out_of_account = possible_answers.first || options[:default_outof_account] || 'Income:Unknown' else @@ -154,110 +151,82 @@ learn_from(ledger) unless options[:account_tokens_file] output(ledger) end end - def finish - options[:output_file].close unless options[:output_file] == STDOUT - interactive_output "Exiting." - exit + def each_row_backwards + rows = [] + (0...@csv_parser.columns.first.length).to_a.each do |index| + rows << { :date => @csv_parser.date_for(index), + :pretty_date => @csv_parser.pretty_date_for(index), + :pretty_money => @csv_parser.pretty_money_for(index), + :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate), + :money => @csv_parser.money_for(index), + :description => @csv_parser.description_for(index) } + end + rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row| + yield row + end end - def output(ledger_line) - options[:output_file].puts ledger_line - options[:output_file].flush - end - def most_specific_regexp_match( row ) matches = regexps.map { |regexp, account| if match = regexp.match(row[:description]) [account, match[0]] end }.compact matches.sort_by! { |account, matched_text| matched_text.length }.map(&:first) end - # Weigh accounts by how well they match the row - def weighted_account_match( row ) - query_tokens = tokenize(row[:description]) - - search_vector = [] - account_vectors = {} - - query_tokens.each do |token| - idf = Math.log((accounts.keys.length + 1) / ((tokens[token] || {}).keys.length.to_f + 1)) - tf = 1.0 / query_tokens.length.to_f - search_vector << tf*idf - - accounts.each do |account, total_terms| - tf = (tokens[token] && tokens[token][account]) ? tokens[token][account] / total_terms.to_f : 0 - account_vectors[account] ||= [] - account_vectors[account] << tf*idf - end - end - - # Should I normalize the vectors? Probably unnecessary due to tf-idf and short documents. - - account_vectors = account_vectors.to_a.map do |account, account_vector| - { :cosine => (0...account_vector.length).to_a.inject(0) { |m, i| m + search_vector[i] * account_vector[i] }, - :account => account } - end - account_vectors.sort! {|a, b| b[:cosine] <=> a[:cosine] } - - # Return empty set if no accounts matched so that we can fallback to the defaults in the unattended mode - if options[:unattended] - if account_vectors.first && account_vectors.first[:account] - account_vectors = [] if account_vectors.first[:cosine] == 0 - end - end - - return account_vectors + def suggest(row) + most_specific_regexp_match(row) + + @matcher.find_similar(row[:description]).map { |n| n[:account] } end def ledger_format(row, line1, line2) out = "#{row[:pretty_date]}\t#{row[:description]}\n" out += "\t#{line1.first}\t\t\t\t\t#{line1.last}\n" out += "\t#{line2.first}\t\t\t\t\t#{line2.last}\n\n" out end + def output(ledger_line) + options[:output_file].puts ledger_line + options[:output_file].flush + end + + def already_seen?(row) + seen[row[:pretty_date]] && seen[row[:pretty_date]][row[:pretty_money]] + end + + def finish + options[:output_file].close unless options[:output_file] == STDOUT + interactive_output "Exiting." + exit + end + def output_table output = Terminal::Table.new do |t| t.headings = 'Date', 'Amount', 'Description' each_row_backwards do |row| t << [ row[:pretty_date], row[:pretty_money], row[:description] ] end end interactive_output output end - def each_row_backwards - rows = [] - (0...@csv_parser.columns.first.length).to_a.each do |index| - rows << { :date => @csv_parser.date_for(index), - :pretty_date => @csv_parser.pretty_date_for(index), - :pretty_money => @csv_parser.pretty_money_for(index), - :pretty_money_negated => @csv_parser.pretty_money_for(index, :negate), - :money => @csv_parser.money_for(index), - :description => @csv_parser.description_for(index) } - end - rows.sort { |a, b| a[:date] <=> b[:date] }.each do |row| - yield row - end - end - def self.parse_opts(args = ARGV) options = { :output_file => STDOUT } parser = OptionParser.new do |opts| opts.banner = "Usage: Reckon.rb [options]" opts.separator "" opts.on("-f", "--file FILE", "The CSV file to parse") do |file| options[:file] = file end - opts.on("-a", "--account name", "The Ledger Account this file is for") do |a| + opts.on("-a", "--account NAME", "The Ledger Account this file is for") do |a| options[:bank_account] = a end opts.on("-v", "--[no-]verbose", "Run verbosely") do |v| options[:verbose] = v @@ -281,10 +250,18 @@ opts.on("", "--ignore-columns 1,2,5", "Columns to ignore in the CSV file - the first column is column 1") do |ignore| options[:ignore_columns] = ignore.split(",").map { |i| i.to_i } end + opts.on("", "--money-column 2", Integer, "Specify the money column instead of letting Reckon guess - the first column is column 1") do |column_number| + options[:money_column] = column_number + end + + opts.on("", "--date-column 3", Integer, "Specify the date column instead of letting Reckon guess - the first column is column 1") do |column_number| + options[:date_column] = column_number + end + opts.on("", "--contains-header [N]", "The first row of the CSV is a header and should be skipped. Optionally add the number of rows to skip.") do |contains_header| options[:contains_header] = 1 options[:contains_header] = contains_header.to_i if contains_header end @@ -314,15 +291,15 @@ opts.on("-t", "--account-tokens FILE", "YAML file with manually-assigned tokens for each account (see README)") do |a| options[:account_tokens_file] = a end - opts.on("", "--default-into-account name", "Default into account") do |a| + opts.on("", "--default-into-account NAME", "Default into account") do |a| options[:default_into_account] = a end - opts.on("", "--default-outof-account name", "Default 'out of' account") do |a| + opts.on("", "--default-outof-account NAME", "Default 'out of' account") do |a| options[:default_outof_account] = a end opts.on("", "--suffixed", "If --currency should be used as a suffix. Defaults to false.") do |e| options[:suffixed] = e @@ -349,10 +326,9 @@ exit end end unless options[:bank_account] - fail "Please specify --account for the unattended mode" if options[:unattended] options[:bank_account] = ask("What is the account name of this bank account in Ledger? ") do |q| q.readline = true q.validate = /^.{2,}$/