lib/banktools-se/ocr.rb in banktools-se-2.5.0 vs lib/banktools-se/ocr.rb in banktools-se-2.6.0

- old
+ new

@@ -1,6 +1,6 @@ -# http://web.archive.org/web/20111216065227/http://www.bgc.se/upload/Gemensamt/Trycksaker/Manualer/BG6070.pdf section 5.2 +# https://www.bankgirot.se/globalassets/dokument/anvandarmanualer/bankgiroinbetalningar_anvandarmanual_sv_31okt2016.pdf section 5.2 module BankTools module SE class OCR class InvalidOCR < StandardError; end @@ -64,17 +64,31 @@ ocr[0...-digits_to_chop] end # max_length is 19 because that's the longest allowed integer by default in a Postgres integer column with Ruby on Rails. So attempting some queries with longer OCRs may cause exceptions. def self.find_all_in_string(string, length_digit: false, pad: "", min_length: 4, max_length: 19) - expanded_string = [ string, *[ "\n", ";", "." ].map { |x| string.gsub(x, "") } ].join(" ") + # First, treat the input as one long string of digits. + # E.g. "1234 and 5678" becomes "12345678". + digit_string = string.gsub(/\D/, "") + digit_string_length = digit_string.length - numbers = expanded_string.scan(/\d+/) + candidates = [] - expanded_numbers = with_numbers_found_by_removing_prefix_and_postfix(numbers). - reject { |n| n.length < min_length || n.length > max_length } + # Then find all substrings of min_length, and of all other lengths, up to max_length. + # So e.g. find all four-digit substrings ("1234", "2345", …), all five-digit substrings and so on. - expanded_numbers.select { |candidate| + 0.upto(digit_string.length - min_length) do |start_pos| + min_end_pos = [ start_pos + min_length, digit_string_length ].min - 1 + max_end_pos = [ start_pos + max_length, digit_string_length ].min - 1 + + min_end_pos.upto(max_end_pos) do |end_pos| + candidates << digit_string.slice(start_pos..end_pos) + end + end + + # Finally, limit these substrings to ones that are actually valid OCRs. + + candidates.select { |candidate| begin to_number(candidate, length_digit: length_digit, pad: pad) true rescue InvalidOCR false