Sha256: 5f0d192c5743ef0fc827d628a01005bad75467e0d607c0f9a96314e08fbc67dc
Contents?: true
Size: 741 Bytes
Versions: 17
Compression:
Stored size: 741 Bytes
Contents
module Embulk module Guess class CharsetGuessPlugin < GuessPlugin Plugin.register_guess('charset', self) def guess(config, sample_buffer) # ICU4J detector = com.ibm.icu.text.CharsetDetector.new detector.setText(sample_buffer.to_java_bytes) best_match = detector.detect if best_match.getConfidence < 50 name = "UTF-8" else name = best_match.getName if name == "ISO-8859-1" # ISO-8859-1 means ASCII which is a subset # of UTF-8 in most of cases due to lack of # sample data set name = "UTF-8" end end return {"parser" => {"charset" => name}} end end end end
Version data entries
17 entries across 17 versions & 1 rubygems