Sha256: 0e356ca589e4016446f035656ae06486301d3c96050eb745e455947a348546eb
Contents?: true
Size: 672 Bytes
Versions: 8
Compression:
Stored size: 672 Bytes
Contents
module Embulk class GuessCharset < GuessPlugin Plugin.register_guess('charset', self) def guess(config, sample_buffer) # ICU4J detector = com.ibm.icu.text.CharsetDetector.new detector.setText(sample_buffer.to_java_bytes) best_match = detector.detect if best_match.getConfidence < 50 name = "UTF-8" else name = best_match.getName if name == "ISO-8859-1" # ISO-8859-1 means ASCII which is a subset # of UTF-8 in most of cases due to lack of # sample data set name = "UTF-8" end end return {"parser" => {"charset" => name}} end end end
Version data entries
8 entries across 8 versions & 1 rubygems