Sha256: 061ad7311824163d4325f4b2b4c25c203774dcb65a7a9ddd38e7f871f14d1473
Contents?: true
Size: 1022 Bytes
Versions: 29
Compression:
Stored size: 1022 Bytes
Contents
module Embulk module Guess class CharsetGuessPlugin < GuessPlugin Plugin.register_guess('charset', self) STATIC_MAPPING = { # ISO-8859-1 means ASCII which is a subset of UTF-8 in most of cases # due to lack of sample data set. "ISO-8859-1" => "UTF-8", # Shift_JIS is used almost only by Windows that uses "CP932" in fact. # And "CP932" called by Microsoft actually means "MS932" in Java. "Shift_JIS" => "MS932", } def guess(config, sample_buffer) detector_class = org.embulk.deps.guess.CharsetDetector detector = detector_class.create detector.setText(sample_buffer.to_java_bytes) best_match = detector.detect if best_match.getConfidence < 50 name = "UTF-8" else name = best_match.getName if mapped_name = STATIC_MAPPING[name] name = mapped_name end end return {"parser" => {"charset" => name}} end end end end
Version data entries
29 entries across 29 versions & 1 rubygems