Sha256: 061ad7311824163d4325f4b2b4c25c203774dcb65a7a9ddd38e7f871f14d1473

Contents?: true

Size: 1022 Bytes

Versions: 29

Compression:

Stored size: 1022 Bytes

Contents

module Embulk
  module Guess

    class CharsetGuessPlugin < GuessPlugin
      Plugin.register_guess('charset', self)

      STATIC_MAPPING = {
        # ISO-8859-1 means ASCII which is a subset of UTF-8 in most of cases
        # due to lack of sample data set.
        "ISO-8859-1" => "UTF-8",

        # Shift_JIS is used almost only by Windows that uses "CP932" in fact.
        # And "CP932" called by Microsoft actually means "MS932" in Java.
        "Shift_JIS" => "MS932",
      }

      def guess(config, sample_buffer)
        detector_class = org.embulk.deps.guess.CharsetDetector
        detector = detector_class.create
        detector.setText(sample_buffer.to_java_bytes)
        best_match = detector.detect
        if best_match.getConfidence < 50
          name = "UTF-8"
        else
          name = best_match.getName
          if mapped_name = STATIC_MAPPING[name]
            name = mapped_name
          end
        end
        return {"parser" => {"charset" => name}}
      end
    end

  end
end

Version data entries

29 entries across 29 versions & 1 rubygems

Version Path
embulk-0.10.32-java lib/embulk/guess/charset.rb
embulk-0.10.31-java lib/embulk/guess/charset.rb
embulk-0.10.30-java lib/embulk/guess/charset.rb
embulk-0.10.29-java lib/embulk/guess/charset.rb
embulk-0.10.28-java lib/embulk/guess/charset.rb
embulk-0.10.27-java lib/embulk/guess/charset.rb
embulk-0.10.26-java lib/embulk/guess/charset.rb
embulk-0.10.25-java lib/embulk/guess/charset.rb
embulk-0.10.24-java lib/embulk/guess/charset.rb