Sha256: 6cba148de9cdfa99da9466f710a438a7e449bbd501f02e5bf84a93210220b352

Contents?: true

Size: 990 Bytes

Versions: 26

Compression:

Stored size: 990 Bytes

Contents

module Embulk
  module Guess

    class CharsetGuessPlugin < GuessPlugin
      Plugin.register_guess('charset', self)

      STATIC_MAPPING = {
        # ISO-8859-1 means ASCII which is a subset of UTF-8 in most of cases
        # due to lack of sample data set.
        "ISO-8859-1" => "UTF-8",

        # Shift_JIS is used almost only by Windows that uses "CP932" in fact.
        # And "CP932" called by Microsoft actually means "MS932" in Java.
        "Shift_JIS" => "MS932",
      }

      def guess(config, sample_buffer)
        # ICU4J
        detector = com.ibm.icu.text.CharsetDetector.new
        detector.setText(sample_buffer.to_java_bytes)
        best_match = detector.detect
        if best_match.getConfidence < 50
          name = "UTF-8"
        else
          name = best_match.getName
          if mapped_name = STATIC_MAPPING[name]
            name = mapped_name
          end
        end
        return {"parser" => {"charset" => name}}
      end
    end

  end
end

Version data entries

26 entries across 26 versions & 1 rubygems

Version Path
embulk-0.6.27 lib/embulk/guess/charset.rb
embulk-0.6.26 lib/embulk/guess/charset.rb
embulk-0.6.25 lib/embulk/guess/charset.rb
embulk-0.6.24 lib/embulk/guess/charset.rb
embulk-0.6.23 lib/embulk/guess/charset.rb
embulk-0.6.22 lib/embulk/guess/charset.rb
embulk-0.6.21 lib/embulk/guess/charset.rb
embulk-0.6.20 lib/embulk/guess/charset.rb
embulk-0.6.19 lib/embulk/guess/charset.rb
embulk-0.6.18 lib/embulk/guess/charset.rb
embulk-0.6.17 lib/embulk/guess/charset.rb
embulk-0.6.16 lib/embulk/guess/charset.rb
embulk-0.6.15 lib/embulk/guess/charset.rb
embulk-0.6.14 lib/embulk/guess/charset.rb
embulk-0.6.13 lib/embulk/guess/charset.rb
embulk-0.6.12 lib/embulk/guess/charset.rb
embulk-0.6.11 lib/embulk/guess/charset.rb
embulk-0.6.10 lib/embulk/guess/charset.rb
embulk-0.6.9 lib/embulk/guess/charset.rb
embulk-0.6.8 lib/embulk/guess/charset.rb