Sha256: 20d146a50c2382832112134389fe070a5c5fc17190bd98f620eb335575453f0a

Contents?: true

Size: 1.38 KB

Versions: 104

Compression:

Stored size: 1.38 KB

Contents

module Embulk
  module Guess

    class CharsetGuessPlugin < GuessPlugin
      Plugin.register_guess('charset', self)

      STATIC_MAPPING = {
        # ISO-8859-1 means ASCII which is a subset of UTF-8 in most of cases
        # due to lack of sample data set.
        "ISO-8859-1" => "UTF-8",

        # Shift_JIS is used almost only by Windows that uses "CP932" in fact.
        # And "CP932" called by Microsoft actually means "MS932" in Java.
        "Shift_JIS" => "MS932",
      }

      def guess(config, sample_buffer)
        # ICU4J
        begin
          detector_class = com.ibm.icu.text.CharsetDetector
        rescue NameError
          # icu4j is removed from embulk.gem package explicitly at embulk.gemspec
          # if gem is packaged for JRuby to reduce binary size. Instead, if it's
          # packaged for JRuby, embulk.gemspec adds rjack-icu to its dependency.
          require 'rjack-icu'
          detector_class = com.ibm.icu.text.CharsetDetector
        end
        detector = detector_class.new
        detector.setText(sample_buffer.to_java_bytes)
        best_match = detector.detect
        if best_match.getConfidence < 50
          name = "UTF-8"
        else
          name = best_match.getName
          if mapped_name = STATIC_MAPPING[name]
            name = mapped_name
          end
        end
        return {"parser" => {"charset" => name}}
      end
    end

  end
end

Version data entries

104 entries across 104 versions & 1 rubygems

Version Path
embulk-0.8.39 lib/embulk/guess/charset.rb
embulk-0.8.39-java lib/embulk/guess/charset.rb
embulk-0.8.38 lib/embulk/guess/charset.rb
embulk-0.8.38-java lib/embulk/guess/charset.rb
embulk-0.8.37 lib/embulk/guess/charset.rb
embulk-0.8.37-java lib/embulk/guess/charset.rb
embulk-0.8.36 lib/embulk/guess/charset.rb
embulk-0.8.36-java lib/embulk/guess/charset.rb
embulk-0.8.35 lib/embulk/guess/charset.rb
embulk-0.8.35-java lib/embulk/guess/charset.rb
embulk-0.8.34 lib/embulk/guess/charset.rb
embulk-0.8.34-java lib/embulk/guess/charset.rb
embulk-0.8.33 lib/embulk/guess/charset.rb
embulk-0.8.33-java lib/embulk/guess/charset.rb
embulk-0.8.32 lib/embulk/guess/charset.rb
embulk-0.8.32-java lib/embulk/guess/charset.rb
embulk-0.8.31 lib/embulk/guess/charset.rb
embulk-0.8.31-java lib/embulk/guess/charset.rb
embulk-0.8.30 lib/embulk/guess/charset.rb
embulk-0.8.30-java lib/embulk/guess/charset.rb