lib/opener/language_identifier/detector.rb in opener-language-identifier-4.1.0 vs lib/opener/language_identifier/detector.rb in opener-language-identifier-4.2.0
- old
+ new
@@ -1,41 +1,160 @@
-require 'singleton'
-
-import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
-
module Opener
class LanguageIdentifier
##
- # Singleton class wrapped around the Cybozu detector. The Cybozu code uses
- # the factory pattern and stores a bunch of things on class level. As such
- # the Cybozu code is *not* thread-safe.
+ # Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This
+ # class automatically handles switching of profiles based on input sizes,
+ # assigning priorities to languages, etc.
#
class Detector
- attr_reader :options
+ attr_reader :profiles_path, :short_profiles_path
- include Singleton
+ ##
+ # Path to the directory containing the default profiles.
+ #
+ # @return [String]
+ #
+ DEFAULT_PROFILES_PATH = File.expand_path(
+ '../../../../core/target/classes/profiles',
+ __FILE__
+ )
- def initialize(options={})
- @options = options
- @detector = CybozuDetector.new(profiles_path)
- @semaphore = Mutex.new
+ ##
+ # Path to the directory containing the default short profiles.
+ #
+ # @return [String]
+ #
+ DEFAULT_SHORT_PROFILES_PATH = File.expand_path(
+ '../../../../core/target/classes/short_profiles',
+ __FILE__
+ )
+
+ ##
+ # The amount of characters after which the detector should switch to using
+ # the longer profiles set.
+ #
+ # @return [Fixnum]
+ #
+ SHORT_THRESHOLD = 15
+
+ ##
+ # Prioritize OpeNER languages over the rest. Languages not covered by this
+ # list are automatically given a default priority.
+ #
+ # @return [Hash]
+ #
+ PRIORITIES = {
+ 'en' => 1.0,
+ 'es' => 0.9,
+ 'it' => 0.9,
+ 'fr' => 0.9,
+ 'de' => 0.9,
+ 'nl' => 0.9
+ }
+
+ ##
+ # The default priority for non OpeNER languages.
+ #
+ # @return [Float]
+ #
+ DEFAULT_PRIORITY = 0.5
+
+ ##
+ # @param [Hash] options
+ #
+ # @option options [String] :profiles_path
+ # @option options [String] :short_profiles_path
+ #
+ def initialize(options = {})
+ options.each do |key, value|
+ instance_variable_set("@#{key}", value) if respond_to?(key)
+ end
+
+ @profiles_path ||= DEFAULT_PROFILES_PATH
+ @short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH
end
+ ##
+ # @return [String]
+ #
def detect(input)
- @semaphore.synchronize do
- @detector.detect(input)
- end
+ return new_detector(input).detect
end
+ ##
+ # @return [Array]
+ #
def probabilities(input)
- @semaphore.synchronize do
- result = @detector.detect_langs(input)
+ return new_detector(input).get_probabilities.to_array
+ end
+
+ ##
+ # Returns a new detector with the profiles set based on the input.
+ #
+ # This method analyses a lowercased version of the input as this yields
+ # better results for short text.
+ #
+ # @param [String] input
+ # @return [CybozuDetector]
+ #
+ def new_detector(input)
+ factory = com.cybozu.labs.langdetect.DetectorFactory.new
+
+ factory.load_profile(determine_profiles(input))
+ factory.set_seed(1)
+
+ priorities = build_priorities(input, factory.langlist)
+ detector = com.cybozu.labs.langdetect.Detector.new(factory)
+
+ detector.set_prior_map(priorities)
+ detector.append(input.downcase)
+
+ return detector
+ end
+
+ ##
+ # Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER
+ # languages.
+ #
+ # If the input size is smaller than the short profiles threshold non
+ # OpeNER languages are _disabled_. This is to ensure that these languages
+ # are detected properly when analysing only 1-2 words.
+ #
+ # @param [String] input
+ # @param [Array<String>] languages
+ # @return [java.util.HashMap]
+ #
+ def build_priorities(input, languages)
+ priorities = java.util.HashMap.new
+ priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY
+
+ PRIORITIES.each do |lang, val|
+ priorities.put(lang, val)
end
+
+ languages.each do |language|
+ unless priorities.contains_key(language)
+ priorities.put(language, priority)
+ end
+ end
+
+ return priorities
end
- def profiles_path
- default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__)
- options.fetch(:profiles_path, default_path)
+ ##
+ # @param [String] input
+ # @return [String]
+ #
+ def determine_profiles(input)
+ return short_input?(input) ? short_profiles_path : profiles_path
end
- end
- end
-end
+
+ ##
+ # @param [String] input
+ # @return [TrueClass|FalseClass]
+ #
+ def short_input?(input)
+ return input.length <= SHORT_THRESHOLD
+ end
+ end # Detector
+ end # LanguageIdentifier
+end # Opener