lib/opener/language_identifier/detector.rb in opener-language-identifier-4.1.0 vs lib/opener/language_identifier/detector.rb in opener-language-identifier-4.2.0

- old
+ new

@@ -1,41 +1,160 @@ -require 'singleton' - -import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector' - module Opener class LanguageIdentifier ## - # Singleton class wrapped around the Cybozu detector. The Cybozu code uses - # the factory pattern and stores a bunch of things on class level. As such - # the Cybozu code is *not* thread-safe. + # Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This + # class automatically handles switching of profiles based on input sizes, + # assigning priorities to languages, etc. # class Detector - attr_reader :options + attr_reader :profiles_path, :short_profiles_path - include Singleton + ## + # Path to the directory containing the default profiles. + # + # @return [String] + # + DEFAULT_PROFILES_PATH = File.expand_path( + '../../../../core/target/classes/profiles', + __FILE__ + ) - def initialize(options={}) - @options = options - @detector = CybozuDetector.new(profiles_path) - @semaphore = Mutex.new + ## + # Path to the directory containing the default short profiles. + # + # @return [String] + # + DEFAULT_SHORT_PROFILES_PATH = File.expand_path( + '../../../../core/target/classes/short_profiles', + __FILE__ + ) + + ## + # The amount of characters after which the detector should switch to using + # the longer profiles set. + # + # @return [Fixnum] + # + SHORT_THRESHOLD = 15 + + ## + # Prioritize OpeNER languages over the rest. Languages not covered by this + # list are automatically given a default priority. + # + # @return [Hash] + # + PRIORITIES = { + 'en' => 1.0, + 'es' => 0.9, + 'it' => 0.9, + 'fr' => 0.9, + 'de' => 0.9, + 'nl' => 0.9 + } + + ## + # The default priority for non OpeNER languages. + # + # @return [Float] + # + DEFAULT_PRIORITY = 0.5 + + ## + # @param [Hash] options + # + # @option options [String] :profiles_path + # @option options [String] :short_profiles_path + # + def initialize(options = {}) + options.each do |key, value| + instance_variable_set("@#{key}", value) if respond_to?(key) + end + + @profiles_path ||= DEFAULT_PROFILES_PATH + @short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH end + ## + # @return [String] + # def detect(input) - @semaphore.synchronize do - @detector.detect(input) - end + return new_detector(input).detect end + ## + # @return [Array] + # def probabilities(input) - @semaphore.synchronize do - result = @detector.detect_langs(input) + return new_detector(input).get_probabilities.to_array + end + + ## + # Returns a new detector with the profiles set based on the input. + # + # This method analyses a lowercased version of the input as this yields + # better results for short text. + # + # @param [String] input + # @return [CybozuDetector] + # + def new_detector(input) + factory = com.cybozu.labs.langdetect.DetectorFactory.new + + factory.load_profile(determine_profiles(input)) + factory.set_seed(1) + + priorities = build_priorities(input, factory.langlist) + detector = com.cybozu.labs.langdetect.Detector.new(factory) + + detector.set_prior_map(priorities) + detector.append(input.downcase) + + return detector + end + + ## + # Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER + # languages. + # + # If the input size is smaller than the short profiles threshold non + # OpeNER languages are _disabled_. This is to ensure that these languages + # are detected properly when analysing only 1-2 words. + # + # @param [String] input + # @param [Array<String>] languages + # @return [java.util.HashMap] + # + def build_priorities(input, languages) + priorities = java.util.HashMap.new + priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY + + PRIORITIES.each do |lang, val| + priorities.put(lang, val) end + + languages.each do |language| + unless priorities.contains_key(language) + priorities.put(language, priority) + end + end + + return priorities end - def profiles_path - default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__) - options.fetch(:profiles_path, default_path) + ## + # @param [String] input + # @return [String] + # + def determine_profiles(input) + return short_input?(input) ? short_profiles_path : profiles_path end - end - end -end + + ## + # @param [String] input + # @return [TrueClass|FalseClass] + # + def short_input?(input) + return input.length <= SHORT_THRESHOLD + end + end # Detector + end # LanguageIdentifier +end # Opener