lib/opener/language_identifier/detector.rb in opener-language-identifier-4.2.2 vs lib/opener/language_identifier/detector.rb in opener-language-identifier-4.3.0

- old
+ new

@@ -4,167 +4,44 @@ # Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This # class automatically handles switching of profiles based on input sizes, # assigning priorities to languages, etc. # class Detector - attr_reader :profiles_path, :short_profiles_path - ## - # Path to the directory containing the default profiles. - # - # @return [String] - # - DEFAULT_PROFILES_PATH = File.expand_path( - '../../../../core/target/classes/profiles', - __FILE__ - ) + attr_reader :backend ## - # Path to the directory containing the default short profiles. - # - # @return [String] - # - DEFAULT_SHORT_PROFILES_PATH = File.expand_path( - '../../../../core/target/classes/short_profiles', - __FILE__ - ) - - ## - # The amount of characters after which the detector should switch to using - # the longer profiles set. - # - # @return [Fixnum] - # - SHORT_THRESHOLD = 15 - - ## - # Prioritize OpeNER languages over the rest. Languages not covered by this - # list are automatically given a default priority. - # - # @return [Hash] - # - PRIORITIES = { - 'en' => 1.0, - 'es' => 0.9, - 'it' => 0.9, - 'fr' => 0.9, - 'de' => 0.9, - 'nl' => 0.9, - - # These languages are disabled (for the time being) due to conflicting - # with other (OpeNER) languages too often. - 'af' => 0.0, # conflicts with Dutch - } - - ## - # The default priority for non OpeNER languages. - # - # @return [Float] - # - DEFAULT_PRIORITY = 0.5 - - ## # @param [Hash] options # - # @option options [String] :profiles_path - # @option options [String] :short_profiles_path # - def initialize(options = {}) - options.each do |key, value| - instance_variable_set("@#{key}", value) if respond_to?(key) - end + def initialize backend = nil, fallback = nil + klass = Backend.const_get backend.to_sym if backend + klass ||= LanguageDetection + @backend = klass.new - @profiles_path ||= DEFAULT_PROFILES_PATH - @short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH + klass = Backend.const_get fallback.to_sym if fallback + @fallback = klass.new if klass + + @timeout = ENV['TIMEOUT']&.to_i end ## # @return [String] # def detect(input) - return new_detector(input).detect - - # The core Java code raise an exception when it can't detect a language. - # Since this isn't actually something fatal we'll capture this and return - # "unknown" instead. - rescue com.cybozu.labs.langdetect.LangDetectException - return 'unknown' + backend_detect @backend, input + rescue + raise unless @fallback + puts 'Using fallback backend' if ENV['DEBUG'] + backend_detect @fallback, input end - ## - # @return [Array] - # - def probabilities(input) - return new_detector(input).get_probabilities.to_array - end - - ## - # Returns a new detector with the profiles set based on the input. - # - # This method analyses a lowercased version of the input as this yields - # better results for short text. - # - # @param [String] input - # @return [CybozuDetector] - # - def new_detector(input) - factory = com.cybozu.labs.langdetect.DetectorFactory.new - - factory.load_profile(determine_profiles(input)) - factory.set_seed(1) - - priorities = build_priorities(input, factory.langlist) - detector = com.cybozu.labs.langdetect.Detector.new(factory) - - detector.set_prior_map(priorities) - detector.append(input.downcase) - - return detector - end - - ## - # Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER - # languages. - # - # If the input size is smaller than the short profiles threshold non - # OpeNER languages are _disabled_. This is to ensure that these languages - # are detected properly when analysing only 1-2 words. - # - # @param [String] input - # @param [Array<String>] languages - # @return [java.util.HashMap] - # - def build_priorities(input, languages) - priorities = java.util.HashMap.new - priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY - - PRIORITIES.each do |lang, val| - priorities.put(lang, val) + def backend_detect backend, input + return backend.detect input unless @timeout + Timeout.timeout @timeout do + backend.detect input end - - languages.each do |language| - unless priorities.contains_key(language) - priorities.put(language, priority) - end - end - - return priorities end - ## - # @param [String] input - # @return [String] - # - def determine_profiles(input) - return short_input?(input) ? short_profiles_path : profiles_path - end - - ## - # @param [String] input - # @return [TrueClass|FalseClass] - # - def short_input?(input) - return input.length <= SHORT_THRESHOLD - end - end # Detector - end # LanguageIdentifier -end # Opener + end + end +end