lib/pragmatic_segmenter/segmenter.rb in pragmatic_segmenter-0.0.3 vs lib/pragmatic_segmenter/segmenter.rb in pragmatic_segmenter-0.0.4
- old
+ new
@@ -15,67 +15,33 @@
require 'pragmatic_segmenter/languages/french'
require 'pragmatic_segmenter/languages/italian'
require 'pragmatic_segmenter/languages/spanish'
require 'pragmatic_segmenter/languages/russian'
require 'pragmatic_segmenter/languages/japanese'
+require 'pragmatic_segmenter/languages/common'
+require 'pragmatic_segmenter/language_support'
require 'pragmatic_segmenter/rules'
module PragmaticSegmenter
# This class segments a text into an array of sentences.
class Segmenter
- include Rules
+ include LanguageSupport
attr_reader :text, :language, :doc_type
+
def initialize(text:, **args)
- return [] unless text
+ return unless text
@language = args[:language] || 'en'
@doc_type = args[:doc_type]
- if args[:clean].eql?(false)
- @text = text.dup
- else
- case @language
- when 'en'
- @text = PragmaticSegmenter::Languages::English::Cleaner.new(text: text.dup, doc_type: args[:doc_type]).clean
- when 'ja'
- @text = PragmaticSegmenter::Languages::Japanese::Cleaner.new(text: text.dup, doc_type: args[:doc_type]).clean
- else
- @text = PragmaticSegmenter::Cleaner.new(text: text.dup, doc_type: args[:doc_type]).clean
- end
+ @text = text.dup
+
+ unless args[:clean].eql?(false)
+ @text = cleaner_class.new(text: @text, doc_type: args[:doc_type]).clean
end
end
def segment
return [] unless text
- case language
- when 'en'
- PragmaticSegmenter::Process.new(text: text, doc_type: doc_type).process
- when 'de'
- PragmaticSegmenter::Languages::Deutsch::Process.new(text: text, doc_type: doc_type).process
- when 'es'
- PragmaticSegmenter::Languages::Spanish::Process.new(text: text, doc_type: doc_type).process
- when 'it'
- PragmaticSegmenter::Languages::Italian::Process.new(text: text, doc_type: doc_type).process
- when 'ja'
- PragmaticSegmenter::Languages::Japanese::Process.new(text: text, doc_type: doc_type).process
- when 'el'
- PragmaticSegmenter::Languages::Greek::Process.new(text: text, doc_type: doc_type).process
- when 'ru'
- PragmaticSegmenter::Languages::Russian::Process.new(text: text, doc_type: doc_type).process
- when 'ar'
- PragmaticSegmenter::Languages::Arabic::Process.new(text: text, doc_type: doc_type).process
- when 'am'
- PragmaticSegmenter::Languages::Amharic::Process.new(text: text, doc_type: doc_type).process
- when 'hi'
- PragmaticSegmenter::Languages::Hindi::Process.new(text: text, doc_type: doc_type).process
- when 'hy'
- PragmaticSegmenter::Languages::Armenian::Process.new(text: text, doc_type: doc_type).process
- when 'fa'
- PragmaticSegmenter::Languages::Persian::Process.new(text: text, doc_type: doc_type).process
- when 'my'
- PragmaticSegmenter::Languages::Burmese::Process.new(text: text, doc_type: doc_type).process
- when 'ur'
- PragmaticSegmenter::Languages::Urdu::Process.new(text: text, doc_type: doc_type).process
- else
- PragmaticSegmenter::Process.new(text: text, doc_type: doc_type).process
- end
+
+ process_class.new(text: text, doc_type: doc_type).process
end
end
end