lib/pragmatic_segmenter/languages/deutsch.rb in pragmatic_segmenter-0.3.3 vs lib/pragmatic_segmenter/languages/deutsch.rb in pragmatic_segmenter-0.3.4

- old
+ new

@@ -16,77 +16,62 @@ SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/ # Rubular: http://rubular.com/r/TkZomF9tTM BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/ - # Rubular: http://rubular.com/r/hZxoyQwKT1 - NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯') - # Rubular: http://rubular.com/r/ityNMwdghj - NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯') + module Numbers + # Rubular: http://rubular.com/r/hZxoyQwKT1 + NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯') + # Rubular: http://rubular.com/r/ityNMwdghj + NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯') + + All = [ + Common::Numbers::All, + NumberPeriodSpaceRule, + NegativeNumberPeriodSpaceRule + ] + end + MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember'] # Rubular: http://rubular.com/r/B4X33QKIL8 SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯') # Rubular: http://rubular.com/r/iUNSkCuso0 SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯') - - class Process < PragmaticSegmenter::Process + class Processor < PragmaticSegmenter::Processor private - def between_punctuation(txt) - BetweenPunctuation.new(text: txt).replace - end + def replace_numbers + @text.apply Numbers::All - def replace_numbers(txt) - Number.new(text: txt).replace + replace_period_in_deutsch_dates end - def replace_abbreviations(txt) - AbbreviationReplacer.new(text: txt, language: Deutsch).replace - end - end - - class Cleaner < PragmaticSegmenter::Cleaner - private - - def abbreviations - Abbreviation::ABBREVIATIONS - end - end - - class Number < PragmaticSegmenter::Number - def replace - super - @text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule) - replace_period_in_deutsch_dates(@text) - end - - def replace_period_in_deutsch_dates(txt) + def replace_period_in_deutsch_dates MONTHS.each do |month| # Rubular: http://rubular.com/r/zlqgj7G5dA - txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯') + @text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯') end - txt end end class AbbreviationReplacer < AbbreviationReplacer def replace - @reformatted_text = text.apply( + @text = text.apply( @language::PossessiveAbbreviationRule, @language::SingleLetterAbbreviationRules::All, SingleLowerCaseLetterRule, SingleLowerCaseLetterAtStartOfLineRule) - @reformatted_text = search_for_abbreviations_in_string(@reformatted_text) - @reformatted_text = replace_multi_period_abbreviations(@reformatted_text) - @reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All) - replace_abbreviation_as_sentence_boundary(@reformatted_text) + @text = search_for_abbreviations_in_string(@text) + @text = replace_multi_period_abbreviations(@text) + @text.apply(Languages::Common::AmPmRules::All) + replace_abbreviation_as_sentence_boundary(@text) end private def scan_for_replacements(txt, am, index, character_array) @@ -95,18 +80,10 @@ end class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation private - def sub_punctuation_between_double_quotes(txt) - btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt) - PragmaticSegmenter::PunctuationReplacer.new( - matches_array: btwn_dbl_quote, - text: txt - ).replace - end - - def sub_punctuation_between_double_quotes_de(txt) + def btwn_dbl_quote(txt) if txt.include?('„') btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX) txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q| btwn_dbl_quote << q end