Sha256: 5b5a85b76e55bbdfb817ef7b53e1a8b90827cf8fb93da58e4ea67191bce20058

Contents?: true

Size: 804 Bytes

Versions: 3

Compression:

Stored size: 804 Bytes

Contents

# coding: utf-8
class Tefil::EachSentence < Tefil::TextFilterBase

  END_CHAR = %w(. ? . 。)
  NOT_END_WORDS = ["Fig.", "FIG."]

  def initialize(options = {})
    options[:smart_filename] = true
    @minimum = options[:minimum]
    super(options)
  end

  def process_stream(in_io, out_io)
    results = []
    #words = []
    in_io.read.strip.split("\n").each do |line|
      new_line = ''
      #line.gsub!("\n", ' ')
      line.chars.each do |char|
        new_line += char
        new_line += "\n" if (END_CHAR.include?(char))
      end
      NOT_END_WORDS.each do |word|
        new_line.gsub!(/#{word}\n/, word)
      end
      new_line.gsub!(/\n  */, "\n")
      new_line.strip!
      new_line.gsub!(/  */, " ")
      results << new_line
    end
    out_io.puts results.join("\n")
  end

end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
tefil-0.1.4 lib/tefil/eachsentence.rb
tefil-0.1.3 lib/tefil/eachsentence.rb
tefil-0.1.2 lib/tefil/eachsentence.rb