Sha256: 3eb7e72ec7a7fa2ba63cf8b80aa592a5bff13818efc42dd1b668b5ada58b8527

Contents?: true

Size: 798 Bytes

Versions: 1

Compression:

Stored size: 798 Bytes

Contents

# coding: utf-8
class Tefil::EachSentence < Tefil::TextFilterBase

  END_CHAR = %w(. ? . 。)
  NOT_END_WORDS = ["Fig.", "FIG."]

  def initialize(options = {})
    options[:smart_filename] = true
    @minimum = options[:minimum]
    super(options)
  end

  def process_stream(in_io, out_io)
    results = []
    words = []
    in_io.read.strip.split("\n").each do |line|
      new_line = ''
      #line.gsub!("\n", ' ')
      line.chars do |char|
        new_line += char
        new_line += "\n" if (END_CHAR.include?(char))
      end
      NOT_END_WORDS.each do |word|
        new_line.gsub!(/#{word}\n/, word)
      end
      new_line.gsub!(/\n  */, "\n")
      new_line.strip!
      new_line.gsub!(/  */, " ")
      results << new_line
    end
    out_io.puts results.join("\n")
  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
tefil-0.1.1 lib/tefil/eachsentence.rb