Sha256: a8c0669ad0d3c94a65fdb32307420b1e395dcaa27d086fb3ef436e35b6f113d4

Contents?: true

Size: 781 Bytes

Versions: 1

Compression:

Stored size: 781 Bytes

Contents

# encoding: UTF-8

require File.expand_path('../../vendor/icu4j-53_1.jar', __FILE__)

java_import 'com.ibm.icu.util.ULocale'
java_import 'com.ibm.icu.text.BreakIterator'

module Pilcrow
  class << self

    def process(text, locale)
      insert_markers(segment_text(text, locale))
    end

    private

    def segment_text(text, locale)
      brkiter = BreakIterator.getWordInstance(ULocale.new(locale))
      brkiter.setText(text)
      start = brkiter.first
      segments = []

      until (stop = brkiter.next) == BreakIterator::DONE
        segments << text[start...stop]
        start = stop
      end

      segments
    end

    def insert_markers(segments)
      # pilcrow character: http://en.wikipedia.org/wiki/Pilcrow
      segments.join("\u00B6")
    end

  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
pilcrow-1.0.0 lib/pilcrow.rb