DataFile.rb in jldrill-0.6.0.1

- old
+ new

@@ -1,20 +1,23 @@
+# encoding: utf-8
 require 'Context/Publisher'
+require 'kconv'
 
 module JLDrill
     # This class represents a data file in JLDrill.   This
     # is an abstract class meant to define the interface
     # for having a file which can be read in the background
     # in JLDrill.
     class DataFile
-        attr_reader :file, :lines, :parsed, :publisher, :stepSize
+        attr_reader :file, :lines, :parsed, :publisher, :stepSize, :encoding
         attr_writer :lines, :stepSize
 
         def initialize
             @publisher = Context::Publisher.new(self)
             # Default to reporting every 100 lines
             @stepSize = 100
+            @encoding = nil
             self.reset
         end
 
         # Returns the number of items you have created
         def dataSize
@@ -70,18 +73,39 @@
 				retVal = @parsed.to_f / @lines.size.to_f
 			end
 			return retVal
 		end
 
+        # Try to determine the encoding from the first 999 characters
+        # of the string.  By keeping it to a multiple of 3 we avoid
+        # splitting the encodings for UTF8 strings.  I can't help but
+        # think that this function is prone to failure since UTF8 characters
+        # are variable length, but I can't think of a better idea.
+        # Note this problem will only manifest itself on ruby 1.8
+        def findEncoding(buffer)
+            encoding = Kconv.guess(buffer[0..998])
+            return encoding
+        end
+
+        # Make sure the encoding is correct and split the lines
+        def createLines(buffer)
+            @encoding = findEncoding(buffer)
+            if (@encoding != Kconv::UTF8)
+                buffer = Kconv.kconv(buffer, Kconv::UTF8, @encoding)
+            end
+            @lines = buffer.split("\n")
+        end
+
         # Read the file into memory.  This is done before parsing
         def readLines
             begin
-                @lines = IO.readlines(@file)
+                buffer = IO.read(@file)
             rescue
                 Context::Log::warning("JLDrill::DataFile",
                                       "Could not load #{@file}.")
-                @lines = []
+                buffer = ""
             end
+            createLines(buffer)
             @parsed = 0
         end
 
         # Load in the file data, but don't parse it yet
 		def load(file)