lib/jldrill/model/DataFile.rb in jldrill-0.5.1.7 vs lib/jldrill/model/DataFile.rb in jldrill-0.6.0.1

- old
+ new

@@ -1,20 +1,23 @@ +# encoding: utf-8 require 'Context/Publisher' +require 'kconv' module JLDrill # This class represents a data file in JLDrill. This # is an abstract class meant to define the interface # for having a file which can be read in the background # in JLDrill. class DataFile - attr_reader :file, :lines, :parsed, :publisher, :stepSize + attr_reader :file, :lines, :parsed, :publisher, :stepSize, :encoding attr_writer :lines, :stepSize def initialize @publisher = Context::Publisher.new(self) # Default to reporting every 100 lines @stepSize = 100 + @encoding = nil self.reset end # Returns the number of items you have created def dataSize @@ -70,18 +73,39 @@ retVal = @parsed.to_f / @lines.size.to_f end return retVal end + # Try to determine the encoding from the first 999 characters + # of the string. By keeping it to a multiple of 3 we avoid + # splitting the encodings for UTF8 strings. I can't help but + # think that this function is prone to failure since UTF8 characters + # are variable length, but I can't think of a better idea. + # Note this problem will only manifest itself on ruby 1.8 + def findEncoding(buffer) + encoding = Kconv.guess(buffer[0..998]) + return encoding + end + + # Make sure the encoding is correct and split the lines + def createLines(buffer) + @encoding = findEncoding(buffer) + if (@encoding != Kconv::UTF8) + buffer = Kconv.kconv(buffer, Kconv::UTF8, @encoding) + end + @lines = buffer.split("\n") + end + # Read the file into memory. This is done before parsing def readLines begin - @lines = IO.readlines(@file) + buffer = IO.read(@file) rescue Context::Log::warning("JLDrill::DataFile", "Could not load #{@file}.") - @lines = [] + buffer = "" end + createLines(buffer) @parsed = 0 end # Load in the file data, but don't parse it yet def load(file)