Sha256: f16a09f522a054a08a1ac4349239c56a6c5684f427c856aab9b29427e5011848

Contents?: true

Size: 1.55 KB

Versions: 59

Compression:

Stored size: 1.55 KB

Contents

module Dap
module Input

  #
  # WARC
  #
  class InputWARC

    include FileSource

    attr_accessor :header, :info

    def initialize(args)
      self.open(args.first)
      read_warc_header
    end

    def read_warc_header
      self.header = read_record
      
      if self.header == Error::EOF
        raise RuntimeError, "Invalid WARC header"
      end

      unless self.header['warc_type'].to_s == "warcinfo"
        raise RuntimeError, "Invalid WARC header (missing warcinfo)"
      end

      self.info = {}
      self.header['content'].to_s.split("\n").each do |line|
        k, v = line.strip.split(/\s*:\s*/, 2)
        next unless v
        self.info[k] = v
      end
    end

    def read_record
      begin

        version = self.fd.readline
        unless version and version =~ /^WARC\/\d+\.\d+/
          return Error::EOF
        end
        warc = {}
      
        loop do
          line = self.fd.readline
          
          unless line.strip.length == 0
            k, v = line.strip.split(/\s*:\s*/, 2)
            k    = k.downcase.gsub('-', '_')
            warc[k] = v.to_s
            next
          end

          unless warc['content_length']
            return Error::EOF
          end

          warc['content'] = self.fd.read(warc['content_length'].to_i)
          skip = self.fd.readline
          skip = self.fd.readline

          unless skip.strip.length == 0
            return Error::EOF
          end

          break
        end

        return warc

      rescue ::EOFError
        return Error::EOF
      end
    end

  end

end
end

Version data entries

59 entries across 59 versions & 1 rubygems

Version Path
dap-1.3.1 lib/dap/input/warc.rb
dap-1.3.0 lib/dap/input/warc.rb
dap-1.2.9 lib/dap/input/warc.rb
dap-1.2.8 lib/dap/input/warc.rb
dap-1.2.7 lib/dap/input/warc.rb
dap-1.2.6 lib/dap/input/warc.rb
dap-1.2.5 lib/dap/input/warc.rb
dap-1.2.4 lib/dap/input/warc.rb
dap-1.2.3 lib/dap/input/warc.rb
dap-1.2.2 lib/dap/input/warc.rb
dap-1.2.1 lib/dap/input/warc.rb
dap-1.2.0 lib/dap/input/warc.rb
dap-1.0.2 lib/dap/input/warc.rb
dap-1.0.1 lib/dap/input/warc.rb
dap-1.0.0 lib/dap/input/warc.rb
dap-0.1.24 lib/dap/input/warc.rb
dap-0.1.23 lib/dap/input/warc.rb
dap-0.1.22 lib/dap/input/warc.rb
dap-0.1.21 lib/dap/input/warc.rb
dap-0.1.20 lib/dap/input/warc.rb