Sha256: f16a09f522a054a08a1ac4349239c56a6c5684f427c856aab9b29427e5011848
Contents?: true
Size: 1.55 KB
Versions: 59
Compression:
Stored size: 1.55 KB
Contents
module Dap module Input # # WARC # class InputWARC include FileSource attr_accessor :header, :info def initialize(args) self.open(args.first) read_warc_header end def read_warc_header self.header = read_record if self.header == Error::EOF raise RuntimeError, "Invalid WARC header" end unless self.header['warc_type'].to_s == "warcinfo" raise RuntimeError, "Invalid WARC header (missing warcinfo)" end self.info = {} self.header['content'].to_s.split("\n").each do |line| k, v = line.strip.split(/\s*:\s*/, 2) next unless v self.info[k] = v end end def read_record begin version = self.fd.readline unless version and version =~ /^WARC\/\d+\.\d+/ return Error::EOF end warc = {} loop do line = self.fd.readline unless line.strip.length == 0 k, v = line.strip.split(/\s*:\s*/, 2) k = k.downcase.gsub('-', '_') warc[k] = v.to_s next end unless warc['content_length'] return Error::EOF end warc['content'] = self.fd.read(warc['content_length'].to_i) skip = self.fd.readline skip = self.fd.readline unless skip.strip.length == 0 return Error::EOF end break end return warc rescue ::EOFError return Error::EOF end end end end end
Version data entries
59 entries across 59 versions & 1 rubygems