#!/usr/bin/ruby -w autoload :XMLTV, 'xmltv/xmltv' require 'cgi' module XMLTV class TvgidsGrabber < Grabber Cattrans = { 'amusement' => 'Talk', 'animatie' => 'Animated', 'comedy' => 'Comedy', 'documentaire' => 'Documentary', 'educatief' => 'Educational', 'erotiek' => 'Adult', 'film' => 'Movies', 'muziek' => 'Art/Music', 'informatief' => 'Educational', 'jeugd' => 'Children', 'kunst/cultuur' => 'Arts/Culture', 'misdaad' => 'Crime/Mystery', 'muziek' => 'Music', 'natuur' => 'Science/Nature', 'nieuws/actualiteiten' => 'News', 'overige' => 'Unknown', 'religieus' => 'Religion', 'serie/soap' => 'Drama', 'sport' => 'Sports', 'theater' => 'Arts/Culture', 'wetenschap' => 'Science/Nature' } Roletrans = { 'regie' => 'director', 'acteurs' => 'actor', 'presentatie' => 'presenter', 'scenario' => 'writer' } Titeltrans = { 'titel aflevering' => 'sub-title', 'jaar van premiere' => 'date', 'aflevering' => 'episode-num' } def grab_detail(href) if href[0] == ?/ href="#{base_url}#{href}" end STDERR.puts "#{Time.now}: #{href} #{@channelhash.size}" if XmltvOptions.verbose program = Hash.new details = fetch(href) desc = [] details.at('//table#progDetail').search('//tr//p').each do |p| break if p['class'] == 'meerLinks' line = p.inner_text.strip desc << line unless line.empty? end program['desc'] = desc.join(' ').to_utf details.search('//div#progPropt//tr/th').each do |pg| content = pg.at('../td') if content['class'] == 'personen' rsl = content.at('div').search('.').find_all { |x| x.text? }.map{|x| x.to_s.strip.to_utf}.find_all{|x| ! x.empty?} else rsl = content.inner_text.strip.to_utf end program[pg.inner_text.strip.gsub(':','').downcase] = rsl end # PP.pp program, STDERR program end def channel_url(chan_id) "#{base_url}//zoeken/?periode=9&station=#{chan_id}" end def fetch_all_channels page = fetch(channel_url(1)) channels = Hash.new page.search('//optgroup')[0..1].each do |og| og.search('/option').each do |g| channels[g['value']] = g.inner_text end end save_object(channels, channel_list) channels end def clean_cache(cache) count = 0 cache.delete_if do |dt, en| rsl = (Date.dutch(en['datum']) < Vandaag) # puts Date.dutch(en['datum']), Vandaag, rsl, '===' count += 1 if rsl rsl end count end def grab_channel(chan_id) url = channel_url(chan_id) page = fetch(url) @channelhash = load_cachefile(chan_id) # get_file(chan_id) period = datum = nil fetched = 0 begin found = remaining = page.at("//table.overzicht//tr//td/strong").inner_text.to_i rescue NoMethodError niks = page.at("//div#resultaten").at("//td").inner_text STDERR.puts url, niks return end # STDERR.puts("#{found} on site, #{@channelhash.size} in cache") page.search("//table.overzicht//tr").each do |pg| td = pg.at('td') next if td.nil? || pg['class'] == 'zoekstring' if td['class'] == 'bloktitel' period = td.at('h5').inner_text rescue period datum = td.at('h4').inner_text rescue datum next end # puts "Period: #{period} " if (tijd = pg.at('/th').inner_text) =~ /\d\d:\d\d/ det = pg.at('/td//a') href = det['href'] id = href[/ID=(\d+)/,1] # puts @channelhash[id] remaining -= 1 next if @channelhash[id] fetched += 1 begin @channelhash[id] = program = grab_detail(href) rescue STDERR.puts href, pg, '=====' raise end program['title'] = det.inner_text.strip.to_utf program['period'] = period program['datum'] = datum program['tijd'] = tijd program['progtip'] = '4/5' if pg['class'] == 'progTip' end end STDERR.puts "Something wrong remaining: #{remaining}" if remaining != 0 save_object(@channelhash, cachefile(chan_id)) if fetched > 0 found end def parse_times(str) rsl = nil # STDERR.puts str md = /(\d+)\s(\w+)\s(\d+),\s(\d+):(\d+)/.match(str) if md rsl = md.captures.map do |x| x =~ /\d/ ? x.to_i : Date::Maanden.index(x.downcase) end mdstop = /(\d+):(\d+)/.match(str[md.offset(0)[1]..-1]) if mdstop rsl << mdstop.captures.map {|x| x.to_i} end rsl.flatten! end raise DateError.new(str) if rsl.index(nil) rsl end def transform(chan_id) # get_file(chan_id) # STDERR.print "#{chan_id} #{@channelhash.size}" progdata_array = Array.new @channelhash.each_pair do |id, entry| begin progdata = proghash(entry, chan_id) a=entry['datum en tijdstip'] shift = entry['period'] == 'Nacht' ? Dag : 0 dag, maand, jaar, startuur, startmin, stopuur, stopmin = parse_times(a) next if dag.nil? progdata['start'] = start = Time.local(jaar, maand, dag, startuur, startmin) + shift if stopuur stop = Time.local(jaar, maand, dag, stopuur, stopmin) + shift if start > stop && start.hour >= 21 && stop.hour <= 7 stop += Dag end progdata['stop'] = stop end date_stats(chan_id, progdata['start']) if (b = entry['bijzonderheden']) b.downcase.split(',').each do |bijz| case bijz when /breedbeeld/ progdata['video']['aspect'] = '16:9' when /zwart/ progdata['video']['colour'] = 'no' when /teletekst/ progdata['subtitles']['type'] = 'teletext' when /stereo/ progdata['audio']['stereo'] = 'stereo' end end end %w{ regie acteurs scenario presentatie }.each do |role| if entry[role] progdata['credits'][Roletrans[role]] = entry[role] end end progdata['category'] = Cattrans[entry['genre'].downcase] if entry['genre'] progdata['star-rating']['value'] = entry['progtip'] if entry['progtip'] Titeltrans.each do |key| progdata[Titeltrans[key]] = entry[key] if entry[key] end progdata_array << progdata rescue DateError => exc STDERR.puts exc.class, exc.message PP.pp(entry, STDERR) next rescue StandardError => exc STDERR.puts exc, exc.message, exc.backtrace PP.pp(entry, STDERR) raise end end progdata_array end end end XMLTV::TvgidsGrabber.new.run