#encoding:utf-8 require 'nokogiri' require 'open-uri' module Grabepg # To change this template use File | Settings | File Templates. #图片的获取: Net::HTTP.get(url) #图片的文件类型获取: attr_reader :channel #频道列表 attr_reader :site #网站地址 attr_reader :proxyindex #代理的索引 attr_reader :show_schedule #根据节目的时间表 attr_reader :img_down_path #图片下载路径存放 DEFAULT_GrabtvType=["cctv","satellite","digital",] DEFAULT_SITE = "http://www.tvmao.com" #将星期的wday获取值转化为中文名 #conversion wady to chinese def self.conversion_what_day(whatday) ret = "星期" case whatday.to_i when 1 ret += "一" when 2 ret += "二" when 3 ret += "三" when 4 ret += "四" when 5 ret += "五" when 6 ret += "六" when 7 ret += "七" end ret end #如果时间为1~9的一位则为其在数字前加0补齐二位 def self.dispose_time(num) num = num.to_s if num.length < 2 num = "0"+num end num end #转化当前时间的格式 def self.get_week_date_time(time) month = time.month day = time.day whatday = time.wday ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")" ret end #前几天需要减去的num def self.del_day_num(day_num) ret = day_num*60*60*24 ret end #获取距离当前多少天的之前的日期 def self.get_time_day_prior(num) time = Time.now - del_day_num(num) ret = get_week_date_time(time) ret end #前面一周要删除的日期的列表 def self.del_time_list ret = [] time = Time.now wday = time.wday if(wday==1) for i in 0..7 ret<{name:channel_name,herf:herf,type:type}}) channel_info.merge!({channel_id=>{"channel_name"=>channel_name,"channel_type"=>type,"channel_id"=>channel_id,"img_path"=>img_path}}) channel_urls.merge!({channel_id=>herf}) end end @img_down_file.close p "Channel: #{@channel}" {"channel_info"=>channel_info,"channel_urls"=>channel_urls} end #使用代理获取url的html的doc值 def self.get_doc_with_proxy(proxylist,url) unless @proxyindex @proxyindex = 0 end @proxyindex=@proxyindex%proxylist.size if(proxylist[@proxyindex]) proxy = proxylist[@proxyindex] else proxy = proxylist[@proxyindex+1] end begin doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty? doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty? @no_firest = 0 rescue => err @no_firest += 1 p "*************************Proxy:#{proxy}, url:#{url}" get_doc_with_proxy(proxylist,url) if @no_firest<4 raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4 end @proxyindex += 1 unless doc p "*************************Proxy:#{proxy}, url:#{url}" end doc end #获取某天的节目表 def self.get_schedulelist_atday(channel,url,proxylist) p "Grab: #{url}" doc = get_doc_with_proxy(proxylist,url) show_type = [] img_url = _img_url + channel+".jpg" data=doc.css('div[class="mt10 clear"]')[0].content.split(" ") date = data[0] week = data[1] p "Channel: #{channel} Date: #{date} Week: #{week}" @date = "#{week}(#{date})" schedule_list = [] doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule| _herf= schedule.xpath('a[@href]')[0] schedule_herf=_herf.get_attribute("href") if _herf unless _herf drama =schedule.css('a[class="drama"]')[0] if drama _herfs=drama.get_attribute("href").gsub("/episode/section","#%#") schedule_herf = _herfs.split("#%#")[0] end end if schedule.content.split(" ").size>1 time = schedule.content.split(" ")[0] schedule = schedule.content.split(" ")[1] show_name = "" unless schedule_herf.nil?||schedule_herf.empty? show_infomation=get_show_infomation(proxylist,schedule_herf) show_type=show_infomation["type"] show_name = show_infomation["name"] show_img = show_infomation["img"] end p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name} img:#{show_img}" schedule_list << {"schedule_name"=>schedule,"schedule_logo"=>show_img,"schedule_start"=>time,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name} end end schedule_list end #获取制定时间和长度url #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天 #day_num 为int型 代表抓取的时间从开始时间计算的多少天 def get_assign_date_url(url,start_time,day_num) _url = site urls = [] _urls = url.split("-") time = Time.now _wday = time.wday wday = _wday + start_time if wday<0 wday = 1 end end_day = wday + day_num if end_day>(_wday+7) end_day = _wday + 7 end 0.upto(1).each do |i| _url = _url+"#{_urls[i]}"+"-" end wday.upto(end_day).each do |i| urls << _url+"w#{i}.html" end urls end #获取指定时间段的节目表 def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num,img_dir_down_path=@img_down_dir_path) begin day_num = 1 if day_num<1 rescue day_num = 1 end site="http://www.tvmao.com" unless img_dir_down_path img_dir_down_path = __FILE__ end @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+") if(@site) site=@site end _img_url = "http://static.haotv.me/channel/logo/" @show_schedule = {} channel_schedule = {} get_assign_date_url(herf,start_num,day_num).each do |url| @date = "" schedule_list = self.get_schedulelist_atday(channel,url,proxylist) channel_schedule.merge!({@date=>schedule_list}) unless @date.empty? end @img_down_file.close {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule} end #因原已调用所以保留 #获取一周节目表 def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path) p "Day Num is #{day_num}" begin day_num = 1 if day_num<1 rescue day_num = 1 end site="http://www.tvmao.com" unless img_dir_down_path img_dir_down_path = __FILE__ end @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+") if(@site) site=@site end _img_url = "http://static.haotv.me/channel/logo/" @show_schedule = {} get_week_url = lambda {|url,day_num| _url = site urls = [] _urls = url.split("-") 0.upto(1).each do |i| _url = _url+"#{_urls[i]}"+"-" end 1.upto(day_num).each do |i| urls << _url+"w#{i}.html" end urls } channel_schedule = {} get_week_url.call(herf,day_num).each do |url| @date = "" schedule_list = self.get_schedulelist_atday(channel,url,proxylist) channel_schedule.merge!({@date=>schedule_list}) unless @date.empty? end @img_down_file.close {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule} end #获取节目详细信息 def self.get_show_infomation(proxy_list,schedule_herf) begin @proxyindex = 0 unless @site @site = "http://www.tvmao.com" end schedule_herf = @site + schedule_herf doc=get_doc_with_proxy(proxy_list,schedule_herf) #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title'] # p "title: %s" % title type = [] name = doc.css('span[itemprop="name"]')[0].content #获取节目的图片 if doc.css('img[class="tvc"]') schedule_img_down_path = doc.css('img[class="tvc"]')[0].get_attribute('src') if doc.css('img[class="tvc"]')[0] end doc.css('span[itemprop="genre"]').each do |_type| type << _type.content end doc.css('a[itemprop="genre"]').each do |_type| type<<_type.content end url = "#{schedule_herf}/detail" doc = get_doc_with_proxy(proxy_list,url) doc.css('span[itemprop="genre"]').each do |_type| type << _type.content end doc.css('a[itemprop="genre"]').each do |_type| type<<_type.content end type.uniq! @img_down_file.puts("#{name}:#{schedule_img_down_path}") @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name) {"type"=>type,"name"=>name,"img"=>schedule_img_down_path} rescue => e p "Error In get_show_infomation msg : #{e.to_s}" end end #获取节目的时间表 def self.get_show_schedule(proxylist,herf) url = herf + "/playingtime" doc = get_doc_with_proxy(proxylist,url) i = 0 schedule = [] doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg| unless(i==0) time = epg.css('div[class="f1 fld"]')[0].content channel_name = epg.css('div[class="f2 fld"]')[0].content show_name = epg.css('div[class="f3 fld"]')[0].content times = time.split(" ") week = times[0] date = times[1] _time = times[2] schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name} end i += 1 end schedule end #获取指定访问速度的代理服务器 #time为最慢速度的时间 int型 代表秒 def self.get_topfast_list(use_time) fast_list = [] time_use = 0 ips_ports = get_proxy_list() ips_ports.each do |ip_port| time_start = Time.now.to_i begin timeout(use_time) do doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}")) end time_end = Time.now.to_i time_use = time_end - time_start p "http://#{ip_port} use_time:#{time_use}" rescue Exception =>e case e when Errno::ETIMEDOUT p "Use http://#{ip_port} timeout" when Timeout::Error p "Use http://#{ip_port} timeout" when Errno::ECONNREFUSED p "Use http://#{ip_port} Error connection" else p "Use http://#{ip_port} Error:#{e.to_s}" end time_use = -1 end if(time_use > 0 &&time_use < 8) fast_list << ip_port end end fast_list end #获取代理列表 def self.get_proxy_list() list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html') if list.count ==0 list = gg('http://www.proxycn.cn/html_proxy/http-1.html') end ips_ports = [] regex_port = /(?<=)[0-9]*?(?=<\/TD>)/ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/ list.each do |proxy_txt| port = proxy_txt[regex_port] ip = proxy_txt[regex_ip] if(ip != ""&& !port.to_s.eql?('3128')) port_ip = ip.to_s + ":" + port.to_s ips_ports << port_ip end end p "Count: #{ips_ports.count}" ips_ports end def self.gg(url) regex_list = /.*<\/TD>/ href =URI.parse(url) contxt = "" href.open{ |f| f.each_line {|line| contxt =contxt + line + "\n"} } list = contxt.scan(regex_list) end def save_img end end