#encoding:utf-8 require 'nokogiri' require 'open-uri' require File.expand_path("../grabepg/grab_base.rb", __FILE__) require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__) module Grabepg class GrabTvmao # To change this template use File | Settings | File Templates. #图片的获取: Net::HTTP.get(url) #图片的文件类型获取: attr_reader :channel #频道列表 attr_reader :site #网站地址 attr_reader :proxyindex #代理的索引 attr_reader :show_schedule #根据节目的时间表 attr_reader :img_down_path #图片下载路径存放 DEFAULT_GrabtvType=["cctv","satellite","digital",] DEFAULT_SITE = "http://www.tvmao.com" def initialize @grabbase = GrabBase.new @@proxyindex = 0 end #批量从tvmao获取节目类型 #channel 节目表属于的屏道 #url 节目表获取的网络地址 #date 日期 #schedule 需要批量修改的时间表 #proxylist 代理列表 def get_show_type_by_batch(channel,url,date,schedule,proxylist) _schedule = {} schedule.each do |s| time = s["schedule_start"].gsub(":","").to_i _schedule.merge!(time=>s) end url = get_show_type_url(url,date) schedules = get_schedulelist_atday(channel,url,proxylist) type = [] schedules.each do |schedule| begin schedule_time_num = schedule["schedule_start"].gsub(":","").to_i if _schedule.has_key?(schedule_time_num) _schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"] if schedule["type"] p "*****************************************************************************************" p "Schedule: #{_schedule[schedule_time_num]}" p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}" p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}" if _schedule[schedule_time_num]["schedule_logo"]=="" unless schedule["img"]=="" _schedule[schedule_time_num]["schedule_logo"]=schedule["img"] end end end rescue next end end ret = [] _schedule.each do |key,value| ret << value end ret end #批量从tvmao获取节目类型 #channel 节目表属于的屏道 #url 节目表获取的网络地址 #date 日期 #time 节目开始时间 #proxylist 代理列表 def get_show_type(channel,url,date,time,proxylist) url = get_show_type_url(url,date) schedules = get_schedulelist_atday(channel,url,proxylist) _time_num = time.gsub(":","").to_i type = nil schedules.each do |schedule| schedule_time_num = schedule["schedule_start"].gsub(":","").to_i if _time_num==schedule_time_num type = schedule["type"] end end if type return type else return [] end end def get_show_type_url(url,date) whatday = 0 _date = date.split("(")[0] case _date when "星期一" whatday=1 when "星期二" whatday=2 when "星期三" whatday=3 when "星期四" whatday=4 when "星期五" whatday=5 when "星期六" whatday=6 when "星期日" whatday=7 end get_week_url = lambda {|url,whatday| _url = "http://www.tvmao.com" urls = [] _urls = url.split("-") 0.upto(1).each do |i| _url = _url+"#{_urls[i]}"+"-" end url = _url+"w#{whatday}.html" return url } return get_week_url.call(url,whatday) end #将星期的wday获取值转化为中文名 #conversion wady to chinese def conversion_what_day(whatday) ret = "星期" case whatday.to_i when 1 ret += "一" when 2 ret += "二" when 3 ret += "三" when 4 ret += "四" when 5 ret += "五" when 6 ret += "六" when 7 ret += "日" when 0 ret += "日" end ret end #如果时间为1~9的一位则为其在数字前加0补齐二位 def dispose_time(num) num = num.to_s if num.length < 2 num = "0"+num end num end #转化当前时间的格式 def get_week_date_time(time) month = time.month day = time.day whatday = time.wday ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")" ret end #前几天需要减去的num def del_day_num(day_num) ret = day_num*60*60*24 ret end #获取距离当前多少天的之前的日期 def get_time_day_prior(num) time = Time.now - del_day_num(num) ret = get_week_date_time(time) ret end #前面一周要删除的日期的列表 def del_time_list ret = [] time = Time.now wday = time.wday if(wday==1) for i in 0..7 ret<str2.length _length=str2.length type = 2 else _length=str1.length type =1 end _str_list = [] _str = "" for i in 0.._length case type when 2 n=i 0.upto(str1.length-1).each do |j| p "N: #{n}" if(str2[n]==str1[j]) _str =_str+str2[n] n = n+1 p "Str = #{_str}" else _str_list << _str _str = "" end end when 1 n=i 0.upto(str2.length-1).each do |j| p "N: #{n}" if(str1[n]==str2[j]) _str =_str+str1[n] n=n+1 p "Str = #{_str}" else _str_list << _str _str = "" end end end end p _str_list _str = "" _str_list.each do |str| if _str.length{name:channel_name,herf:herf,type:type}}) channel_info.merge!({channel_id=>{"channel_name"=>channel_name,"channel_type"=>type,"channel_id"=>channel_id,"img_path"=>img_path}}) channel_urls.merge!({channel_id=>herf}) end end @img_down_file.close p "Channel: #{@channel}" {"channel_info"=>channel_info,"channel_urls"=>channel_urls} end def err_doc_proxy(proxy,proxylist,url="",err="") if proxy.empty?||proxy.nil? proxylist.delete_at[@@proxyindex] end unless @no_firest @no_firest = 0 end @no_firest += 1 p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}" #proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复 @@proxyindex += 1 @@proxyindex=@@proxyindex%@size doc=get_doc_with_proxy(proxylist,url) if @no_firest<10 unless @no_firest<10 @no_firest=0 raise RuntimeError,"Error: #{err}" end doc end #使用代理获取url的html的doc值 def get_doc_with_proxy(proxylist,url) unless proxylist.nil?||proxylist.empty? unless @@proxyindex @@proxyindex = 0 end @size = proxylist.size @@proxyindex=@@proxyindex+Time.now.to_i+1 @@proxyindex=@@proxyindex%@size if(proxylist[@@proxyindex]) proxy = proxylist[@@proxyindex] else @@proxyindex=@@proxyindex+1 proxy = proxylist[@@proxyindex] end begin doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty? if doc.nil? p "DOC is nil" doc=err_doc_proxy(proxy,proxylist,url,"doc nil") @no_firest=0 end @no_firest = 0 rescue => err p "IN Rescue" doc=err_doc_proxy(proxy,proxylist,url,err.to_s) @no_firest=0 p "Get DOC" @@proxyindex=@@proxyindex+Time.now.to_i+1 @@proxyindex=@@proxyindex%@size return doc end else begin doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty? rescue => err p "Error : Proxy:#{proxy}, url:#{url}" raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy" end end doc end #获取某天的节目表 def get_schedulelist_atday(channel,url,proxylist) p "Grab: #{url}" doc = get_doc_with_proxy(proxylist,url) show_type = [] _img_url = "http://static.haotv.me/channel/logo/" img_url = _img_url + channel+".jpg" data=doc.css('div[class="mt10 clear"]')[0].content.split(" ") date = data[0] week = data[1] p "Channel: #{channel} Date: #{date} Week: #{week}" @date = "#{week}(#{date})" schedule_list = [] _herf = doc.css("h1[style='float:left']").xpath('img[@src]')[0] img_url = _herf.get_attribute("src") if _herf p "**************IMG: #{img_url}" doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule| _herf= schedule.xpath('a[@href]')[0] schedule_herf=_herf.get_attribute("href") if _herf unless _herf drama =schedule.css('a[class="drama"]')[0] if drama _herfs=drama.get_attribute("href").gsub("/episode/section","#%#") schedule_herf = _herfs.split("#%#")[0] end end if schedule.content.split(" ").size>1 time = schedule.content.split(" ")[0] schedule = schedule.content.split(" ")[1] show_name = "" unless schedule_herf.nil?||schedule_herf.empty? p "Show_infomation:#{schedule_herf} Time:#{time}" show_infomation=get_show_infomation(proxylist,schedule_herf) show_type=show_infomation["type"] show_name = show_infomation["name"] show_img = show_infomation["img"] end p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name} img:#{show_img}" schedule_list << {"schedule_name"=>schedule,"schedule_logo"=>show_img,"schedule_start"=>time,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name} end end schedule_list end #获取制定时间和长度url #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天 #day_num 为int型 代表抓取的时间从开始时间计算的多少天 def get_assign_date_url(url,start_time,day_num) site="http://www.tvmao.com" if(@site) site=@site end _url = site urls = [] _urls = url.split("-") time = Time.now _wday = time.wday wday = _wday + start_time if wday<0 wday = 1 end end_day = wday + day_num - 1 if end_day>(_wday+7) end_day = _wday + 7 end 0.upto(1).each do |i| _url = _url+"#{_urls[i]}"+"-" end wday.upto(end_day).each do |i| urls << _url+"w#{i}.html" end urls end #获取指定时间段的节目表 def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path) begin day_num = 1 if day_num<1 rescue day_num = 1 end site="http://www.tvmao.com" unless img_dir_down_path img_dir_down_path = __FILE__ end @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+") if(@site) site=@site end _img_url = "http://static.haotv.me/channel/logo/" @show_schedule = {} channel_schedule = {} get_assign_date_url(herf,start_num,day_num).each do |url| @date = "" schedule_list = get_schedulelist_atday(channel,url,proxylist) channel_schedule.merge!({@date=>schedule_list}) unless @date.empty? end @img_down_file.close {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule} end #因原已调用所以保留 #获取一周节目表 def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path) p "Day Num is #{day_num}" begin day_num = 1 if day_num<1 rescue day_num = 1 end site="http://www.tvmao.com" unless img_dir_down_path img_dir_down_path = __FILE__ end @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+") if(@site) site=@site end _img_url = "http://static.haotv.me/channel/logo/" @show_schedule = {} get_week_url = lambda {|url,day_num| _url = site urls = [] _urls = url.split("-") 0.upto(1).each do |i| _url = _url+"#{_urls[i]}"+"-" end 1.upto(day_num).each do |i| urls << _url+"w#{i}.html" end urls } channel_schedule = {} get_week_url.call(herf,day_num).each do |url| @date = "" schedule_list = get_schedulelist_atday(channel,url,proxylist) channel_schedule.merge!({@date=>schedule_list}) unless @date.empty? end @img_down_file.close {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule} end #获取节目详细信息 def get_show_infomation(proxy_list,schedule_herf) begin unless @site @site = "http://www.tvmao.com" end schedule_herf = @site + schedule_herf doc = get_doc_with_proxy(proxy_list,schedule_herf) type = [] name = doc.css('span[itemprop="name"]')[0].content #获取节目的图片 if doc.css('img[class="tvc"]') schedule_img_down_path = doc.css('img[class="tvc"]')[0].get_attribute('src') if doc.css('img[class="tvc"]')[0] end doc.css('span[itemprop="genre"]').each do |_type| type << _type.content end doc.css('a[itemprop="genre"]').each do |_type| type<<_type.content end url = "#{schedule_herf}/detail" doc = get_doc_with_proxy(proxy_list,url) if doc doc.css('span[itemprop="genre"]').each do |_type| type << _type.content end end type.uniq! unless @show_schedule @show_schedule={} end #@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name) {"type"=>type,"name"=>name,"img"=>schedule_img_down_path} rescue => e p "Error In get_show_infomation msg : #{e.to_s}" end end #获取节目的时间表 def get_show_schedule(proxylist,herf) url = herf + "/playingtime" doc = get_doc_with_proxy(proxylist,url) i = 0 schedule = [] if doc.css('div[id="epg"]')[0] doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg| unless(i==0) time = epg.css('div[class="f1 fld"]')[0].content channel_name = epg.css('div[class="f2 fld"]')[0].content show_name = epg.css('div[class="f3 fld"]')[0].content times = time.split(" ") week = times[0] date = times[1] _time = times[2] schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name} end i += 1 end end schedule end #获取指定访问速度的代理服务器 #time为最慢速度的时间 int型 代表秒 def get_topfast_list(use_time) fast_list = [] time_use = 0 ips_ports = get_proxy_list() ips_ports.each do |ip_port| time_start = Time.now.to_i begin timeout(use_time) do doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}")) end time_end = Time.now.to_i time_use = time_end - time_start p "http://#{ip_port} use_time:#{time_use}" rescue Exception =>e case e when Errno::ETIMEDOUT p "Use http://#{ip_port} timeout" when Timeout::Error p "Use http://#{ip_port} timeout" when Errno::ECONNREFUSED p "Use http://#{ip_port} Error connection" else p "Use http://#{ip_port} Error:#{e.to_s}" end time_use = -1 end if(time_use > 0 &&time_use < 8) fast_list << ip_port end end fast_list end #获取代理列表 def get_proxy_list() list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html') if list.count ==0 list = gg('http://www.proxycn.cn/html_proxy/http-1.html') end ips_ports = [] regex_port = /(?<=)[0-9]*?(?=<\/TD>)/ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/ list.each do |proxy_txt| port = proxy_txt[regex_port] ip = proxy_txt[regex_ip] if(ip != ""&& !port.to_s.eql?('3128')) port_ip = ip.to_s + ":" + port.to_s ips_ports << port_ip end end p "Count: #{ips_ports.count}" ips_ports end def gg(url) regex_list = /.*<\/TD>/ href =URI.parse(url) contxt = "" href.open{ |f| f.each_line {|line| contxt =contxt + line + "\n"} } list = contxt.scan(regex_list) end def save_img end end end