lib/grab_tvmao.rb in grab_epg-0.2.3 vs lib/grab_tvmao.rb in grab_epg-0.2.4

- old
+ new

@@ -1,11 +1,15 @@ #encoding:utf-8 require 'nokogiri' require 'open-uri' -module GrabTvmao +require File.expand_path("../grabepg/grab_base.rb", __FILE__) +require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__) + +module Grabepg + class GrabTvmao # To change this template use File | Settings | File Templates. #图片的获取: Net::HTTP.get(url) #图片的文件类型获取: @@ -20,14 +24,119 @@ DEFAULT_SITE = "http://www.tvmao.com" + def initialize + @grabbase = GrabBase.new + end + + + #批量从tvmao获取节目类型 + #channel 节目表属于的屏道 + #url 节目表获取的网络地址 + #date 日期 + #schedule 需要批量修改的时间表 + #proxylist 代理列表 + def get_show_type_by_batch(channel,url,date,schedule,proxylist) + _schedule = {} + schedule.each do |s| + time = s["schedule_start"].gsub(":","").to_i + _schedule.merge!(time=>s) + end + url = get_show_type_url(url,date) + schedules = get_schedulelist_atday(channel,url,proxylist) + type = nil + schedules.each do |schedule| + schedule_time_num = schedule["schedule_start"].gsub(":","").to_i + if _schedule.has_key?(schedule_time_num) + _schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"] + p "*****************************************************************************************" + p "Schedule: #{_schedule[schedule_time_num]}" + p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}" + p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}" + if _schedule[schedule_time_num]["schedule_logo"]=="" + unless schedule["img"]=="" + _schedule[schedule_time_num]["schedule_logo"]=schedule["img"] + end + end + end + end + ret = [] + _schedule.each do |key,value| + ret << value + end + + ret + end + + #批量从tvmao获取节目类型 + #channel 节目表属于的屏道 + #url 节目表获取的网络地址 + #date 日期 + #time 节目开始时间 + #proxylist 代理列表 + def get_show_type(channel,url,date,time,proxylist) + url = get_show_type_url(url,date) + schedules = get_schedulelist_atday(channel,url,proxylist) + _time_num = time.gsub(":","").to_i + type = nil + schedules.each do |schedule| + schedule_time_num = schedule["schedule_start"].gsub(":","").to_i + if _time_num==schedule_time_num + type = schedule["type"] + end + end + if type + return type + else + return [] + end + end + + def get_show_type_url(url,date) + whatday = 0 + _date = date.split("(")[0] + case _date + when "星期一" + whatday=1 + when "星期二" + whatday=2 + when "星期三" + whatday=3 + when "星期四" + whatday=4 + when "星期五" + whatday=5 + when "星期六" + whatday=6 + when "星期日" + whatday=7 + end + + get_week_url = lambda {|url,whatday| + _url = "http://www.tvmao.com" + urls = [] + _urls = url.split("-") + 0.upto(1).each do |i| + _url = _url+"#{_urls[i]}"+"-" + end + url = _url+"w#{whatday}.html" + return url + } + return get_week_url.call(url,whatday) + end + + + + + + #将星期的wday获取值转化为中文名 #conversion wady to chinese - def self.conversion_what_day(whatday) + def conversion_what_day(whatday) ret = "星期" case whatday.to_i when 1 ret += "一" when 2 @@ -45,58 +154,58 @@ end ret end #如果时间为1~9的一位则为其在数字前加0补齐二位 - def self.dispose_time(num) + def dispose_time(num) num = num.to_s if num.length < 2 num = "0"+num end num end #转化当前时间的格式 - def self.get_week_date_time(time) + def get_week_date_time(time) month = time.month day = time.day whatday = time.wday ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")" ret end #前几天需要减去的num - def self.del_day_num(day_num) + def del_day_num(day_num) ret = day_num*60*60*24 ret end #获取距离当前多少天的之前的日期 - def self.get_time_day_prior(num) + def get_time_day_prior(num) time = Time.now - del_day_num(num) ret = get_week_date_time(time) ret end #前面一周要删除的日期的列表 - def self.del_time_list + def del_time_list ret = [] time = Time.now wday = time.wday if(wday==1) for i in 0..7 - ret<<self.get_time_day_prior(i) + ret<<get_time_day_prior(i) end end ret end #调用此方法的例子 - def self.start + def start #作用是获取俩个字符串的相似度 #get str1 and str2 similarity get_similarity_string = lambda { |str1,str2| _length = 0 type = 0 @@ -199,18 +308,18 @@ end end end - def self.img_down_path + def img_down_path @img_down_path end #获取网站的频道表 #img_path 图片存放路径 - def self.getchannels(img_dir_path) + def getchannels(img_dir_path) @channel = [] @site=DEFAULT_SITE @proxyindex = 0 @img_down_dir_path = img_dir_path @img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+') @@ -250,43 +359,80 @@ @img_down_file.close p "Channel: #{@channel}" {"channel_info"=>channel_info,"channel_urls"=>channel_urls} end - #使用代理获取url的html的doc值 - def self.get_doc_with_proxy(proxylist,url) - unless @proxyindex - @proxyindex = 0 - end - @proxyindex=@proxyindex%proxylist.size - if(proxylist[@proxyindex]) - proxy = proxylist[@proxyindex] - else - proxy = proxylist[@proxyindex+1] - end - begin - doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty? - doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty? - @no_firest = 0 - rescue => err + def err_doc_proxy(proxy,proxylist,url="",err="") + if proxy.empty?||proxy.nil? + proxylist.delete_at[@proxyindex] + end + + unless @no_firest @no_firest = 0 end @no_firest += 1 - p "*************************Proxy:#{proxy}, url:#{url} Error:#{err.to_s}" + p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}" #proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复 - get_doc_with_proxy(proxylist,url) if @no_firest<4 - raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4 + @proxyindex += 1 + @proxyindex=@proxyindex%@size + doc=get_doc_with_proxy(proxylist,url) if @no_firest<4 + unless @no_firest<4 + @no_firest=0 + raise RuntimeError,"Error: #{err}" + end + doc end - @proxyindex += 1 - doc - end + + #使用代理获取url的html的doc值 + def get_doc_with_proxy(proxylist,url) + unless proxylist.nil?||proxylist.empty? + unless @proxyindex + @proxyindex = 0 + end + @size = proxylist.size + @proxyindex=@proxyindex%proxylist.size + if(proxylist[@proxyindex]) + proxy = proxylist[@proxyindex] + else + proxy = proxylist[@proxyindex+1] + end + begin + doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty? + if doc.nil? + p "DOC is nil" + doc=err_doc_proxy(proxy,proxylist,url,"doc nil") + @no_firest=0 + end + @no_firest = 0 + rescue => err + p "IN Rescue" + doc=err_doc_proxy(proxy,proxylist,url,err.to_s) + @no_firest=0 + p "Get DOC" + @proxyindex += 1 + @proxyindex=@proxyindex%@size + return doc + end + @proxyindex += 1 + @proxyindex=@proxyindex%@size + else + begin + doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty? + rescue => err + p "Error : Proxy:#{proxy}, url:#{url}" + raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy" + end + end + doc + end + #获取某天的节目表 - def self.get_schedulelist_atday(channel,url,proxylist) + def get_schedulelist_atday(channel,url,proxylist) p "Grab: #{url}" doc = get_doc_with_proxy(proxylist,url) show_type = [] @@ -320,10 +466,11 @@ if schedule.content.split(" ").size>1 time = schedule.content.split(" ")[0] schedule = schedule.content.split(" ")[1] show_name = "" unless schedule_herf.nil?||schedule_herf.empty? + p "Show_infomation:#{schedule_herf} Time:#{time}" show_infomation=get_show_infomation(proxylist,schedule_herf) show_type=show_infomation["type"] show_name = show_infomation["name"] show_img = show_infomation["img"] end @@ -335,11 +482,11 @@ end #获取制定时间和长度url #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天 #day_num 为int型 代表抓取的时间从开始时间计算的多少天 - def self.get_assign_date_url(url,start_time,day_num) + def get_assign_date_url(url,start_time,day_num) site="http://www.tvmao.com" if(@site) site=@site end @@ -371,11 +518,11 @@ end #获取指定时间段的节目表 - def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path) + def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path) begin day_num = 1 if day_num<1 rescue day_num = 1 end @@ -392,11 +539,11 @@ @show_schedule = {} channel_schedule = {} get_assign_date_url(herf,start_num,day_num).each do |url| @date = "" - schedule_list = self.get_schedulelist_atday(channel,url,proxylist) + schedule_list = get_schedulelist_atday(channel,url,proxylist) channel_schedule.merge!({@date=>schedule_list}) unless @date.empty? end @img_down_file.close {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule} end @@ -405,11 +552,11 @@ #因原已调用所以保留 #获取一周节目表 - def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path) + def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path) p "Day Num is #{day_num}" begin day_num = 1 if day_num<1 rescue day_num = 1 @@ -440,29 +587,27 @@ } channel_schedule = {} get_week_url.call(herf,day_num).each do |url| @date = "" - schedule_list = self.get_schedulelist_atday(channel,url,proxylist) + schedule_list = get_schedulelist_atday(channel,url,proxylist) channel_schedule.merge!({@date=>schedule_list}) unless @date.empty? end @img_down_file.close {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule} end #获取节目详细信息 - def self.get_show_infomation(proxy_list,schedule_herf) + def get_show_infomation(proxy_list,schedule_herf) begin @proxyindex = 0 unless @site @site = "http://www.tvmao.com" end schedule_herf = @site + schedule_herf - doc=get_doc_with_proxy(proxy_list,schedule_herf) - #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title'] - # p "title: %s" % title + doc = get_doc_with_proxy(proxy_list,schedule_herf) type = [] name = doc.css('span[itemprop="name"]')[0].content #获取节目的图片 if doc.css('img[class="tvc"]') @@ -477,53 +622,56 @@ doc.css('a[itemprop="genre"]').each do |_type| type<<_type.content end url = "#{schedule_herf}/detail" doc = get_doc_with_proxy(proxy_list,url) - doc.css('span[itemprop="genre"]').each do |_type| - type << _type.content + if doc + doc.css('span[itemprop="genre"]').each do |_type| + type << _type.content + end end - doc.css('a[itemprop="genre"]').each do |_type| - type<<_type.content - end type.uniq! - @img_down_file.puts("#{name}:#{schedule_img_down_path}") + unless @show_schedule + @show_schedule={} + end @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name) {"type"=>type,"name"=>name,"img"=>schedule_img_down_path} - rescue => e - p "Error In get_show_infomation msg : #{e.to_s}" + #rescue => e + # p "Error In get_show_infomation msg : #{e.to_s}" end end #获取节目的时间表 - def self.get_show_schedule(proxylist,herf) + def get_show_schedule(proxylist,herf) url = herf + "/playingtime" doc = get_doc_with_proxy(proxylist,url) i = 0 schedule = [] - doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg| - unless(i==0) - time = epg.css('div[class="f1 fld"]')[0].content - channel_name = epg.css('div[class="f2 fld"]')[0].content - show_name = epg.css('div[class="f3 fld"]')[0].content - times = time.split(" ") - week = times[0] - date = times[1] - _time = times[2] - schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name} + if doc.css('div[id="epg"]')[0] + doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg| + unless(i==0) + time = epg.css('div[class="f1 fld"]')[0].content + channel_name = epg.css('div[class="f2 fld"]')[0].content + show_name = epg.css('div[class="f3 fld"]')[0].content + times = time.split(" ") + week = times[0] + date = times[1] + _time = times[2] + schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name} + end + i += 1 end - i += 1 end schedule end #获取指定访问速度的代理服务器 #time为最慢速度的时间 int型 代表秒 - def self.get_topfast_list(use_time) + def get_topfast_list(use_time) fast_list = [] time_use = 0 ips_ports = get_proxy_list() ips_ports.each do |ip_port| time_start = Time.now.to_i @@ -553,11 +701,11 @@ end fast_list end #获取代理列表 - def self.get_proxy_list() + def get_proxy_list() list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html') if list.count ==0 list = gg('http://www.proxycn.cn/html_proxy/http-1.html') end ips_ports = [] @@ -573,11 +721,11 @@ end p "Count: #{ips_ports.count}" ips_ports end - def self.gg(url) + def gg(url) regex_list = /<TD class="list">.*<\/TD>/ href =URI.parse(url) contxt = "" href.open{ |f| f.each_line {|line| contxt =contxt + line + "\n"} @@ -586,7 +734,7 @@ end def save_img end - + end end