grab_tvmao.rb in grab_epg-0.2.4

- old
+ new

@@ -1,11 +1,15 @@
 #encoding:utf-8
 
 require 'nokogiri'
 require 'open-uri'
 
-module GrabTvmao
+require File.expand_path("../grabepg/grab_base.rb", __FILE__)
+require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__)
+
+module Grabepg
+  class GrabTvmao
   # To change this template use File | Settings | File Templates.
 
 
   #图片的获取： Net::HTTP.get(url)
   #图片的文件类型获取：
@@ -20,14 +24,119 @@
   DEFAULT_SITE = "http://www.tvmao.com"
 
 
 
 
+  def initialize
+    @grabbase = GrabBase.new
+  end
 
+
+
+  #批量从tvmao获取节目类型
+  #channel 节目表属于的屏道
+  #url 节目表获取的网络地址
+  #date 日期
+  #schedule 需要批量修改的时间表
+  #proxylist 代理列表
+  def get_show_type_by_batch(channel,url,date,schedule,proxylist)
+    _schedule =  {}
+    schedule.each do |s|
+     time = s["schedule_start"].gsub(":","").to_i
+     _schedule.merge!(time=>s)
+    end
+    url = get_show_type_url(url,date)
+    schedules = get_schedulelist_atday(channel,url,proxylist)
+    type = nil
+    schedules.each do |schedule|
+      schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
+      if _schedule.has_key?(schedule_time_num)
+        _schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"]
+        p "*****************************************************************************************"
+        p "Schedule: #{_schedule[schedule_time_num]}"
+        p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}"
+        p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}"
+        if _schedule[schedule_time_num]["schedule_logo"]==""
+          unless schedule["img"]==""
+            _schedule[schedule_time_num]["schedule_logo"]=schedule["img"]
+          end
+        end
+      end
+    end
+    ret = []
+    _schedule.each do |key,value|
+      ret << value
+    end
+
+    ret
+  end
+
+  #批量从tvmao获取节目类型
+  #channel 节目表属于的屏道
+  #url 节目表获取的网络地址
+  #date 日期
+  #time 节目开始时间
+  #proxylist 代理列表
+  def get_show_type(channel,url,date,time,proxylist)
+    url = get_show_type_url(url,date)
+    schedules = get_schedulelist_atday(channel,url,proxylist)
+    _time_num = time.gsub(":","").to_i
+    type = nil
+    schedules.each do |schedule|
+      schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
+      if _time_num==schedule_time_num
+        type = schedule["type"]
+      end
+    end
+    if type
+      return type
+    else
+      return []
+    end
+  end
+
+  def get_show_type_url(url,date)
+    whatday = 0
+    _date = date.split("(")[0]
+    case _date
+      when "星期一"
+        whatday=1
+      when "星期二"
+        whatday=2
+      when "星期三"
+        whatday=3
+      when "星期四"
+        whatday=4
+      when "星期五"
+        whatday=5
+      when "星期六"
+        whatday=6
+      when "星期日"
+        whatday=7
+    end
+
+    get_week_url = lambda {|url,whatday|
+      _url = "http://www.tvmao.com"
+      urls = []
+      _urls = url.split("-")
+      0.upto(1).each do |i|
+        _url = _url+"#{_urls[i]}"+"-"
+      end
+        url = _url+"w#{whatday}.html"
+      return url
+    }
+    return get_week_url.call(url,whatday)
+  end
+
+
+
+
+
+
 #将星期的wday获取值转化为中文名
 #conversion wady to chinese
-  def self.conversion_what_day(whatday)
+  def conversion_what_day(whatday)
     ret = "星期"
     case whatday.to_i
       when 1
         ret += "一"
       when 2
@@ -45,58 +154,58 @@
     end
     ret
   end
 
   #如果时间为1~9的一位则为其在数字前加0补齐二位
-  def self.dispose_time(num)
+  def dispose_time(num)
     num = num.to_s
     if num.length < 2
       num = "0"+num
     end
     num
   end
 
   #转化当前时间的格式
-  def self.get_week_date_time(time)
+  def get_week_date_time(time)
     month = time.month
     day = time.day
     whatday = time.wday
     ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")"
     ret
   end
 
   #前几天需要减去的num
-  def self.del_day_num(day_num)
+  def del_day_num(day_num)
     ret = day_num*60*60*24
     ret
   end
 
   #获取距离当前多少天的之前的日期
-  def self.get_time_day_prior(num)
+  def get_time_day_prior(num)
     time = Time.now - del_day_num(num)
     ret = get_week_date_time(time)
     ret
   end
 
   #前面一周要删除的日期的列表
-  def self.del_time_list
+  def del_time_list
     ret = []
     time = Time.now
     wday = time.wday
     if(wday==1)
       for i in 0..7
-        ret<<self.get_time_day_prior(i)
+        ret<<get_time_day_prior(i)
       end
     end
     ret
  end
 
 
 
 
   #调用此方法的例子
-  def self.start
+  def start
     #作用是获取俩个字符串的相似度
     #get str1 and str2 similarity
     get_similarity_string = lambda { |str1,str2|
       _length = 0
       type = 0
@@ -199,18 +308,18 @@
 
         end
       end
     end
 
-  def self.img_down_path
+  def img_down_path
     @img_down_path
   end
 
 
   #获取网站的频道表
   #img_path 图片存放路径
-  def self.getchannels(img_dir_path)
+  def getchannels(img_dir_path)
     @channel = []
     @site=DEFAULT_SITE
     @proxyindex = 0
     @img_down_dir_path = img_dir_path
     @img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+')
@@ -250,43 +359,80 @@
     @img_down_file.close
     p "Channel: #{@channel}"
     {"channel_info"=>channel_info,"channel_urls"=>channel_urls}
   end
 
-  #使用代理获取url的html的doc值
-  def self.get_doc_with_proxy(proxylist,url)
-    unless @proxyindex
-      @proxyindex = 0
-    end
-    @proxyindex=@proxyindex%proxylist.size
-    if(proxylist[@proxyindex])
-      proxy = proxylist[@proxyindex]
-    else
-      proxy = proxylist[@proxyindex+1]
-    end
-    begin
-      doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
-      doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
-      @no_firest = 0
-    rescue => err
 
+    def err_doc_proxy(proxy,proxylist,url="",err="")
+      if proxy.empty?||proxy.nil?
+        proxylist.delete_at[@proxyindex]
+      end
+
+
       unless @no_firest
         @no_firest = 0
       end
 
       @no_firest += 1
-      p "*************************Proxy:#{proxy}, url:#{url} Error:#{err.to_s}"
+      p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
       #proxylist.delete(proxy)    #删除出错的代理  但如果是此网页错误则会引起BUG待修复
-      get_doc_with_proxy(proxylist,url) if @no_firest<4
-      raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
+      @proxyindex += 1
+      @proxyindex=@proxyindex%@size
+      doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
+      unless @no_firest<4
+        @no_firest=0
+        raise RuntimeError,"Error: #{err}"
+      end
+      doc
     end
-    @proxyindex += 1
-    doc
-  end
 
+
+    #使用代理获取url的html的doc值
+    def get_doc_with_proxy(proxylist,url)
+      unless proxylist.nil?||proxylist.empty?
+        unless @proxyindex
+          @proxyindex = 0
+        end
+        @size = proxylist.size
+        @proxyindex=@proxyindex%proxylist.size
+        if(proxylist[@proxyindex])
+          proxy = proxylist[@proxyindex]
+        else
+          proxy = proxylist[@proxyindex+1]
+        end
+        begin
+          doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty?
+          if doc.nil?
+            p "DOC is nil"
+            doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
+            @no_firest=0
+          end
+          @no_firest = 0
+        rescue => err
+          p "IN Rescue"
+          doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
+          @no_firest=0
+          p "Get DOC"
+          @proxyindex += 1
+          @proxyindex=@proxyindex%@size
+          return doc
+        end
+        @proxyindex += 1
+        @proxyindex=@proxyindex%@size
+      else
+        begin
+          doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty?
+        rescue => err
+          p "Error : Proxy:#{proxy}, url:#{url}"
+          raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
+        end
+      end
+      doc
+    end
+
   #获取某天的节目表
-  def self.get_schedulelist_atday(channel,url,proxylist)
+  def get_schedulelist_atday(channel,url,proxylist)
     p "Grab: #{url}"
     doc = get_doc_with_proxy(proxylist,url)
     show_type = []
 
 
@@ -320,10 +466,11 @@
       if schedule.content.split(" ").size>1
         time = schedule.content.split(" ")[0]
         schedule = schedule.content.split(" ")[1]
         show_name = ""
         unless schedule_herf.nil?||schedule_herf.empty?
+          p "Show_infomation:#{schedule_herf} Time:#{time}"
           show_infomation=get_show_infomation(proxylist,schedule_herf)
           show_type=show_infomation["type"]
           show_name = show_infomation["name"]
           show_img = show_infomation["img"]
         end
@@ -335,11 +482,11 @@
   end
 
   #获取制定时间和长度url
   #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
   #day_num 为int型 代表抓取的时间从开始时间计算的多少天
-  def self.get_assign_date_url(url,start_time,day_num)
+  def get_assign_date_url(url,start_time,day_num)
     site="http://www.tvmao.com"
     if(@site)
       site=@site
     end
 
@@ -371,11 +518,11 @@
   end
 
 
 
   #获取指定时间段的节目表
-  def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
+  def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
     begin
       day_num = 1 if day_num<1
     rescue
       day_num = 1
     end
@@ -392,11 +539,11 @@
     @show_schedule = {}
 
     channel_schedule = {}
     get_assign_date_url(herf,start_num,day_num).each do |url|
       @date = ""
-      schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
+      schedule_list = get_schedulelist_atday(channel,url,proxylist)
       channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
     end
     @img_down_file.close
     {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
   end
@@ -405,11 +552,11 @@
 
 
 
   #因原已调用所以保留
   #获取一周节目表
-  def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
+  def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
     p "Day Num is #{day_num}"
     begin
       day_num = 1 if day_num<1
     rescue
       day_num = 1
@@ -440,29 +587,27 @@
     }
 
     channel_schedule = {}
     get_week_url.call(herf,day_num).each do |url|
       @date = ""
-      schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
+      schedule_list = get_schedulelist_atday(channel,url,proxylist)
       channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
     end
     @img_down_file.close
     {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
   end
 
 
   #获取节目详细信息
-  def self.get_show_infomation(proxy_list,schedule_herf)
+  def get_show_infomation(proxy_list,schedule_herf)
     begin
     @proxyindex = 0
     unless @site
       @site = "http://www.tvmao.com"
     end
     schedule_herf = @site + schedule_herf
-    doc=get_doc_with_proxy(proxy_list,schedule_herf)
-    #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
-   # p "title: %s" % title
+    doc = get_doc_with_proxy(proxy_list,schedule_herf)
     type = []
     name = doc.css('span[itemprop="name"]')[0].content
 
     #获取节目的图片
     if doc.css('img[class="tvc"]')
@@ -477,53 +622,56 @@
     doc.css('a[itemprop="genre"]').each do |_type|
       type<<_type.content
     end
     url = "#{schedule_herf}/detail"
     doc = get_doc_with_proxy(proxy_list,url)
-    doc.css('span[itemprop="genre"]').each do |_type|
-      type << _type.content
+    if doc
+      doc.css('span[itemprop="genre"]').each do |_type|
+        type << _type.content
+      end
     end
-    doc.css('a[itemprop="genre"]').each do |_type|
-      type<<_type.content
-    end
     type.uniq!
-    @img_down_file.puts("#{name}:#{schedule_img_down_path}")
+    unless @show_schedule
+      @show_schedule={}
+    end
     @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
     {"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
-    rescue => e
-      p "Error In get_show_infomation msg : #{e.to_s}"
+    #rescue => e
+    #  p "Error In get_show_infomation msg : #{e.to_s}"
     end
   end
 
   #获取节目的时间表
-  def self.get_show_schedule(proxylist,herf)
+  def get_show_schedule(proxylist,herf)
     url = herf + "/playingtime"
     doc = get_doc_with_proxy(proxylist,url)
     i = 0
     schedule = []
-    doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
-      unless(i==0)
-        time = epg.css('div[class="f1 fld"]')[0].content
-        channel_name = epg.css('div[class="f2 fld"]')[0].content
-        show_name = epg.css('div[class="f3 fld"]')[0].content
-        times = time.split(" ")
-        week = times[0]
-        date = times[1]
-        _time = times[2]
-        schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
+    if  doc.css('div[id="epg"]')[0]
+      doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
+        unless(i==0)
+          time = epg.css('div[class="f1 fld"]')[0].content
+          channel_name = epg.css('div[class="f2 fld"]')[0].content
+          show_name = epg.css('div[class="f3 fld"]')[0].content
+          times = time.split(" ")
+          week = times[0]
+          date = times[1]
+          _time = times[2]
+          schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
+        end
+        i += 1
       end
-      i += 1
     end
     schedule
   end
 
 
 
 
   #获取指定访问速度的代理服务器
   #time为最慢速度的时间 int型 代表秒
-  def self.get_topfast_list(use_time)
+  def get_topfast_list(use_time)
     fast_list = []
     time_use = 0
     ips_ports = get_proxy_list()
     ips_ports.each do |ip_port|
       time_start = Time.now.to_i
@@ -553,11 +701,11 @@
     end
     fast_list
   end
 
   #获取代理列表
-  def self.get_proxy_list()
+  def get_proxy_list()
     list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
     if list.count ==0
       list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
     end
     ips_ports = []
@@ -573,11 +721,11 @@
     end
     p "Count: #{ips_ports.count}"
     ips_ports
   end
 
-  def self.gg(url)
+  def gg(url)
     regex_list = /<TD class="list">.*<\/TD>/
     href =URI.parse(url)
     contxt = ""
     href.open{ |f|
       f.each_line {|line| contxt =contxt + line + "\n"}
@@ -586,7 +734,7 @@
   end
 
   def save_img
 
   end
-
+ end
 end