lib/grab_tvmao.rb in grab_epg-0.2.3 vs lib/grab_tvmao.rb in grab_epg-0.2.4
- old
+ new
@@ -1,11 +1,15 @@
#encoding:utf-8
require 'nokogiri'
require 'open-uri'
-module GrabTvmao
+require File.expand_path("../grabepg/grab_base.rb", __FILE__)
+require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__)
+
+module Grabepg
+ class GrabTvmao
# To change this template use File | Settings | File Templates.
#图片的获取: Net::HTTP.get(url)
#图片的文件类型获取:
@@ -20,14 +24,119 @@
DEFAULT_SITE = "http://www.tvmao.com"
+ def initialize
+ @grabbase = GrabBase.new
+ end
+
+
+ #批量从tvmao获取节目类型
+ #channel 节目表属于的屏道
+ #url 节目表获取的网络地址
+ #date 日期
+ #schedule 需要批量修改的时间表
+ #proxylist 代理列表
+ def get_show_type_by_batch(channel,url,date,schedule,proxylist)
+ _schedule = {}
+ schedule.each do |s|
+ time = s["schedule_start"].gsub(":","").to_i
+ _schedule.merge!(time=>s)
+ end
+ url = get_show_type_url(url,date)
+ schedules = get_schedulelist_atday(channel,url,proxylist)
+ type = nil
+ schedules.each do |schedule|
+ schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
+ if _schedule.has_key?(schedule_time_num)
+ _schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"]
+ p "*****************************************************************************************"
+ p "Schedule: #{_schedule[schedule_time_num]}"
+ p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}"
+ p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}"
+ if _schedule[schedule_time_num]["schedule_logo"]==""
+ unless schedule["img"]==""
+ _schedule[schedule_time_num]["schedule_logo"]=schedule["img"]
+ end
+ end
+ end
+ end
+ ret = []
+ _schedule.each do |key,value|
+ ret << value
+ end
+
+ ret
+ end
+
+ #批量从tvmao获取节目类型
+ #channel 节目表属于的屏道
+ #url 节目表获取的网络地址
+ #date 日期
+ #time 节目开始时间
+ #proxylist 代理列表
+ def get_show_type(channel,url,date,time,proxylist)
+ url = get_show_type_url(url,date)
+ schedules = get_schedulelist_atday(channel,url,proxylist)
+ _time_num = time.gsub(":","").to_i
+ type = nil
+ schedules.each do |schedule|
+ schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
+ if _time_num==schedule_time_num
+ type = schedule["type"]
+ end
+ end
+ if type
+ return type
+ else
+ return []
+ end
+ end
+
+ def get_show_type_url(url,date)
+ whatday = 0
+ _date = date.split("(")[0]
+ case _date
+ when "星期一"
+ whatday=1
+ when "星期二"
+ whatday=2
+ when "星期三"
+ whatday=3
+ when "星期四"
+ whatday=4
+ when "星期五"
+ whatday=5
+ when "星期六"
+ whatday=6
+ when "星期日"
+ whatday=7
+ end
+
+ get_week_url = lambda {|url,whatday|
+ _url = "http://www.tvmao.com"
+ urls = []
+ _urls = url.split("-")
+ 0.upto(1).each do |i|
+ _url = _url+"#{_urls[i]}"+"-"
+ end
+ url = _url+"w#{whatday}.html"
+ return url
+ }
+ return get_week_url.call(url,whatday)
+ end
+
+
+
+
+
+
#将星期的wday获取值转化为中文名
#conversion wady to chinese
- def self.conversion_what_day(whatday)
+ def conversion_what_day(whatday)
ret = "星期"
case whatday.to_i
when 1
ret += "一"
when 2
@@ -45,58 +154,58 @@
end
ret
end
#如果时间为1~9的一位则为其在数字前加0补齐二位
- def self.dispose_time(num)
+ def dispose_time(num)
num = num.to_s
if num.length < 2
num = "0"+num
end
num
end
#转化当前时间的格式
- def self.get_week_date_time(time)
+ def get_week_date_time(time)
month = time.month
day = time.day
whatday = time.wday
ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")"
ret
end
#前几天需要减去的num
- def self.del_day_num(day_num)
+ def del_day_num(day_num)
ret = day_num*60*60*24
ret
end
#获取距离当前多少天的之前的日期
- def self.get_time_day_prior(num)
+ def get_time_day_prior(num)
time = Time.now - del_day_num(num)
ret = get_week_date_time(time)
ret
end
#前面一周要删除的日期的列表
- def self.del_time_list
+ def del_time_list
ret = []
time = Time.now
wday = time.wday
if(wday==1)
for i in 0..7
- ret<<self.get_time_day_prior(i)
+ ret<<get_time_day_prior(i)
end
end
ret
end
#调用此方法的例子
- def self.start
+ def start
#作用是获取俩个字符串的相似度
#get str1 and str2 similarity
get_similarity_string = lambda { |str1,str2|
_length = 0
type = 0
@@ -199,18 +308,18 @@
end
end
end
- def self.img_down_path
+ def img_down_path
@img_down_path
end
#获取网站的频道表
#img_path 图片存放路径
- def self.getchannels(img_dir_path)
+ def getchannels(img_dir_path)
@channel = []
@site=DEFAULT_SITE
@proxyindex = 0
@img_down_dir_path = img_dir_path
@img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+')
@@ -250,43 +359,80 @@
@img_down_file.close
p "Channel: #{@channel}"
{"channel_info"=>channel_info,"channel_urls"=>channel_urls}
end
- #使用代理获取url的html的doc值
- def self.get_doc_with_proxy(proxylist,url)
- unless @proxyindex
- @proxyindex = 0
- end
- @proxyindex=@proxyindex%proxylist.size
- if(proxylist[@proxyindex])
- proxy = proxylist[@proxyindex]
- else
- proxy = proxylist[@proxyindex+1]
- end
- begin
- doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
- doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
- @no_firest = 0
- rescue => err
+ def err_doc_proxy(proxy,proxylist,url="",err="")
+ if proxy.empty?||proxy.nil?
+ proxylist.delete_at[@proxyindex]
+ end
+
+
unless @no_firest
@no_firest = 0
end
@no_firest += 1
- p "*************************Proxy:#{proxy}, url:#{url} Error:#{err.to_s}"
+ p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
- get_doc_with_proxy(proxylist,url) if @no_firest<4
- raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
+ @proxyindex += 1
+ @proxyindex=@proxyindex%@size
+ doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
+ unless @no_firest<4
+ @no_firest=0
+ raise RuntimeError,"Error: #{err}"
+ end
+ doc
end
- @proxyindex += 1
- doc
- end
+
+ #使用代理获取url的html的doc值
+ def get_doc_with_proxy(proxylist,url)
+ unless proxylist.nil?||proxylist.empty?
+ unless @proxyindex
+ @proxyindex = 0
+ end
+ @size = proxylist.size
+ @proxyindex=@proxyindex%proxylist.size
+ if(proxylist[@proxyindex])
+ proxy = proxylist[@proxyindex]
+ else
+ proxy = proxylist[@proxyindex+1]
+ end
+ begin
+ doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty?
+ if doc.nil?
+ p "DOC is nil"
+ doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
+ @no_firest=0
+ end
+ @no_firest = 0
+ rescue => err
+ p "IN Rescue"
+ doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
+ @no_firest=0
+ p "Get DOC"
+ @proxyindex += 1
+ @proxyindex=@proxyindex%@size
+ return doc
+ end
+ @proxyindex += 1
+ @proxyindex=@proxyindex%@size
+ else
+ begin
+ doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty?
+ rescue => err
+ p "Error : Proxy:#{proxy}, url:#{url}"
+ raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
+ end
+ end
+ doc
+ end
+
#获取某天的节目表
- def self.get_schedulelist_atday(channel,url,proxylist)
+ def get_schedulelist_atday(channel,url,proxylist)
p "Grab: #{url}"
doc = get_doc_with_proxy(proxylist,url)
show_type = []
@@ -320,10 +466,11 @@
if schedule.content.split(" ").size>1
time = schedule.content.split(" ")[0]
schedule = schedule.content.split(" ")[1]
show_name = ""
unless schedule_herf.nil?||schedule_herf.empty?
+ p "Show_infomation:#{schedule_herf} Time:#{time}"
show_infomation=get_show_infomation(proxylist,schedule_herf)
show_type=show_infomation["type"]
show_name = show_infomation["name"]
show_img = show_infomation["img"]
end
@@ -335,11 +482,11 @@
end
#获取制定时间和长度url
#start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
#day_num 为int型 代表抓取的时间从开始时间计算的多少天
- def self.get_assign_date_url(url,start_time,day_num)
+ def get_assign_date_url(url,start_time,day_num)
site="http://www.tvmao.com"
if(@site)
site=@site
end
@@ -371,11 +518,11 @@
end
#获取指定时间段的节目表
- def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
+ def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
begin
day_num = 1 if day_num<1
rescue
day_num = 1
end
@@ -392,11 +539,11 @@
@show_schedule = {}
channel_schedule = {}
get_assign_date_url(herf,start_num,day_num).each do |url|
@date = ""
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
+ schedule_list = get_schedulelist_atday(channel,url,proxylist)
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
end
@img_down_file.close
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
end
@@ -405,11 +552,11 @@
#因原已调用所以保留
#获取一周节目表
- def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
+ def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
p "Day Num is #{day_num}"
begin
day_num = 1 if day_num<1
rescue
day_num = 1
@@ -440,29 +587,27 @@
}
channel_schedule = {}
get_week_url.call(herf,day_num).each do |url|
@date = ""
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
+ schedule_list = get_schedulelist_atday(channel,url,proxylist)
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
end
@img_down_file.close
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
end
#获取节目详细信息
- def self.get_show_infomation(proxy_list,schedule_herf)
+ def get_show_infomation(proxy_list,schedule_herf)
begin
@proxyindex = 0
unless @site
@site = "http://www.tvmao.com"
end
schedule_herf = @site + schedule_herf
- doc=get_doc_with_proxy(proxy_list,schedule_herf)
- #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
- # p "title: %s" % title
+ doc = get_doc_with_proxy(proxy_list,schedule_herf)
type = []
name = doc.css('span[itemprop="name"]')[0].content
#获取节目的图片
if doc.css('img[class="tvc"]')
@@ -477,53 +622,56 @@
doc.css('a[itemprop="genre"]').each do |_type|
type<<_type.content
end
url = "#{schedule_herf}/detail"
doc = get_doc_with_proxy(proxy_list,url)
- doc.css('span[itemprop="genre"]').each do |_type|
- type << _type.content
+ if doc
+ doc.css('span[itemprop="genre"]').each do |_type|
+ type << _type.content
+ end
end
- doc.css('a[itemprop="genre"]').each do |_type|
- type<<_type.content
- end
type.uniq!
- @img_down_file.puts("#{name}:#{schedule_img_down_path}")
+ unless @show_schedule
+ @show_schedule={}
+ end
@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
{"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
- rescue => e
- p "Error In get_show_infomation msg : #{e.to_s}"
+ #rescue => e
+ # p "Error In get_show_infomation msg : #{e.to_s}"
end
end
#获取节目的时间表
- def self.get_show_schedule(proxylist,herf)
+ def get_show_schedule(proxylist,herf)
url = herf + "/playingtime"
doc = get_doc_with_proxy(proxylist,url)
i = 0
schedule = []
- doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
- unless(i==0)
- time = epg.css('div[class="f1 fld"]')[0].content
- channel_name = epg.css('div[class="f2 fld"]')[0].content
- show_name = epg.css('div[class="f3 fld"]')[0].content
- times = time.split(" ")
- week = times[0]
- date = times[1]
- _time = times[2]
- schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
+ if doc.css('div[id="epg"]')[0]
+ doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
+ unless(i==0)
+ time = epg.css('div[class="f1 fld"]')[0].content
+ channel_name = epg.css('div[class="f2 fld"]')[0].content
+ show_name = epg.css('div[class="f3 fld"]')[0].content
+ times = time.split(" ")
+ week = times[0]
+ date = times[1]
+ _time = times[2]
+ schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
+ end
+ i += 1
end
- i += 1
end
schedule
end
#获取指定访问速度的代理服务器
#time为最慢速度的时间 int型 代表秒
- def self.get_topfast_list(use_time)
+ def get_topfast_list(use_time)
fast_list = []
time_use = 0
ips_ports = get_proxy_list()
ips_ports.each do |ip_port|
time_start = Time.now.to_i
@@ -553,11 +701,11 @@
end
fast_list
end
#获取代理列表
- def self.get_proxy_list()
+ def get_proxy_list()
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
if list.count ==0
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
end
ips_ports = []
@@ -573,11 +721,11 @@
end
p "Count: #{ips_ports.count}"
ips_ports
end
- def self.gg(url)
+ def gg(url)
regex_list = /<TD class="list">.*<\/TD>/
href =URI.parse(url)
contxt = ""
href.open{ |f|
f.each_line {|line| contxt =contxt + line + "\n"}
@@ -586,7 +734,7 @@
end
def save_img
end
-
+ end
end