#encoding:utf-8 require File.expand_path("../grab_base.rb", __FILE__) module Grabepg # To change this template use File | Settings | File Templates. class GrabTvsou include Grabepg #首页 attr_reader :home_page #代理列表 attr_reader :proxy_list attr_reader :grabbase #频道存储 attr_reader :channels #时间表存储 attr_reader :schedules #俩个节目间的最小间隔时间 attr_reader :default_min_interval ChannelTypeMap = {"yg_ys_li"=>"央视","yg_ws_li"=>"卫视","yg_hw_li"=>"海外","yg_df_li"=>"地方"} #type 从mobie还是网站接口抓取数据 def initialize(grabtype,proxy_list) @home_page = get_url(grabtype) @proxy_list = proxy_list @grabbase = GrabBase.new @channels = {} @site="http://m.tvsou.com" end #获取从tvsou的什么网站上获取 #type: mobile,webpage def get_url(type) return "http://m.tvsou.com/index.asp" if type.eql?("mobile") end def get_data_year_month_day(time) month = time.month.to_s if month.length<2 month = "0"+month end return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{time.day})"} end #获取时间 #start_time 时间起始点 #use_time 天数 def get_data(start_time,use_time) time = Time.now+start_time*24*60*60 ret = [] use_time.times.each do |i| _time = time + i*24*60*60 ret << get_data_year_month_day(_time) end ret end #对首页进行处理获取部分频道的URL和嘻嘻 def dispose_home_page get_channellist = lambda { |li,type| channellist = {} li.css('a').each do |a| channellist.merge!({a.content=>{url:a.get_attribute("href"),type:type}}) unless channellist.has_key?(a.content) end channellist } doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page) doc.css("li").each do |li| case ChannelTypeMap[li.get_attribute("class")] when "央视" @channels.merge!(get_channellist.call(li,"CCTV")) when "卫视" @channels.merge!(get_channellist.call(li,"WTV")) when "海外" when "地方" end end return @channels end #获取频道列表 #url是获取频道列表的首页 #地方需要调用此函数 def dispose_channel_page(url,channel_type) end #获取频道时间表URL def dispose_href_schedule_data(href,start_time,use_time) hrefs=href.split("&programDT=") _hrefs=hrefs[1].split("&") ret = [] get_data(start_time,use_time).each do |time| _hrefs[0]=time[:time] url = hrefs[0]+"&programDT=" + time[:time] 1.upto(_hrefs.length-1).each do |i| url += "&"+_hrefs[i] end ret<<{url:url,time:time[:time],date:time[:date]} end ret end #根据URL解析时间表页面 def dispose_schedule_page(url,start_time,use_time) url = @site +"/"+url urls = url.split("?") doc = @grabbase.get_doc_with_proxy(@proxy_list,url) _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href") _url = urls[0]+_url urls = dispose_href_schedule_data(_url,start_time,use_time) ret = {} last_time = -5 last_schedule = {} urls.each do |url| p "Grab url: #{url}" if url doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url]) schedules = [] doc.css('div[class="time"]')[0].css("li[class='gray']").each do |schedule| begin _dispose = schedule.content _dispose_show =schedule.css("span")[0].text time = _dispose.gsub(_dispose_show,"") _url = @site+"/" + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0] schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url} now = time.gsub(":","").to_i if((now-last_time)<5) schedules.delete(last_schedule) end last_schedule = {time:time,schedule_name:_dispose_show.gsub(" 剧情",""),url:_url} last_time = now rescue => err p "Schedule: #{schedule}" end end ret.merge!({url[:date]=>schedules}) end end return ret end #解析节目详情页面 def dispose_show_info(url) doc = @grabbase.get_doc_with_proxy(@proxy_list,url) show_name = doc.css('div[class="tv_info_top"]')[0].content _doc=doc.css("div[class='tv_info']") img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","") show_info = _doc.css("p")[0].content.gsub("[全文]","") {show_name:show_name,img_url:img_url,show_info:show_info} end end end