lib/webmaster_tools.rb in webmaster_tools-0.1.1 vs lib/webmaster_tools.rb in webmaster_tools-0.1.2

- old
+ new

@@ -6,23 +6,34 @@ # required parameters: # # :username - google username or email # :password - password in plaintext class WebmasterTools - LOGIN = "https://accounts.google.com/ServiceLogin?service=sitemaps" - AUTH = "https://accounts.google.com/ServiceLoginAuth" - REMOVAL = "https://www.google.com/webmasters/tools/removals-request?hl=en&siteUrl=%s&urlt=%s" - INFO = "https://www.google.com/webmasters/tools/sitemaps-dl?hl=en&siteUrl=%s&security_token=%s" - DASHBOARD = "https://www.google.com/webmasters/tools/dashboard?hl=en&siteUrl=%s" - ERRORS = "https://www.google.com/webmasters/tools/crawl-errors?hl=en&siteUrl=%s" - STATS = "https://www.google.com/webmasters/tools/crawl-stats?hl=en&siteUrl=%s" - TOKEN = "https://www.google.com/webmasters/tools/gwt/SITEMAPS_READ" - GWT = "https://www.google.com/webmasters/tools/gwt/" - GWT_PERM = "E3DA43109D05B1A5067480CE25494CC2" + LOGIN = "https://accounts.google.com/ServiceLogin?service=sitemaps" + AUTH = "https://accounts.google.com/ServiceLoginAuth" - PAYLOAD = "7|0|11|%s|3EA173CEE6992CFDEAB5C18469B06594|com.google.crawl.wmconsole.fe.feature.gwt.sitemaps.shared.SitemapsService|getDataForMainPage|com.google.crawl.wmconsole.fe.feature.gwt.common.shared.FeatureContext/2156265033|Z|/webmasters/tools|com.google.crawl.wmconsole.fe.feature.gwt.config.FeatureKey/497977451|en|%s|com.google.crawl.wmconsole.fe.base.PermissionLevel/2330262508|1|2|3|4|3|5|6|6|5|7|8|5|9|10|11|5|1|0|" + DASHBOARD = "https://www.google.com/webmasters/tools/dashboard?hl=en&siteUrl=%s" + STATS = "https://www.google.com/webmasters/tools/crawl-stats?hl=en&siteUrl=%s" + SUGGESTS = "https://www.google.com/webmasters/tools/html-suggestions?hl=en&siteUrl=%s" + REMOVAL = "https://www.google.com/webmasters/tools/removals-request?hl=en&siteUrl=%s&urlt=%s" + GWT_URL = "https://www.google.com/webmasters/tools/gwt/" + GWT = { + :info => { + :action => "SITEMAPS_READ", + :perm => "E3DA43109D05B1A5067480CE25494CC2", + :data => "7|0|11|%s|3EA173CEE6992CFDEAB5C18469B06594|com.google.crawl.wmconsole.fe.feature.gwt.sitemaps.shared.SitemapsService|getDataForMainPage|com.google.crawl.wmconsole.fe.feature.gwt.common.shared.FeatureContext/2156265033|Z|/webmasters/tools|com.google.crawl.wmconsole.fe.feature.gwt.config.FeatureKey/497977451|en|%s|com.google.crawl.wmconsole.fe.base.PermissionLevel/2330262508|1|2|3|4|3|5|6|6|5|7|8|5|9|10|11|5|1|0|", + :dl => "https://www.google.com/webmasters/tools/sitemaps-dl?hl=en&siteUrl=%s&security_token=%s", + }, + :error => { + :action => "CRAWLERRORS_READ", + :perm => "E3DA43109D05B1A5067480CE25494CC2", #"2367B7971367CA7B851B969834DDB639", + :data => "7|0|10|%s|1AC39E92A6D484F754108CEEAACB245D|com.google.crawl.wmconsole.fe.feature.gwt.crawlerrors.shared.CrawlErrorsService|getSiteLevelData|com.google.crawl.wmconsole.fe.feature.gwt.common.shared.FeatureContext/2156265033|/webmasters/tools|com.google.crawl.wmconsole.fe.feature.gwt.config.FeatureKey/497977451|en|%s|com.google.crawl.wmconsole.fe.base.PermissionLevel/2330262508|1|2|3|4|1|5|5|6|7|1|8|9|10|5|", + :dl => "https://www.google.com/webmasters/tools/crawl-errors-new-dl?hl=en&siteUrl=%s&security_token=%s", + } + } + def initialize(username, password) login(username, password) end def login(username, password) @@ -40,37 +51,12 @@ page.search("#sitemap tbody .rightmost").map do |node| { :indexed_web => node.text.gsub(/\D/, '').to_i } end end - def security_token(url) - # looks like `crawl_error_counts(url)` contains the security_token as well (if data available)... - dashboard(url) # to trigger referer - url = norm_url(url) - page = agent.post(TOKEN, PAYLOAD % [GWT, url], { - "X-GWT-Module-Base" => GWT, - "X-GWT-Permutation" => GWT_PERM, - "Content-Type" => "text/x-gwt-rpc; charset=utf-8", - }) - page.content.scan(/security_token=([^"]+)/).flatten.first - end + ################ - def crawl_info(url) - token = security_token(url) - url = CGI::escape norm_url(url) - page = agent.get(INFO % [url, token]) - - lines = page.content.split("\n").map do |line| - line.split(",") - end - head = lines.shift.map { |key| key.downcase.gsub(' ', '_').to_sym } - - $lines = lines.map do |line| - Hash[head.zip(line)] - end - end - def crawl_stats(url) url = CGI::escape norm_url(url) types = %w(pages kilobytes milliseconds).map(&:to_sym) head = %w(high avg low).map(&:to_sym) @@ -81,33 +67,102 @@ end.each_slice(3).map do |slice| Hash[head.zip(slice)] end)] end - def crawl_error_counts(url) + def suggests(url) url = CGI::escape norm_url(url) - page = agent.get(ERRORS % url) + page = agent.get(SUGGESTS % url) - page.search(".categories a").inject({}) do |hash, n| - key, value = n.text.split("\n") - hash[key.downcase.gsub(' ', '_').to_sym] = value.gsub(/\D/, '').to_i + page.search(".g-section tr").inject({}) do |hash, n| + if (key = n.search("a").first) && (value = n.search(".pages").first) + hash[to_key(key.text)] = to_value(value.text) + end hash end end def remove_url(url_with_file) - url = CGI::escape norm_url(url_with_file) - page = agent.get(REMOVAL % [url, CGI::escape(url_with_file)]) - page = agent.submit page.form - raise "could not submit URL" unless page.search(".wmt-external-url").map(&:text).include?(url_with_file) + url = CGI::escape norm_url(url_with_file) + page = agent.get(REMOVAL % [url, CGI::escape(url_with_file)]) + page = agent.submit page.form + files = page.search(".wmt-external-url").map { |n| File.basename(n.text) } + raise "could not submit URL" unless files.include?(File.basename(url_with_file)) end + ########################### + + def crawl_info(url) + url = norm_url(url) + token = security_token(:info, url) + page = agent.get(GWT[:info][:dl] % [CGI::escape(url), token]) + + lines = page.content.split("\n").map do |line| + line.split(",") + end + head = lines.shift.map { |key| key.downcase.gsub(' ', '_').to_sym } + + lines.map do |line| + Hash[head.zip(line)] + end + end + + def crawl_error_counts(url, split = false) + url = norm_url(url) + token = security_token(:error, url) + page = agent.get(GWT[:error][:dl] % [CGI::escape(url), token]) + + lines = page.content.split("\n").map do |line| + line.split(",") + end + head = lines.shift.map { |key| key.downcase.gsub(' ', '_').to_sym } + + errors = lines.inject({}) do |hash, line| + url, response_code, _, detected, category = *line + detected = "20#{$3}-#{'%02d' % $1.to_i}-#{'%02d' % $2.to_i}" if /(\d{1,2})\/(\d{1,2})\/(\d{2})/ =~ detected + unless category.to_s.empty? + sub_hash = split ? (hash[detected] ||= {}) : hash + sub_hash[category] ||= 0 + sub_hash[category] += 1 + end + hash + end + Hash[errors.sort { |a,b| a[0] <=> b[0] }] + end + private def agent @agent ||= Mechanize.new end def norm_url(url) schema, host, _ = url.scan(/^(https?:\/\/)?(.+?)(\/.*)?$/).flatten "#{schema || 'http://'}#{host}/" end + + def security_token(action, url) + dashboard(url) # to trigger referer + page = agent.post(GWT_URL + GWT[action][:action], GWT[action][:data] % [GWT_URL, url], { + "X-GWT-Module-Base" => GWT_URL, + "X-GWT-Permutation" => GWT[action][:perm], + "Content-Type" => "text/x-gwt-rpc; charset=utf-8", + }) + page.content.scan(/security_token=([^"]+)/).flatten.first + end + + def to_key(key) + key.downcase.gsub(' ', '_').to_sym + end + + def to_value(value) + value.gsub(/\D/, '').to_i + end + + # def security_token2(url) + # url = CGI::escape norm_url(url) + # page = agent.get(REMOVAL % [url, url]) + # page.form.fields.select do |field| + # field.name == "security_token" + # end.first.value + # end end +