lib/webmaster_tools.rb in webmaster_tools-0.1.1 vs lib/webmaster_tools.rb in webmaster_tools-0.1.2
- old
+ new
@@ -6,23 +6,34 @@
# required parameters:
#
# :username - google username or email
# :password - password in plaintext
class WebmasterTools
- LOGIN = "https://accounts.google.com/ServiceLogin?service=sitemaps"
- AUTH = "https://accounts.google.com/ServiceLoginAuth"
- REMOVAL = "https://www.google.com/webmasters/tools/removals-request?hl=en&siteUrl=%s&urlt=%s"
- INFO = "https://www.google.com/webmasters/tools/sitemaps-dl?hl=en&siteUrl=%s&security_token=%s"
- DASHBOARD = "https://www.google.com/webmasters/tools/dashboard?hl=en&siteUrl=%s"
- ERRORS = "https://www.google.com/webmasters/tools/crawl-errors?hl=en&siteUrl=%s"
- STATS = "https://www.google.com/webmasters/tools/crawl-stats?hl=en&siteUrl=%s"
- TOKEN = "https://www.google.com/webmasters/tools/gwt/SITEMAPS_READ"
- GWT = "https://www.google.com/webmasters/tools/gwt/"
- GWT_PERM = "E3DA43109D05B1A5067480CE25494CC2"
+ LOGIN = "https://accounts.google.com/ServiceLogin?service=sitemaps"
+ AUTH = "https://accounts.google.com/ServiceLoginAuth"
- PAYLOAD = "7|0|11|%s|3EA173CEE6992CFDEAB5C18469B06594|com.google.crawl.wmconsole.fe.feature.gwt.sitemaps.shared.SitemapsService|getDataForMainPage|com.google.crawl.wmconsole.fe.feature.gwt.common.shared.FeatureContext/2156265033|Z|/webmasters/tools|com.google.crawl.wmconsole.fe.feature.gwt.config.FeatureKey/497977451|en|%s|com.google.crawl.wmconsole.fe.base.PermissionLevel/2330262508|1|2|3|4|3|5|6|6|5|7|8|5|9|10|11|5|1|0|"
+ DASHBOARD = "https://www.google.com/webmasters/tools/dashboard?hl=en&siteUrl=%s"
+ STATS = "https://www.google.com/webmasters/tools/crawl-stats?hl=en&siteUrl=%s"
+ SUGGESTS = "https://www.google.com/webmasters/tools/html-suggestions?hl=en&siteUrl=%s"
+ REMOVAL = "https://www.google.com/webmasters/tools/removals-request?hl=en&siteUrl=%s&urlt=%s"
+ GWT_URL = "https://www.google.com/webmasters/tools/gwt/"
+ GWT = {
+ :info => {
+ :action => "SITEMAPS_READ",
+ :perm => "E3DA43109D05B1A5067480CE25494CC2",
+ :data => "7|0|11|%s|3EA173CEE6992CFDEAB5C18469B06594|com.google.crawl.wmconsole.fe.feature.gwt.sitemaps.shared.SitemapsService|getDataForMainPage|com.google.crawl.wmconsole.fe.feature.gwt.common.shared.FeatureContext/2156265033|Z|/webmasters/tools|com.google.crawl.wmconsole.fe.feature.gwt.config.FeatureKey/497977451|en|%s|com.google.crawl.wmconsole.fe.base.PermissionLevel/2330262508|1|2|3|4|3|5|6|6|5|7|8|5|9|10|11|5|1|0|",
+ :dl => "https://www.google.com/webmasters/tools/sitemaps-dl?hl=en&siteUrl=%s&security_token=%s",
+ },
+ :error => {
+ :action => "CRAWLERRORS_READ",
+ :perm => "E3DA43109D05B1A5067480CE25494CC2", #"2367B7971367CA7B851B969834DDB639",
+ :data => "7|0|10|%s|1AC39E92A6D484F754108CEEAACB245D|com.google.crawl.wmconsole.fe.feature.gwt.crawlerrors.shared.CrawlErrorsService|getSiteLevelData|com.google.crawl.wmconsole.fe.feature.gwt.common.shared.FeatureContext/2156265033|/webmasters/tools|com.google.crawl.wmconsole.fe.feature.gwt.config.FeatureKey/497977451|en|%s|com.google.crawl.wmconsole.fe.base.PermissionLevel/2330262508|1|2|3|4|1|5|5|6|7|1|8|9|10|5|",
+ :dl => "https://www.google.com/webmasters/tools/crawl-errors-new-dl?hl=en&siteUrl=%s&security_token=%s",
+ }
+ }
+
def initialize(username, password)
login(username, password)
end
def login(username, password)
@@ -40,37 +51,12 @@
page.search("#sitemap tbody .rightmost").map do |node|
{ :indexed_web => node.text.gsub(/\D/, '').to_i }
end
end
- def security_token(url)
- # looks like `crawl_error_counts(url)` contains the security_token as well (if data available)...
- dashboard(url) # to trigger referer
- url = norm_url(url)
- page = agent.post(TOKEN, PAYLOAD % [GWT, url], {
- "X-GWT-Module-Base" => GWT,
- "X-GWT-Permutation" => GWT_PERM,
- "Content-Type" => "text/x-gwt-rpc; charset=utf-8",
- })
- page.content.scan(/security_token=([^"]+)/).flatten.first
- end
+ ################
- def crawl_info(url)
- token = security_token(url)
- url = CGI::escape norm_url(url)
- page = agent.get(INFO % [url, token])
-
- lines = page.content.split("\n").map do |line|
- line.split(",")
- end
- head = lines.shift.map { |key| key.downcase.gsub(' ', '_').to_sym }
-
- $lines = lines.map do |line|
- Hash[head.zip(line)]
- end
- end
-
def crawl_stats(url)
url = CGI::escape norm_url(url)
types = %w(pages kilobytes milliseconds).map(&:to_sym)
head = %w(high avg low).map(&:to_sym)
@@ -81,33 +67,102 @@
end.each_slice(3).map do |slice|
Hash[head.zip(slice)]
end)]
end
- def crawl_error_counts(url)
+ def suggests(url)
url = CGI::escape norm_url(url)
- page = agent.get(ERRORS % url)
+ page = agent.get(SUGGESTS % url)
- page.search(".categories a").inject({}) do |hash, n|
- key, value = n.text.split("\n")
- hash[key.downcase.gsub(' ', '_').to_sym] = value.gsub(/\D/, '').to_i
+ page.search(".g-section tr").inject({}) do |hash, n|
+ if (key = n.search("a").first) && (value = n.search(".pages").first)
+ hash[to_key(key.text)] = to_value(value.text)
+ end
hash
end
end
def remove_url(url_with_file)
- url = CGI::escape norm_url(url_with_file)
- page = agent.get(REMOVAL % [url, CGI::escape(url_with_file)])
- page = agent.submit page.form
- raise "could not submit URL" unless page.search(".wmt-external-url").map(&:text).include?(url_with_file)
+ url = CGI::escape norm_url(url_with_file)
+ page = agent.get(REMOVAL % [url, CGI::escape(url_with_file)])
+ page = agent.submit page.form
+ files = page.search(".wmt-external-url").map { |n| File.basename(n.text) }
+ raise "could not submit URL" unless files.include?(File.basename(url_with_file))
end
+ ###########################
+
+ def crawl_info(url)
+ url = norm_url(url)
+ token = security_token(:info, url)
+ page = agent.get(GWT[:info][:dl] % [CGI::escape(url), token])
+
+ lines = page.content.split("\n").map do |line|
+ line.split(",")
+ end
+ head = lines.shift.map { |key| key.downcase.gsub(' ', '_').to_sym }
+
+ lines.map do |line|
+ Hash[head.zip(line)]
+ end
+ end
+
+ def crawl_error_counts(url, split = false)
+ url = norm_url(url)
+ token = security_token(:error, url)
+ page = agent.get(GWT[:error][:dl] % [CGI::escape(url), token])
+
+ lines = page.content.split("\n").map do |line|
+ line.split(",")
+ end
+ head = lines.shift.map { |key| key.downcase.gsub(' ', '_').to_sym }
+
+ errors = lines.inject({}) do |hash, line|
+ url, response_code, _, detected, category = *line
+ detected = "20#{$3}-#{'%02d' % $1.to_i}-#{'%02d' % $2.to_i}" if /(\d{1,2})\/(\d{1,2})\/(\d{2})/ =~ detected
+ unless category.to_s.empty?
+ sub_hash = split ? (hash[detected] ||= {}) : hash
+ sub_hash[category] ||= 0
+ sub_hash[category] += 1
+ end
+ hash
+ end
+ Hash[errors.sort { |a,b| a[0] <=> b[0] }]
+ end
+
private
def agent
@agent ||= Mechanize.new
end
def norm_url(url)
schema, host, _ = url.scan(/^(https?:\/\/)?(.+?)(\/.*)?$/).flatten
"#{schema || 'http://'}#{host}/"
end
+
+ def security_token(action, url)
+ dashboard(url) # to trigger referer
+ page = agent.post(GWT_URL + GWT[action][:action], GWT[action][:data] % [GWT_URL, url], {
+ "X-GWT-Module-Base" => GWT_URL,
+ "X-GWT-Permutation" => GWT[action][:perm],
+ "Content-Type" => "text/x-gwt-rpc; charset=utf-8",
+ })
+ page.content.scan(/security_token=([^"]+)/).flatten.first
+ end
+
+ def to_key(key)
+ key.downcase.gsub(' ', '_').to_sym
+ end
+
+ def to_value(value)
+ value.gsub(/\D/, '').to_i
+ end
+
+ # def security_token2(url)
+ # url = CGI::escape norm_url(url)
+ # page = agent.get(REMOVAL % [url, url])
+ # page.form.fields.select do |field|
+ # field.name == "security_token"
+ # end.first.value
+ # end
end
+