require 'scraperwiki'
require 'mechanize'
require 'logger'
starting_url = 'https://www2.bmcc.nsw.gov.au/datracking/Modules/applicationmaster/default.aspx?page=exhibit'
comment_url = 'http://www.bmcc.nsw.gov.au/sustainableliving/developmentapplicationsinnotification'
def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end
def scrape_table(doc, comment_url)
doc.search('table tbody tr').each do |tr|
# Columns in table
# Show Number Exhibit Start Exhibit End Details Village
tds = tr.search('td')
h = tds.map{|td| td.inner_html}
record = {
'info_url' => (doc.uri + tds[0].at('a')['href']).to_s,
'comment_url' => comment_url,
'council_reference' => clean_whitespace(h[1]),
'on_notice_from' => Date.strptime(clean_whitespace(h[2]),"%d/%m/%Y").to_s,
'on_notice_to' => Date.strptime(clean_whitespace(h[3]), "%d/%m/%Y").to_s,
'address' => clean_whitespace(h[4].split('
')[0]) + ", " + clean_whitespace(h[5]) + ", NSW",
'description' => clean_whitespace(h[4].split('
')[1..-1].join),
'date_scraped' => Date.today.to_s
}
#p record
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
p record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end
end
def scrape_and_follow_next_link(doc, comment_url)
scrape_table(doc, comment_url)
nextButton = doc.at('.rgPageNext')
unless nextButton['onclick'] =~ /return false/
form = doc.forms.first
# The joy of dealing with ASP.NET
form['__EVENTTARGET'] = nextButton['name']
form['__EVENTARGUMENT'] = ''
# It doesn't seem to work without these stupid values being set.
# Would be good to figure out where precisely in the javascript these values are coming from.
form['ctl00%24RadScriptManager1']=
'ctl00%24cphContent%24ctl00%24ctl00%24cphContent%24ctl00%24Radajaxpanel2Panel%7Cctl00%24cphContent%24ctl00%24ctl00%24RadGrid1%24ctl00%24ctl03%24ctl01%24ctl10'
form['ctl00_RadScriptManager1_HiddenField']=
'%3B%3BSystem.Web.Extensions%2C%20Version%3D3.5.0.0%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D31bf3856ad364e35%3Aen-US%3A0d787d5c-3903-4814-ad72-296cea810318%3Aea597d4b%3Ab25378d2%3BTelerik.Web.UI%2C%20Version%3D2009.1.527.35%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D121fae78165ba3d4%3Aen-US%3A1e3fef00-f492-4ed8-96ce-6371bc241e1c%3A16e4e7cd%3Af7645509%3A24ee1bba%3Ae330518b%3A1e771326%3Ac8618e41%3A4cacbc31%3A8e6f0d33%3Aed16cbdc%3A58366029%3Aaa288e2d'
doc = form.submit(form.button_with(:name => nextButton['name']))
scrape_and_follow_next_link(doc, comment_url)
end
end
#ScraperWiki.save_metadata('authority_name', 'Blue Mountains City Council')
#ScraperWiki.save_metadata('authority_short', 'Blue Mountains')
#ScraperWiki.save_metadata('state', 'NSW')
# Using Mechanize to grab the page because ScraperWiki.scrape bombed out on me
agent = Mechanize.new do |a|
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
doc = agent.get(starting_url)
scrape_and_follow_next_link(doc, comment_url)