require 'httparty' require 'scraperwiki-api/version' module ScraperWiki # A Ruby wrapper for the ScraperWiki API. # @see https://scraperwiki.com/docs/api class API include HTTParty base_uri 'api.scraperwiki.com/api/1.0' class Error < StandardError; end class ScraperNotFound < Error; end # Initializes a ScraperWiki API object. def initialize(apikey = nil) @apikey = apikey end # Queries and extracts data via a general purpose SQL interface. # # To make an RSS feed you need to use SQL's +AS+ keyword (e.g. "SELECT name # AS description") to make columns called +title+, +link+, +description+, # +guid+ (optional, uses link if not available) and +pubDate+ or +date+. # # +jsondict+ example output: # # [ # { # "fieldA": "valueA", # "fieldB": "valueB", # "fieldC": "valueC", # }, # ... # ] # # +jsonlist+ example output: # # { # "keys": ["fieldA", "fieldB", "fieldC"], # "data": [ # ["valueA", "valueB", "valueC"], # ... # ] # } # # +csv+ example output: # # fieldA,fieldB,fieldC # valueA,valueB,valueC # ... # # @param [String] shortname the scraper's shortname (as it appears in the URL) # @param [String] query a SQL query # @param [Hash] opts optional arguments # @option opts [String] :format one of "jsondict", "jsonlist", "csv", # "htmltable" or "rss2" # @option opts [String] :attach ";"-delimited list of shortnames of other # scrapers whose data you need to access # @see https://scraperwiki.com/docs/ruby/ruby_help_documentation/ # # @note The query string parameter is +name+, not +shortname+ # {https://scraperwiki.com/docs/api#sqlite as documented} def datastore_sqlite(shortname, query, opts = {}) if Array === opts[:attach] opts[:attach] = opts[:attach].join ';' end request_with_apikey '/datastore/sqlite', {name: shortname, query: query}.merge(opts) end # Extracts data about a scraper's code, owner, history, etc. # # Example output: # * The +runid+ is a Unix timestamp with microseconds and a UUID. # * The value of +records+ is the same as that of +total_rows+ under +datasummary+. # * +run_interval+ is the number of seconds between runs. # # [ # { # "code": "require 'nokogiri'\n...", # "datasummary": { # "tables": { # "swdata": { # "keys": [ # "fieldA", # ... # ], # "count": 42, # "sql": "CREATE TABLE `swdata` (...)" # }, # "swvariables": { # "keys": [ # "value_blob", # "type", # "name" # ], # "count": 2, # "sql": "CREATE TABLE `swvariables` (`value_blob` blob, `type` text, `name` text)" # }, # ... # }, # "total_rows": 44, # "filesize": 1000000 # }, # "description": "Scrapes websites for data.", # "language": "ruby", # "title": "Example scraper", # "tags": [], # "short_name": "example-scraper", # "userroles": { # "owner": [ # "johndoe" # ], # "editor": [ # "janedoe", # ... # ] # }, # "last_run": "1970-01-01T00:00:00", # "created": "1970-01-01T00:00:00", # "runevents": [ # { # "still_running": false, # "pages_scraped": 5, # "run_started": "1970-01-01T00:00:00", # "last_update": "1970-01-01T00:00:00", # "runid": "1325394000.000000_xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx", # "records_produced": 42 # }, # ... # ], # "records": 44, # "wiki_type": "scraper", # "privacy_status": "visible", # "run_interval": 604800, # "attachable_here": [], # "attachables": [], # "history": [ # ..., # { # "date": "1970-01-01T00:00:00", # "version": 0, # "user": "johndoe", # "session": "Thu, 1 Jan 1970 00:00:08 GMT" # } # ] # } # ] # # @param [String] shortname the scraper's shortname (as it appears in the URL) # @param [Hash] opts optional arguments # @option opts [String] :version version number (-1 for most recent) [default -1] # @option opts [String] :history_start_date history and runevents are # restricted to this date or after, enter as YYYY-MM-DD # @option opts [String] :quietfields "|"-delimited list of fields to exclude # from the output. Must be a subset of 'code|runevents|datasummary|userroles|history' # # @note Returns an array although the array seems to always have only one item # @note The +tags+ field seems to always be an empty array # @note The query string parameter is +name+, not +shortname+ # {https://scraperwiki.com/docs/api#getinfo as documented} def scraper_getinfo(shortname, opts = {}) if Array === opts[:quietfields] opts[:quietfields] = opts[:quietfields].join '|' end request_with_apikey '/scraper/getinfo', {name: shortname}.merge(opts) end # See what the scraper did during each run. # # Example output: # # [ # { # "run_ended": "1970-01-01T00:00:00", # "first_url_scraped": "http://www.iana.org/domains/example/", # "pages_scraped": 5, # "run_started": "1970-01-01T00:00:00", # "runid": "1325394000.000000_xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx", # "domainsscraped": [ # { # "domain": "http://example.com", # "bytes": 1000000, # "pages": 5 # } # ... # ], # "output": "...", # "records_produced": 42 # } # ] # # @param [String] shortname the scraper's shortname (as it appears in the URL) # @param [Hash] opts optional arguments # @option opts [String] runid a run ID # # @note Returns an array although the array seems to always have only one item # @note The query string parameter is +name+, not +shortname+ # {https://scraperwiki.com/docs/api#getinfo as documented} def scraper_getruninfo(shortname, opts = {}) request_with_apikey '/scraper/getruninfo', {name: shortname}.merge(opts) end # Find out information about a user. # # Example output: # # [ # { # "username": "johndoe", # "profilename": "John Doe", # "coderoles": { # "owner": [ # "johndoe.emailer", # "example-scraper", # ... # ], # "email": [ # "johndoe.emailer" # ], # "editor": [ # "yet-another-scraper", # ... # ] # }, # "datejoined": "1970-01-01T00:00:00" # } # ] # # @param [String] username a username # # @note Returns an array although the array seems to always have only one item # @note The date joined field is +date_joined+ (with underscore) on # {#scraper_usersearch} def scraper_getuserinfo(username) request_with_apikey '/scraper/getuserinfo', username: username end # Search the titles and descriptions of all the scrapers. # # Example output: # # [ # { # "description": "Scrapes websites for data.", # "language": "ruby", # "created": "1970-01-01T00:00:00", # "title": "Example scraper", # "short_name": "example-scraper", # "privacy_status": "public" # }, # ... # ] # # @param [Hash] opts optional arguments # @option opts [String] :searchquery search terms # @option opts [Integer] :maxrows number of results to return [default 5] # @option opts [String] :requestinguser the name of the user making the # search, which changes the order of the matches def scraper_search(opts = {}) request_with_apikey '/scraper/search', opts end # Search for a user by name. # # Example output: # # [ # { # "username": "johndoe", # "profilename": "John Doe", # "date_joined": "1970-01-01T00:00:00" # }, # ... # ] # # @param [Hash] opts optional arguments # @option opts [String] :searchquery search terms # @option opts [Integer] :maxrows number of results to return [default 5] # @option opts [String] :nolist space-separated list of usernames to exclude # from the output # @option opts [String] :requestinguser the name of the user making the # search, which changes the order of the matches # # @note The date joined field is +datejoined+ (without underscore) on # {#scraper_getuserinfo} def scraper_usersearch(opts = {}) if Array === opts[:nolist] opts[:nolist] = opts[:nolist].join ' ' end request '/scraper/usersearch', opts end private def request_with_apikey(path, opts = {}) if @apikey opts[:apikey] = @apikey end request path, opts end def request(path, opts) self.class.get(path, query: opts).parsed_response end end end