lib/infoboxer/media_wiki.rb in infoboxer-0.1.2.1 vs lib/infoboxer/media_wiki.rb in infoboxer-0.2.0

- old
+ new

@@ -1,8 +1,9 @@ # encoding: utf-8 -require 'rest-client' -require 'json' +#require 'rest-client' +#require 'json' +require 'mediawiktory' require 'addressable/uri' require_relative 'media_wiki/traits' require_relative 'media_wiki/page' @@ -34,11 +35,11 @@ # # You can also use per-instance option, see {#initialize} attr_accessor :user_agent end - attr_reader :api_base_url + attr_reader :api_base_url, :traits # Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut # for it, as well as shortcuts for some well-known wikis, like # {Infoboxer.wikipedia}. # @@ -47,30 +48,35 @@ # in different wikis. # @param options Only one option is currently supported: # * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header. def initialize(api_base_url, options = {}) @api_base_url = Addressable::URI.parse(api_base_url) - @resource = RestClient::Resource.new(api_base_url, headers: headers(options)) + @client = MediaWiktory::Client.new(api_base_url, user_agent: user_agent(options)) + @traits = Traits.get(@api_base_url.host, namespaces: extract_namespaces) end # Receive "raw" data from Wikipedia (without parsing or wrapping in # classes). # # @return [Array<Hash>] def raw(*titles) - postprocess @resource.get( - params: DEFAULT_PARAMS.merge(titles: titles.join('|')) - ) + titles.each_slice(50).map{|part| + @client.query. + titles(*part). + prop(revisions: {prop: :content}, info: {prop: :url}). + redirects(true). # FIXME: should be done transparently by MediaWiktory? + perform.pages + }.inject(:concat) # somehow flatten(1) fails! end - # Receive list of parsed wikipedia pages for list of titles provided. + # Receive list of parsed MediaWiki pages for list of titles provided. # All pages are received with single query to MediaWiki API. # - # **NB**: currently, if you are requesting more than 50 titles at - # once (MediaWiki limitation for single request), Infoboxer will - # **not** try to get other pages with subsequent queries. This will - # be fixed in future. + # **NB**: if you are requesting more than 50 titles at once + # (MediaWiki limitation for single request), Infoboxer will do as + # many queries as necessary to extract them all (it will be like + # `(titles.count / 50.0).ceil` requests) # # @return [Tree::Nodes<Page>] array of parsed pages. Notes: # * if you call `get` with only one title, one page will be # returned instead of an array # * if some of pages are not in wiki, they will not be returned, @@ -85,78 +91,120 @@ # ``` # and obtain meaningful results instead of NoMethodError or some # NotFound. # def get(*titles) - pages = raw(*titles).reject{|raw| raw[:content].nil?}. + pages = raw(*titles). + tap{|pages| pages.detect(&:invalid?).tap{|i| i && fail(i.raw.invalidreason)}}. + select(&:exists?). map{|raw| - traits = Traits.get(@api_base_url.host, extract_traits(raw)) - Page.new(self, - Parser.paragraphs(raw[:content], traits), - raw.merge(traits: traits)) + Parser.paragraphs(raw.content, traits), + raw) } titles.count == 1 ? pages.first : Tree::Nodes[*pages] end + # Receive list of parsed MediaWiki pages from specified category. + # + # **NB**: currently, this API **always** fetches all pages from + # category, there is no option to "take first 20 pages". Pages are + # fetched in 50-page batches, then parsed. So, for large category + # it can really take a while to fetch all pages. + # + # @param title Category title. You can use namespaceless title (like + # `"Countries in South America"`), title with namespace (like + # `"Category:Countries in South America"`) or title with local + # namespace (like `"Catégorie:Argentine"` for French Wikipedia) + # + # @return [Tree::Nodes<Page>] array of parsed pages. + # + def category(title) + title = normalize_category_title(title) + + list(categorymembers: {title: title, limit: 50}) + end + + # Receive list of parsed MediaWiki pages for provided search query. + # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch) + # for details. + # + # **NB**: currently, this API **always** fetches all pages from + # category, there is no option to "take first 20 pages". Pages are + # fetched in 50-page batches, then parsed. So, for large category + # it can really take a while to fetch all pages. + # + # @param query Search query. For old installations, look at + # https://www.mediawiki.org/wiki/Help:Searching + # for search syntax. For new ones (including Wikipedia), see at + # https://www.mediawiki.org/wiki/Help:CirrusSearch. + # + # @return [Tree::Nodes<Page>] array of parsed pages. + # + def search(query) + list(search: {search: query, limit: 50}) + end + + # Receive list of parsed MediaWiki pages with titles startin from prefix. + # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bprefixsearch) + # for details. + # + # **NB**: currently, this API **always** fetches all pages from + # category, there is no option to "take first 20 pages". Pages are + # fetched in 50-page batches, then parsed. So, for large category + # it can really take a while to fetch all pages. + # + # @param prefix page title prefix. + # + # @return [Tree::Nodes<Page>] array of parsed pages. + # + def prefixsearch(prefix) + list(prefixsearch: {search: prefix, limit: 100}) + end + + def inspect + "#<#{self.class}(#{@api_base_url.host})>" + end + private - # @private - PROP = [ - 'revisions', # to extract content of the page - 'info', # to extract page canonical url - 'categories', # to extract default category prefix - 'images' # to extract default media prefix - ].join('|') + def list(query) + response = @client.query. + generator(query). + prop(revisions: {prop: :content}, info: {prop: :url}). + redirects(true). # FIXME: should be done transparently by MediaWiktory? + perform - # @private - DEFAULT_PARAMS = { - action: :query, - format: :json, - redirects: true, + response.continue! while response.continue? - prop: PROP, - rvprop: :content, - inprop: :url, - } + pages = response.pages.select(&:exists?). + map{|raw| + Page.new(self, + Parser.paragraphs(raw.content, traits), + raw) + } - def headers(options) - {'User-Agent' => options[:user_agent] || options[:ua] || self.class.user_agent || UA} + Tree::Nodes[*pages] end - def extract_traits(raw) - raw.select{|k, v| [:file_prefix, :category_prefix].include?(k)} + def normalize_category_title(title) + # FIXME: shouldn't it go to MediaWiktory?.. + namespace, titl = title.include?(':') ? title.split(':', 2) : [nil, title] + namespace, titl = nil, title unless traits.category_namespace.include?(namespace) + + namespace ||= traits.category_namespace.first + [namespace, titl].join(':') end - def guess_traits(pages) - categories = pages.map{|p| p['categories']}.compact.flatten - images = pages.map{|p| p['images']}.compact.flatten - { - file_prefix: images.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq, - category_prefix: categories.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq, - } + def user_agent(options) + options[:user_agent] || options[:ua] || self.class.user_agent || UA end - def postprocess(response) - pages = JSON.parse(response)['query']['pages'] - traits = guess_traits(pages.values) - - pages.map{|id, data| - if id.to_i < 0 - { - title: data['title'], - content: nil, - not_found: true - } - else - { - title: data['title'], - content: data['revisions'].first['*'], - url: data['fullurl'], - }.merge(traits) - end + def extract_namespaces + siteinfo = @client.query.meta(siteinfo: {prop: [:namespaces, :namespacealiases]}).perform + siteinfo.raw.query.namespaces.map{|_, namespace| + aliases = siteinfo.raw.query.namespacealiases.select{|a| a.id == namespace.id}.map{|a| a['*']} + namespace.merge(aliases: aliases) } - rescue JSON::ParserError - fail RuntimeError, "Not a JSON response, seems there's not a MediaWiki API: #{@api_base_url}" end end end