media_wiki.rb in infoboxer-0.2.8

- old
+ new

@@ -1,22 +1,21 @@
 # encoding: utf-8
-#require 'rest-client'
-#require 'json'
+
 require 'mediawiktory'
 require 'addressable/uri'
 
-require_relative 'media_wiki/mediawiktory_patch'
 require_relative 'media_wiki/traits'
 require_relative 'media_wiki/page'
 
 module Infoboxer
   # MediaWiki client class.
   #
   # Usage:
   #
   # ```ruby
-  # client = Infoboxer::MediaWiki.new('http://en.wikipedia.org/w/api.php', user_agent: 'My Own Project')
+  # client = Infoboxer::MediaWiki
+  #   .new('http://en.wikipedia.org/w/api.php', user_agent: 'My Own Project')
   # page = client.get('Argentina')
   # ```
   #
   # Consider using shortcuts like {Infoboxer.wiki}, {Infoboxer.wikipedia},
   # {Infoboxer.wp} and so on instead of direct instation of this class
@@ -25,21 +24,25 @@
   class MediaWiki
     # Default Infoboxer User-Agent header.
     #
     # You can set yours as an option to {Infoboxer.wiki} and its shortcuts,
     # or to {#initialize}
-    UA = "Infoboxer/#{Infoboxer::VERSION} (https://github.com/molybdenum-99/infoboxer; zverok.offline@gmail.com)"
+    UA = "Infoboxer/#{Infoboxer::VERSION} "\
+      '(https://github.com/molybdenum-99/infoboxer; zverok.offline@gmail.com)'.freeze
 
     class << self
       # User agent getter/setter.
       #
       # Default value is {UA}.
       #
       # You can also use per-instance option, see {#initialize}
+      #
+      # @return [String]
       attr_accessor :user_agent
     end
 
+    # @private
     attr_reader :api_base_url, :traits
 
     # Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut
     # for it, as well as shortcuts for some well-known wikis, like
     # {Infoboxer.wikipedia}.
@@ -49,43 +52,64 @@
     #   in different wikis.
     # @param options Only one option is currently supported:
     #   * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header.
     def initialize(api_base_url, options = {})
       @api_base_url = Addressable::URI.parse(api_base_url)
-      @client = MediaWiktory::Client.new(api_base_url, user_agent: user_agent(options))
+      @client = MediaWiktory::Wikipedia::Api.new(api_base_url, user_agent: user_agent(options))
       @traits = Traits.get(@api_base_url.host, namespaces: extract_namespaces)
     end
 
     # Receive "raw" data from Wikipedia (without parsing or wrapping in
     # classes).
     #
-    # @return [Array<Hash>]
-    def raw(*titles)
-      return [] if titles.empty? # could emerge on "automatically" created page lists, should work
-      
-      titles.each_slice(50).map{|part|
-        @client.query.
-          titles(*part).
-          prop(revisions: {prop: :content}, info: {prop: :url}).
-          redirects(true). # FIXME: should be done transparently by MediaWiktory?
-          perform.pages
-      }.inject(:concat). # somehow flatten(1) fails!
-      sort_by{|page|
-        res_title = page.alt_titles.detect{|t| titles.map(&:downcase).include?(t.downcase)} # FIXME?..
-        titles.index(res_title) || 1_000
-      }
+    # @param titles [Array<String>] List of page titles to get.
+    # @param prop [Array<Symbol>] List of additional page properties to get, refer to
+    #   [MediaWiktory::Actions::Query#prop](http://www.rubydoc.info/gems/mediawiktory/MediaWiktory/Wikipedia/Actions/Query#prop-instance_method)
+    #   for the list of available properties.
+    #
+    # @return [Hash{String => Hash}] Hash of `{requested title => raw MediaWiki object}`. Note that
+    #   even missing (does not exist in current Wiki) or invalid (impossible title) still be present
+    #   in response, just will have `"missing"` or `"invalid"` key, just like MediaWiki returns them.
+    def raw(*titles, prop: [])
+      # could emerge on "automatically" created page lists, should work
+      return {} if titles.empty?
+
+      titles.each_slice(50).map do |part|
+        response = @client
+                   .query
+                   .titles(*part)
+                   .prop(:revisions, :info, *prop).prop(:content, :timestamp, :url)
+                   .redirects
+                   .response
+
+        sources = response['pages'].values.map { |page| [page['title'], page] }.to_h
+        redirects =
+          if response['redirects']
+            response['redirects'].map { |r| [r['from'], sources[r['to']]] }.to_h
+          else
+            {}
+          end
+
+        # This way for 'Einstein' query we'll have {'Albert Einstein' => page, 'Einstein' => same page}
+        sources.merge(redirects)
+      end.inject(:merge)
     end
 
     # Receive list of parsed MediaWiki pages for list of titles provided.
     # All pages are received with single query to MediaWiki API.
     #
     # **NB**: if you are requesting more than 50 titles at once
     # (MediaWiki limitation for single request), Infoboxer will do as
     # many queries as necessary to extract them all (it will be like
     # `(titles.count / 50.0).ceil` requests)
     #
-    # @return [Tree::Nodes<Page>] array of parsed pages. Notes:
+    # @param titles [Array<String>] List of page titles to get.
+    # @param prop [Array<Symbol>] List of additional page properties to get, refer to
+    #   [MediaWiktory::Actions::Query#prop](http://www.rubydoc.info/gems/mediawiktory/MediaWiktory/Wikipedia/Actions/Query#prop-instance_method)
+    #   for the list of available properties.
+    #
+    # @return [Page, Tree::Nodes<Page>] array of parsed pages. Notes:
     #   * if you call `get` with only one title, one page will be
     #     returned instead of an array
     #   * if some of pages are not in wiki, they will not be returned,
     #     therefore resulting array can be shorter than titles array;
     #     you can always check `pages.map(&:title)` to see what you've
@@ -94,26 +118,19 @@
     #
     #     ```ruby
     #     Infoboxer.wp.get('Argentina', 'Chile', 'Something non-existing').
     #        infobox.fetch('some value')
     #     ```
-    #     and obtain meaningful results instead of NoMethodError or some
-    #     NotFound.
+    #     and obtain meaningful results instead of `NoMethodError` or
+    #     `SomethingNotFound`.
     #
-    def get(*titles)
-      pages = raw(*titles).
-        tap{|pages| pages.detect(&:invalid?).tap{|i| i && fail(i.raw.invalidreason)}}.
-        select(&:exists?).
-        map{|raw|
-          Page.new(self,
-            Parser.paragraphs(raw.content, traits),
-            raw)
-        }
+    def get(*titles, prop: [])
+      pages = get_h(*titles, prop: prop).values.compact
       titles.count == 1 ? pages.first : Tree::Nodes[*pages]
     end
 
-    # Same as {#get}, but returns hash of {requested title => page}.
+    # Same as {#get}, but returns hash of `{requested title => page}`.
     #
     # Useful quirks:
     # * when requested page not existing, key will be still present in
     #   resulting hash (value will be `nil`);
     # * when requested page redirects to another, key will still be the
@@ -121,118 +138,130 @@
     #   with key 'Einstein' and page titled 'Albert Einstein'.
     #
     # This allows you to be in full control of what pages of large list
     # you've received.
     #
+    # @param titles [Array<String>] List of page titles to get.
+    # @param prop [Array<Symbol>] List of additional page properties to get, refer to
+    #   [MediaWiktory::Actions::Query#prop](http://www.rubydoc.info/gems/mediawiktory/MediaWiktory/Wikipedia/Actions/Query#prop-instance_method)
+    #   for the list of available properties.
+    #
     # @return [Hash<String, Page>]
     #
-    def get_h(*titles)
-      pages = [*get(*titles)]
-      titles.map{|t|
-        [t, pages.detect{|p| p.source.alt_titles.map(&:downcase).include?(t.downcase)}]
-      }.to_h
+    def get_h(*titles, prop: [])
+      raw_pages = raw(*titles, prop: prop)
+                  .tap { |ps| ps.detect { |_, p| p['invalid'] }.tap { |_, i| i && fail(i['invalidreason']) } }
+                  .reject { |_, p| p.key?('missing') }
+      titles.map { |title| [title, make_page(raw_pages, title)] }.to_h
     end
 
     # Receive list of parsed MediaWiki pages from specified category.
     #
     # **NB**: currently, this API **always** fetches all pages from
     # category, there is no option to "take first 20 pages". Pages are
     # fetched in 50-page batches, then parsed. So, for large category
     # it can really take a while to fetch all pages.
     #
-    # @param title Category title. You can use namespaceless title (like
-    #     `"Countries in South America"`), title with namespace (like 
+    # @param title [String] Category title. You can use namespaceless title (like
+    #     `"Countries in South America"`), title with namespace (like
     #     `"Category:Countries in South America"`) or title with local
     #     namespace (like `"Catégorie:Argentine"` for French Wikipedia)
     #
     # @return [Tree::Nodes<Page>] array of parsed pages.
     #
     def category(title)
       title = normalize_category_title(title)
-      
-      list(categorymembers: {title: title, limit: 50})
+
+      list(@client.query.generator(:categorymembers).title(title).limit('max'))
     end
 
     # Receive list of parsed MediaWiki pages for provided search query.
     # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch)
     # for details.
     #
     # **NB**: currently, this API **always** fetches all pages from
     # category, there is no option to "take first 20 pages". Pages are
-    # fetched in 50-page batches, then parsed. So, for large category
+    # fetched in 50-page batches, then parsed. So, for large search query
     # it can really take a while to fetch all pages.
     #
-    # @param query Search query. For old installations, look at
+    # @param query [String] Search query. For old installations, look at
     #     https://www.mediawiki.org/wiki/Help:Searching
     #     for search syntax. For new ones (including Wikipedia), see at
     #     https://www.mediawiki.org/wiki/Help:CirrusSearch.
     #
     # @return [Tree::Nodes<Page>] array of parsed pages.
     #
     def search(query)
-      list(search: {search: query, limit: 50})
+      list(@client.query.generator(:search).search(query).limit('max'))
     end
 
     # Receive list of parsed MediaWiki pages with titles startin from prefix.
     # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bprefixsearch)
     # for details.
     #
     # **NB**: currently, this API **always** fetches all pages from
     # category, there is no option to "take first 20 pages". Pages are
-    # fetched in 50-page batches, then parsed. So, for large category
+    # fetched in 50-page batches, then parsed. So, for large search query
     # it can really take a while to fetch all pages.
     #
-    # @param prefix page title prefix.
+    # @param prefix [String] Page title prefix.
     #
     # @return [Tree::Nodes<Page>] array of parsed pages.
     #
     def prefixsearch(prefix)
-      list(prefixsearch: {search: prefix, limit: 100})
+      list(@client.query.generator(:prefixsearch).search(prefix).limit('max'))
     end
 
+    # @return [String]
     def inspect
       "#<#{self.class}(#{@api_base_url.host})>"
     end
 
     private
 
+    def make_page(raw_pages, title)
+      _, source = raw_pages.detect { |ptitle, _| ptitle.casecmp(title).zero? }
+      source or return nil
+      Page.new(self, Parser.paragraphs(source['revisions'].first['*'], traits), source)
+    end
+
     def list(query)
-      response = @client.query.
-        generator(query).
-        prop(revisions: {prop: :content}, info: {prop: :url}).
-        redirects(true). # FIXME: should be done transparently by MediaWiktory?
-        perform
+      response = query
+                 .prop(:revisions, :info)
+                 .prop(:content, :timestamp, :url)
+                 .redirects
+                 .response
 
-      response.continue! while response.continue?
+      response = response.continue while response.continue?
 
-      pages = response.pages.select(&:exists?).
-        map{|raw|
-          Page.new(self,
-            Parser.paragraphs(raw.content, traits),
-            raw)
-        }
+      return Tree::Nodes[] if response['pages'].nil?
 
+      pages = response['pages']
+              .values.select { |p| p['missing'].nil? }
+              .map { |raw| Page.new(self, Parser.paragraphs(raw['revisions'].first['*'], traits), raw) }
+
       Tree::Nodes[*pages]
     end
 
     def normalize_category_title(title)
       # FIXME: shouldn't it go to MediaWiktory?..
       namespace, titl = title.include?(':') ? title.split(':', 2) : [nil, title]
       namespace, titl = nil, title unless traits.category_namespace.include?(namespace)
-      
+
       namespace ||= traits.category_namespace.first
       [namespace, titl].join(':')
     end
 
     def user_agent(options)
       options[:user_agent] || options[:ua] || self.class.user_agent || UA
     end
 
     def extract_namespaces
-      siteinfo = @client.query.meta(siteinfo: {prop: [:namespaces, :namespacealiases]}).perform
-      siteinfo.raw.query.namespaces.map{|_, namespace|
-        aliases = siteinfo.raw.query.namespacealiases.select{|a| a.id == namespace.id}.map{|a| a['*']}
-        namespace.merge(aliases: aliases)
-      }
+      siteinfo = @client.query.meta(:siteinfo).prop(:namespaces, :namespacealiases).response
+      siteinfo['namespaces'].map do |_, namespace|
+        aliases =
+          siteinfo['namespacealiases'].select { |a| a['id'] == namespace['id'] }.map { |a| a['*'] }
+        namespace.merge('aliases' => aliases)
+      end
     end
   end
 end