media_wiki.rb in infoboxer-0.2.0

- old
+ new

@@ -1,8 +1,9 @@
 # encoding: utf-8
-require 'rest-client'
-require 'json'
+#require 'rest-client'
+#require 'json'
+require 'mediawiktory'
 require 'addressable/uri'
 
 require_relative 'media_wiki/traits'
 require_relative 'media_wiki/page'
 
@@ -34,11 +35,11 @@
       #
       # You can also use per-instance option, see {#initialize}
       attr_accessor :user_agent
     end
 
-    attr_reader :api_base_url
+    attr_reader :api_base_url, :traits
 
     # Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut
     # for it, as well as shortcuts for some well-known wikis, like
     # {Infoboxer.wikipedia}.
     #
@@ -47,30 +48,35 @@
     #   in different wikis.
     # @param options Only one option is currently supported:
     #   * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header.
     def initialize(api_base_url, options = {})
       @api_base_url = Addressable::URI.parse(api_base_url)
-      @resource = RestClient::Resource.new(api_base_url, headers: headers(options))
+      @client = MediaWiktory::Client.new(api_base_url, user_agent: user_agent(options))
+      @traits = Traits.get(@api_base_url.host, namespaces: extract_namespaces)
     end
 
     # Receive "raw" data from Wikipedia (without parsing or wrapping in
     # classes).
     #
     # @return [Array<Hash>]
     def raw(*titles)
-      postprocess @resource.get(
-        params: DEFAULT_PARAMS.merge(titles: titles.join('|'))
-      )
+      titles.each_slice(50).map{|part|
+        @client.query.
+          titles(*part).
+          prop(revisions: {prop: :content}, info: {prop: :url}).
+          redirects(true). # FIXME: should be done transparently by MediaWiktory?
+          perform.pages
+      }.inject(:concat) # somehow flatten(1) fails!
     end
 
-    # Receive list of parsed wikipedia pages for list of titles provided.
+    # Receive list of parsed MediaWiki pages for list of titles provided.
     # All pages are received with single query to MediaWiki API.
     #
-    # **NB**: currently, if you are requesting more than 50 titles at
-    # once (MediaWiki limitation for single request), Infoboxer will
-    # **not** try to get other pages with subsequent queries. This will
-    # be fixed in future.
+    # **NB**: if you are requesting more than 50 titles at once
+    # (MediaWiki limitation for single request), Infoboxer will do as
+    # many queries as necessary to extract them all (it will be like
+    # `(titles.count / 50.0).ceil` requests)
     #
     # @return [Tree::Nodes<Page>] array of parsed pages. Notes:
     #   * if you call `get` with only one title, one page will be
     #     returned instead of an array
     #   * if some of pages are not in wiki, they will not be returned,
@@ -85,78 +91,120 @@
     #     ```
     #     and obtain meaningful results instead of NoMethodError or some
     #     NotFound.
     #
     def get(*titles)
-      pages = raw(*titles).reject{|raw| raw[:content].nil?}.
+      pages = raw(*titles).
+        tap{|pages| pages.detect(&:invalid?).tap{|i| i && fail(i.raw.invalidreason)}}.
+        select(&:exists?).
         map{|raw|
-          traits = Traits.get(@api_base_url.host, extract_traits(raw))
-          
           Page.new(self,
-            Parser.paragraphs(raw[:content], traits),
-            raw.merge(traits: traits))
+            Parser.paragraphs(raw.content, traits),
+            raw)
         }
       titles.count == 1 ? pages.first : Tree::Nodes[*pages]
     end
 
+    # Receive list of parsed MediaWiki pages from specified category.
+    #
+    # **NB**: currently, this API **always** fetches all pages from
+    # category, there is no option to "take first 20 pages". Pages are
+    # fetched in 50-page batches, then parsed. So, for large category
+    # it can really take a while to fetch all pages.
+    #
+    # @param title Category title. You can use namespaceless title (like
+    #     `"Countries in South America"`), title with namespace (like 
+    #     `"Category:Countries in South America"`) or title with local
+    #     namespace (like `"Catégorie:Argentine"` for French Wikipedia)
+    #
+    # @return [Tree::Nodes<Page>] array of parsed pages.
+    #
+    def category(title)
+      title = normalize_category_title(title)
+      
+      list(categorymembers: {title: title, limit: 50})
+    end
+
+    # Receive list of parsed MediaWiki pages for provided search query.
+    # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch)
+    # for details.
+    #
+    # **NB**: currently, this API **always** fetches all pages from
+    # category, there is no option to "take first 20 pages". Pages are
+    # fetched in 50-page batches, then parsed. So, for large category
+    # it can really take a while to fetch all pages.
+    #
+    # @param query Search query. For old installations, look at
+    #     https://www.mediawiki.org/wiki/Help:Searching
+    #     for search syntax. For new ones (including Wikipedia), see at
+    #     https://www.mediawiki.org/wiki/Help:CirrusSearch.
+    #
+    # @return [Tree::Nodes<Page>] array of parsed pages.
+    #
+    def search(query)
+      list(search: {search: query, limit: 50})
+    end
+
+    # Receive list of parsed MediaWiki pages with titles startin from prefix.
+    # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bprefixsearch)
+    # for details.
+    #
+    # **NB**: currently, this API **always** fetches all pages from
+    # category, there is no option to "take first 20 pages". Pages are
+    # fetched in 50-page batches, then parsed. So, for large category
+    # it can really take a while to fetch all pages.
+    #
+    # @param prefix page title prefix.
+    #
+    # @return [Tree::Nodes<Page>] array of parsed pages.
+    #
+    def prefixsearch(prefix)
+      list(prefixsearch: {search: prefix, limit: 100})
+    end
+
+    def inspect
+      "#<#{self.class}(#{@api_base_url.host})>"
+    end
+
     private
 
-    # @private
-    PROP = [
-      'revisions',    # to extract content of the page
-      'info',         # to extract page canonical url
-      'categories',   # to extract default category prefix
-      'images'        # to extract default media prefix
-    ].join('|')
+    def list(query)
+      response = @client.query.
+        generator(query).
+        prop(revisions: {prop: :content}, info: {prop: :url}).
+        redirects(true). # FIXME: should be done transparently by MediaWiktory?
+        perform
 
-    # @private
-    DEFAULT_PARAMS = {
-      action:    :query,
-      format:    :json,
-      redirects: true,
+      response.continue! while response.continue?
 
-      prop:      PROP,
-      rvprop:    :content,
-      inprop:    :url,
-    }
+      pages = response.pages.select(&:exists?).
+        map{|raw|
+          Page.new(self,
+            Parser.paragraphs(raw.content, traits),
+            raw)
+        }
 
-    def headers(options)
-      {'User-Agent' => options[:user_agent] || options[:ua] || self.class.user_agent || UA}
+      Tree::Nodes[*pages]
     end
 
-    def extract_traits(raw)
-      raw.select{|k, v| [:file_prefix, :category_prefix].include?(k)}
+    def normalize_category_title(title)
+      # FIXME: shouldn't it go to MediaWiktory?..
+      namespace, titl = title.include?(':') ? title.split(':', 2) : [nil, title]
+      namespace, titl = nil, title unless traits.category_namespace.include?(namespace)
+      
+      namespace ||= traits.category_namespace.first
+      [namespace, titl].join(':')
     end
 
-    def guess_traits(pages)
-      categories = pages.map{|p| p['categories']}.compact.flatten
-      images = pages.map{|p| p['images']}.compact.flatten
-      {
-        file_prefix: images.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
-        category_prefix: categories.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
-      }
+    def user_agent(options)
+      options[:user_agent] || options[:ua] || self.class.user_agent || UA
     end
 
-    def postprocess(response)
-      pages = JSON.parse(response)['query']['pages']
-      traits = guess_traits(pages.values)
-      
-      pages.map{|id, data|
-        if id.to_i < 0
-          {
-            title: data['title'],
-            content: nil,
-            not_found: true
-          }
-        else
-          {
-            title: data['title'],
-            content: data['revisions'].first['*'],
-            url: data['fullurl'],
-          }.merge(traits)
-        end
+    def extract_namespaces
+      siteinfo = @client.query.meta(siteinfo: {prop: [:namespaces, :namespacealiases]}).perform
+      siteinfo.raw.query.namespaces.map{|_, namespace|
+        aliases = siteinfo.raw.query.namespacealiases.select{|a| a.id == namespace.id}.map{|a| a['*']}
+        namespace.merge(aliases: aliases)
       }
-    rescue JSON::ParserError
-      fail RuntimeError, "Not a JSON response, seems there's not a MediaWiki API: #{@api_base_url}"
     end
   end
 end