lib/infoboxer/media_wiki.rb in infoboxer-0.2.7 vs lib/infoboxer/media_wiki.rb in infoboxer-0.2.8
- old
+ new
@@ -1,22 +1,21 @@
# encoding: utf-8
-#require 'rest-client'
-#require 'json'
+
require 'mediawiktory'
require 'addressable/uri'
-require_relative 'media_wiki/mediawiktory_patch'
require_relative 'media_wiki/traits'
require_relative 'media_wiki/page'
module Infoboxer
# MediaWiki client class.
#
# Usage:
#
# ```ruby
- # client = Infoboxer::MediaWiki.new('http://en.wikipedia.org/w/api.php', user_agent: 'My Own Project')
+ # client = Infoboxer::MediaWiki
+ # .new('http://en.wikipedia.org/w/api.php', user_agent: 'My Own Project')
# page = client.get('Argentina')
# ```
#
# Consider using shortcuts like {Infoboxer.wiki}, {Infoboxer.wikipedia},
# {Infoboxer.wp} and so on instead of direct instation of this class
@@ -25,21 +24,25 @@
class MediaWiki
# Default Infoboxer User-Agent header.
#
# You can set yours as an option to {Infoboxer.wiki} and its shortcuts,
# or to {#initialize}
- UA = "Infoboxer/#{Infoboxer::VERSION} (https://github.com/molybdenum-99/infoboxer; zverok.offline@gmail.com)"
+ UA = "Infoboxer/#{Infoboxer::VERSION} "\
+ '(https://github.com/molybdenum-99/infoboxer; zverok.offline@gmail.com)'.freeze
class << self
# User agent getter/setter.
#
# Default value is {UA}.
#
# You can also use per-instance option, see {#initialize}
+ #
+ # @return [String]
attr_accessor :user_agent
end
+ # @private
attr_reader :api_base_url, :traits
# Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut
# for it, as well as shortcuts for some well-known wikis, like
# {Infoboxer.wikipedia}.
@@ -49,43 +52,64 @@
# in different wikis.
# @param options Only one option is currently supported:
# * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header.
def initialize(api_base_url, options = {})
@api_base_url = Addressable::URI.parse(api_base_url)
- @client = MediaWiktory::Client.new(api_base_url, user_agent: user_agent(options))
+ @client = MediaWiktory::Wikipedia::Api.new(api_base_url, user_agent: user_agent(options))
@traits = Traits.get(@api_base_url.host, namespaces: extract_namespaces)
end
# Receive "raw" data from Wikipedia (without parsing or wrapping in
# classes).
#
- # @return [Array<Hash>]
- def raw(*titles)
- return [] if titles.empty? # could emerge on "automatically" created page lists, should work
-
- titles.each_slice(50).map{|part|
- @client.query.
- titles(*part).
- prop(revisions: {prop: :content}, info: {prop: :url}).
- redirects(true). # FIXME: should be done transparently by MediaWiktory?
- perform.pages
- }.inject(:concat). # somehow flatten(1) fails!
- sort_by{|page|
- res_title = page.alt_titles.detect{|t| titles.map(&:downcase).include?(t.downcase)} # FIXME?..
- titles.index(res_title) || 1_000
- }
+ # @param titles [Array<String>] List of page titles to get.
+ # @param prop [Array<Symbol>] List of additional page properties to get, refer to
+ # [MediaWiktory::Actions::Query#prop](http://www.rubydoc.info/gems/mediawiktory/MediaWiktory/Wikipedia/Actions/Query#prop-instance_method)
+ # for the list of available properties.
+ #
+ # @return [Hash{String => Hash}] Hash of `{requested title => raw MediaWiki object}`. Note that
+ # even missing (does not exist in current Wiki) or invalid (impossible title) still be present
+ # in response, just will have `"missing"` or `"invalid"` key, just like MediaWiki returns them.
+ def raw(*titles, prop: [])
+ # could emerge on "automatically" created page lists, should work
+ return {} if titles.empty?
+
+ titles.each_slice(50).map do |part|
+ response = @client
+ .query
+ .titles(*part)
+ .prop(:revisions, :info, *prop).prop(:content, :timestamp, :url)
+ .redirects
+ .response
+
+ sources = response['pages'].values.map { |page| [page['title'], page] }.to_h
+ redirects =
+ if response['redirects']
+ response['redirects'].map { |r| [r['from'], sources[r['to']]] }.to_h
+ else
+ {}
+ end
+
+ # This way for 'Einstein' query we'll have {'Albert Einstein' => page, 'Einstein' => same page}
+ sources.merge(redirects)
+ end.inject(:merge)
end
# Receive list of parsed MediaWiki pages for list of titles provided.
# All pages are received with single query to MediaWiki API.
#
# **NB**: if you are requesting more than 50 titles at once
# (MediaWiki limitation for single request), Infoboxer will do as
# many queries as necessary to extract them all (it will be like
# `(titles.count / 50.0).ceil` requests)
#
- # @return [Tree::Nodes<Page>] array of parsed pages. Notes:
+ # @param titles [Array<String>] List of page titles to get.
+ # @param prop [Array<Symbol>] List of additional page properties to get, refer to
+ # [MediaWiktory::Actions::Query#prop](http://www.rubydoc.info/gems/mediawiktory/MediaWiktory/Wikipedia/Actions/Query#prop-instance_method)
+ # for the list of available properties.
+ #
+ # @return [Page, Tree::Nodes<Page>] array of parsed pages. Notes:
# * if you call `get` with only one title, one page will be
# returned instead of an array
# * if some of pages are not in wiki, they will not be returned,
# therefore resulting array can be shorter than titles array;
# you can always check `pages.map(&:title)` to see what you've
@@ -94,26 +118,19 @@
#
# ```ruby
# Infoboxer.wp.get('Argentina', 'Chile', 'Something non-existing').
# infobox.fetch('some value')
# ```
- # and obtain meaningful results instead of NoMethodError or some
- # NotFound.
+ # and obtain meaningful results instead of `NoMethodError` or
+ # `SomethingNotFound`.
#
- def get(*titles)
- pages = raw(*titles).
- tap{|pages| pages.detect(&:invalid?).tap{|i| i && fail(i.raw.invalidreason)}}.
- select(&:exists?).
- map{|raw|
- Page.new(self,
- Parser.paragraphs(raw.content, traits),
- raw)
- }
+ def get(*titles, prop: [])
+ pages = get_h(*titles, prop: prop).values.compact
titles.count == 1 ? pages.first : Tree::Nodes[*pages]
end
- # Same as {#get}, but returns hash of {requested title => page}.
+ # Same as {#get}, but returns hash of `{requested title => page}`.
#
# Useful quirks:
# * when requested page not existing, key will be still present in
# resulting hash (value will be `nil`);
# * when requested page redirects to another, key will still be the
@@ -121,118 +138,130 @@
# with key 'Einstein' and page titled 'Albert Einstein'.
#
# This allows you to be in full control of what pages of large list
# you've received.
#
+ # @param titles [Array<String>] List of page titles to get.
+ # @param prop [Array<Symbol>] List of additional page properties to get, refer to
+ # [MediaWiktory::Actions::Query#prop](http://www.rubydoc.info/gems/mediawiktory/MediaWiktory/Wikipedia/Actions/Query#prop-instance_method)
+ # for the list of available properties.
+ #
# @return [Hash<String, Page>]
#
- def get_h(*titles)
- pages = [*get(*titles)]
- titles.map{|t|
- [t, pages.detect{|p| p.source.alt_titles.map(&:downcase).include?(t.downcase)}]
- }.to_h
+ def get_h(*titles, prop: [])
+ raw_pages = raw(*titles, prop: prop)
+ .tap { |ps| ps.detect { |_, p| p['invalid'] }.tap { |_, i| i && fail(i['invalidreason']) } }
+ .reject { |_, p| p.key?('missing') }
+ titles.map { |title| [title, make_page(raw_pages, title)] }.to_h
end
# Receive list of parsed MediaWiki pages from specified category.
#
# **NB**: currently, this API **always** fetches all pages from
# category, there is no option to "take first 20 pages". Pages are
# fetched in 50-page batches, then parsed. So, for large category
# it can really take a while to fetch all pages.
#
- # @param title Category title. You can use namespaceless title (like
- # `"Countries in South America"`), title with namespace (like
+ # @param title [String] Category title. You can use namespaceless title (like
+ # `"Countries in South America"`), title with namespace (like
# `"Category:Countries in South America"`) or title with local
# namespace (like `"Catégorie:Argentine"` for French Wikipedia)
#
# @return [Tree::Nodes<Page>] array of parsed pages.
#
def category(title)
title = normalize_category_title(title)
-
- list(categorymembers: {title: title, limit: 50})
+
+ list(@client.query.generator(:categorymembers).title(title).limit('max'))
end
# Receive list of parsed MediaWiki pages for provided search query.
# See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch)
# for details.
#
# **NB**: currently, this API **always** fetches all pages from
# category, there is no option to "take first 20 pages". Pages are
- # fetched in 50-page batches, then parsed. So, for large category
+ # fetched in 50-page batches, then parsed. So, for large search query
# it can really take a while to fetch all pages.
#
- # @param query Search query. For old installations, look at
+ # @param query [String] Search query. For old installations, look at
# https://www.mediawiki.org/wiki/Help:Searching
# for search syntax. For new ones (including Wikipedia), see at
# https://www.mediawiki.org/wiki/Help:CirrusSearch.
#
# @return [Tree::Nodes<Page>] array of parsed pages.
#
def search(query)
- list(search: {search: query, limit: 50})
+ list(@client.query.generator(:search).search(query).limit('max'))
end
# Receive list of parsed MediaWiki pages with titles startin from prefix.
# See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bprefixsearch)
# for details.
#
# **NB**: currently, this API **always** fetches all pages from
# category, there is no option to "take first 20 pages". Pages are
- # fetched in 50-page batches, then parsed. So, for large category
+ # fetched in 50-page batches, then parsed. So, for large search query
# it can really take a while to fetch all pages.
#
- # @param prefix page title prefix.
+ # @param prefix [String] Page title prefix.
#
# @return [Tree::Nodes<Page>] array of parsed pages.
#
def prefixsearch(prefix)
- list(prefixsearch: {search: prefix, limit: 100})
+ list(@client.query.generator(:prefixsearch).search(prefix).limit('max'))
end
+ # @return [String]
def inspect
"#<#{self.class}(#{@api_base_url.host})>"
end
private
+ def make_page(raw_pages, title)
+ _, source = raw_pages.detect { |ptitle, _| ptitle.casecmp(title).zero? }
+ source or return nil
+ Page.new(self, Parser.paragraphs(source['revisions'].first['*'], traits), source)
+ end
+
def list(query)
- response = @client.query.
- generator(query).
- prop(revisions: {prop: :content}, info: {prop: :url}).
- redirects(true). # FIXME: should be done transparently by MediaWiktory?
- perform
+ response = query
+ .prop(:revisions, :info)
+ .prop(:content, :timestamp, :url)
+ .redirects
+ .response
- response.continue! while response.continue?
+ response = response.continue while response.continue?
- pages = response.pages.select(&:exists?).
- map{|raw|
- Page.new(self,
- Parser.paragraphs(raw.content, traits),
- raw)
- }
+ return Tree::Nodes[] if response['pages'].nil?
+ pages = response['pages']
+ .values.select { |p| p['missing'].nil? }
+ .map { |raw| Page.new(self, Parser.paragraphs(raw['revisions'].first['*'], traits), raw) }
+
Tree::Nodes[*pages]
end
def normalize_category_title(title)
# FIXME: shouldn't it go to MediaWiktory?..
namespace, titl = title.include?(':') ? title.split(':', 2) : [nil, title]
namespace, titl = nil, title unless traits.category_namespace.include?(namespace)
-
+
namespace ||= traits.category_namespace.first
[namespace, titl].join(':')
end
def user_agent(options)
options[:user_agent] || options[:ua] || self.class.user_agent || UA
end
def extract_namespaces
- siteinfo = @client.query.meta(siteinfo: {prop: [:namespaces, :namespacealiases]}).perform
- siteinfo.raw.query.namespaces.map{|_, namespace|
- aliases = siteinfo.raw.query.namespacealiases.select{|a| a.id == namespace.id}.map{|a| a['*']}
- namespace.merge(aliases: aliases)
- }
+ siteinfo = @client.query.meta(:siteinfo).prop(:namespaces, :namespacealiases).response
+ siteinfo['namespaces'].map do |_, namespace|
+ aliases =
+ siteinfo['namespacealiases'].select { |a| a['id'] == namespace['id'] }.map { |a| a['*'] }
+ namespace.merge('aliases' => aliases)
+ end
end
end
end