lib/infoboxer/media_wiki.rb in infoboxer-0.1.2.1 vs lib/infoboxer/media_wiki.rb in infoboxer-0.2.0
- old
+ new
@@ -1,8 +1,9 @@
# encoding: utf-8
-require 'rest-client'
-require 'json'
+#require 'rest-client'
+#require 'json'
+require 'mediawiktory'
require 'addressable/uri'
require_relative 'media_wiki/traits'
require_relative 'media_wiki/page'
@@ -34,11 +35,11 @@
#
# You can also use per-instance option, see {#initialize}
attr_accessor :user_agent
end
- attr_reader :api_base_url
+ attr_reader :api_base_url, :traits
# Creating new MediaWiki client. {Infoboxer.wiki} provides shortcut
# for it, as well as shortcuts for some well-known wikis, like
# {Infoboxer.wikipedia}.
#
@@ -47,30 +48,35 @@
# in different wikis.
# @param options Only one option is currently supported:
# * `:user_agent` (also aliased as `:ua`) -- custom User-Agent header.
def initialize(api_base_url, options = {})
@api_base_url = Addressable::URI.parse(api_base_url)
- @resource = RestClient::Resource.new(api_base_url, headers: headers(options))
+ @client = MediaWiktory::Client.new(api_base_url, user_agent: user_agent(options))
+ @traits = Traits.get(@api_base_url.host, namespaces: extract_namespaces)
end
# Receive "raw" data from Wikipedia (without parsing or wrapping in
# classes).
#
# @return [Array<Hash>]
def raw(*titles)
- postprocess @resource.get(
- params: DEFAULT_PARAMS.merge(titles: titles.join('|'))
- )
+ titles.each_slice(50).map{|part|
+ @client.query.
+ titles(*part).
+ prop(revisions: {prop: :content}, info: {prop: :url}).
+ redirects(true). # FIXME: should be done transparently by MediaWiktory?
+ perform.pages
+ }.inject(:concat) # somehow flatten(1) fails!
end
- # Receive list of parsed wikipedia pages for list of titles provided.
+ # Receive list of parsed MediaWiki pages for list of titles provided.
# All pages are received with single query to MediaWiki API.
#
- # **NB**: currently, if you are requesting more than 50 titles at
- # once (MediaWiki limitation for single request), Infoboxer will
- # **not** try to get other pages with subsequent queries. This will
- # be fixed in future.
+ # **NB**: if you are requesting more than 50 titles at once
+ # (MediaWiki limitation for single request), Infoboxer will do as
+ # many queries as necessary to extract them all (it will be like
+ # `(titles.count / 50.0).ceil` requests)
#
# @return [Tree::Nodes<Page>] array of parsed pages. Notes:
# * if you call `get` with only one title, one page will be
# returned instead of an array
# * if some of pages are not in wiki, they will not be returned,
@@ -85,78 +91,120 @@
# ```
# and obtain meaningful results instead of NoMethodError or some
# NotFound.
#
def get(*titles)
- pages = raw(*titles).reject{|raw| raw[:content].nil?}.
+ pages = raw(*titles).
+ tap{|pages| pages.detect(&:invalid?).tap{|i| i && fail(i.raw.invalidreason)}}.
+ select(&:exists?).
map{|raw|
- traits = Traits.get(@api_base_url.host, extract_traits(raw))
-
Page.new(self,
- Parser.paragraphs(raw[:content], traits),
- raw.merge(traits: traits))
+ Parser.paragraphs(raw.content, traits),
+ raw)
}
titles.count == 1 ? pages.first : Tree::Nodes[*pages]
end
+ # Receive list of parsed MediaWiki pages from specified category.
+ #
+ # **NB**: currently, this API **always** fetches all pages from
+ # category, there is no option to "take first 20 pages". Pages are
+ # fetched in 50-page batches, then parsed. So, for large category
+ # it can really take a while to fetch all pages.
+ #
+ # @param title Category title. You can use namespaceless title (like
+ # `"Countries in South America"`), title with namespace (like
+ # `"Category:Countries in South America"`) or title with local
+ # namespace (like `"Catégorie:Argentine"` for French Wikipedia)
+ #
+ # @return [Tree::Nodes<Page>] array of parsed pages.
+ #
+ def category(title)
+ title = normalize_category_title(title)
+
+ list(categorymembers: {title: title, limit: 50})
+ end
+
+ # Receive list of parsed MediaWiki pages for provided search query.
+ # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch)
+ # for details.
+ #
+ # **NB**: currently, this API **always** fetches all pages from
+ # category, there is no option to "take first 20 pages". Pages are
+ # fetched in 50-page batches, then parsed. So, for large category
+ # it can really take a while to fetch all pages.
+ #
+ # @param query Search query. For old installations, look at
+ # https://www.mediawiki.org/wiki/Help:Searching
+ # for search syntax. For new ones (including Wikipedia), see at
+ # https://www.mediawiki.org/wiki/Help:CirrusSearch.
+ #
+ # @return [Tree::Nodes<Page>] array of parsed pages.
+ #
+ def search(query)
+ list(search: {search: query, limit: 50})
+ end
+
+ # Receive list of parsed MediaWiki pages with titles startin from prefix.
+ # See [MediaWiki API docs](https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bprefixsearch)
+ # for details.
+ #
+ # **NB**: currently, this API **always** fetches all pages from
+ # category, there is no option to "take first 20 pages". Pages are
+ # fetched in 50-page batches, then parsed. So, for large category
+ # it can really take a while to fetch all pages.
+ #
+ # @param prefix page title prefix.
+ #
+ # @return [Tree::Nodes<Page>] array of parsed pages.
+ #
+ def prefixsearch(prefix)
+ list(prefixsearch: {search: prefix, limit: 100})
+ end
+
+ def inspect
+ "#<#{self.class}(#{@api_base_url.host})>"
+ end
+
private
- # @private
- PROP = [
- 'revisions', # to extract content of the page
- 'info', # to extract page canonical url
- 'categories', # to extract default category prefix
- 'images' # to extract default media prefix
- ].join('|')
+ def list(query)
+ response = @client.query.
+ generator(query).
+ prop(revisions: {prop: :content}, info: {prop: :url}).
+ redirects(true). # FIXME: should be done transparently by MediaWiktory?
+ perform
- # @private
- DEFAULT_PARAMS = {
- action: :query,
- format: :json,
- redirects: true,
+ response.continue! while response.continue?
- prop: PROP,
- rvprop: :content,
- inprop: :url,
- }
+ pages = response.pages.select(&:exists?).
+ map{|raw|
+ Page.new(self,
+ Parser.paragraphs(raw.content, traits),
+ raw)
+ }
- def headers(options)
- {'User-Agent' => options[:user_agent] || options[:ua] || self.class.user_agent || UA}
+ Tree::Nodes[*pages]
end
- def extract_traits(raw)
- raw.select{|k, v| [:file_prefix, :category_prefix].include?(k)}
+ def normalize_category_title(title)
+ # FIXME: shouldn't it go to MediaWiktory?..
+ namespace, titl = title.include?(':') ? title.split(':', 2) : [nil, title]
+ namespace, titl = nil, title unless traits.category_namespace.include?(namespace)
+
+ namespace ||= traits.category_namespace.first
+ [namespace, titl].join(':')
end
- def guess_traits(pages)
- categories = pages.map{|p| p['categories']}.compact.flatten
- images = pages.map{|p| p['images']}.compact.flatten
- {
- file_prefix: images.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
- category_prefix: categories.map{|i| i['title'].scan(/^([^:]+):/)}.flatten.uniq,
- }
+ def user_agent(options)
+ options[:user_agent] || options[:ua] || self.class.user_agent || UA
end
- def postprocess(response)
- pages = JSON.parse(response)['query']['pages']
- traits = guess_traits(pages.values)
-
- pages.map{|id, data|
- if id.to_i < 0
- {
- title: data['title'],
- content: nil,
- not_found: true
- }
- else
- {
- title: data['title'],
- content: data['revisions'].first['*'],
- url: data['fullurl'],
- }.merge(traits)
- end
+ def extract_namespaces
+ siteinfo = @client.query.meta(siteinfo: {prop: [:namespaces, :namespacealiases]}).perform
+ siteinfo.raw.query.namespaces.map{|_, namespace|
+ aliases = siteinfo.raw.query.namespacealiases.select{|a| a.id == namespace.id}.map{|a| a['*']}
+ namespace.merge(aliases: aliases)
}
- rescue JSON::ParserError
- fail RuntimeError, "Not a JSON response, seems there's not a MediaWiki API: #{@api_base_url}"
end
end
end