module Wikipedia
class Page
def initialize(json)
require 'json'
@json = json
@data = JSON::load(json)
end
def page
@data['query']['pages'].values.first
end
def content
page['revisions'].first['*'] if page['revisions']
end
def sanitized_content
self.class.sanitize(content)
end
def redirect?
content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
end
def redirect_title
if matches = redirect?
matches[1]
end
end
def title
page['title']
end
def categories
page['categories'].map {|c| c['title'] } if page['categories']
end
def links
page['links'].map {|c| c['title'] } if page['links']
end
def images
page['images'].map {|c| c['title'] } if page['images']
end
def image_url
page['imageinfo'].first['url'] if page['imageinfo']
end
def image_urls
if list = images
filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
filtered.map do |title|
Wikipedia.find_image( title ).image_url
end
end
end
def image_descriptionurl
page['imageinfo'].first['descriptionurl'] if page['imageinfo']
end
def raw_data
@data
end
def json
@json
end
def self.sanitize( s )
if s
s = s.dup
# strip anything inside curly braces!
while s =~ /\{\{[^\{\}]+?\}\}/
s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
end
# strip info box
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
# strip internal links
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
# strip images and file links
s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
# convert bold/italic to html
s.gsub!(/'''''(.+?)'''''/, '\1')
s.gsub!(/'''(.+?)'''/, '\1')
s.gsub!(/''(.+?)''/, '\1')
# misc
s.gsub!(/[]*>[\s\S]*?<\/ref>/, '')
s.gsub!(//, '')
s.gsub!(' ', ' ')
s.strip!
# create paragraphs
sections = s.split("\n\n")
if sections.size > 1
s = sections.map {|s| "]#{s.strip}
" }.join("\n")
end
s
end
end
end
end