module Wikipedia
class Page
attr_reader :json
def initialize(json)
require 'json'
@json = json
@data = JSON.parse(json)
end
def page
@data['query']['pages'].values.first if @data['query']['pages']
end
def content
page['revisions'].first['*'] if page['revisions']
end
def sanitized_content
self.class.sanitize(content)
end
def redirect?
content && content.match(/\#REDIRECT\s*\[\[(.*?)\]\]/i)
end
def redirect_title
redirect?[1] rescue nil
end
def title
page['title']
end
def fullurl
page['fullurl']
end
def editurl
page['editurl']
end
def text
page['extract']
end
def summary
page['extract'].split('==')[0].strip if page['extract'] && page['extract'] != ''
end
def categories
page['categories'].map { |c| c['title'] } if page['categories']
end
def links
page['links'].map { |c| c['title'] } if page['links']
end
def extlinks
page['extlinks'].map { |c| c['*'] } if page['extlinks']
end
def langlinks
Hash[page['langlinks'].collect { |c| [c['lang'], c['*']] }] if page['langlinks']
end
def images
page['images'].map { |c| c['title'] } if page['images']
end
def image_url
page['imageinfo'].first['url'] if page['imageinfo']
end
def image_thumburl
page['imageinfo'].first['thumburl'] if page['imageinfo']
end
def image_descriptionurl
page['imageinfo'].first['descriptionurl'] if page['imageinfo']
end
def image_urls
image_metadata.map(&:image_url) unless image_metadata.nil?
end
def image_thumburls( width = nil )
options = width.nil? ? {} : { iiurlwidth: width }
image_metadata( options ).map(&:image_thumburl) unless image_metadata( options ).nil?
end
def image_descriptionurls
image_metadata.map(&:image_descriptionurl) unless image_metadata.nil?
end
def main_image_url
page['thumbnail']['source'].sub(/\/thumb/, '').sub(/\/[^\/]*$/, '') if page['thumbnail']
end
def main_image_thumburl
page['thumbnail']['source'] if page['thumbnail']
end
def coordinates
page['coordinates'].first.values if page['coordinates']
end
def raw_data
@data
end
def image_metadata( options = {} )
unless @cached_image_metadata
return if images.nil?
filtered = images.select { |i| i =~ /:.+\.(jpg|jpeg|png|gif|svg)$/i && !i.include?('LinkFA-star') }
@cached_image_metadata = filtered.map { |title| Wikipedia.find_image(title, options) }
end
@cached_image_metadata || []
end
def templates
page['templates'].map { |c| c['title'] } if page['templates']
end
# rubocop:disable Metrics/MethodLength
# rubocop:disable Metrics/AbcSize
def self.sanitize(s)
return unless s
# Transform punctuation templates
# Em dash (https://en.wikipedia.org/wiki/Template:Em_dash)
s.gsub!(/\{\{(em dash|emdash)\}\}/i, '—')
# En dash (https://en.wikipedia.org/wiki/Template:En_dash)
s.gsub!(/\{\{(en dash|ndash|nsndns)\}\}/i, '–')
# Spaced en dashes (https://en.wikipedia.org/wiki/Template:Spaced_en_dash_space)
s.gsub!(/\{\{(spaced e?n\s?dash( space)?|snds?|spndsp|sndashs|spndashsp)\}\}/i, ' – ')
# Bold middot
s.gsub!(/\{\{(·|dot|middot|\,)\}\}/i, ' ·')
# Bullets
s.gsub!(/\{\{(•|bull(et)?)\}\}/i, ' •')
# Forward Slashes (https://en.wikipedia.org/wiki/Template:%5C)
s.gsub!(/\{\{\\\}\}/i, ' /')
# Transform language specific blocks
s.gsub!(/\{\{lang[\-\|]([a-z]+)\|([^\|\{\}]+)(\|[^\{\}]+)?\}\}/i, '\2')
# Parse Old Style Date template blocks
# Old Style Dates (https://en.wikipedia.org/wiki/Template:OldStyleDate)
s.gsub!(/\{\{OldStyleDate\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [O.S. \3] \2')
# Old Style Dates with different years (https://en.wikipedia.org/wiki/Template:OldStyleDateDY)
s.gsub!(/\{\{OldStyleDateDY\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 \2 [O.S. \3]')
# Old Style Dates with no year (https://en.wikipedia.org/wiki/Template:OldStyleDateNY)
s.gsub!(/\{\{OldStyleDateNY\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [O.S. \2]')
# strip anything else inside curly braces!
s.gsub!(/\{\{[^\{\}]+?\}\}[\;\,]?/, '') while s =~ /\{\{[^\{\}]+?\}\}[\;\,]?/
# strip info box
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
# strip internal links
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
# strip images and file links
s.gsub!(/\[\[Image:(.*?(?=\]\]))??\]\]/, '')
s.gsub!(/\[\[File:(.*?(?=\]\]))??\]\]/, '')
# convert bold/italic to html
s.gsub!(/'''''(.+?)'''''/, '\1')
s.gsub!(/'''(.+?)'''/, '\1')
s.gsub!(/''(.+?)''/, '\1')
# misc
s.gsub!(/(\d)[]*>[\s\S]*?<\/ref>(\d)/, '\1 – \2')
s.gsub!(/][]*>[\s\S]*?<\/ref>/, '')
s.gsub!(/]))??\/>/, '')
s.gsub!(//, '')
s.gsub!(/\(\s+/, '(')
s.gsub!(' ', ' ')
s.strip!
# create paragraphs
sections = s.split("\n\n")
s =
if sections.size > 1
sections.map { |paragraph| "#{paragraph.strip}
" }.join("\n")
else
"#{s}
"
end
s
end
end
end