module Wikipedia class Page attr_reader :json def initialize(json) require 'json' @json = json @data = JSON.parse(json) end def page @data['query']['pages'].values.first if @data['query']['pages'] end def content page['revisions'].first['*'] if page['revisions'] end def sanitized_content self.class.sanitize(content) end def redirect? content && content.match(/\#REDIRECT\s*\[\[(.*?)\]\]/i) end def redirect_title redirect?[1] rescue nil end def title page['title'] end def fullurl page['fullurl'] end def editurl page['editurl'] end def text page['extract'] end def summary page['extract'].split('==')[0].strip if page['extract'] && page['extract'] != '' end def categories page['categories'].map { |c| c['title'] } if page['categories'] end def links page['links'].map { |c| c['title'] } if page['links'] end def extlinks page['extlinks'].map { |c| c['*'] } if page['extlinks'] end def images page['images'].map { |c| c['title'] } if page['images'] end def image_url page['imageinfo'].first['url'] if page['imageinfo'] end def image_descriptionurl page['imageinfo'].first['descriptionurl'] if page['imageinfo'] end def image_urls image_metadata.map(&:image_url) end def image_descriptionurls image_metadata.map(&:image_descriptionurl) end def coordinates page['coordinates'].first.values if page['coordinates'] end def raw_data @data end def image_metadata unless @cached_image_metadata return if images.nil? filtered = images.select { |i| i =~ /:.+\.(jpg|jpeg|png|gif|svg)$/i && !i.include?('LinkFA-star') } @cached_image_metadata = filtered.map { |title| Wikipedia.find_image(title) } end @cached_image_metadata || [] end def templates page['templates'].map { |c| c['title'] } if page['templates'] end # rubocop:disable Metrics/MethodLength # rubocop:disable Metrics/AbcSize def self.sanitize( s ) return unless s # strip anything inside curly braces! s.gsub!(/\{\{[^\{\}]+?\}\}/, '') while s =~ /\{\{[^\{\}]+?\}\}/ # strip info box s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '') # strip internal links s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2') s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1') # strip images and file links s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '') s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '') # convert bold/italic to html s.gsub!(/'''''(.+?)'''''/, '\1') s.gsub!(/'''(.+?)'''/, '\1') s.gsub!(/''(.+?)''/, '\1') # misc s.gsub!(/]*>[\s\S]*?<\/ref>/, '') s.gsub!(//, '') s.gsub!(' ', ' ') s.strip! # create paragraphs sections = s.split("\n\n") if sections.size > 1 s = sections.map { |paragraph| "

#{paragraph.strip}

" }.join("\n") end s end end end