#!/usr/bin/env ruby # -*- coding: utf-8 -*- $: << File.join(File.dirname(__FILE__)) $: << File.join(File.dirname(__FILE__), '..', 'lib') $DEBUG_MODE = false SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share') DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc') require 'wp2txt' require 'wp2txt/utils' require 'wp2txt/version' require 'trollop' include Wp2txt opts = Trollop::options do version Wp2txt::VERSION banner <<-EOS WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. Usage: wp2txt [options] where [options] are: EOS opt :input_file, "Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format", :required => true opt :output_dir, "Output directory", :default => Dir::pwd, :type => String opt :convert, "Output in plain text (converting from XML)", :default => true opt :list, "Show list items in output", :default => true opt :heading, "Show section titles in output", :default => true, :short => "-d" opt :title, "Show page titles in output", :default => true opt :table, "Show table source code in output", :default => false opt :template, "Show template specifications in output", :default => false opt :redirect, "Show redirect destination", :default => false opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true opt :category, "Show article category information", :default => false opt :file_size, "Approximate size (in MB) of each output file", :default => 10 end Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0 Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir]) input_file = ARGV[0] output_dir = opts[:output_dir] tfile_size = opts[:file_size] convert = opts[:convert] strip_tmarker = opts[:marker] ? false : true opt_array = [:title, :list, :heading, :table, :template, :redirect] config = {} opt_array.each do |opt| config[opt] = opts[opt] end # a "parent" is either commandline progress bar or # a gui window (not available for now) parent = Wp2txt::CmdProgbar.new wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker) wpconv.extract_text do |article| title = format_wiki article.title title = "[[#{title}]]\n" if opts[:category] && !article.categories.empty? contents = "\nCATEGORIES: " contents += article.categories.join(", ") contents += "\n\n" else contents = "" end article.elements.each do |e| case e.first when :mw_heading next if !config[:heading] line = format_wiki(e.last) line += "+HEADING+" if $DEBUG_MODE when :mw_paragraph # next if !config[:paragraph] line = format_wiki(e.last) line += "+PARAGRAPH+" if $DEBUG_MODE when :mw_table, :mw_htable next if !config[:table] line = format_wiki(e.last) line += "+TABLE+" if $DEBUG_MODE when :mw_pre next if !config[:pre] line = e.last line += "+PRE+" if $DEBUG_MODE when :mw_quote # next if !config[:quote] line = format_wiki(e.last) line += "+QUOTE+" if $DEBUG_MODE when :mw_unordered, :mw_ordered, :mw_definition next if !config[:list] line = format_wiki(e.last) line += "+LIST+" if $DEBUG_MODE when :mw_redirect next if !config[:redirect] line = format_wiki(e.last) line += "+REDIRECT+" if $DEBUG_MODE line += "\n\n" else if $DEBUG_MODE line = format_wiki(e.last) line += "+OTHER+" else next end end contents += line contents = remove_templates(contents) unless config[:template] end ##### cleanup ##### if /\A\s*\z/m =~ contents result = "" else result = config[:title] ? title + "\n" + contents : contents end result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""} result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n" end