#!/usr/bin/env ruby # -*- coding: utf-8 -*- $: << File.join(File.dirname(__FILE__)) $: << File.join(File.dirname(__FILE__), '..', 'lib') $DEBUG_MODE = false SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share') DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc') require 'wp2txt' require 'wp2txt/utils' require 'wp2txt/version' require 'trollop' include Wp2txt opts = Trollop::options do version Wp2txt::VERSION banner <<-EOS WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. Usage: wp2txt [options] where [options] are: EOS opt :input_file, "Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format", :required => true opt :output_dir, "Output directory", :default => Dir::pwd, :type => String opt :convert, "Output in plain text (converting from XML)", :default => true opt :list, "Show list items in output", :default => true opt :heading, "Show section titles in output", :default => true, :short => "-d" opt :title, "Show page titles in output", :default => true opt :table, "Show table source code in output", :default => false opt :template, "leave inline template notations unmodified", :default => false opt :redirect, "Show redirect destination", :default => false opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true opt :category, "Show article category information", :default => false opt :file_size, "Approximate size (in MB) of each output file", :default => 10 opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10 end Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0 Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir]) Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10 input_file = ARGV[0] output_dir = opts[:output_dir] tfile_size = opts[:file_size] limit_recur = opts[:limit_recur] convert = opts[:convert] strip_tmarker = opts[:marker] ? false : true opt_array = [:title, :list, :heading, :table, :redirect] $leave_template = true if opts[:template] config = {} opt_array.each do |opt| config[opt] = opts[opt] end parent = Wp2txt::CmdProgbar.new wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker, limit_recur) wpconv.extract_text do |article| format_wiki!(article.title) title = "[[#{article.title}]]\n" if opts[:category] && !article.categories.empty? contents = "\nCATEGORIES: " contents << article.categories.join(", ") contents << "\n\n" else contents = "" end article.elements.each do |e| case e.first when :mw_heading next if !config[:heading] format_wiki!(e.last) line = e.last line << "+HEADING+" if $DEBUG_MODE when :mw_paragraph # next if !config[:paragraph] format_wiki!(e.last) line = e.last line << "+PARAGRAPH+" if $DEBUG_MODE when :mw_table, :mw_htable next if !config[:table] format_wiki!(e.last) line = e.last line << "+TABLE+" if $DEBUG_MODE when :mw_pre next if !config[:pre] line = e.last line << "+PRE+" if $DEBUG_MODE when :mw_quote # next if !config[:quote] format_wiki!(e.last) line = e.last line << "+QUOTE+" if $DEBUG_MODE when :mw_unordered, :mw_ordered, :mw_definition next if !config[:list] format_wiki!(e.last) line = e.last line << "+LIST+" if $DEBUG_MODE when :mw_redirect next if !config[:redirect] format_wiki!(e.last) line = e.last line << "+REDIRECT+" if $DEBUG_MODE line << "\n\n" else if $DEBUG_MODE format_wiki!(e.last) line = e.last line << "+OTHER+" else next end end contents << line end remove_directive!(contents) remove_emphasis!(contents) mndash!(contents) make_reference!(contents) format_ref!(contents) remove_hr!(contents) remove_tag!(contents) special_chr!(contents) correct_inline_template!(contents) unless $leave_template remove_templates!(contents) unless $leave_template ##### cleanup ##### if /\A\s*\z/m =~ contents result = "" else result = config[:title] ? title + "\n" + contents : contents end result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""} result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n" end