# frozen_string_literal: true require "nokogiri" require_relative "wp2txt/article" require_relative "wp2txt/utils" module Wp2txt class Splitter include Wp2txt def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false) @fp = nil @input_file = input_file @output_dir = output_dir @tfile_size = tfile_size require "bzip2-ruby" if bz2_gem @bz2_gem = bz2_gem prepare end def file_size(file) size = 0 unit = 10_485_760 star = 0 before = Time.now.to_f loop do begin a = file.read(unit) rescue StandardError a = nil end break unless a present = Time.now.to_f size += a.size next if present - before <= 0.3 star = 0 if star > 10 star += 1 before = present end size end # check if a given command exists: return the path if it does, return false if not def command_exist?(command) basename = File.basename(command) path = +"" print "Checking #{basename}: " if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip } puts "detected [#{path}]" path.strip elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip } puts "detected [#{path}]" path.strip else puts "not found" false end end # check the size of input file (bz2 or plain xml) when decompressed def prepare # if output_dir is not specified, output in the same directory # as the imput file @output_dir = File.dirname(@input_file) if !@output_dir && @input_file if /.bz2$/ =~ @input_file if @bz2_gem file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8") elsif RUBY_PLATFORM.index("win32") file = IO.popen("bunzip2.exe -c #{@input_file}") elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2")) file = IO.popen("#{bzpath} -c -d #{@input_file}") end else # meaning that it is a text file @infile_size = File.stat(@input_file).size file = open(@input_file) end # create basename of output file @outfile_base = File.basename(@input_file, ".*") + "-" @total_size = 0 @file_index = 1 outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s) @outfiles = [] @outfiles << outfilename @fp = File.open(outfilename, "w") @file_pointer = file true end # read text data from bz2 compressed file by 1 megabyte def fill_buffer loop do begin new_lines = @file_pointer.read(10_485_760) rescue StandardError return nil end return nil unless new_lines # temp_buf is filled with text split by "\n" temp_buf = [] ss = StringScanner.new(new_lines) temp_buf << ss[0] while ss.scan(/.*?\n/m) temp_buf << ss.rest unless ss.eos? new_first_line = temp_buf.shift @buffer.last << new_first_line @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n") @buffer += temp_buf unless temp_buf.empty? @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n") break if @buffer.size > 1 end true end def get_newline @buffer ||= [+""] if @buffer.size == 1 && !fill_buffer nil elsif @buffer.empty? nil else @buffer.shift end end def split_file output_text = +"" end_flag = false while (text = get_newline) @count ||= 0 @count += 1 @size_read ||= 0 @size_read += text.bytesize @total_size += text.bytesize output_text << text end_flag = true if @total_size > (@tfile_size * 1024 * 1024) # never close the file until the end of the page even if end_flag is on next unless end_flag && %r{ 1 end true end def get_newline @buffer ||= [+""] if @buffer.size == 1 && !fill_buffer nil elsif @buffer.empty? nil else @buffer.shift end end def get_page inside_page = false page = +"" while (line = get_newline) case line when // page << line inside_page = true next when %r{} page << line inside_page = false break end page << line if inside_page end if page.empty? false else page.force_encoding("utf-8") end rescue StandardError page end def extract_text(&block) title = nil output_text = +"" pages = [] data_empty = false until data_empty new_page = get_page if new_page pages << new_page else data_empty = true end next unless data_empty pages.each do |page| xmlns = '' + "\n" xml = xmlns + page + "" input = Nokogiri::XML(xml, nil, 'UTF-8') page = input.xpath("//xmlns:text").first pp_title = page.parent.parent.at_css "title" title = pp_title.content next if /:/ =~ title text = page.content text.gsub!(//m) do |content| num_of_newlines = content.count("\n") if num_of_newlines.zero? +"" else "\n" * num_of_newlines end end article = Article.new(text, title, @strip_tmarker) page_text = block.call(article) output_text << page_text end output_text = cleanup(output_text) unless output_text.empty? outfilename = File.join(@output_dir, @outfile_base + ".txt") @fp = File.open(outfilename, "w") @fp.puts(output_text) @fp.close end File.delete(@input_file) if @del_interfile output_text = +"" end end end end