# frozen_string_literal: true
require "nokogiri"
require_relative "wp2txt/article"
require_relative "wp2txt/utils"
module Wp2txt
class Splitter
include Wp2txt
def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
@fp = nil
@input_file = input_file
@output_dir = output_dir
@tfile_size = tfile_size
require "bzip2-ruby" if bz2_gem
@bz2_gem = bz2_gem
prepare
end
def file_size(file)
size = 0
unit = 10_485_760
star = 0
before = Time.now.to_f
loop do
begin
a = file.read(unit)
rescue StandardError
a = nil
end
break unless a
present = Time.now.to_f
size += a.size
next if present - before <= 0.3
star = 0 if star > 10
star += 1
before = present
end
size
end
# check if a given command exists: return the path if it does, return false if not
def command_exist?(command)
basename = File.basename(command)
path = +""
print "Checking #{basename}: "
if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
puts "detected [#{path}]"
path.strip
elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
puts "detected [#{path}]"
path.strip
else
puts "not found"
false
end
end
# check the size of input file (bz2 or plain xml) when decompressed
def prepare
# if output_dir is not specified, output in the same directory
# as the imput file
@output_dir = File.dirname(@input_file) if !@output_dir && @input_file
if /.bz2$/ =~ @input_file
if @bz2_gem
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
elsif RUBY_PLATFORM.index("win32")
file = IO.popen("bunzip2.exe -c #{@input_file}")
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
file = IO.popen("#{bzpath} -c -d #{@input_file}")
end
else # meaning that it is a text file
@infile_size = File.stat(@input_file).size
file = open(@input_file)
end
# create basename of output file
@outfile_base = File.basename(@input_file, ".*") + "-"
@total_size = 0
@file_index = 1
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
@outfiles = []
@outfiles << outfilename
@fp = File.open(outfilename, "w")
@file_pointer = file
true
end
# read text data from bz2 compressed file by 1 megabyte
def fill_buffer
loop do
begin
new_lines = @file_pointer.read(10_485_760)
rescue StandardError
return nil
end
return nil unless new_lines
# temp_buf is filled with text split by "\n"
temp_buf = []
ss = StringScanner.new(new_lines)
temp_buf << ss[0] while ss.scan(/.*?\n/m)
temp_buf << ss.rest unless ss.eos?
new_first_line = temp_buf.shift
@buffer.last << new_first_line
@buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
@buffer += temp_buf unless temp_buf.empty?
@buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
break if @buffer.size > 1
end
true
end
def get_newline
@buffer ||= [+""]
if @buffer.size == 1 && !fill_buffer
nil
elsif @buffer.empty?
nil
else
@buffer.shift
end
end
def split_file
output_text = +""
end_flag = false
while (text = get_newline)
@count ||= 0
@count += 1
@size_read ||= 0
@size_read += text.bytesize
@total_size += text.bytesize
output_text << text
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
# never close the file until the end of the page even if end_flag is on
next unless end_flag && %r{ 1
end
true
end
def get_newline
@buffer ||= [+""]
if @buffer.size == 1 && !fill_buffer
nil
elsif @buffer.empty?
nil
else
@buffer.shift
end
end
def get_page
inside_page = false
page = +""
while (line = get_newline)
case line
when //
page << line
inside_page = true
next
when %r{}
page << line
inside_page = false
break
end
page << line if inside_page
end
if page.empty?
false
else
page.force_encoding("utf-8")
end
rescue StandardError
page
end
def extract_text(&block)
title = nil
output_text = +""
pages = []
data_empty = false
until data_empty
new_page = get_page
if new_page
pages << new_page
else
data_empty = true
end
next unless data_empty
pages.each do |page|
xmlns = '' + "\n"
xml = xmlns + page + ""
input = Nokogiri::XML(xml, nil, 'UTF-8')
page = input.xpath("//xmlns:text").first
pp_title = page.parent.parent.at_css "title"
title = pp_title.content
next if /:/ =~ title
text = page.content
text.gsub!(//m) do |content|
num_of_newlines = content.count("\n")
if num_of_newlines.zero?
+""
else
"\n" * num_of_newlines
end
end
article = Article.new(text, title, @strip_tmarker)
page_text = block.call(article)
output_text << page_text
end
output_text = cleanup(output_text)
unless output_text.empty?
outfilename = File.join(@output_dir, @outfile_base + ".txt")
@fp = File.open(outfilename, "w")
@fp.puts(output_text)
@fp.close
end
File.delete(@input_file) if @del_interfile
output_text = +""
end
end
end
end