# coding: ASCII-8BIT
#
# Copyright (c) 2011-2017 NAITOH Jun
# Released under the MIT license
# http://www.opensource.org/licenses/MIT
require 'test_helper'
class RbpdfHtmlTest < Test::Unit::TestCase
class MYPDF < RBPDF
def getPageBuffer(page)
super
end
# get text count and x_pos from pdf page
def get_html_text_position_x(page, regrep_text, x_pos_exp=nil)
count_line, count_text, x_pos, y_pos = get_html_text_position(page, regrep_text, x_pos_exp)
return count_line, count_text, x_pos
end
# get text count and y_pos from pdf page
def get_html_text_position_y(page, regrep_text)
count_line, count_text, x_pos, y_pos = get_html_text_position(page, regrep_text)
return count_line, count_text, y_pos
end
# get text count and pos from pdf page
def get_html_text_position(page, regrep_text, x_pos_exp=nil)
content = []
contents = getPageBuffer(page)
contents.each_line {|line| content.push line.chomp }
count_line = count_text = 0
x_pos = y_pos = -1
content.each do |line|
count_line += 1 if line =~ /TJ ET Q$/ # Text Line Count
if line =~ regrep_text
count_text += 1
line =~ /BT ([0-9.]+) ([0-9.]+) Td/
x_pos = $1
y_pos = $2 if y_pos == -1 # y first position only
if x_pos.nil? or y_pos.nil? # Error
return count_line, count_text, nil, nil
end
if !x_pos_exp.nil? and x_pos != x_pos_exp # Error
return count_line, count_text, x_pos, y_pos
end
end
end
return count_line, count_text, x_pos, y_pos
end
# get text from pdf page
def get_html_text(page)
content = []
contents = getPageBuffer(page)
contents.each_line {|line| content.push line.chomp }
pdf_text = ''
content.each do |line|
if line =~ /\[\((.*)\)\] TJ ET/
pdf_text << $1
end
end
return pdf_text
end
end
test "write_html Basic test" do
pdf = RBPDF.new
pdf.add_page()
htmlcontent = '
HTML Example
'
pdf.write_html(htmlcontent, true, 0, true, 0)
htmlcontent = 'abcdefghijklmnopgrstuvwxyz01234567890 abcdefghijklmnopgrstuvwxyz01234567890 abcdefghijklmnopgrstuvwxyz01234567890 abcdefghijklmnopgrstuvwxyz01234567890 abcdefghijklmnopgrstuvwxyz01234567890'
pdf.write_html(htmlcontent, true, 0, true, 0)
htmlcontent = '1
2
3
4
5
6
7
8
9
10
11
'
pdf.write_html(htmlcontent, true, 0, true, 0)
pno = pdf.get_page
assert_equal 3, pno
end
test "write_html Table test 1" do
pdf = RBPDF.new
pdf.add_page()
tablehtml = ''
pdf.write_html(tablehtml, true, 0, true, 0)
htmlcontent = '1
2
3
4
5
6
7
8
9
10
11
'
tablehtml = ''
pdf.write_html(tablehtml, true, 0, true, 0)
pno = pdf.get_page
assert_equal 3, pno
end
test "write_html Table test 2" do
pdf = MYPDF.new
pdf.add_page()
htmlcontent = '1
2
3
4
5
6
7
8
9
10
11
'
tablehtml = 'ABCD | EFGH | IJKL |
abcd | efgh | ijkl |
' + htmlcontent + ' |
'
pdf.write_html(tablehtml, true, 0, true, 0)
pno = pdf.get_page
assert_equal 3, pno
# Page 1
count_line, count_text, xpos1 = pdf.get_html_text_position_x(1, /ABCD/) # Header
assert_not_nil xpos1
assert_equal 1, count_text
assert_equal 13, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(1, /abcd/)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 13, count_line
# Page 2
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /\([6-9]\)/, xpos1)
assert_not_nil xpos2
assert_equal xpos1, xpos2
assert_equal 7, count_line
end
test "write_html Table thead tag test 1" do
pdf = MYPDF.new
pdf.add_page()
tablehtml = '
ABCD | EFGH | IJKL |
abcd | efgh | ijkl |
'
pdf.write_html(tablehtml, true, 0, true, 0)
page = pdf.get_page
assert_equal 1, page
count_line, count_text, xpos = pdf.get_html_text_position_x(1, /ABCD/) # Header
assert_equal 1, count_text
end
test "write_html Table thead tag test 2" do
pdf = MYPDF.new
pdf.add_page()
htmlcontent = '1
2
3
4
5
6
7
8
9
10
11
'
tablehtml = 'ABCD | EFGH | IJKL |
abcd | efgh | ijkl |
' + htmlcontent + ' |
'
pdf.write_html(tablehtml, true, 0, true, 0)
page = pdf.get_page
assert_equal 3, page
# Page 1
count_line, count_text, xpos1 = pdf.get_html_text_position_x(1, /ABCD/) # Header
assert_not_nil xpos1
assert_equal 1, count_text
assert_equal 13, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(1, /abcd/)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 13, count_line
# Page 2
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /ABCD/, xpos1) # Header
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 10, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /abcd/)
assert_equal 0, count_text
assert_equal 10, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /\([6-9]\)/, xpos1)
assert_not_nil xpos2
assert_equal xpos1, xpos2
assert_equal 10, count_line
# Page 3
count_line, count_text, xpos2 = pdf.get_html_text_position_x(3, /ABCD/, xpos1) # Header
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 5, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(3, /abcd/)
assert_equal 0, count_text
assert_equal 5, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(3, /\(11\)/, xpos1)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 5, count_line
end
test "write_html_cell Table thead tag test" do
pdf = MYPDF.new
pdf.add_page()
htmlcontent = '
1
2
3
4
5
6
7
8
9
10
11
'
tablehtml ='
Left align |
Right align |
Center align |
left' + htmlcontent + ' |
right |
center |
'
pdf.write_html_cell(0, 0, '', '',tablehtml)
page = pdf.get_page
assert_equal 1, page
# Page 1
count_line, count_text, xpos1 = pdf.get_html_text_position_x(1, /Left align/) # Header
assert_not_nil xpos1
assert_equal 1, count_text
assert_equal 13, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(1, /left/)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal 13, count_line
assert_equal xpos1, xpos2
# Page 2
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /Left align/, xpos1) # Header
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 10, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /\(6\)/, xpos1)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 10, count_line
# Page 3
count_line, count_text, xpos2 = pdf.get_html_text_position_x(3, /Left align/, xpos1) # Header
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 5, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(3, /\(11\)/, xpos1)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 5, count_line
end
test "write_html_cell Table thead tag cellpadding x position test" do
pdf = MYPDF.new
pdf.add_page()
htmlcontent = '
1
2
3
4
5
6
7
8
9
10
11
'
tablehtml ='
Left align |
Center align |
Right align |
left |
center |
right' + htmlcontent + ' |
'
pdf.write_html_cell(0, 0, '', '',tablehtml)
page = pdf.get_page
assert_equal 1, page
# Page 1
count_line, count_text, xpos1 = pdf.get_html_text_position_x(1, /Right align/) # Header
assert_not_nil xpos1
assert_equal 1, count_text
assert_equal 13, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(1, /right/)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 13, count_line
# Page 2
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /Right align/, xpos1) # Header
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 10, count_line
count_line, count_text, xpos2 = pdf.get_html_text_position_x(2, /\(6\)/, xpos1)
assert_not_nil xpos2
assert_equal 1, count_text
assert_equal xpos1, xpos2
assert_equal 10, count_line
end
test "write_html_cell Table thead tag cellpadding y position test 1" do
pdf = MYPDF.new
pdf.add_page()
table_start='
Left align | Center align | Right align |
'
table_col='AAA | BBB | CCC |
'
table_end='
'
tablehtml= table_start + table_col * 30 + table_end
pdf.write_html_cell(0, 0, '', '',tablehtml)
# Page 1
count_line, count_text, ypos1 = pdf.get_html_text_position_y(1, /Left align/) # Header
assert_not_nil ypos1
assert_equal 1, count_text
assert_equal 65, count_line
count_line, count_text, ypos2 = pdf.get_html_text_position_y(1, /AAA/)
assert_not_nil ypos2
assert_equal 20, count_text
assert_equal 65, count_line
base_pos = ypos1.to_i - ypos2.to_i
# Page 2
count_line, count_text, ypos1 = pdf.get_html_text_position_y(2, /Left align/) # Header
assert_not_nil ypos2
assert_equal 1, count_text
assert_equal 34, count_line
count_line, count_text, ypos2 = pdf.get_html_text_position_y(2, /AAA/)
assert_not_nil ypos2
assert_equal 10, count_text
assert_equal 34, count_line
assert_equal base_pos, ypos1.to_i - ypos2.to_i
end
test "write_html_cell Table thead tag cellpadding y position test 2" do
pdf = MYPDF.new
pdf.add_page()
table_start='abc
Left align | Center align | Right align |
'
table_col='AAA | BBB | CCC |
'
table_end='
'
tablehtml= table_start + table_col * 30 + table_end
pdf.write_html_cell(0, 0, '', '',tablehtml)
# Page 1
count_line, count_text, ypos1 = pdf.get_html_text_position_y(1, /Left align/) # Header
assert_not_nil ypos1
assert_equal 1, count_text
assert_equal 66, count_line
count_line, count_text, ypos2 = pdf.get_html_text_position_y(1, /AAA/)
assert_not_nil ypos2
assert_equal 20, count_text
assert_equal 66, count_line
base_pos = ypos1.to_i - ypos2.to_i
# Page 2
count_line, count_text, ypos1 = pdf.get_html_text_position_y(2, /Left align/) # Header
assert_not_nil ypos2
assert_equal 1, count_text
assert_equal 34, count_line
count_line, count_text, ypos2 = pdf.get_html_text_position_y(2, /AAA/)
assert_not_nil ypos2
assert_equal 10, count_text
assert_equal 34, count_line
assert_equal base_pos, ypos1.to_i - ypos2.to_i
end
test "write_html ASCII text test" do
pdf = MYPDF.new
pdf.add_page()
text = 'HTML Example'
htmlcontent = '' + text + '
'
pdf.write_html(htmlcontent, true, 0, true, 0)
page = pdf.get_page
assert_equal 1, page
content = []
contents = pdf.getPageBuffer(1)
contents.each_line {|line| content.push line.chomp }
count_text = 0
content.each do |line|
count_text += 1 unless line.scan(text).empty?
end
assert_equal 1, count_text
end
test "write_html Non ASCII text test" do
pdf = MYPDF.new
pdf.add_page()
text = 'HTML Example ' + "\xc2\x83\xc2\x86"
htmlcontent = '' + text + '
'
pdf.write_html(htmlcontent, true, 0, true, 0)
page = pdf.get_page
assert_equal 1, page
content = []
contents = pdf.getPageBuffer(1)
contents.each_line {|line| content.push line.chomp }
text = 'HTML Example ' + "\x83\x86"
text.force_encoding('ASCII-8BIT') if text.respond_to?(:force_encoding)
count_text = 0
content.each do |line|
line.force_encoding('ASCII-8BIT') if line.respond_to?(:force_encoding)
count_text += 1 unless line.scan(text).empty?
end
assert_equal 1, count_text
end
test "works internal links out of page range" do
pdf = RBPDF.new
pdf.add_page()
htmlcontent = 'FooLink'
pdf.write_html(htmlcontent, true, 0, true, 0)
assert_nothing_raised do
pdf.Close
end
assert_nothing_raised do
pdf.Output
end
end
test "write_html no tag text test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = ' abc def '
pdf.write_html(text, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal 'abc def', pdf_text
end
test "write_html no tag back slash test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = " abc \\def "
pdf.write_html(text, true, 0, true, 0) # use escape() method in getCellCode()
pdf_text = pdf.get_html_text(1)
assert_equal "abc \\\\def", pdf_text
end
test "write_html tag test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = ' ' + 'A' * 70
htmlcontent = '' + text + ''
pdf.write_html(htmlcontent, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal 'A' * 70, pdf_text
end
test "write_html tag test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = ' ' + 'A' * 70
htmlcontent = '' + text + ''
pdf.write_html(htmlcontent, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal 'A' * 70, pdf_text
end
test "write_html tag test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = ' ' + 'A' * 70
htmlcontent = '' + text + ''
pdf.write_html(htmlcontent, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal 'A' * 70, pdf_text
end
test "write_html tag space 1 test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = ' ' + 'A' * 70
htmlcontent = '' + text + '
'
pdf.write_html(htmlcontent, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal "\xa0" + 'A' * 70, pdf_text
end
test "write_html tag space 2 test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = ' ' + 'A' * 70
htmlcontent = '' + text + '
'
pdf.write_html(htmlcontent, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal "\xa0" * 2 + 'A' * 70, pdf_text
end
test "write_html
tag text test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = "abc"
htmlcontent = ''
pdf.write_html(htmlcontent, true, 0, true, 0)
pdf_text = pdf.get_html_text(1)
assert_equal 'abc', pdf_text
end
test "write_html tag back slash test" do
pdf = MYPDF.new
pdf.set_print_header(false)
pdf.add_page()
text = "a\\bc"
htmlcontent = ''
pdf.write_html(htmlcontent, true, 0, true, 0) # use escape() method in getCellCode()
pdf_text = pdf.get_html_text(1)
assert_equal 'a\\\\bc', pdf_text
end
test "write_html Character Entities test" do
pdf = MYPDF.new
pdf.set_print_header(false)
character_entities = {
'<' => '<',
'>' => '>',
'&' => '&',
'"' => '"',
' ' => "\xa0",
'¢' => "\xa2",
'£' => "\xa3",
'¥' => "\xa5",
'©' => "\xa9",
'®' => "\xae",
'€' => "\x80",
}
character_entities.each {|ce, c|
pdf.add_page()
page = pdf.get_page
pdf.write_html(ce, true, 0, true, 0)
pdf_text = pdf.get_html_text(page)
assert_equal '[' + ce + ']:' + c, '[' + ce + ']:' + pdf_text
}
end
test "write_html Character Entities test pre mode" do
pdf = MYPDF.new
pdf.set_print_header(false)
character_entities = {
'<' => '<',
'>' => '>',
'&' => '&',
'"' => '"',
' ' => "\xa0",
'¢' => "\xa2",
'£' => "\xa3",
'¥' => "\xa5",
'©' => "\xa9",
'®' => "\xae",
'€' => "\x80",
}
character_entities.each {|ce, c|
pdf.add_page()
page = pdf.get_page
pdf.write_html('' + ce + '
', true, 0, true, 0)
pdf_text = pdf.get_html_text(page)
assert_equal '[' + ce + ']:' + c, '[' + ce + ']:' + pdf_text
}
end
test "unhtmlentities test" do
pdf = RBPDF.new
character_entities = {
'<' => '<',
'>' => '>',
'&' => '&',
'"' => '"',
' ' => "\xc2\xa0",
'¢' => "\xc2\xa2",
'£' => "\xc2\xa3",
'¥' => "\xc2\xa5",
'©' => "\xc2\xa9",
'®' => "\xc2\xae",
'€' => "\xe2\x82\xac",
}
character_entities.each {|ce, c|
text = pdf.unhtmlentities(ce)
text.force_encoding('ASCII-8BIT') if text.respond_to?(:force_encoding)
assert_equal '[' + ce + ']:' + c, '[' + ce + ']:' + text
}
end
end