# -*- coding: utf-8 -*-
require File.expand_path(File.join(File.dirname(__FILE__), "helper"))
# require 'hpricot'
# Mechanize.html_parser = Hpricot
require 'test/unit'
require 'kconv'
require 'iconv'
module EncodingTestPage
if RUBY_VERSION >= "1.9.0"
BODY_ENC_PAIR = {
:utf8 => ::Encoding::UTF_8,
:ascii => ::Encoding::US_ASCII,
:latin => ::Encoding::ISO_8859_2,
:cp1252 => ::Encoding::CP1252,
:sjis => ::Encoding::SHIFT_JIS,
:euc => ::Encoding::EUC_JP,
:cp932 => ::Encoding::CP932,
:bin => ::Encoding::ASCII_8BIT}
end
ENC_NAME = {
:utf8 => 'utf-8',
:ascii => 'us-ascii',
:latin => 'iso-8859-2',
:cp1252 => 'cp1252',
:sjis => 'shift_jis',
:euc => 'euc-jp',
:cp932 => 'cp932',
:utf8_upcase => 'UTF-8',
:unknown => '*unknown*'}
JP_TITLE = # other JP string may causes FAILURE, but it's due to misdetection of NKF.guess itself.
# This is Mechanize test, so don't modify me.
"\346\235\245\343\202\213\346\227\245\343\202\202\346\235\245\343\202\213\346\227\245" +
"\343\202\202\346\211\213\345\211\215\343\201\251\343\202\202\343\201\257\351\211\204" +
"\346\235\277\343\201\256\344\270\212\343\201\253\343\201\246\347\204\274\343\201\213" +
"\343\202\214\343\201\246\345\253\214\343\201\253\343\201\252\343\201\243\343\201\241" +
"\343\202\203\343\201\206\343\201\247\343\201\224\343\201\226\343\202\213"
# too short non-ascii strings don't work well at NKF.guess misdeteting test
TITLE = {
:ascii => "test page",
# "Bialystok" in UTF-8, 'puts' me on UTF-8 and latin-2 font console
:latin => "Bia\305\202ystok"*100,
# dagger mark, pure iso-8859-1 doesn't contain it.
# irb1.9_on_utf8> "\342\200\240".encode('iso-8859-1') #=> UndefinedConversionError
# irb1.9_on_utf8> "\342\200\240".encode('cp1252') #=> "\x86"
:cp1252 => "dagger mark dagger mark dagger mark dagger mark \342\200\240"*5,
:utf8 => JP_TITLE,
:sjis => JP_TITLE,
# circled integer, "marutuki-suuji" in Japanese. pure SHIFT_JIS doesn't know them.
# irb1.9_on_utf8> s.encode('shift_jis') #=> Encoding::UndefinedConversionError
:cp932 => "\342\221\240\342\221\241\342\221\242\342\221\243\342\221\244"*3,
:euc => JP_TITLE,
}
def page(h)
content_type = if h[:http]
"text/html; charset=#{ENC_NAME[h[:http]]}"
else
'text/html'
end
meta = if h[:meta]
""
else
''
end
html = convert("#{meta}
#{TITLE[h[:body]]}", h[:body])
return Mechanize::Page.new(
URI.parse('http://www.example.com/'),
{ 'content-type' => content_type },
html,
200,
h[:agent]|| Mechanize.new)
end
def convert(str, enc)
case enc
when :ascii then
Iconv::conv('ASCII', 'UTF-8', str)
when :latin then
Iconv::conv('ISO-8859-2', 'UTF-8', str)
when :utf8 then
NKF.nkf('-Wm0w', str)
when :sjis then
NKF.nkf('-Wm0s', str)
when :euc then
NKF.nkf('-Wm0e', str)
when :cp932 then
Iconv::conv('CP932', 'UTF-8', str)
when :cp1252 then
Iconv::conv('CP1252', 'UTF-8', str)
else
str
end
end
def err_msg1(page, mes_name)
return <= "1.9.0"
assert_equal(false, TITLE[@enc] == page.at('title').inner_text.force_encoding(::Encoding::UTF_8), err_msg2(page, 'FAILURE 2'))
end
else
# Hpricot just returns "same" byte string, so never "FAILURE".
assert(convert(TITLE[@enc], @enc) == page.parser.at('title').inner_text, err_msg2(page, 'FAILURE'))
end
end
end
# M H D
# Meta - t t meta works everytime
# HTTP f - t HTTP works only when meta doesn't exist
# Dete f f - Detect works only when both of meta and HTTP don't exist
module EncodingTest
include EncodingTestPage
attr_reader :bad
def test_with_no_meta_no_http
page = page(:body => @enc)
assert_SUCCESS(page)
end
def test_with_right_meta_any_http
page = page(:body => @enc, :meta => @enc)
assert_SUCCESS(page)
page = page(:body => @enc, :meta => @enc, :http => bad)
assert_SUCCESS(page)
page = page(:body => @enc, :meta => @enc, :http => @enc)
assert_SUCCESS(page)
end
def test_with_no_meta_right_http
page = page(:body => @enc, :http => @enc)
assert_SUCCESS(page)
end
def test_failure_with_bad_meta_any_http
page = page(:body => @enc, :meta => bad)
assert_FAILURE(page)
page = page(:body => @enc, :meta => bad, :http => bad)
assert_FAILURE(page)
page = page(:body => @enc, :meta => bad, :http => @enc)
assert_FAILURE(page)
end
def test_failure_with_no_meta_bad_http
page = page(:body => @enc, :http => bad)
assert_FAILURE(page)
end
def test_overwrite_encoding
page = page(:body => @enc, :meta => bad)
page.encoding = ENC_NAME[@enc]
assert_SUCCESS(page)
end
end
class ASCIITest < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :ascii, :utf8 ; end
# ASCII successes at all the case
alias :assert_FAILURE :assert_SUCCESS
end
class LatinTest < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :latin, :utf8 ; end
# Latin chars are misdetected to Japanese 'Shift_JIS' by NKF.guess
undef :test_with_no_meta_no_http
def test_failure_with_no_meta_no_http_cause_of_detect_charset_mistake
page = page(:body => @enc)
assert_FAILURE(page)
end
end
class CP1252Test < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :cp1252, :utf8 ; end
end
class UTF8Test < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :utf8, :sjis ; end
end
class ShiftJISTest < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :sjis, :utf8 ; end
end
class CP932Test < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :cp932, :utf8 ; end
end
class EUCJPTest < Test::Unit::TestCase
include EncodingTest
def setup ; @enc, @bad = :euc, :utf8 ; end
end
# =====================================
class Etc_Test < Test::Unit::TestCase
include EncodingTestPage
def setup
@agent = Mechanize.new
end
def test_page_meta_encoding
page = page(:body => :latin, :meta => :utf8, :http => :latin)
assert_equal('utf-8', page.meta_encoding)
end
def test_page_meta_encoding_as_is
page = page(:body => :latin, :meta => :utf8_upcase, :http => :latin)
assert_equal('UTF-8', page.meta_encoding)
page = page(:body => :latin, :meta => :unknown, :http => :latin)
assert_equal('*unknown*', page.meta_encoding)
end
def test_page_http_encoding
page = page(:body => :latin, :http => :utf8)
assert_equal('utf-8', page.http_encoding)
end
def test_page_http_encoding_as_is
page = page(:body => :latin, :http => :utf8_upcase)
assert_equal('UTF-8', page.http_encoding)
page = page(:body => :latin, :http => :unknown)
assert_equal('*unknown*', page.http_encoding)
end
def test_page_body_encoding
page = page(:body => :utf8)
assert_equal('UTF-8', page.body_encoding)
end
def test_post_page_hook
@enc = :latin
page = page(:body => :latin)
assert_FAILURE(page)
@agent.post_page_hooks << lambda{|p| p.encoding = p.http_charset}
page = page(:body => :latin, :http => :latin, :agent => @agent)
assert_SUCCESS(page)
end
def test_reset_parser
data = {
:title => "\343\202\277\343\202\244\343\203\210\343\203\253",
:link => "\343\203\252\343\203\263\343\202\257"} # "title", "link" in japanese utf-8
page = Mechanize::Page.new(
URI.parse('http://www.example.com/'),
{ 'content-type' => 'text/html; charset=SHIFT_JIS' },
"#{data[:title]}#{data[:link]}",
200,
@agent)
bad = [page.title, page.links[0].text]
page.encoding = 'utf-8' # correct encoding
good = [page.title, page.links[0].text]
assert_not_equal(good, bad) # Page#encoding resets title and links
assert_equal([data[:title], data[:link]], good) # correct encoding
end
end