# -*- coding: utf-8 -*- require File.expand_path(File.join(File.dirname(__FILE__), "helper")) # require 'hpricot' # Mechanize.html_parser = Hpricot require 'test/unit' require 'kconv' require 'iconv' module EncodingTestPage if RUBY_VERSION >= "1.9.0" BODY_ENC_PAIR = { :utf8 => ::Encoding::UTF_8, :ascii => ::Encoding::US_ASCII, :latin => ::Encoding::ISO_8859_2, :cp1252 => ::Encoding::CP1252, :sjis => ::Encoding::SHIFT_JIS, :euc => ::Encoding::EUC_JP, :cp932 => ::Encoding::CP932, :bin => ::Encoding::ASCII_8BIT} end ENC_NAME = { :utf8 => 'utf-8', :ascii => 'us-ascii', :latin => 'iso-8859-2', :cp1252 => 'cp1252', :sjis => 'shift_jis', :euc => 'euc-jp', :cp932 => 'cp932', :utf8_upcase => 'UTF-8', :unknown => '*unknown*'} JP_TITLE = # other JP string may causes FAILURE, but it's due to misdetection of NKF.guess itself. # This is Mechanize test, so don't modify me. "\346\235\245\343\202\213\346\227\245\343\202\202\346\235\245\343\202\213\346\227\245" + "\343\202\202\346\211\213\345\211\215\343\201\251\343\202\202\343\201\257\351\211\204" + "\346\235\277\343\201\256\344\270\212\343\201\253\343\201\246\347\204\274\343\201\213" + "\343\202\214\343\201\246\345\253\214\343\201\253\343\201\252\343\201\243\343\201\241" + "\343\202\203\343\201\206\343\201\247\343\201\224\343\201\226\343\202\213" # too short non-ascii strings don't work well at NKF.guess misdeteting test TITLE = { :ascii => "test page", # "Bialystok" in UTF-8, 'puts' me on UTF-8 and latin-2 font console :latin => "Bia\305\202ystok"*100, # dagger mark, pure iso-8859-1 doesn't contain it. # irb1.9_on_utf8> "\342\200\240".encode('iso-8859-1') #=> UndefinedConversionError # irb1.9_on_utf8> "\342\200\240".encode('cp1252') #=> "\x86" :cp1252 => "dagger mark dagger mark dagger mark dagger mark \342\200\240"*5, :utf8 => JP_TITLE, :sjis => JP_TITLE, # circled integer, "marutuki-suuji" in Japanese. pure SHIFT_JIS doesn't know them. # irb1.9_on_utf8> s.encode('shift_jis') #=> Encoding::UndefinedConversionError :cp932 => "\342\221\240\342\221\241\342\221\242\342\221\243\342\221\244"*3, :euc => JP_TITLE, } def page(h) content_type = if h[:http] "text/html; charset=#{ENC_NAME[h[:http]]}" else 'text/html' end meta = if h[:meta] "" else '' end html = convert("#{meta}#{TITLE[h[:body]]}", h[:body]) return Mechanize::Page.new( URI.parse('http://www.example.com/'), { 'content-type' => content_type }, html, 200, h[:agent]|| Mechanize.new) end def convert(str, enc) case enc when :ascii then Iconv::conv('ASCII', 'UTF-8', str) when :latin then Iconv::conv('ISO-8859-2', 'UTF-8', str) when :utf8 then NKF.nkf('-Wm0w', str) when :sjis then NKF.nkf('-Wm0s', str) when :euc then NKF.nkf('-Wm0e', str) when :cp932 then Iconv::conv('CP932', 'UTF-8', str) when :cp1252 then Iconv::conv('CP1252', 'UTF-8', str) else str end end def err_msg1(page, mes_name) return <= "1.9.0" assert_equal(false, TITLE[@enc] == page.at('title').inner_text.force_encoding(::Encoding::UTF_8), err_msg2(page, 'FAILURE 2')) end else # Hpricot just returns "same" byte string, so never "FAILURE". assert(convert(TITLE[@enc], @enc) == page.parser.at('title').inner_text, err_msg2(page, 'FAILURE')) end end end # M H D # Meta - t t meta works everytime # HTTP f - t HTTP works only when meta doesn't exist # Dete f f - Detect works only when both of meta and HTTP don't exist module EncodingTest include EncodingTestPage attr_reader :bad def test_with_no_meta_no_http page = page(:body => @enc) assert_SUCCESS(page) end def test_with_right_meta_any_http page = page(:body => @enc, :meta => @enc) assert_SUCCESS(page) page = page(:body => @enc, :meta => @enc, :http => bad) assert_SUCCESS(page) page = page(:body => @enc, :meta => @enc, :http => @enc) assert_SUCCESS(page) end def test_with_no_meta_right_http page = page(:body => @enc, :http => @enc) assert_SUCCESS(page) end def test_failure_with_bad_meta_any_http page = page(:body => @enc, :meta => bad) assert_FAILURE(page) page = page(:body => @enc, :meta => bad, :http => bad) assert_FAILURE(page) page = page(:body => @enc, :meta => bad, :http => @enc) assert_FAILURE(page) end def test_failure_with_no_meta_bad_http page = page(:body => @enc, :http => bad) assert_FAILURE(page) end def test_overwrite_encoding page = page(:body => @enc, :meta => bad) page.encoding = ENC_NAME[@enc] assert_SUCCESS(page) end end class ASCIITest < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :ascii, :utf8 ; end # ASCII successes at all the case alias :assert_FAILURE :assert_SUCCESS end class LatinTest < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :latin, :utf8 ; end # Latin chars are misdetected to Japanese 'Shift_JIS' by NKF.guess undef :test_with_no_meta_no_http def test_failure_with_no_meta_no_http_cause_of_detect_charset_mistake page = page(:body => @enc) assert_FAILURE(page) end end class CP1252Test < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :cp1252, :utf8 ; end end class UTF8Test < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :utf8, :sjis ; end end class ShiftJISTest < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :sjis, :utf8 ; end end class CP932Test < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :cp932, :utf8 ; end end class EUCJPTest < Test::Unit::TestCase include EncodingTest def setup ; @enc, @bad = :euc, :utf8 ; end end # ===================================== class Etc_Test < Test::Unit::TestCase include EncodingTestPage def setup @agent = Mechanize.new end def test_page_meta_encoding page = page(:body => :latin, :meta => :utf8, :http => :latin) assert_equal('utf-8', page.meta_encoding) end def test_page_meta_encoding_as_is page = page(:body => :latin, :meta => :utf8_upcase, :http => :latin) assert_equal('UTF-8', page.meta_encoding) page = page(:body => :latin, :meta => :unknown, :http => :latin) assert_equal('*unknown*', page.meta_encoding) end def test_page_http_encoding page = page(:body => :latin, :http => :utf8) assert_equal('utf-8', page.http_encoding) end def test_page_http_encoding_as_is page = page(:body => :latin, :http => :utf8_upcase) assert_equal('UTF-8', page.http_encoding) page = page(:body => :latin, :http => :unknown) assert_equal('*unknown*', page.http_encoding) end def test_page_body_encoding page = page(:body => :utf8) assert_equal('UTF-8', page.body_encoding) end def test_post_page_hook @enc = :latin page = page(:body => :latin) assert_FAILURE(page) @agent.post_page_hooks << lambda{|p| p.encoding = p.http_charset} page = page(:body => :latin, :http => :latin, :agent => @agent) assert_SUCCESS(page) end def test_reset_parser data = { :title => "\343\202\277\343\202\244\343\203\210\343\203\253", :link => "\343\203\252\343\203\263\343\202\257"} # "title", "link" in japanese utf-8 page = Mechanize::Page.new( URI.parse('http://www.example.com/'), { 'content-type' => 'text/html; charset=SHIFT_JIS' }, "#{data[:title]}#{data[:link]}", 200, @agent) bad = [page.title, page.links[0].text] page.encoding = 'utf-8' # correct encoding good = [page.title, page.links[0].text] assert_not_equal(good, bad) # Page#encoding resets title and links assert_equal([data[:title], data[:link]], good) # correct encoding end end