# -*- coding: utf-8 -*- require 'mechanize/test_case' # tests for Page encoding and charset and parsing class TestMechanizePageEncoding < Mechanize::TestCase MECH_ASCII_ENCODING = 'US-ASCII' def setup super @uri = URI('http://localhost/') @response_headers = { 'content-type' => 'text/html' } @body = 'hi' end def util_page body = @body, headers = @response_headers Mechanize::Page.new @uri, headers, body && body.force_encoding(Encoding::BINARY), 200, @mech end def test_page_charset charset = Mechanize::Page.charset 'text/html;charset=vAlue' assert_equal 'vAlue', charset charset = Mechanize::Page.charset 'text/html;charset=vaLue, text/html' assert_equal 'vaLue', charset charset = Mechanize::Page.charset 'text/html ; charset = valUe, text/html' assert_equal 'valUe', charset end def test_page_charset_upcase charset = Mechanize::Page.charset 'TEXT/HTML;CHARSET=UTF-8' assert_equal 'UTF-8', charset end def test_page_charset_semicolon charset = Mechanize::Page.charset 'text/html;charset=UTF-8;' assert_equal 'UTF-8', charset end def test_page_charset_no_chaset_token charset = Mechanize::Page.charset 'text/html' assert_nil charset end def test_page_charset_returns_nil_when_charset_says_none charset = Mechanize::Page.charset 'text/html;charset=none' assert_nil charset end def test_page_charset_multiple charset = Mechanize::Page.charset 'text/html;charset=111;charset=222' assert_equal '111', charset end def test_page_response_header_charset headers = { 'content-type' => 'text/html;charset=HEADER' } charsets = Mechanize::Page.response_header_charset(headers) assert_equal ['HEADER'], charsets end def test_page_response_header_charset_no_token headers = {'content-type' => 'text/html'} charsets = Mechanize::Page.response_header_charset(headers) assert_equal [], charsets headers = {'X-My-Header' => 'hello'} charsets = Mechanize::Page.response_header_charset(headers) assert_equal [], charsets end def test_page_response_header_charset_wrong_header headers = { 'x-content-type' => 'text/html;charset=bogus' } charsets = Mechanize::Page.response_header_charset(headers) assert_equal [], charsets end def test_response_header_charset page = util_page nil, {'content-type' => 'text/html;charset=HEADER'} assert_equal ['HEADER'], page.response_header_charset end def test_page_meta_charset body = '' charsets = Mechanize::Page.meta_charset(body) assert_equal ['META'], charsets end def test_page_meta_charset_is_empty_when_no_charset_meta body = '' charsets = Mechanize::Page.meta_charset(body) assert_equal [], charsets end def test_page_meta_charset_no_content body = '' charsets = Mechanize::Page.meta_charset(body) assert_empty charsets end # Test to fix issue: https://github.com/sparklemotion/mechanize/issues/143 def test_page_meta_charset_handles_whitespace body = '' charsets = Mechanize::Page.meta_charset(body) assert_equal ["iso-8859-1"], charsets end def test_meta_charset body = '' page = util_page body assert_equal ['META'], page.meta_charset end def test_detected_encoding page = util_page assert_equal MECH_ASCII_ENCODING, page.detected_encoding end def test_encodings response = {'content-type' => 'text/html;charset=HEADER'} body = '' @mech.default_encoding = 'DEFAULT' page = util_page body, response assert_equal true, page.encodings.include?('HEADER') assert_equal true, page.encodings.include?('META') assert_equal true, page.encodings.include?(MECH_ASCII_ENCODING) assert_equal true, page.encodings.include?('DEFAULT') end def test_parser_with_default_encoding # pre test assert_equal false, util_page.encodings.include?('Windows-1252') @mech.default_encoding = 'Windows-1252' page = util_page assert_equal true, page.encodings.include?('Windows-1252') end def test_parser_force_default_encoding @mech.default_encoding = 'Windows-1252' @mech.force_default_encoding = true page = util_page assert page.encodings.include? 'Windows-1252' end def test_parser_encoding_equals_overwrites_force_default_encoding @mech.default_encoding = 'Windows-1252' @mech.force_default_encoding = true page = util_page assert_equal 'Windows-1252', page.encoding page.encoding = 'ISO-8859-2' assert_equal 'ISO-8859-2', page.encoding end def test_parser_encoding_when_searching_elements skip "Encoding not implemented" unless have_encoding? body = 'hi' page = util_page body, 'content-type' => 'text/html,charset=ISO-8859-1' result = page.search('#latin1') assert_equal Encoding::UTF_8, result.text.encoding end def test_parser_error_message_containing_encoding_errors skip if RUBY_ENGINE == 'jruby' # this is a libxml2-specific condition # https://github.com/sparklemotion/mechanize/issues/553 body = <<~EOF EOF page = util_page body # this should not raise an "invalid byte sequence in UTF-8" error while processing parsing errors page.search("body") # let's assert on the setup: a libxml2-returned parsing error itself contains an invalid character # note that this problem only appears in libxml <= 2.9.10 error = page.parser.errors.find { |e| e.message.include?("Comment not terminated") } if error exception = assert_raises(ArgumentError) do error.message =~ /any regex just to trigger encoding error/ end assert_includes(exception.message, "invalid byte sequence in UTF-8") end end end