# encoding: UTF-8
require 'spec_helper'
describe "GuessHtmlEncoding" do
describe "#guess" do
it "can use headers" do
guess = GuessHtmlEncoding.guess("
hi!
",
"Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")
guess.should == "ISO-8859-1"
end
it "accepts headers as a hash as well" do
guess = GuessHtmlEncoding.guess("hi!
",
{"Hello" => "world", "Content-Type" => "text/html; charset=LATIN1", "Foo" => "bar"})
guess.should == "ISO-8859-1"
end
it "accepts meta tags" do
guess = GuessHtmlEncoding.guess('hi!
')
guess.should == "ISO-8859-1"
end
it "works okay when there is a semi-colon after the encoding with headers" do
guess = GuessHtmlEncoding.guess("hi!
",
"Hello: world\nContent-Type: text/html; charset=utf-8;\nFoo: bar")
guess.should == "UTF-8"
end
it "works okay when there is a semi-colon after the encoding with meta-tags" do
guess = GuessHtmlEncoding.guess('hi!
')
guess.should == "UTF-8"
end
it "converts UTF8 to UTF-8" do
guess = GuessHtmlEncoding.guess('hi!
')
guess.should == "UTF-8"
end
it "converts CP-1251 to CP1251" do
guess = GuessHtmlEncoding.guess('hi!
')
guess.should == "CP1251"
end
it "skips the header content type if it's invalid" do
guess = GuessHtmlEncoding.guess('hi!
',
"Hello: world\nContent-Type: text/html; charset=RU;\nFoo: bar")
guess.should == "UTF-8"
end
end
describe "#encode" do
it "should work on correctly encoded pages" do
data = "hi!♥
"
data.force_encoding("ASCII-8BIT")
data.should be_valid_encoding # everything is valid in binary
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
data.force_encoding("UTF-8").should be_valid_encoding # because it really is utf-8
encoded = GuessHtmlEncoding.encode(data)
encoded.encoding.to_s.should == "UTF-8"
encoded.should be_valid_encoding
end
it "should work on incorrectly encoded pages" do
data = "hi!\xc2
"
data.force_encoding("ASCII-8BIT")
data.should be_valid_encoding # everything is valid in binary
GuessHtmlEncoding.guess(data).should == "UTF-8" # because the page says so!
data.force_encoding("UTF-8").should_not be_valid_encoding # because of the bad byte sequence \xc2 which is not valid UTF-8
encoded = GuessHtmlEncoding.encode(data)
encoded.encoding.to_s.should == "UTF-8"
encoded.should be_valid_encoding
end
it "should work on pages encoded with an unknown encoding by forcing them to utf8" do
data = "hi!
"
data.force_encoding("ASCII-8BIT")
data.should be_valid_encoding # everything is valid in binary
GuessHtmlEncoding.guess(data).should == "X-MAC-ROMAN" # because the page says so!
encoded = GuessHtmlEncoding.encode(data)
encoded.encoding.to_s.should == "UTF-8"
encoded.should be_valid_encoding
data.encoding.to_s.should == "ASCII-8BIT"
end
end
describe "#encoding_loaded?" do
it 'returns true for all loaded encodings' do
Encoding.name_list.each do |name|
GuessHtmlEncoding.encoding_loaded?(name).should be_true
end
end
it 'returns false for irregular or unloaded encoding' do
GuessHtmlEncoding.encoding_loaded?('_WHY').should be_false
end
end
end