/* ****************************************************************************
*
* Copyright (c) Microsoft Corporation.
*
* This source code is subject to terms and conditions of the Apache License, Version 2.0. A
* copy of the license can be found in the License.html file at the root of this distribution. If
* you cannot locate the Apache License, Version 2.0, please send an email to
* ironruby@microsoft.com. By using this source code in any fashion, you are agreeing to be bound
* by the terms of the Apache License, Version 2.0.
*
* You must not remove this notice, or any other, from this software.
*
*
* ***************************************************************************/
#if !CLR2
using System.Linq.Expressions;
#else
using Microsoft.Scripting.Ast;
#endif
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Threading;
using System.Runtime.Serialization;
using System.Security.Permissions;
using Microsoft.Scripting.Utils;
using Microsoft.Scripting.Runtime;
using IronRuby.Runtime;
using IronRuby.Compiler;
namespace IronRuby.Builtins {
///
/// All encodings in Ruby are represented by instances of a single class. Therefore we need to wrap .NET Encoding class in RubyEncoding.
/// Instances of this class are app-domain singletons. That's all right as far as the class is readonly and doesn't implement IRubyObject.
/// Taint, frozen flags and instance variables need to be stored in per-runtime lookaside table.
///
[Serializable]
public class RubyEncoding : ISerializable, IExpressionSerializable {
#region Singletons
public const int CodePageBinary = 0;
public const int CodePageSJIS = 932;
public const int CodePageBig5 = 950;
public const int CodePageAscii = 20127;
// Windows returns 2 EUC-JP encodings (CP 20932 and CP 51932). Mono implements EUC-JP as 51932 and doesn't support 20932.
public const int CodePageEUCJP = 51932;
public const int CodePageUTF7 = 65000;
public const int CodePageUTF8 = 65001;
public const int CodePageUTF16BE = 1201;
public const int CodePageUTF16LE = 1200;
public const int CodePageUTF32BE = 12001;
public const int CodePageUTF32LE = 12000;
// TODO: how does MRI sort encodings?
public static readonly RubyEncoding/*!*/ Binary = new RubyEncoding(BinaryEncoding.Instance, BinaryEncoding.Instance, -4);
public static readonly RubyEncoding/*!*/ UTF8 = new RubyEncoding(CreateEncoding(CodePageUTF8, false), CreateEncoding(CodePageUTF8, true), -3);
#if SILVERLIGHT
public static readonly RubyEncoding/*!*/ Ascii = UTF8;
#else
public static readonly RubyEncoding/*!*/ Ascii = new RubyEncoding(CreateEncoding(CodePageAscii, false), CreateEncoding(CodePageAscii, true), -2);
public static readonly RubyEncoding/*!*/ EUCJP = new RubyEncoding(CreateEncoding(CodePageEUCJP, false), CreateEncoding(CodePageEUCJP, true), -1);
public static readonly RubyEncoding/*!*/ SJIS = new RubyEncoding(CreateEncoding(CodePageSJIS, false), CreateEncoding(CodePageSJIS, true), 0);
#endif
#endregion
private readonly Encoding/*!*/ _encoding;
private readonly Encoding/*!*/ _strictEncoding;
private Expression _expression;
private readonly int _ordinal;
// TODO: combine into a single integer (tables could be merged)
private readonly int _maxBytesPerChar;
private readonly bool _isAsciiIdentity;
#if !SILVERLIGHT
private bool? _isSingleByteCharacterSet;
private bool? _isDoubleByteCharacterSet;
#endif
private RubyEncoding(Encoding/*!*/ encoding, Encoding/*!*/ strictEncoding, int ordinal) {
Assert.NotNull(encoding, strictEncoding);
_ordinal = ordinal;
_encoding = encoding;
_strictEncoding = strictEncoding;
_maxBytesPerChar = strictEncoding.GetMaxByteCount(1);
_isAsciiIdentity = AsciiIdentity(encoding);
}
public override int GetHashCode() {
return _ordinal;
}
internal Expression/*!*/ Expression {
get { return _expression ?? (_expression = Expression.Constant(this)); }
}
public bool IsAsciiIdentity {
get { return _isAsciiIdentity; }
}
private static Encoding/*!*/ CreateEncoding(int codepage, bool throwOnError) {
#if SILVERLIGHT
return new UTF8Encoding(false, throwOnError);
#else
if (throwOnError) {
return Encoding.GetEncoding(codepage, EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
} else {
return Encoding.GetEncoding(codepage, EncoderFallback.ReplacementFallback, BinaryDecoderFallback.Instance);
}
#endif
}
#region Serialization
#if !SILVERLIGHT
private RubyEncoding(SerializationInfo/*!*/ info, StreamingContext context) {
throw Assert.Unreachable;
}
[Serializable]
internal sealed class Deserializer : ISerializable, IObjectReference {
private readonly int _codePage;
private Deserializer(SerializationInfo/*!*/ info, StreamingContext context) {
_codePage = info.GetInt32("CodePage");
}
public object GetRealObject(StreamingContext context) {
return GetRubyEncoding(_codePage);
}
void ISerializable.GetObjectData(SerializationInfo/*!*/ info, StreamingContext context) {
throw Assert.Unreachable;
}
}
void ISerializable.GetObjectData(SerializationInfo/*!*/ info, StreamingContext context) {
info.AddValue("CodePage", CodePage);
info.SetType(typeof(Deserializer));
}
#endif
#endregion
public int MaxBytesPerChar {
get { return _maxBytesPerChar; }
}
public Encoding/*!*/ Encoding {
get { return _encoding; }
}
public Encoding/*!*/ StrictEncoding {
get { return _strictEncoding; }
}
///
/// Name as displayed by MRI.
///
public string/*!*/ Name {
get {
return GetRubySpecificName(CodePage) ?? _encoding.WebName;
}
}
public static string GetRubySpecificName(int codepage) {
switch (codepage) {
case RubyEncoding.CodePageUTF8: return "UTF-8";
#if !SILVERLIGHT
case RubyEncoding.CodePageUTF7: return "UTF-7";
case RubyEncoding.CodePageUTF16BE: return "UTF-16BE";
case RubyEncoding.CodePageUTF16LE: return "UTF-16LE";
case RubyEncoding.CodePageUTF32BE: return "UTF-32BE";
case RubyEncoding.CodePageUTF32LE: return "UTF-32LE";
case RubyEncoding.CodePageSJIS: return "Shift_JIS";
case RubyEncoding.CodePageAscii: return "US-ASCII";
// disambiguates CP 20932 and CP 51932:
case RubyEncoding.CodePageEUCJP: return "EUC-JP";
case 20932: return "CP20932";
case 50220: return "ISO-2022-JP";
case 50222: return "CP50222";
#endif
default: return null;
}
}
public int CodePage {
get { return GetCodePage(_encoding); }
}
public override string/*!*/ ToString() {
return Name;
}
public int CompareTo(RubyEncoding/*!*/ other) {
return _ordinal - other._ordinal;
}
public static RubyRegexOptions ToRegexOption(RubyEncoding encoding) {
if (encoding == RubyEncoding.Binary) {
return RubyRegexOptions.FIXED;
}
if (encoding == null) {
return RubyRegexOptions.NONE;
}
switch (encoding.CodePage) {
#if !SILVERLIGHT
case RubyEncoding.CodePageSJIS: return RubyRegexOptions.SJIS;
case RubyEncoding.CodePageEUCJP: return RubyRegexOptions.EUC;
#endif
case RubyEncoding.CodePageUTF8: return RubyRegexOptions.UTF8;
}
throw Assert.Unreachable;
}
public static RubyEncoding GetRegexEncoding(RubyRegexOptions options) {
switch (options & RubyRegexOptions.EncodingMask) {
#if !SILVERLIGHT
case RubyRegexOptions.EUC: return RubyEncoding.EUCJP;
case RubyRegexOptions.SJIS: return RubyEncoding.SJIS;
#endif
case RubyRegexOptions.UTF8: return RubyEncoding.UTF8;
case RubyRegexOptions.FIXED: return RubyEncoding.Binary;
default: return null;
}
}
internal static int GetCodePage(int nameInitial) {
switch (nameInitial) {
#if !SILVERLIGHT
case 'E':
case 'e': return CodePageEUCJP;
case 'S':
case 's': return CodePageSJIS;
#endif
case 'U':
case 'u': return CodePageUTF8;
default:
return -1;
}
}
public static RubyEncoding GetEncodingByNameInitial(int initial) {
int codepage = GetCodePage(initial);
return codepage > 0 ? GetRubyEncoding(codepage) : null;
}
public void RequireAsciiIdentity() {
if (!_isAsciiIdentity) {
throw new NotSupportedException(String.Format("Encoding {0} (code page {1}) is not supported", Name, CodePage));
}
}
#if !SILVERLIGHT
private static Dictionary _Encodings;
public static RubyEncoding/*!*/ GetRubyEncoding(Encoding/*!*/ encoding) {
ContractUtils.RequiresNotNull(encoding, "encoding");
if (encoding.CodePage == 0) {
if (encoding == BinaryEncoding.Instance) {
return Binary;
}
// TODO: allow custom encodings (without codepage)
}
return GetRubyEncoding(encoding.CodePage);
}
public static RubyEncoding/*!*/ GetRubyEncoding(int codepage) {
switch (codepage) {
case CodePageBinary: return Binary;
case CodePageAscii: return Ascii;
case CodePageUTF8: return UTF8;
case CodePageSJIS: return SJIS;
case CodePageEUCJP: return EUCJP;
}
if (_Encodings == null) {
Interlocked.CompareExchange(ref _Encodings, new Dictionary(), null);
}
RubyEncoding result;
lock (_Encodings) {
if (!_Encodings.TryGetValue(codepage, out result)) {
result = new RubyEncoding(
CreateEncoding(codepage, false),
CreateEncoding(codepage, true),
codepage
);
_Encodings.Add(codepage, result);
}
}
return result;
}
private static int GetCodePage(Encoding/*!*/ encoding) {
return encoding.CodePage;
}
public static bool AsciiIdentity(Encoding/*!*/ encoding) {
if (encoding == BinaryEncoding.Instance) {
return true;
}
switch (encoding.CodePage) {
case 437: // OEM United States
case 708: // Arabic (ASMO 708)
case 720: // Arabic (DOS)
case 737: // Greek (DOS)
case 775: // Baltic (DOS)
case 850: // Western European (DOS)
case 852: // Central European (DOS)
case 855: // OEM Cyrillic
case 857: // Turkish (DOS)
case 858: // OEM Multilingual Latin I
case 860: // Portuguese (DOS)
case 861: // Icelandic (DOS)
case 862: // Hebrew (DOS)
case 863: // French Canadian (DOS)
case 864: // Arabic (864)
case 865: // Nordic (DOS)
case 866: // Cyrillic (DOS)
case 869: // Greek, Modern (DOS)
case 874: // Thai (Windows)
case 932: // Japanese (Shift-JIS)
case 936: // Chinese Simplified (GB2312)
case 949: // Korean
case 950: // Chinese Traditional (Big5)
case 1250: // Central European (Windows)
case 1251: // Cyrillic (Windows)
case 1252: // Western European (Windows)
case 1253: // Greek (Windows)
case 1254: // Turkish (Windows)
case 1255: // Hebrew (Windows)
case 1256: // Arabic (Windows)
case 1257: // Baltic (Windows)
case 1258: // Vietnamese (Windows)
case 1361: // Korean (Johab)
case 10000: // Western European (Mac)
case 10001: // Japanese (Mac)
case 10002: // Chinese Traditional (Mac)
case 10003: // Korean (Mac)
case 10004: // Arabic (Mac)
case 10005: // Hebrew (Mac)
case 10006: // Greek (Mac)
case 10007: // Cyrillic (Mac)
case 10008: // Chinese Simplified (Mac)
case 10010: // Romanian (Mac)
case 10017: // Ukrainian (Mac)
case 10029: // Central European (Mac)
case 10079: // Icelandic (Mac)
case 10081: // Turkish (Mac)
case 10082: // Croatian (Mac)
case 20000: // Chinese Traditional (CNS)
case 20001: // TCA Taiwan
case 20002: // Chinese Traditional (Eten)
case 20003: // IBM5550 Taiwan
case 20004: // TeleText Taiwan
case 20005: // Wang Taiwan
case 20127: // US-ASCII
case 20866: // Cyrillic (KOI8-R)
case 20932: // Japanese (JIS 0208-1990 and 0212-1990)
case 20936: // Chinese Simplified (GB2312-80)
case 20949: // Korean Wansung
case 21866: // Cyrillic (KOI8-U)
case 28591: // Western European (ISO)
case 28592: // Central European (ISO)
case 28593: // Latin 3 (ISO)
case 28594: // Baltic (ISO)
case 28595: // Cyrillic (ISO)
case 28596: // Arabic (ISO)
case 28597: // Greek (ISO)
case 28598: // Hebrew (ISO-Visual)
case 28599: // Turkish (ISO)
case 28603: // Estonian (ISO)
case 28605: // Latin 9 (ISO)
case 38598: // Hebrew (ISO-Logical)
case 50220: // Japanese (JIS)
case 50221: // Japanese (JIS-Allow 1 byte Kana)
case 50222: // Japanese (JIS-Allow 1 byte Kana - SO/SI)
case 50225: // Korean (ISO)
case 50227: // Chinese Simplified (ISO-2022)
case 51932: // Japanese (EUC)
case 51936: // Chinese Simplified (EUC)
case 51949: // Korean (EUC)
case 54936: // Chinese Simplified (GB18030)
case 57002: // ISCII Devanagari
case 57003: // ISCII Bengali
case 57004: // ISCII Tamil
case 57005: // ISCII Telugu
case 57006: // ISCII Assamese
case 57007: // ISCII Oriya
case 57008: // ISCII Kannada
case 57009: // ISCII Malayalam
case 57010: // ISCII Gujarati
case 57011: // ISCII Punjabi
case 65001: // Unicode (UTF-8)
Debug.Assert(IsAsciiIdentityFallback(encoding));
return true;
default:
return IsAsciiIdentityFallback(encoding);
}
}
private static string _AllAscii;
private static bool IsAsciiIdentityFallback(Encoding/*!*/ encoding) {
if (_AllAscii == null) {
// all ASCII characters:
var sb = new StringBuilder(0x80);
for (int i = 0; i < 0x80; i++) {
sb.Append((char)i);
}
_AllAscii = sb.ToString();
}
var bytes = encoding.GetBytes(_AllAscii);
if (bytes.Length != _AllAscii.Length) {
return false;
}
for (int i = 0; i < _AllAscii.Length; i++) {
if ((int)_AllAscii[i] != (int)bytes[i]) {
return false;
}
}
return true;
}
public bool IsSingleByteCharacterSet {
get {
if (!_isSingleByteCharacterSet.HasValue) {
_isSingleByteCharacterSet = IsSBCS(CodePage);
}
return _isSingleByteCharacterSet.Value;
}
}
public bool IsDoubleByteCharacterSet {
get {
if (!_isDoubleByteCharacterSet.HasValue) {
_isDoubleByteCharacterSet = IsDBCS(CodePage);
}
return _isDoubleByteCharacterSet.Value;
}
}
private static int[] _sbsc;
private static int[] _dbsc;
private static bool IsSBCS(int codepage) {
if (_sbsc == null) {
_sbsc = new int[] {
0, 37, 437, 500, 708, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 1026,
1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258,
10000, 10004, 10005, 10006, 10007, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 20105, 20106, 20107, 20108, 20127,
20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880,
20905, 20924, 21025, 21866, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598
};
}
return Array.BinarySearch(_sbsc, codepage) >= 0;
}
private static bool IsDBCS(int codepage) {
if (_dbsc == null) {
_dbsc = new int[] {
932, 936, 949, 950, 1361, 10001, 10002, 10003, 10008, 20000, 20001, 20002, 20003, 20004, 20005, 20261, 20932, 20936,
20949, 50227, 51936, 51949
};
}
return Array.BinarySearch(_dbsc, codepage) >= 0;
}
public bool InUnicodeBasicPlane {
get {
// TODO: others
return this == Ascii || this == Binary;
}
}
public bool IsUnicodeEncoding {
get {
switch (CodePage) {
case CodePageUTF7:
case CodePageUTF8:
case CodePageUTF16BE:
case CodePageUTF16LE:
case CodePageUTF32BE:
case CodePageUTF32LE:
return true;
}
return false;
}
}
private static ReadOnlyDictionary _aliases;
public static ReadOnlyDictionary Aliases {
get { return _aliases ?? (_aliases = CreateAliases()); }
}
private static ReadOnlyDictionary CreateAliases() {
return new ReadOnlyDictionary(new Dictionary(StringComparer.InvariantCultureIgnoreCase) {
{ "646", "US-ASCII" },
{ "ASCII", "US-ASCII" },
{ "ANSI_X3.4-1968", "US-ASCII" },
{ "BINARY", "ASCII-8BIT" },
{ "CP437", "IBM437" },
{ "CP737", "IBM737" },
{ "CP775", "IBM775" },
{ "CP857", "IBM857" },
{ "CP860", "IBM860" },
{ "CP861", "IBM861" },
{ "CP862", "IBM862" },
{ "CP863", "IBM863" },
{ "CP864", "IBM864" },
{ "CP865", "IBM865" },
{ "CP866", "IBM866" },
{ "CP869", "IBM869" },
{ "CP874", "Windows-874" },
{ "CP878", "KOI8-R" },
{ "CP932", "Windows-31J" },
{ "CP936", "GBK" },
{ "CP950", "Big5" },
{ "CP951", "Big5-HKSCS" },
{ "CP1258", "Windows-1258" },
{ "CP1252", "Windows-1252" },
{ "CP1250", "Windows-1250" },
{ "CP1256", "Windows-1256" },
{ "CP1251", "Windows-1251" },
{ "CP1253", "Windows-1253" },
{ "CP1255", "Windows-1255" },
{ "CP1254", "Windows-1254" },
{ "CP1257", "Windows-1257" },
{ "CP65000", "UTF-7" },
{ "CP65001", "UTF-8" },
{ "IBM850", "CP850" },
{ "eucJP", "EUC-JP" },
{ "eucKR", "EUC-KR" },
// { "eucTW", "EUC-TW" },
{ "ISO2022-JP", "ISO-2022-JP" },
// { "ISO2022-JP2", "ISO-2022-JP-2" },
{ "ISO8859-1", "ISO-8859-1" },
{ "ISO8859-2", "ISO-8859-2" },
{ "ISO8859-3", "ISO-8859-3" },
{ "ISO8859-4", "ISO-8859-4" },
{ "ISO8859-5", "ISO-8859-5" },
{ "ISO8859-6", "ISO-8859-6" },
{ "ISO8859-7", "ISO-8859-7" },
{ "ISO8859-8", "ISO-8859-8" },
{ "ISO8859-9", "ISO-8859-9" },
// { "ISO8859-10", "ISO-8859-10" },
{ "ISO8859-11", "ISO-8859-11" },
{ "ISO8859-13", "ISO-8859-13" },
// { "ISO8859-14", "ISO-8859-14" },
{ "ISO8859-15", "ISO-8859-15" },
// { "ISO8859-16", "ISO-8859-16" },
{ "SJIS", "Shift_JIS" },
{ "csWindows31J", "Windows-31J" },
// { "MacJapan", "MacJapanese" },
// { "UTF-8-MAC", "UTF8-MAC" },
// { "UTF-8-HFS", "UTF8-MAC" },
{ "UCS-2BE", "UTF-16BE" },
{ "UCS-4BE", "UTF-32BE" },
{ "UCS-4LE", "UTF-32LE" },
});
}
Expression/*!*/ IExpressionSerializable.CreateExpression() {
// TODO: use static fields
return Methods.CreateEncoding.OpCall(Expression.Constant(CodePage));
}
#else
public static bool AsciiIdentity(Encoding/*!*/ encoding) {
switch (GetCodePage(encoding)) {
case CodePageBinary:
case CodePageUTF8:
return true;
}
return false;
}
public bool IsSingleByteCharacterSet {
get {
return this == Binary;
}
}
public bool IsDoubleByteCharacterSet {
get {
return false;
}
}
public bool InUnicodeBasicPlane {
get {
return this == Binary;
}
}
public static RubyEncoding/*!*/ GetRubyEncoding(Encoding/*!*/ encoding) {
ContractUtils.RequiresNotNull(encoding, "encoding");
//RHO
/*
if (encoding == BinaryEncoding.Instance) {
return Binary;
}else if (encoding == Encoding.UTF8 || encoding == System.Text.UTF8Encoding.UTF8)
{
return UTF8;
} else {
throw new ArgumentException(String.Format("Unknown encoding: '{0}'", encoding));
}
*/
switch (GetCodePage(encoding))
{
case CodePageBinary: return Binary;
case CodePageAscii: return Ascii;
case CodePageUTF8: return UTF8;
}
throw new ArgumentException(String.Format("Unknown encoding: '{0}'", encoding));
//RHO
}
internal static RubyEncoding/*!*/ GetRubyEncoding(int codepage) {
switch (codepage) {
case CodePageBinary: return Binary;
case CodePageUTF8: return UTF8;
default: throw new ArgumentException(String.Format("Unknown encoding codepage: {0}", codepage));
}
}
private static int GetCodePage(Encoding/*!*/ encoding) {
Debug.Assert(encoding != null);
if (encoding == BinaryEncoding.Instance) {
return CodePageBinary;
}
switch (encoding.WebName.ToUpperInvariant()) {
case "UTF-8": return CodePageUTF8;
case "UTF-16": return CodePageUTF16LE;
case "UTF-16BE": return CodePageUTF16BE;
}
throw new ArgumentException(String.Format("Unknown encoding: {0}", encoding));
}
Expression/*!*/ IExpressionSerializable.CreateExpression() {
// TODO: use a static fields, deal with KCODEs
return Expression.Constant(UTF8);
}
#endif
}
}