Files
ChanSort/source/ChanSort.Loader.SamsungJ/Utf16InsideUtf8EnvelopeEncoding.cs
Horst Beham e472e6976b - Philips: older models which export a Repair/*.BIN file can now be loaded, when there is an invisible .xml file in the same
directory.
  (Philips exports the .xml file with file attributes "hidden" and "system", which makes them invisible to Windows Explorer)
- upgrade to DevExpress 20.1.6
2020-08-03 00:35:57 +02:00

101 lines
3.9 KiB
C#

using System.IO;
using System.Text;
namespace ChanSort.Loader.SamsungJ
{
// Samsung 1242 format does not store UTF16 characters directly, but instead wraps 16 data bits inside a UTF-8 lead + continuation byte sequence.
// A 3 byte UTF-8 sequence is used to encode 16 bits of utf-16 big endian input: 1110aaaa 10bbbbcc 10ccdddd represents the 16bit big endian integer ccccddddaaaabbbb, i.e. 0xE4, 0x84, 0x80 => 0x00, 0x41 => "A" in UTF-16 BE
// The Samsung encoder seems to create some illegal UTF-8 sequences at the end of the string as a result of padding and operating on 32bit inputs (2 characters) with big-endianness, which
// this decoder has to take care of. 0xFFFD can appear both in the raw input bytes (0xFF, 0xFB) as well as already encoded into UTF-8 wrappings (0xEF,0xBF,0xBD)
// This implementation here decodes the UTF-8 byte sequence into UTF-16 Little Endian for the sake of simplicity: aaaa=4, bbbb=1, cccc=0, dddd=0 => 0xE4, 0x84, 0x80 => 0x41, 0x00 => "A" in UTF-16 LE.
// The encoder here operates on 16bit characters and not 32bit 2-characters, so there is no need for padding and no invalid UTF-8 sequences.
public class Utf16InsideUtf8EnvelopeEncoding : Encoding
{
public override int GetMaxByteCount(int charCount)
{
return charCount * 3;
}
public override int GetByteCount(char[] chars, int index, int count)
{
return count * 3;
}
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
{
var utf16Le = Unicode.GetBytes(chars, charIndex, charCount);
int o = byteIndex;
int c = utf16Le.Length;
for (int i = 0; i < c; i += 2, o += 3)
{
var b0 = utf16Le[i + 0];
var b1 = utf16Le[i + 1];
bytes[o + 0] = (byte) (0xE0 + (b0 >> 4));
bytes[o + 1] = (byte) (0x80 + ((b0 & 0x0F) << 2) + (b1 >> 6));
bytes[o + 2] = (byte) (0x80 + (b1 & 0x3F));
}
return charCount * 3;
}
public override int GetMaxCharCount(int byteCount)
{
return (byteCount + 2) / 3;
}
public override int GetCharCount(byte[] bytes, int index, int count)
{
return (count + 2) / 3;
}
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
{
using MemoryStream ms = new MemoryStream(40);
for (int i = byteIndex, c = byteIndex + byteCount; i < c; i++)
{
int b0 = bytes[i + 0];
if (b0 == 0 && i == c - 1) // satellite names end with a single trailing 0x00 byte
break;
if (b0 > 0xF7) // invalid UTF-8 lead byte. (0xFF, 0xFD) = 0xFFFD in BigEndian can appear unencoded at the end of the byte stream, likely as a padding
continue;
if (b0 >= 0xE0) // 3-byte UTF envelope for 2 input bytes
{
int b1 = bytes[i + 1];
int b2 = bytes[i + 2];
if ((b2 & 0xC0) != 0x80) // invalid 2nd UTF-8 continuation byte; only a single byte is encoded as 1110aaaa 10bbbbcc => aaaabbbb
{
b2 = 0;
--i;
}
int ch1 = ((b0 & 0x0F) << 4) | ((b1 & 0x3C) >> 2);
int ch2 = ((b1 & 0x03) << 6) | (b2 & 0x3F);
if (ch1 != 0xFF || ch2 != 0xFD) // ignore UTF-16 "replacement character" U-0xFFFD
{
ms.WriteByte((byte) ch1);
ms.WriteByte((byte) ch2);
}
i += 2;
}
else if (b0 >= 0xC0) // 2-byte UTF envelope for 1 input byte as 110xaaaa 10bbbbcc => aaaabbbb
{
int b1 = bytes[i + 1];
int ch = ((b0 & 0x0F) << 4) | ((b1 & 0x3C)>>2);
ms.WriteByte((byte)ch);
ms.WriteByte(0);
i++;
}
else if (b0 < 0x80) // 1-byte UTF envelope for 1 input byte < 0x80
{
ms.WriteByte(bytes[i]);
ms.WriteByte(0);
}
}
return Encoding.Unicode.GetChars(ms.GetBuffer(), 0, (int) ms.Length, chars, charIndex);
}
}
}