mirror of
https://github.com/PredatH0r/ChanSort.git
synced 2026-01-12 18:32:05 +01:00
101 lines
3.9 KiB
C#
101 lines
3.9 KiB
C#
using System.IO;
|
|
using System.Text;
|
|
|
|
namespace ChanSort.Loader.Samsung.Zip
|
|
{
|
|
// Samsung 1242 format does not store UTF16 characters directly, but instead wraps 16 data bits inside a UTF-8 lead + continuation byte sequence.
|
|
// A 3 byte UTF-8 sequence is used to encode 16 bits of utf-16 big endian input: 1110aaaa 10bbbbcc 10ccdddd represents the 16bit big endian integer ccccddddaaaabbbb, i.e. 0xE4, 0x84, 0x80 => 0x00, 0x41 => "A" in UTF-16 BE
|
|
// The Samsung encoder seems to create some illegal UTF-8 sequences at the end of the string as a result of padding and operating on 32bit inputs (2 characters) with big-endianness, which
|
|
// this decoder has to take care of. 0xFFFD can appear both in the raw input bytes (0xFF, 0xFB) as well as already encoded into UTF-8 wrappings (0xEF,0xBF,0xBD)
|
|
|
|
// This implementation here decodes the UTF-8 byte sequence into UTF-16 Little Endian for the sake of simplicity: aaaa=4, bbbb=1, cccc=0, dddd=0 => 0xE4, 0x84, 0x80 => 0x41, 0x00 => "A" in UTF-16 LE.
|
|
// The encoder here operates on 16bit characters and not 32bit 2-characters, so there is no need for padding and no invalid UTF-8 sequences.
|
|
|
|
public class Utf16InsideUtf8EnvelopeEncoding : Encoding
|
|
{
|
|
public override int GetMaxByteCount(int charCount)
|
|
{
|
|
return charCount * 3;
|
|
}
|
|
|
|
public override int GetByteCount(char[] chars, int index, int count)
|
|
{
|
|
return count * 3;
|
|
}
|
|
|
|
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
|
|
{
|
|
var utf16Le = Unicode.GetBytes(chars, charIndex, charCount);
|
|
int o = byteIndex;
|
|
int c = utf16Le.Length;
|
|
for (int i = 0; i < c; i += 2, o += 3)
|
|
{
|
|
var b0 = utf16Le[i + 0];
|
|
var b1 = utf16Le[i + 1];
|
|
bytes[o + 0] = (byte) (0xE0 + (b0 >> 4));
|
|
bytes[o + 1] = (byte) (0x80 + ((b0 & 0x0F) << 2) + (b1 >> 6));
|
|
bytes[o + 2] = (byte) (0x80 + (b1 & 0x3F));
|
|
}
|
|
|
|
return charCount * 3;
|
|
}
|
|
|
|
|
|
public override int GetMaxCharCount(int byteCount)
|
|
{
|
|
return (byteCount + 2) / 3;
|
|
}
|
|
|
|
public override int GetCharCount(byte[] bytes, int index, int count)
|
|
{
|
|
return (count + 2) / 3;
|
|
}
|
|
|
|
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
|
|
{
|
|
using MemoryStream ms = new MemoryStream(40);
|
|
for (int i = byteIndex, c = byteIndex + byteCount; i < c; i++)
|
|
{
|
|
int b0 = bytes[i + 0];
|
|
if (b0 == 0 && i == c - 1) // satellite names end with a single trailing 0x00 byte
|
|
break;
|
|
if (b0 > 0xF7) // invalid UTF-8 lead byte. (0xFF, 0xFD) = 0xFFFD in BigEndian can appear unencoded at the end of the byte stream, likely as a padding
|
|
continue;
|
|
if (b0 >= 0xE0) // 3-byte UTF envelope for 2 input bytes
|
|
{
|
|
int b1 = bytes[i + 1];
|
|
int b2 = bytes[i + 2];
|
|
if ((b2 & 0xC0) != 0x80) // invalid 2nd UTF-8 continuation byte; only a single byte is encoded as 1110aaaa 10bbbbcc => aaaabbbb
|
|
{
|
|
b2 = 0;
|
|
--i;
|
|
}
|
|
int ch1 = ((b0 & 0x0F) << 4) | ((b1 & 0x3C) >> 2);
|
|
int ch2 = ((b1 & 0x03) << 6) | (b2 & 0x3F);
|
|
if (ch1 != 0xFF || ch2 != 0xFD) // ignore UTF-16 "replacement character" U-0xFFFD
|
|
{
|
|
ms.WriteByte((byte) ch1);
|
|
ms.WriteByte((byte) ch2);
|
|
}
|
|
i += 2;
|
|
}
|
|
else if (b0 >= 0xC0) // 2-byte UTF envelope for 1 input byte as 110xaaaa 10bbbbcc => aaaabbbb
|
|
{
|
|
int b1 = bytes[i + 1];
|
|
int ch = ((b0 & 0x0F) << 4) | ((b1 & 0x3C)>>2);
|
|
ms.WriteByte((byte)ch);
|
|
ms.WriteByte(0);
|
|
i++;
|
|
}
|
|
else if (b0 < 0x80) // 1-byte UTF envelope for 1 input byte < 0x80
|
|
{
|
|
ms.WriteByte(bytes[i]);
|
|
ms.WriteByte(0);
|
|
}
|
|
}
|
|
|
|
return Encoding.Unicode.GetChars(ms.GetBuffer(), 0, (int) ms.Length, chars, charIndex);
|
|
}
|
|
}
|
|
}
|