Widnows kernel中不存在Unicdoe/UTF-8转换函数, 因此通过分析UTF-8编码表自己实现转换函数
UTF-8编码字符理论上可以最多到6个字节长,然而16位BMP(Basic Multilingual Plane)字符最多只用到3字节长。下面看一下UTF-8编码表:
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
enum Hex{ HexC0 = 0xC0, HexE0 = 0xE0, HexF0 = 0xF0, HexF8 = 0xF8, HexFC = 0xFC, HexFE = 0xFE, HexFF = 0xFF, Hex80 = 0x80, Hex800 = 0x800, Hex10000 = 0x10000, Hex200000 = 0x200000, Hex4000000 = 0x4000000, Hex80000000 = 0x80000000};
/*
* des: The address of a sequence of Unicode characters.
* src: The address of a sequence of UTF8 characters.
* size_d: The number of Unicode characters in word.
* size_s: The number of UTF8 characters to convert in byte.
*/
VOID UTF82Unicode(PWCHAR des, PBYTE src, int size_d, int size_s)
{
int s = 0, d = 0;
while(s < size_s && d < size_d)
{
// 1 Byte UTF8
if(src[s] < Hex80)
{
des[d++] = src[s++];
}
// Suffix byte
if(src[s] >= Hex80 && src[s] < HexC0)
{
s++;
continue;
}
// 2 Bytes UTF8
if(src[s] >= HexC0 && src[s] < HexE0)
{
if(s + 1 >= size_s)
break;
des[d] = (src[s] & 0x1F) << 6;
des[d] |= src[s+1] & 0x3F;
d++;
s++;
}
// 3 Bytes UTF8
if(src[s] >= HexE0 && src[s] < HexF0)
{
if(s + 2 >= size_s)
break;
des[d] = (src[s] & 0x0F) << 12;
des[d] |= (src[s+1] & 0x3F) << 6;
des[d] |= src[s+2] & 0x3F;
d++;
s++;
}
}
}
/*
* des: The address of a sequence of UTF8 characters.
* src: The address of a sequence of Unicode characters.
* size_d: The number of UTF8 characters in byte.
* size_s: The number of Unicode characters to convert in word.
*/
VOID Unicode2UTF8(PBYTE des, PWCHAR src, int size_d, int size_s)
{
int s = 0, d = 0;
while(s < size_s && d < size_d)
{
if(src[s] < Hex80)
{
// 1 Byte UTF-8
des[d] = (BYTE)src[s];
d++;
s++;
}
if(src[s] >= Hex80 && src[s] < Hex800)
{
// 2 Bytes UTF-8
if(d + 1 >= size_d)
break;
des[d] = (BYTE)(0xC0 | (src[s] >> 6));
des[d+1] = (BYTE)(0x80 | (src[s] & 0x003F));
d += 2;
s++;
}
if(src[s] >= Hex800 && src[s] < Hex10000)
{
// 3 Bytes UTF-8
if(d + 2 >= size_d )
break;
des[d] = (BYTE)(0xE0 | (src[s] >> 12));
des[d+1] = (BYTE)(0x80 | ((src[s] >> 6) & 0x003F));
des[d+2] = (BYTE)(0x80 | (src[s] & 0x003F));
d += 3;
s++;
}
}
}
本文介绍了一种在Windows内核环境下实现UTF-8与Unicode编码相互转换的方法。通过解析UTF-8编码规则,提供了两个核心函数:UTF82Unicode用于将UTF-8编码转换为Unicode,而Unicode2UTF8则完成相反过程。这些函数适用于处理不同编码间的转换需求。
2579

被折叠的 条评论
为什么被折叠?



