这是一个典型的「字符集解决编码方案问题」的例子。
简单来说,取 U+D800..U+DBFF 内的一个码位同 U+DC00..U+DFFF 内的一个码位构成的序偶在 UTF-16 下会被解析为 U+10000..U+10FFFF 内的所有码位。
GB/T 13000 将 U+D800..U+DFFF 内的任一码位称为一个「RC 元素」,将 U+D800..U+DBFF 称为「高半区」,将 U+DC00..U+DFFF 称为「低半区」。
最后给一段 C 代码
typedef unsigned int16 UTF16;
typedef unsigned int32 UTF32;
const UTF16 HI_SURROGATE_START = 0xD800;
const UTF16 LO_SURROGATE_START = 0xDC00;
/*
* C -> (HiSurrogate, LoSurrogate)
*
* xxxxxxxxxxxxxxxx -> xxxxxxxxxxxxxxxx
* 000uuuuuxxxxxxxxxxxxxxxx -> 110110wwwwxxxxxx 110111xxxxxxxxxx
*/
UTF16 X = (UTF16) C;
UTF32 U = (C >> 16) & ((1 << 5) - 1);
UTF16 W = (UTF16) U - 1;
UTF16 HiSurrogate = HI_SURROGATE_START | (W << 6) | X >> 10;
UTF16 LoSurrogate = (UTF16) (LO_SURROGATE_START | X & ((1 << 10) - 1));
/*
* (hi, lo) -> C
*/
UTF32 X = (hi & ((1 << 6) -1)) << 10 | lo & ((1 << 10) -1);
UTF32 W = (hi >> 6) & ((1 << 5) - 1);
UTF32 U = W + 1;
UTF32 C = U << 16 | X;