CUnicodeUtils
#pragma once
#include <stdint.h>
#include <string>class CUnicodeUtils
{
public:// // @brief: 获取UTF16字符个数// @param: pData 数据(UTF16编码, 大端字节序或小端字节序, 可包含BOM)// @param: size 数据长度(字节)// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF16 编码字符串static int32_t GetUtf16Count(const void* pData, size_t size = -1);// // @brief: 获取UTF8字符个数// @param: pData 数据(UTF8编码数据, 可包含BOM)// @param: size 数据长度(字节)// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF8 编码字符串static int32_t GetUtf8Count(const void* pData, size_t size = -1);// // @brief: 获取GBK字符个数// @param: pData 数据(UTF8编码数据, 可包含BOM)// @param: size 数据长度(字节)// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF8 编码字符串static int32_t GetGbkCount(const void* pData, size_t size = -1);// // @brief: 转换为UTF16编码的字符串// @param: pData 数据(UTF8编码数据, 可包含BOM)// @param: size 数据长度(字节)// @ret: std::wstring UTF16编码的字符串static std::wstring Utf8ToUtf16(const void* pData, size_t size = -1);// // @brief: 转换为UTF8编码的字符串// @param: pData 数据(UTF8编码数据, 可包含BOM)// @param: size 数据长度(字节)// @ret: std::string UTF8编码的字符串static std::string Utf16ToUtf8(const void* pData, size_t size = -1);private:static void _CodePointToUtf8(uint32_t cp32, uint8_t* pBuf);static int32_t _Utf8ToUtf16(const void* pData, size_t size = -1, std::string* pUtf8 = nullptr, std::wstring* pUtf16 = nullptr);static int32_t _Utf16ToUtf8(const void* pData, size_t size = -1, std::string* pUtf8 = nullptr, std::wstring* pUtf16 = nullptr);static int32_t _GetGbkCount(const void* pData, size_t size = -1, std::string* pGbk = nullptr);
};
CUnicodeUtils.cpp
#include "CUnicodeUtils.h"// ANSI GBK 编码标准
// 第一字节(称为高字节)的范围: 0x81 - 0xFE
// 第二字节(称为低字节)的范围: 0x40 - 0xFE (不含0x7F)
//
// 汉字区
// GBK/2:0XBOA1-F7FE 收录 GB 2312 汉字 6763 个,按原序排列
// GBK/3:0X8140-AOFE,收录 CJK 汉字 6080 个
// GBK/4:0XAA40-FEAO,收录 CJK 汉字和增补的汉字 8160 个
//
// 图形符号区
// GBK/1:0XA1A1-A9FE,除 GB 2312 的符号外,还增补了其它符号
// GBK/5:0XA840-A9AO,扩除非汉字区
//
// 用户自定义区
// GBK 区域中的空白区,用户可以自己定义字符// UTF-8 编码标准
//
// 1字节 U+0000000 - U+0000007F 0xxxxxxx
// 2字节 U+0000080 - U+000007FF 110xxxxx 10xxxxxx
// 3字节 U+0000800 - U+0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
// 4字节 U+0010000 - U+001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 5字节 U+0200000 - U+03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 6字节 U+4000000 - U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx// UTF16 编码标准
//
// 基本多语言平面(U+0000 - U+FFFF)
// U+000000 - U+00D7FF
// U+00D800 - U+00DFFF 保留区域
// U+00E000 - U+00FFFF
// U+010000 - U+10FFFF
//
// 辅助平面(U+10000 - U+10FFFF)
// 1.码位减去 0x10000,得到20位的代理值(0x00 - 0xFFFFF)
// 2.高10位(范围0 - 0x3FF)加 0xD800 得到高位代理(0xD800 - 0xDBFF)
// 3.低10位(范围0 - 0x3FF)加 0xDC00 得到低位代理(0xDC00 - 0xDFFF)int32_t CUnicodeUtils::GetUtf16Count(const void* pData, size_t size/* = -1*/)
{return _Utf16ToUtf8(pData, size);
}int32_t CUnicodeUtils::GetUtf8Count(const void* pData, size_t size/* = -1*/)
{return _Utf8ToUtf16(pData, size);
}int32_t CUnicodeUtils::GetGbkCount(const void* pData, size_t size/* = -1*/)
{std::string strResult8;int32_t nLength = _GetGbkCount(pData, size, &strResult8);return nLength;
}std::wstring CUnicodeUtils::Utf8ToUtf16(const void* pData, size_t size/* = -1*/)
{std::string strResult8;std::wstring strResult16;int32_t nLength = _Utf8ToUtf16(pData, size, nullptr, &strResult16);return strResult16;
}std::string CUnicodeUtils::Utf16ToUtf8(const void* pData, size_t size/* = -1*/)
{std::string strResult8;std::wstring strResult16;int32_t nLength = _Utf16ToUtf8(pData, size, &strResult8, nullptr);return strResult8;
}void CUnicodeUtils::_CodePointToUtf8(uint32_t cp32, uint8_t* pBuf)
{// 1字节 0xxxxxxxif (cp32 >= 0x00000000 && cp32 <= 0x0000007F){pBuf[0] = (uint8_t)cp32;pBuf[1] = 0;}// 2字节 110xxxxx 10xxxxxxif (cp32 >= 0x00000080 && cp32 <= 0x000007FF){pBuf[0] = ((cp32 >> 6) & 0x1F) | 0xC0;pBuf[1] = ((cp32 & 0x3F)) | 0x80;pBuf[2] = 0;}// 3字节 1110xxxx 10xxxxxx 10xxxxxxif (cp32 >= 0x00000800 && cp32 <= 0x0000FFFF){pBuf[0] = ((cp32 >> 12) & 0x0F) | 0xE0;pBuf[1] = ((cp32 >> 6) & 0x3F) | 0x80;pBuf[2] = ((cp32 & 0x3F)) | 0x80;pBuf[3] = 0;}// 4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxxif (cp32 >= 0x00010000 && cp32 <= 0x001FFFFF){pBuf[0] = ((cp32 >> 18) & 0x07) | 0xF0;pBuf[1] = ((cp32 >> 12) & 0x3F) | 0x80;pBuf[2] = ((cp32 >> 6) & 0x3F) | 0x80;pBuf[3] = ((cp32 & 0x3F)) | 0x80;pBuf[4] = 0;}// 5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxif (cp32 >= 0x00200000 && cp32 <= 0x03FFFFFF){pBuf[0] = ((cp32 >> 24) & 0x03) | 0xF8;pBuf[1] = ((cp32 >> 18) & 0x3F) | 0x80;pBuf[2] = ((cp32 >> 12) & 0x3F) | 0x80;pBuf[3] = ((cp32 >> 6) & 0x3F) | 0x80;pBuf[4] = ((cp32 & 0x3F)) | 0x80;pBuf[5] = 0;}// 6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxxif (cp32 >= 0x04000000 && cp32 <= 0x7FFFFFFF){pBuf[0] = ((cp32 >> 30) & 0x01) | 0xFC;pBuf[1] = ((cp32 >> 24) & 0x3F) | 0x80;pBuf[2] = ((cp32 >> 18) & 0x3F) | 0x80;pBuf[3] = ((cp32 >> 12) & 0x3F) | 0x80;pBuf[4] = ((cp32 >> 6) & 0x3F) | 0x80;pBuf[5] = ((cp32 & 0x3F)) | 0x80;pBuf[6] = 0;}
}int32_t CUnicodeUtils::_Utf8ToUtf16(const void* pData, size_t size/* = -1*/, std::string* pUtf8/* = nullptr*/, std::wstring* pUtf16/* = nullptr*/)
{const uint8_t* pCpData = (const uint8_t*)pData;std::wstring strOut16; // 输出UTF16std::string strOut8; // 输出UTF8uint32_t cp32 = 0; // UNICODE码点int32_t nByteCount = 0; // 字节计数int32_t nChCount = 0; // 字符计数bool fResult = true; // 操作结果bool fBom = true; // BOM(Byte Order Mark)while ((0 != *pCpData) && (0 != size)){uint8_t ch = *pCpData;// 普通 Ascii 也是 UTF-8 一部分if (ch < 0x7F){cp32 = ch;nChCount++;}else{// 检查 UTF-8 首字节if (0 == nByteCount){cp32 = 0;if (ch >= 0xC0){uint8_t u8CodeMask = 0xC0; // 11000000uint8_t u8DataMask = 0x1F; // 000xxxxxint nCount = 2; // 有效字节数量: 2-6// 检索字符使用的字节数量while(u8CodeMask <= 0xFC){uint8_t u8MaskMax = u8CodeMask | u8DataMask;if (ch >= u8CodeMask && ch <= u8MaskMax){cp32 = ch & u8DataMask;nByteCount = nCount;break;}u8CodeMask = (u8CodeMask >> 1) | 0x80;u8DataMask = u8DataMask >> 1;nCount++;}if (0 == nByteCount){fResult = false;break;}if (0xEF == ch && 3 == nByteCount){fBom = true;}nByteCount--;}else{fResult = false;break;}}else{// 非首字节掩码: 10xxxxxxif (0x80 != (ch & 0xC0)){fResult = false;break;}// BOM处理if (fBom){if (0xBB != ch && 2 == nByteCount){fBom = false;}if (0xBF != ch && 1 == nByteCount){fBom = false;}}cp32 = cp32 << 6;cp32 |= ch & 0x3F;nByteCount--;if (0 == nByteCount){// 跳过BOMif (fBom){fBom = false;pCpData++;continue;}nChCount++;}}}if (0 == nByteCount){uint8_t szBuf[7] = { 0 };if (pUtf8){_CodePointToUtf8(cp32, szBuf);strOut8 += (const char*)szBuf;}if (pUtf16){if (cp32 < 0x10000){strOut16.push_back((uint16_t)(cp32 & 0xFFFF));}else{uint16_t cp = (uint16_t)(cp32 - 0x10000);uint16_t cp32Hi = (uint16_t)(cp >> 10) + 0xD800;uint16_t cp32Lo = (uint16_t)(cp & 0x3FF) + 0xDC00;strOut16.push_back(cp32Hi);strOut16.push_back(cp32Lo);}}}pCpData++;if (-1 != size){size--;}}if (!fResult){return -1;}if (pUtf8){*pUtf8 = std::move(strOut8);}if (pUtf16){*pUtf16 = std::move(strOut16);}return nChCount;
}int32_t CUnicodeUtils::_Utf16ToUtf8(const void* pData, size_t size/* = -1*/, std::string* pUtf8/* = nullptr*/, std::wstring* pUtf16/* = nullptr*/)
{const uint16_t* pCpData = (const uint16_t*)pData;std::wstring strOut16; // 输出UTF16std::string strOut8; // 输出UTF8uint32_t cp32 = 0; // 32位码点uint16_t cp32Hi = 0; // 32位码点高10位uint16_t cp32Lo = 0; // 32位码点低10位uint16_t cp16 = 0; // 16位码点int32_t nByteCount = 0; // 字节计数int32_t nChCount = 0; // 字符计数bool fBigEndian = false; // 是否大端字节序bool fLittleEndian = false; // 是否小端字节序bool fResult = true; // 操作结果if (-1 != size){if ((size < 2) || (0 != (size % 2))){return -1;}}while ((0 != *pCpData) && (0 != size)){cp16 = *pCpData;// BOM检查if (0xFFFE == cp16 || 0xFEFF == cp16){if (0 == nByteCount){if (0xFFFE == cp16) // 大端字节序 (Big Endian){fBigEndian = true;}if (0xFEFF == cp16) // 小端字节序 (Little Endian){fLittleEndian = true;}}else{fResult = false;break;}// 不可能同时存在两种字节序if (fBigEndian && fLittleEndian){fResult = false;break;}pCpData++;if (-1 != size){size -= 2;}continue;}if (fBigEndian){cp16 = ((cp16 >> 8) | (cp16 << 8));}//检查是否为基本多语言平面(U+0000 - U+FFFF)if (!(cp16 >= 0xD800 && cp16 <= 0xDFFF)){if (cp32Hi > 0) // 高位码点后必须跟着低位码点{fResult = false;break;}cp32 = cp16;nChCount++;}else{if (0 == nByteCount){//检查是否为辅助平面(U+10000 - U+10FFFF)if (cp16 >= 0xD800 && cp16 <= 0xDBFF) //检查高位代理(0xD800 - 0xDBFF){cp32Hi = (cp16 - 0xD800);nByteCount = 1;}else{fResult = false;break;}}else{if (1 == nByteCount) // 高位码点后必须接着低位码点{if (cp16 >= 0xDC00 && cp16 <= 0xDFFF) //检查低位代理(0xDC00 - 0xDFFF){cp32Lo = (cp16 - 0xDC00);cp32 = 0x10000 + ((uint32_t)cp32Hi << 10 | cp32Lo);cp32Lo = 0;cp32Hi = 0;}else{fResult = false;break;}}nByteCount--;if (0 == nByteCount){nChCount++;}}}// 转换为 UTF 编码if (0 == nByteCount){uint8_t szBuf[7] = { 0 };if (pUtf8){_CodePointToUtf8(cp32, szBuf);strOut8 += (const char*)szBuf;}if (pUtf16){if (cp32 < 0x10000){strOut16.push_back((uint16_t)(cp32 & 0xFFFF));}else{uint16_t cp = (uint16_t)(cp32 - 0x10000);uint16_t cpHi = (uint16_t)(cp >> 10) + 0xD800;uint16_t cpLo = (uint16_t)(cp & 0x3FF) + 0xDC00;strOut16.push_back(cpHi);strOut16.push_back(cpLo);}}}pCpData++;if (-1 != size){size -= 2;}}if (!fResult){return -1;}if (pUtf8){*pUtf8 = std::move(strOut8);}if (pUtf16){*pUtf16 = std::move(strOut16);}return nChCount;
}int32_t CUnicodeUtils::_GetGbkCount(const void* pData, size_t size/* = -1*/, std::string* pGbk/* = nullptr*/)
{const uint8_t* pCpData = (const uint8_t*)pData;std::string strOutGbk; // 输出UTF8uint16_t gbkCode = 0; // GBK编码int32_t nByteCount = 0; // 字节计数int32_t nChCount = 0; // 字符计数bool fResult = true; // 操作结果while ((0 != *pCpData) && (0 != size)){uint8_t ch = *pCpData;if (ch < 0x7F){gbkCode = ch;nChCount++;}else{// 检查 UTF-8 首字节if (0 == nByteCount){gbkCode = 0;// 第1字节: 0x81 - 0xFEif (ch >= 0x81 && ch<=0xFE){gbkCode = ch;nByteCount = 1;}else{fResult = false;break;}}else{if (1 == nByteCount){// 第2字节: 0x40 - 0xFE (不包括0x7F)if (!(ch >= 0x40 && ch<=0xFE) || 0x7F == ch){fResult = false;break;}}gbkCode = gbkCode << 8;gbkCode |= ch;nByteCount--;if (0 == nByteCount){nChCount++;}}}if (0 == nByteCount){if (gbkCode <= 0x7F){strOutGbk.push_back((uint8_t)gbkCode);}else{strOutGbk.push_back(gbkCode >> 8);strOutGbk.push_back(gbkCode & 0xFF);}}pCpData++;if (-1 != size){size--;}}if (!fResult){return -1;}if (pGbk){*pGbk = std::move(strOutGbk);}return nChCount;
}
main.cpp
// CUnicodeUtils.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//#include <iostream>
#include "CUnicodeUtils.h"int main()
{char szUtf16Little[] = { 0x3C,0xD8,0x0D,0xDF,0x55,0x00,0x6E,0x00,0x69,0x00,0x63,0x00,0x6F,0x00,0x64,0x00,0x65,0x00,0x16,0x7F,0x01,0x78,0x4B,0x6D,0xD5,0x8B, 0x00,0x00 };char szUtf16Big[] = { 0xD8,0x3C,0xDF,0x0D,0x00,0x55,0x00,0x6E,0x00,0x69,0x00,0x63,0x00,0x6F,0x00,0x64,0x00,0x65,0x7F,0x16,0x78,0x01,0x6D,0x4B,0x8B,0xD5,0xD8,0x3C,0xDF,0x0D };char szUtf8[] = u8"🌍Unicode编码测试";int nUtf16Length = CUnicodeUtils::GetUtf16Count(szUtf16Little);nUtf16Length = CUnicodeUtils::GetUtf16Count(szUtf16Big);int nUtf8Length = CUnicodeUtils::GetUtf8Count(szUtf8);wchar_t* lpStr = (wchar_t*)szUtf16Little;std::string str8 = CUnicodeUtils::Utf16ToUtf8(szUtf16Little);std::wstring str16 = CUnicodeUtils::Utf8ToUtf16(szUtf8);int nGbkLength = CUnicodeUtils::GetGbkCount("789\xCC\x80");std::string str8Test;str8Test.push_back(0xEF);str8Test.push_back(0xBB);str8Test.push_back(0xBF);str8Test += str8;for (int i = 0; i < 10000; i++){str8 = CUnicodeUtils::Utf16ToUtf8(szUtf16Little);str16 = CUnicodeUtils::Utf8ToUtf16(szUtf8);}return 0;
}