1 UTF8转TUF16----UTF16---本系统单字节字符,字符串
#if defined(_WIN32) #include <windows.h> #include <stdio.h> #include <iostream> #include <string> #include <iostream> #endif // WIN32 #ifdef __linux__ #include<string.h> #include<iconv.h> #include <string> #include <locale> #include <cstring> #include <stdio.h> #include <stdlib.h> #include<stdint.h> #endif using namespace std; #if defined(_WIN32) string UTF8_URL_DECODE(char* URLcode,int bytelen) { if (URLcode == NULL || bytelen <= 0) { return ""; } char* UTF8str = URLcode; //UTF8转换到UTF16 int wcslen = ::MultiByteToWideChar(CP_UTF8, NULL, UTF8str, bytelen, NULL, 0); wchar_t* wszString = new wchar_t[wcslen + 1]; ::MultiByteToWideChar(CP_UTF8, NULL, UTF8str, bytelen, wszString, wcslen); wszString[wcslen] = L'\0'; std::wcout.imbue(std::locale("CHS")); wstring DecodeStr = wszString; string OutStr; int nLen = (int)DecodeStr.length() * 2; OutStr.resize(nLen, ' '); //CP_ACP通常对应GB2312编码;win中是宽字符转换成当前系统的ANSI页的多字符 WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)DecodeStr.c_str(), nLen, (LPSTR)OutStr.c_str(), nLen, NULL, NULL); delete[] wszString; return OutStr; } #endif #ifdef __linux__ unsigned char UTF8ToUnicode(unsigned char* utf8, unsigned int* unicode) { const unsigned char lut_size = 3; const unsigned char length_lut[] = { 2, 3, 4 }; const unsigned char range_lut[] = { 0xE0, 0xF0, 0xF8 }; const unsigned char mask_lut[] = { 0x1F, 0x0F, 0x07 }; unsigned char length = 0; byte b = *(utf8 + 0); unsigned int i = 0; if (utf8 == NULL) { *unicode = 0; return -1; } // utf8编码兼容ASCII编码,使用0xxxxxx 表示00~7F if (b < 0x80) { *unicode = b; return 1; } // utf8不兼容ISO8859-1 ASCII拓展字符集 // 同时最大支持编码6个字节即1111110X if (b < 0xC0 || b > 0xFD) { *unicode = 0; return -1; } for (i = 0; i < lut_size; i++) { if (b < range_lut[i]) { *unicode = b & mask_lut[i]; length = length_lut[i]; break; } } // 超过四字节的utf8编码不进行解析 if (length == 0) { *unicode = 0; return -1; } // 取后续字节数据 for (i = 1; i < length; i++) { b = *(utf8 + i); // 多字节utf8编码后续字节范围10xxxxxx~10111111 if (b < 0x80 || b > 0xBF) { break; } *unicode <<= 6; // 00111111 *unicode |= (b & 0x3F); } // 长度校验 return (i < length) ? -1 : length; } /** * @brief 4字节unicode(usc4)字符集转utf16编码 * @param unicode unicode字符值 * @param *utf16 utf16编码结果 * @return utf16长度,(2字节)单位 */ unsigned char UnicodeToUTF16(unsigned int unicode, unsigned short* utf16) { // Unicode范围 U+000~U+FFFF // utf16编码方式:2 Byte存储,编码后等于Unicode值 if (unicode <= 0xFFFF) { if (utf16 != NULL) { *utf16 = (unicode & 0xFFFF); } return 1; } else if (unicode <= 0xEFFFF) { if (utf16 != NULL) { // 高10位 *(utf16 + 0) = 0xD800 + (unicode >> 10) - 0x40; // 低10位 *(utf16 + 1) = 0xDC00 + (unicode & 0x03FF); } return 2; } return 0; } //多字符转换 int UTF16toStr(unsigned short* utf16, int lens, string& strout) { if (lens > 1024) { printf("error utf16 is too long\n"); return -1; } char psz[1024]; wchar_t* pwsz = (wchar_t*)utf16; setlocale(LC_CTYPE, ""); int cch = wcstombs(psz, pwsz, 1024); if (cch != 0 && cch != -1) { printf("%s\n", psz); } else { printf("error UTF16toStr %d\n", cch); return -1; } strout = cch; return 1; } string L_UTF8_URL_DECODE(char* URLcode, int bytelen) { if (URLcode == NULL || bytelen <= 0) { return ""; } char* UTF8str = URLcode; //UTF8转换到UTF16 //size_t mbstowcs( wchar_t* wcstr, const char* mbstr, size_t count) setlocale(LC_CTYPE, ""); int wcslen = mbstowcs(NULL, URLcode, bytelen); printf(" wcslen:%d\n", wcslen); wchar_t* wszString = new wchar_t[wcslen + 1]; //::MultiByteToWideChar(CP_UTF8, NULL, UTF8str, bytelen, wszString, wcslen); mbstowcs(wszString, URLcode, bytelen); wszString[wcslen] = L'\0'; wstring DecodeStr = wszString; string OutStr; UTF16toStr((unsigned short*)wszString, DecodeStr.length(), OutStr); delete[] wszString; return OutStr; } #endif int main(int argc, char* argv[])
{
//linux-----------------------------
// 严 utf8 E4 B8 A5
printf("Hello world!\n");
unsigned int buffer;
uint8_t utf8[20];
utf8[0] = 0xE4; "\xE4\xB8\xA5";
utf8[1] = 0xB8;
utf8[2] = 0xA5;
utf8[3] = 0xE4; "\xE4\xB8\xA5";
utf8[4] = 0xB8;
utf8[5] = 0xA5;
utf8[6] = '\0';
std::string stdout2 = L_UTF8_URL_DECODE((char*)utf8, 7);
return 0;
}
标签:return,进制,16,int,UTF8,unicode,utf8,include From: https://www.cnblogs.com/8335IT/p/18650607