UTF8编码转Unicode编码(C语言)

发布时间 2023-07-21 08:51:19作者: SymPny

/**

  • 功能:UTF8编码转Unicode编码

  • 参数:dst 目标字符串,src 源字符串
    */
    WORD UTF8_to_Unicode(BYTE *dst, BYTE *src)
    {
    WORD i = 0, unicode = 0, ii, iii;
    int codeLen = 0;

    while ( *src )
    {
    //1. UTF-8 ---> Unicode
    if(0 == (src[0] & 0x80))
    {
    // 单字节
    codeLen = 1;
    unicode = src[0];
    }
    else if(0xC0 == (src[0] & 0xE0) && 0x80 == (src[1] & 0xC0))
    {// 双字节
    codeLen = 2;
    unicode = (WORD)((((WORD)src[0] & 0x001F) << 6) | ((WORD)src[1] & 0x003F));
    }
    else if(0xE0 == (src[0] & 0xF0) && 0x80 == (src[1] & 0xC0) && 0x80 == (src[2] & 0xC0))
    {// 三字节
    codeLen = 3;
    ii = (((WORD)src[0] & 0x000F) << 12);
    iii = (((WORD)src[1] & 0x003F) << 6);
    unicode = ii|iii|((WORD)src[2] & 0x003F);
    unicode = (WORD)((((WORD)src[0] & 0x000F) << 12) | (((WORD)src[1] & 0x003F) << 6) | ((WORD)src[2] & 0x003F));
    }
    else if(0xF0 == (src[0] & 0xF0) && 0x80 == (src[1] & 0xC0) && 0x80 == (src[2] & 0xC0) && 0x80 == (src[3] & 0xC0))
    {// 四字节
    codeLen = 4;
    unicode = (((int)(src[0] & 0x07)) << 18) | (((int)(src[1] & 0x3F)) << 12) | (((int)(src[2] & 0x3F)) << 6) | (src[3] & 0x3F);
    }
    else
    {
    INFOBOX_Show("超出4字节的Unicode码", 100);
    break;
    }
    src += codeLen;
    if (unicode < 0x80)
    {
    if (i == 0 && unicode == 0x20)
    {
    continue;
    }
    }
    i += 2;
    *dst++ = (BYTE)(unicode&0xff);
    *dst++ = (BYTE)((unicode>>8)&0xff);
    } // end while
    *dst = 0;

    return i;
    }