用icu探测字符集

发布时间 2023-12-13 15:01:51作者: 北京开发

g++ str-charset.cpp -licui18n

 

#include <iostream>
#include <stdio.h>
#include <string.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
//#include <unicode/urename.h>
#include <unicode/ucsdet.h>

bool what_charset(const char *data, int len, char **detected)
{
UCharsetDetector* csd;
const UCharsetMatch **csm;
int match, matchCount = 0;
UErrorCode status = U_ZERO_ERROR;
csd = ucsdet_open(&status);
if(status != U_ZERO_ERROR)
return false;
ucsdet_setText(csd, data, len, &status);
if(status != U_ZERO_ERROR)
return false;

csm = ucsdet_detectAll(csd, &matchCount, &status);
if(status != U_ZERO_ERROR)
return false;
#if 1 //打印出探测的可能的编码
for(match = 0; match < matchCount; match += 1)
{
const char *name = ucsdet_getName(csm[match], &status);
const char *lang = ucsdet_getLanguage(csm[match], &status);
int confidence = ucsdet_getConfidence(csm[match], &status);
if (lang == NULL || strlen(lang) == 0)
lang = "**";
printf("%s (%s) %d\n", name, lang, confidence);
}
#endif
if(matchCount > 0)
{
*detected = strdup(ucsdet_getName(csm[0], &status)); //分配了内
if(status != U_ZERO_ERROR)
return false;
}

printf("charset = %s n", *detected);
ucsdet_close(csd);

return true;
}

int main(int argc, char* argv[])
{
std::string data = "This is a 测试用数据";
char buf[128];
char *str[1];
str[0] = buf;

what_charset((const char *)data.c_str(), data.length(), str);

return 0;
}