/* * icu.c: Example how to use ICU for character encoding conversion * * This example shows how to use ICU by installing a custom character * encoding converter with xmlCtxtSetCharEncConvImpl, available * since libxml2 2.14. * * This approach makes it possible to use ICU even if libxml2 is * compiled without ICU support. It also makes sure that *only* ICU * is used. Many Linux distros currently ship libxml2 with support * for both ICU and iconv which makes the library's behavior hard to * predict. * * The long-term plan is to make libxml2 only support a single * conversion library internally (iconv on POSIX). */ #include #include #include #define ICU_PIVOT_BUF_SIZE 1024 typedef struct { UConverter *uconv; /* for conversion between an encoding and UTF-16 */ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ UChar *pivot_source; UChar *pivot_target; int isInput; UChar pivot_buf[ICU_PIVOT_BUF_SIZE]; } myConvCtxt; static int icuConvert(void *vctxt, unsigned char *out, int *outlen, const unsigned char *in, int *inlen, int flush) { myConvCtxt *cd = vctxt; const char *ucv_in = (const char *) in; char *ucv_out = (char *) out; UConverter *target, *source; UErrorCode err = U_ZERO_ERROR; int ret; if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { if (outlen != NULL) *outlen = 0; return XML_ENC_ERR_INTERNAL; } /* * The ICU API can consume input, including partial sequences, * even if the output buffer would overflow. The remaining input * must be processed by calling ucnv_convertEx with a possibly * empty input buffer. */ if (cd->isInput) { source = cd->uconv; target = cd->utf8; } else { source = cd->utf8; target = cd->uconv; } ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, cd->pivot_buf + ICU_PIVOT_BUF_SIZE, /* reset */ 0, flush, &err); *inlen = ucv_in - (const char*) in; *outlen = ucv_out - (char *) out; if (U_SUCCESS(err)) { ret = XML_ENC_ERR_SUCCESS; } else { switch (err) { case U_TRUNCATED_CHAR_FOUND: /* Should only happen with flush */ ret = XML_ENC_ERR_INPUT; break; case U_BUFFER_OVERFLOW_ERROR: ret = XML_ENC_ERR_SPACE; break; case U_INVALID_CHAR_FOUND: case U_ILLEGAL_CHAR_FOUND: case U_ILLEGAL_ESCAPE_SEQUENCE: case U_UNSUPPORTED_ESCAPE_SEQUENCE: ret = XML_ENC_ERR_INPUT; break; case U_MEMORY_ALLOCATION_ERROR: ret = XML_ENC_ERR_MEMORY; break; default: ret = XML_ENC_ERR_INTERNAL; break; } } return ret; } static int icuOpen(const char* name, int isInput, myConvCtxt **out) { UErrorCode status; myConvCtxt *cd; *out = NULL; cd = xmlMalloc(sizeof(myConvCtxt)); if (cd == NULL) return XML_ERR_NO_MEMORY; cd->isInput = isInput; cd->pivot_source = cd->pivot_buf; cd->pivot_target = cd->pivot_buf; status = U_ZERO_ERROR; cd->uconv = ucnv_open(name, &status); if (U_FAILURE(status)) goto error; status = U_ZERO_ERROR; if (isInput) { ucnv_setToUCallBack(cd->uconv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &status); } else { ucnv_setFromUCallBack(cd->uconv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); } if (U_FAILURE(status)) goto error; status = U_ZERO_ERROR; cd->utf8 = ucnv_open("UTF-8", &status); if (U_FAILURE(status)) goto error; *out = cd; return 0; error: if (cd->uconv) ucnv_close(cd->uconv); xmlFree(cd); if (status == U_FILE_ACCESS_ERROR) return XML_ERR_UNSUPPORTED_ENCODING; if (status == U_MEMORY_ALLOCATION_ERROR) return XML_ERR_NO_MEMORY; return XML_ERR_SYSTEM; } static void icuClose(myConvCtxt *cd) { if (cd == NULL) return; ucnv_close(cd->uconv); ucnv_close(cd->utf8); xmlFree(cd); } static void icuConvCtxtDtor(void *vctxt) { icuClose(vctxt); } static int icuConvImpl(void *vctxt, const char *name, xmlCharEncFlags flags, xmlCharEncodingHandler **result) { xmlCharEncConvFunc inFunc = NULL, outFunc = NULL; myConvCtxt *inputCtxt = NULL; myConvCtxt *outputCtxt = NULL; int ret; if (flags & XML_ENC_INPUT) { ret = icuOpen(name, 1, &inputCtxt); if (ret != 0) goto error; inFunc = icuConvert; } if (flags & XML_ENC_OUTPUT) { ret = icuOpen(name, 0, &outputCtxt); if (ret != 0) goto error; outFunc = icuConvert; } return xmlCharEncNewCustomHandler(name, inFunc, outFunc, icuConvCtxtDtor, inputCtxt, outputCtxt, result); error: if (inputCtxt != NULL) icuClose(inputCtxt); if (outputCtxt != NULL) icuClose(outputCtxt); return ret; } int main(void) { xmlParserCtxtPtr ctxt; xmlDocPtr doc; const char *xml; xmlChar *content; int ret = 0; /* * We use IBM-1051, an alias for HP Roman, as a simple example that * ICU supports, but iconv (typically) doesn't. * * Character code 0xDE is U+00DF Latin Small Letter Sharp S. */ xml = "\xDE"; ctxt = xmlNewParserCtxt(); xmlCtxtSetCharEncConvImpl(ctxt, icuConvImpl, NULL); doc = xmlCtxtReadDoc(ctxt, BAD_CAST xml, NULL, "IBM-1051", 0); xmlFreeParserCtxt(ctxt); content = xmlNodeGetContent((xmlNodePtr) doc); printf("content: %s\n", content); if (!xmlStrEqual(content, BAD_CAST "\xC3\x9F")) { fprintf(stderr, "conversion failed\n"); ret = 1; } xmlFree(content); xmlFreeDoc(doc); return ret; }