mirror of
https://gitlab.gnome.org/GNOME/libxml2
synced 2025-03-28 21:33:13 +00:00

Make it possible to open an encoding handler only for input or output. This avoids the creation of unnecessary converters. Should also fix #863.
242 lines
6.1 KiB
C
242 lines
6.1 KiB
C
/*
|
|
* icu.c: Example how to use ICU for character encoding conversion
|
|
*
|
|
* This example shows how to use ICU by installing a custom character
|
|
* encoding converter with xmlCtxtSetCharEncConvImpl, available
|
|
* since libxml2 2.14.
|
|
*
|
|
* This approach makes it possible to use ICU even if libxml2 is
|
|
* compiled without ICU support. It also makes sure that *only* ICU
|
|
* is used. Many Linux distros currently ship libxml2 with support
|
|
* for both ICU and iconv which makes the library's behavior hard to
|
|
* predict.
|
|
*
|
|
* The long-term plan is to make libxml2 only support a single
|
|
* conversion library internally (iconv on POSIX).
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <libxml/parser.h>
|
|
#include <unicode/ucnv.h>
|
|
|
|
#define ICU_PIVOT_BUF_SIZE 1024
|
|
|
|
typedef struct {
|
|
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
|
|
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
|
|
UChar *pivot_source;
|
|
UChar *pivot_target;
|
|
int isInput;
|
|
UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
|
|
} myConvCtxt;
|
|
|
|
static int
|
|
icuConvert(void *vctxt, unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen, int flush) {
|
|
myConvCtxt *cd = vctxt;
|
|
const char *ucv_in = (const char *) in;
|
|
char *ucv_out = (char *) out;
|
|
UConverter *target, *source;
|
|
UErrorCode err = U_ZERO_ERROR;
|
|
int ret;
|
|
|
|
if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
|
|
if (outlen != NULL)
|
|
*outlen = 0;
|
|
return XML_ENC_ERR_INTERNAL;
|
|
}
|
|
|
|
/*
|
|
* The ICU API can consume input, including partial sequences,
|
|
* even if the output buffer would overflow. The remaining input
|
|
* must be processed by calling ucnv_convertEx with a possibly
|
|
* empty input buffer.
|
|
*/
|
|
if (cd->isInput) {
|
|
source = cd->uconv;
|
|
target = cd->utf8;
|
|
} else {
|
|
source = cd->utf8;
|
|
target = cd->uconv;
|
|
}
|
|
|
|
ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
|
|
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
|
|
&cd->pivot_source, &cd->pivot_target,
|
|
cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
|
|
/* reset */ 0, flush, &err);
|
|
|
|
*inlen = ucv_in - (const char*) in;
|
|
*outlen = ucv_out - (char *) out;
|
|
|
|
if (U_SUCCESS(err)) {
|
|
ret = XML_ENC_ERR_SUCCESS;
|
|
} else {
|
|
switch (err) {
|
|
case U_TRUNCATED_CHAR_FOUND:
|
|
/* Should only happen with flush */
|
|
ret = XML_ENC_ERR_INPUT;
|
|
break;
|
|
|
|
case U_BUFFER_OVERFLOW_ERROR:
|
|
ret = XML_ENC_ERR_SPACE;
|
|
break;
|
|
|
|
case U_INVALID_CHAR_FOUND:
|
|
case U_ILLEGAL_CHAR_FOUND:
|
|
case U_ILLEGAL_ESCAPE_SEQUENCE:
|
|
case U_UNSUPPORTED_ESCAPE_SEQUENCE:
|
|
ret = XML_ENC_ERR_INPUT;
|
|
break;
|
|
|
|
case U_MEMORY_ALLOCATION_ERROR:
|
|
ret = XML_ENC_ERR_MEMORY;
|
|
break;
|
|
|
|
default:
|
|
ret = XML_ENC_ERR_INTERNAL;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int
|
|
icuOpen(const char* name, int isInput, myConvCtxt **out)
|
|
{
|
|
UErrorCode status;
|
|
myConvCtxt *cd;
|
|
|
|
*out = NULL;
|
|
|
|
cd = xmlMalloc(sizeof(myConvCtxt));
|
|
if (cd == NULL)
|
|
return XML_ERR_NO_MEMORY;
|
|
|
|
cd->isInput = isInput;
|
|
cd->pivot_source = cd->pivot_buf;
|
|
cd->pivot_target = cd->pivot_buf;
|
|
|
|
status = U_ZERO_ERROR;
|
|
cd->uconv = ucnv_open(name, &status);
|
|
if (U_FAILURE(status))
|
|
goto error;
|
|
|
|
status = U_ZERO_ERROR;
|
|
if (isInput) {
|
|
ucnv_setToUCallBack(cd->uconv, UCNV_TO_U_CALLBACK_STOP,
|
|
NULL, NULL, NULL, &status);
|
|
}
|
|
else {
|
|
ucnv_setFromUCallBack(cd->uconv, UCNV_FROM_U_CALLBACK_STOP,
|
|
NULL, NULL, NULL, &status);
|
|
}
|
|
if (U_FAILURE(status))
|
|
goto error;
|
|
|
|
status = U_ZERO_ERROR;
|
|
cd->utf8 = ucnv_open("UTF-8", &status);
|
|
if (U_FAILURE(status))
|
|
goto error;
|
|
|
|
*out = cd;
|
|
return 0;
|
|
|
|
error:
|
|
if (cd->uconv)
|
|
ucnv_close(cd->uconv);
|
|
xmlFree(cd);
|
|
|
|
if (status == U_FILE_ACCESS_ERROR)
|
|
return XML_ERR_UNSUPPORTED_ENCODING;
|
|
if (status == U_MEMORY_ALLOCATION_ERROR)
|
|
return XML_ERR_NO_MEMORY;
|
|
return XML_ERR_SYSTEM;
|
|
}
|
|
|
|
static void
|
|
icuClose(myConvCtxt *cd)
|
|
{
|
|
if (cd == NULL)
|
|
return;
|
|
ucnv_close(cd->uconv);
|
|
ucnv_close(cd->utf8);
|
|
xmlFree(cd);
|
|
}
|
|
|
|
static void
|
|
icuConvCtxtDtor(void *vctxt) {
|
|
icuClose(vctxt);
|
|
}
|
|
|
|
static int
|
|
icuConvImpl(void *vctxt, const char *name, xmlCharEncFlags flags,
|
|
xmlCharEncodingHandler **result) {
|
|
xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
|
|
myConvCtxt *inputCtxt = NULL;
|
|
myConvCtxt *outputCtxt = NULL;
|
|
int ret;
|
|
|
|
if (flags & XML_ENC_INPUT) {
|
|
ret = icuOpen(name, 1, &inputCtxt);
|
|
if (ret != 0)
|
|
goto error;
|
|
inFunc = icuConvert;
|
|
}
|
|
|
|
if (flags & XML_ENC_OUTPUT) {
|
|
ret = icuOpen(name, 0, &outputCtxt);
|
|
if (ret != 0)
|
|
goto error;
|
|
outFunc = icuConvert;
|
|
}
|
|
|
|
return xmlCharEncNewCustomHandler(name, inFunc, outFunc, icuConvCtxtDtor,
|
|
inputCtxt, outputCtxt, result);
|
|
|
|
error:
|
|
if (inputCtxt != NULL)
|
|
icuClose(inputCtxt);
|
|
if (outputCtxt != NULL)
|
|
icuClose(outputCtxt);
|
|
return ret;
|
|
}
|
|
|
|
int
|
|
main(void) {
|
|
xmlParserCtxtPtr ctxt;
|
|
xmlDocPtr doc;
|
|
const char *xml;
|
|
xmlChar *content;
|
|
int ret = 0;
|
|
|
|
/*
|
|
* We use IBM-1051, an alias for HP Roman, as a simple example that
|
|
* ICU supports, but iconv (typically) doesn't.
|
|
*
|
|
* Character code 0xDE is U+00DF Latin Small Letter Sharp S.
|
|
*/
|
|
xml = "<doc>\xDE</doc>";
|
|
|
|
ctxt = xmlNewParserCtxt();
|
|
xmlCtxtSetCharEncConvImpl(ctxt, icuConvImpl, NULL);
|
|
doc = xmlCtxtReadDoc(ctxt, BAD_CAST xml, NULL, "IBM-1051", 0);
|
|
xmlFreeParserCtxt(ctxt);
|
|
|
|
content = xmlNodeGetContent((xmlNodePtr) doc);
|
|
|
|
printf("content: %s\n", content);
|
|
|
|
if (!xmlStrEqual(content, BAD_CAST "\xC3\x9F")) {
|
|
fprintf(stderr, "conversion failed\n");
|
|
ret = 1;
|
|
}
|
|
|
|
xmlFree(content);
|
|
xmlFreeDoc(doc);
|
|
|
|
return ret;
|
|
}
|
|
|