diff --git a/HTMLparser.c b/HTMLparser.c
index 9daa6274..2260ef13 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -4385,6 +4385,11 @@ htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
htmlParseContent(ctxt);
+ /*
+ * Only check for truncated multi-byte sequences
+ */
+ xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
+
/* TODO: Use xmlCtxtIsCatastrophicError */
if (ctxt->errNo != XML_ERR_NO_MEMORY) {
xmlNodePtr cur;
@@ -4509,11 +4514,9 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
htmlParseContent(ctxt);
/*
- * autoclose
+ * Only check for truncated multi-byte sequences
*/
- if (CUR == 0)
- htmlAutoCloseOnEnd(ctxt);
-
+ xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
/*
* SAX: end of the document processing.
@@ -5237,12 +5240,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
int terminate) {
- if ((ctxt == NULL) || (ctxt->input == NULL))
+ if ((ctxt == NULL) ||
+ (ctxt->input == NULL) || (ctxt->input->buf == NULL) ||
+ (size < 0) ||
+ ((size > 0) && (chunk == NULL)))
return(XML_ERR_ARGUMENT);
if (PARSER_STOPPED(ctxt) != 0)
return(ctxt->errNo);
- if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
- (ctxt->input->buf != NULL)) {
+
+ if (size > 0) {
size_t pos = ctxt->input->cur - ctxt->input->base;
int res;
@@ -5261,6 +5267,11 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
htmlAutoCloseOnEnd(ctxt);
+ /*
+ * Only check for truncated multi-byte sequences
+ */
+ xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
+
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
ctxt->sax->endDocument(ctxt->userData);
diff --git a/doc/libxml2-api.xml b/doc/libxml2-api.xml
index 82593e21..a6058350 100644
--- a/doc/libxml2-api.xml
+++ b/doc/libxml2-api.xml
@@ -8107,13 +8107,14 @@ crash if you try to modify the tree)'/>
- Convert between character encodings. On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.
-
+ Convert between character encodings. The value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced. If the converter can consume partial multi-byte sequences, the @flush flag can be used to detect truncated sequences at EOF. Otherwise, the flag can be ignored.
+
+
-
+
If this function returns XML_ERR_OK, it must fill the @out pointer with an encoding handler. The handler can be obtained from xmlCharEncNewCustomHandler.
diff --git a/encoding.c b/encoding.c
index 1996ef3f..f172c68e 100644
--- a/encoding.c
+++ b/encoding.c
@@ -113,35 +113,35 @@ static const xmlEncTableEntry xmlEncTable[] = {
};
static int
-asciiToAscii(unsigned char* out, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+asciiToAscii(void *vctxt, unsigned char* out, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
static int
-UTF8ToUTF8(unsigned char* out, int *outlen,
- const unsigned char* inb, int *inlenb, void *vctxt);
+UTF8ToUTF8(void *vctxt, unsigned char* out, int *outlen,
+ const unsigned char* inb, int *inlenb, int flush);
static int
-latin1ToUTF8(unsigned char* out, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+latin1ToUTF8(void *vctxt, unsigned char* out, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
static int
-UTF16LEToUTF8(unsigned char* out, int *outlen,
- const unsigned char* inb, int *inlenb, void *vctxt);
+UTF16LEToUTF8(void *vctxt, unsigned char* out, int *outlen,
+ const unsigned char* inb, int *inlenb, int flush);
static int
-UTF16BEToUTF8(unsigned char* out, int *outlen,
- const unsigned char* inb, int *inlenb, void *vctxt);
+UTF16BEToUTF8(void *vctxt, unsigned char* out, int *outlen,
+ const unsigned char* inb, int *inlenb, int flush);
#ifdef LIBXML_OUTPUT_ENABLED
static int
-UTF8ToLatin1(unsigned char* outb, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToLatin1(void *vctxt, unsigned char* outb, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
static int
-UTF8ToUTF16(unsigned char* outb, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToUTF16(void *vctxt, unsigned char* outb, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
static int
-UTF8ToUTF16LE(unsigned char* outb, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToUTF16LE(void *vctxt, unsigned char* outb, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
static int
-UTF8ToUTF16BE(unsigned char* outb, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToUTF16BE(void *vctxt, unsigned char* outb, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
#else /* LIBXML_OUTPUT_ENABLED */
@@ -154,8 +154,8 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
static int
-UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt);
+UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen, int flush);
#else
#define UTF8ToHtmlWrapper NULL
#endif
@@ -166,11 +166,11 @@ UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
#include "iso8859x.inc"
static int
-ISO8859xToUTF8(unsigned char* out, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt);
+ISO8859xToUTF8(void *vctxt, unsigned char* out, int *outlen,
+ const unsigned char* in, int *inlen, int flush);
static int
-UTF8ToISO8859x(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt);
+UTF8ToISO8859x(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen, int flush);
#define MAKE_ISO_HANDLER(name, n) \
{ (char *) name, { ISO8859xToUTF8 }, { UTF8ToISO8859x }, \
@@ -1073,6 +1073,7 @@ typedef struct {
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
+ * @flush: end of input
*
* Returns an XML_ENC_ERR code.
*
@@ -1081,8 +1082,9 @@ typedef struct {
* The value of @outlen after return is the number of octets produced.
*/
static int
-xmlIconvConvert(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt) {
+xmlIconvConvert(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen,
+ int flush ATTRIBUTE_UNUSED) {
xmlIconvCtxt *ctxt = vctxt;
size_t icv_inlen, icv_outlen;
const char *icv_in = (const char *) in;
@@ -1293,6 +1295,7 @@ struct _uconv_t {
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
+ * @flush: end of input
*
* Returns an XML_ENC_ERR code.
*
@@ -1301,8 +1304,8 @@ struct _uconv_t {
* The value of @outlen after return is the number of octets produced.
*/
static int
-xmlUconvConvert(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt) {
+xmlUconvConvert(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen, int flush) {
xmlUconvCtxt *cd = vctxt;
const char *ucv_in = (const char *) in;
char *ucv_out = (char *) out;
@@ -1317,14 +1320,10 @@ xmlUconvConvert(unsigned char *out, int *outlen,
}
/*
- * Note that the ICU API is stateful. It can always consume a certain
- * amount of input even if the output buffer would overflow. The
- * remaining input must be processed by calling ucnv_convertEx with a
- * possibly empty input buffer.
- *
- * ucnv_convertEx is always called with reset and flush set to 0,
- * so we don't mess up the state. This should never generate
- * U_TRUNCATED_CHAR_FOUND errors.
+ * The ICU API can consume input, including partial sequences,
+ * even if the output buffer would overflow. The remaining input
+ * must be processed by calling ucnv_convertEx with a possibly
+ * empty input buffer.
*/
if (cd->isInput) {
source = cd->uconv;
@@ -1337,7 +1336,8 @@ xmlUconvConvert(unsigned char *out, int *outlen,
ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
- cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
+ cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
+ /* reset */ 0, flush, &err);
*inlen = ucv_in - (const char*) in;
*outlen = ucv_out - (char *) out;
@@ -1347,8 +1347,8 @@ xmlUconvConvert(unsigned char *out, int *outlen,
} else {
switch (err) {
case U_TRUNCATED_CHAR_FOUND:
- /* Shouldn't happen without flush */
- ret = XML_ENC_ERR_SUCCESS;
+ /* Should only happen with flush */
+ ret = XML_ENC_ERR_INPUT;
break;
case U_BUFFER_OVERFLOW_ERROR:
@@ -1510,6 +1510,7 @@ xmlEncConvertError(int code) {
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
+ * @flush: end of input
*
* The value of @inlen after return is the number of octets consumed
* as the return value is 0, else unpredictable.
@@ -1519,7 +1520,8 @@ xmlEncConvertError(int code) {
*/
int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
- int *outlen, const unsigned char *in, int *inlen) {
+ int *outlen, const unsigned char *in, int *inlen,
+ int flush) {
int ret;
if (handler->flags & XML_HANDLER_LEGACY) {
@@ -1534,6 +1536,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
ret = func(out, outlen, in, inlen);
} else {
xmlCharEncConvFunc func = handler->input.func;
+ int oldInlen;
if (func == NULL) {
*outlen = 0;
@@ -1541,7 +1544,14 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
return(XML_ENC_ERR_INTERNAL);
}
- ret = func(out, outlen, in, inlen, handler->inputCtxt);
+ oldInlen = *inlen;
+ ret = func(handler->inputCtxt, out, outlen, in, inlen, flush);
+
+ /*
+ * Check for truncated multi-byte sequence.
+ */
+ if ((flush) && (ret == XML_ENC_ERR_SUCCESS) && (*inlen != oldInlen))
+ ret = XML_ENC_ERR_INPUT;
}
if (ret > 0)
@@ -1588,7 +1598,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
return(XML_ENC_ERR_INTERNAL);
}
- ret = func(out, outlen, in, inlen, handler->outputCtxt);
+ ret = func(handler->outputCtxt, out, outlen, in, inlen, /* flush */ 0);
}
if (ret > 0)
@@ -1617,6 +1627,7 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
* xmlCharEncInput:
* @input: a parser input buffer
* @sizeOut: pointer to output size
+ * @flush: end of input
*
* @sizeOut should be set to the maximum output size (or SIZE_MAX).
* After return, it is set to the number of bytes written.
@@ -1626,7 +1637,7 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
* Returns an XML_ENC_ERR code.
*/
int
-xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
+xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush)
{
xmlBufPtr out, in;
const xmlChar *dataIn;
@@ -1644,7 +1655,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
*sizeOut = 0;
availIn = xmlBufUse(in);
- if (availIn == 0)
+ if ((availIn == 0) && (!flush))
return(0);
dataIn = xmlBufContent(in);
totalIn = 0;
@@ -1675,7 +1686,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
}
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
- dataIn, &c_in);
+ dataIn, &c_in, flush && completeIn);
totalIn += c_in;
dataIn += c_in;
@@ -1750,7 +1761,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
written = out->size - out->use - 1;
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
- in->content, &toconv);
+ in->content, &toconv, /* flush */ 0);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
@@ -2077,9 +2088,10 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
************************************************************************/
static int
-asciiToAscii(unsigned char* out, int *poutlen,
+asciiToAscii(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char* out, int *poutlen,
const unsigned char* in, int *pinlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char *inend;
const unsigned char *instart = in;
int inlen, outlen, ret;
@@ -2121,9 +2133,10 @@ asciiToAscii(unsigned char* out, int *poutlen,
}
static int
-latin1ToUTF8(unsigned char* out, int *outlen,
+latin1ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char* out, int *outlen,
const unsigned char* in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
unsigned char* outstart = out;
const unsigned char* instart = in;
unsigned char* outend;
@@ -2180,13 +2193,15 @@ done:
int
xmlIsolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
- return(latin1ToUTF8(out, outlen, in, inlen, NULL));
+ return(latin1ToUTF8(/* ctxt */ NULL, out, outlen, in, inlen,
+ /* flush */ 0));
}
static int
-UTF8ToUTF8(unsigned char* out, int *outlen,
+UTF8ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char* out, int *outlen,
const unsigned char* in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
int len;
int ret;
@@ -2214,9 +2229,10 @@ UTF8ToUTF8(unsigned char* out, int *outlen,
#ifdef LIBXML_OUTPUT_ENABLED
static int
-UTF8ToLatin1(unsigned char* out, int *outlen,
+UTF8ToLatin1(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char* out, int *outlen,
const unsigned char* in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
@@ -2286,14 +2302,16 @@ xmlUTF8ToIsolat1(unsigned char* out, int *outlen,
if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
- return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
+ return(UTF8ToLatin1(/* ctxt */ NULL, out, outlen, in, inlen,
+ /* flush */ 0));
}
#endif /* LIBXML_OUTPUT_ENABLED */
static int
-UTF16LEToUTF8(unsigned char *out, int *outlen,
+UTF16LEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend = in + (*inlen & ~1);
unsigned char *outstart = out;
@@ -2360,9 +2378,10 @@ done:
#ifdef LIBXML_OUTPUT_ENABLED
static int
-UTF8ToUTF16LE(unsigned char *out, int *outlen,
+UTF8ToUTF16LE(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend;
unsigned char *outstart = out;
@@ -2462,9 +2481,10 @@ done:
}
static int
-UTF8ToUTF16(unsigned char* outb, int *outlen,
+UTF8ToUTF16(void *vctxt,
+ unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush) {
if (in == NULL) {
/*
* initialization, add the Byte Order Mark for UTF-16LE
@@ -2480,14 +2500,15 @@ UTF8ToUTF16(unsigned char* outb, int *outlen,
*inlen = 0;
return(0);
}
- return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
+ return (UTF8ToUTF16LE(vctxt, outb, outlen, in, inlen, flush));
}
#endif /* LIBXML_OUTPUT_ENABLED */
static int
-UTF16BEToUTF8(unsigned char *out, int *outlen,
+UTF16BEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend = in + (*inlen & ~1);
unsigned char *outstart = out;
@@ -2554,9 +2575,10 @@ done:
#ifdef LIBXML_OUTPUT_ENABLED
static int
-UTF8ToUTF16BE(unsigned char *out, int *outlen,
+UTF8ToUTF16BE(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend;
unsigned char *outstart = out;
@@ -2657,10 +2679,11 @@ done:
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
static int
-UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
+UTF8ToHtmlWrapper(void *vctxt ATTRIBUTE_UNUSED,
+ unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
- void *vctxt ATTRIBUTE_UNUSED) {
- return(UTF8ToHtml(out, outlen, in, inlen));
+ int flush ATTRIBUTE_UNUSED) {
+ return(htmlUTF8ToHtml(out, outlen, in, inlen));
}
#endif
@@ -2668,8 +2691,10 @@ UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
defined(LIBXML_ISO8859X_ENABLED)
static int
-UTF8ToISO8859x(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt) {
+UTF8ToISO8859x(void *vctxt,
+ unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen,
+ int flush ATTRIBUTE_UNUSED) {
const unsigned char *xlattable = vctxt;
const unsigned char *instart = in;
const unsigned char *inend;
@@ -2748,8 +2773,10 @@ done:
}
static int
-ISO8859xToUTF8(unsigned char* out, int *outlen,
- const unsigned char* in, int *inlen, void *vctxt) {
+ISO8859xToUTF8(void *vctxt,
+ unsigned char* out, int *outlen,
+ const unsigned char* in, int *inlen,
+ int flush ATTRIBUTE_UNUSED) {
unsigned short const *unicodetable = vctxt;
const unsigned char* instart = in;
const unsigned char* inend;
diff --git a/example/icu.c b/example/icu.c
index 0e93b671..59105e9b 100644
--- a/example/icu.c
+++ b/example/icu.c
@@ -31,8 +31,8 @@ typedef struct {
} myConvCtxt;
static int
-icuConvert(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt) {
+icuConvert(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen, int flush) {
myConvCtxt *cd = vctxt;
const char *ucv_in = (const char *) in;
char *ucv_out = (char *) out;
@@ -47,14 +47,10 @@ icuConvert(unsigned char *out, int *outlen,
}
/*
- * Note that the ICU API is stateful. It can always consume a certain
- * amount of input even if the output buffer would overflow. The
- * remaining input must be processed by calling ucnv_convertEx with a
- * possibly empty input buffer.
- *
- * ucnv_convertEx is always called with reset and flush set to 0,
- * so we don't mess up the state. This should never generate
- * U_TRUNCATED_CHAR_FOUND errors.
+ * The ICU API can consume input, including partial sequences,
+ * even if the output buffer would overflow. The remaining input
+ * must be processed by calling ucnv_convertEx with a possibly
+ * empty input buffer.
*/
if (cd->isInput) {
source = cd->uconv;
@@ -67,7 +63,8 @@ icuConvert(unsigned char *out, int *outlen,
ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
- cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
+ cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
+ /* reset */ 0, flush, &err);
*inlen = ucv_in - (const char*) in;
*outlen = ucv_out - (char *) out;
@@ -77,8 +74,8 @@ icuConvert(unsigned char *out, int *outlen,
} else {
switch (err) {
case U_TRUNCATED_CHAR_FOUND:
- /* Shouldn't happen without flush */
- ret = XML_ENC_ERR_SUCCESS;
+ /* Should only happen with flush */
+ ret = XML_ENC_ERR_INPUT;
break;
case U_BUFFER_OVERFLOW_ERROR:
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 8a3cddd8..03c67b12 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -126,17 +126,22 @@ typedef int (*xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
+ * @flush: end of input
*
* Convert between character encodings.
*
- * On success, the value of @inlen after return is the number of
- * bytes consumed and @outlen is the number of bytes produced.
+ * The value of @inlen after return is the number of bytes consumed
+ * and @outlen is the number of bytes produced.
*
- * Returns the number of bytes written or an XML_ENC_ERR code.
+ * If the converter can consume partial multi-byte sequences, the
+ * @flush flag can be used to detect truncated sequences at EOF.
+ * Otherwise, the flag can be ignored.
+ *
+ * Returns a non-negative number on success or an XML_ENC_ERR code.
*/
typedef int
-(*xmlCharEncConvFunc)(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt);
+(*xmlCharEncConvFunc)(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen, int flush);
/**
* xmlCharEncConvCtxtDtor:
diff --git a/include/private/enc.h b/include/private/enc.h
index 864025f8..de575582 100644
--- a/include/private/enc.h
+++ b/include/private/enc.h
@@ -9,9 +9,10 @@ xmlInitEncodingInternal(void);
XML_HIDDEN int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
- int *outlen, const unsigned char *in, int *inlen);
+ int *outlen, const unsigned char *in, int *inlen,
+ int flush);
XML_HIDDEN int
-xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut);
+xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush);
XML_HIDDEN int
xmlCharEncOutput(xmlOutputBufferPtr output, int init);
diff --git a/include/private/parser.h b/include/private/parser.h
index 1c92edc8..bae9f55b 100644
--- a/include/private/parser.h
+++ b/include/private/parser.h
@@ -140,4 +140,7 @@ XML_HIDDEN xmlChar *
xmlExpandEntitiesInAttValue(xmlParserCtxtPtr ctxt, const xmlChar *str,
int normalize);
+XML_HIDDEN void
+xmlParserCheckEOF(xmlParserCtxtPtr ctxt, xmlParserErrors code);
+
#endif /* XML_PARSER_H_PRIVATE__ */
diff --git a/parser.c b/parser.c
index 8a26968a..aacaf1f8 100644
--- a/parser.c
+++ b/parser.c
@@ -7300,9 +7300,7 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
while (ctxt->inputNr > oldInputNr)
xmlPopPE(ctxt);
- if (RAW != 0) {
- xmlFatalErr(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED, NULL);
- }
+ xmlParserCheckEOF(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED);
}
/**
@@ -9875,8 +9873,7 @@ xmlParseContent(xmlParserCtxtPtr ctxt) {
xmlParseContentInternal(ctxt);
- if (ctxt->input->cur < ctxt->input->end)
- xmlFatalErr(ctxt, XML_ERR_NOT_WELL_BALANCED, NULL);
+ xmlParserCheckEOF(ctxt, XML_ERR_NOT_WELL_BALANCED);
}
/**
@@ -10737,16 +10734,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
*/
xmlParseMisc(ctxt);
- if (ctxt->input->cur < ctxt->input->end) {
- if (ctxt->wellFormed)
- xmlFatalErr(ctxt, XML_ERR_DOCUMENT_END, NULL);
- } else if ((ctxt->input->buf != NULL) &&
- (ctxt->input->buf->encoder != NULL) &&
- (ctxt->input->buf->error == 0) &&
- (!xmlBufIsEmpty(ctxt->input->buf->raw))) {
- xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR,
- "Truncated multi-byte sequence at EOF\n");
- }
+ xmlParserCheckEOF(ctxt, XML_ERR_DOCUMENT_END);
}
ctxt->instate = XML_PARSER_EOF;
@@ -11596,11 +11584,8 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
xmlFatalErrMsg(ctxt, XML_ERR_DOCUMENT_EMPTY,
"Start tag expected, '<' not found\n");
}
- } else if ((ctxt->input->buf->encoder != NULL) &&
- (ctxt->input->buf->error == 0) &&
- (!xmlBufIsEmpty(ctxt->input->buf->raw))) {
- xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR,
- "Truncated multi-byte sequence at EOF\n");
+ } else {
+ xmlParserCheckEOF(ctxt, XML_ERR_DOCUMENT_END);
}
if (ctxt->instate != XML_PARSER_EOF) {
ctxt->instate = XML_PARSER_EOF;
diff --git a/parserInternals.c b/parserInternals.c
index e24af956..7d9bcb27 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -596,6 +596,49 @@ xmlParserGrow(xmlParserCtxtPtr ctxt) {
return(ret);
}
+/**
+ * xmlParserCheckEOF:
+ * @ctxt: parser ctxt
+ * @code: error code
+ *
+ * Raises an error with @code if the input wasn't consumed
+ * completely.
+ */
+void
+xmlParserCheckEOF(xmlParserCtxtPtr ctxt, xmlParserErrors code) {
+ xmlParserInputPtr in = ctxt->input;
+ xmlParserInputBufferPtr buf;
+
+ if (ctxt->errNo != XML_ERR_OK)
+ return;
+
+ if (in->cur < in->end) {
+ xmlFatalErr(ctxt, code, NULL);
+ return;
+ }
+
+ buf = in->buf;
+ if ((buf != NULL) && (buf->encoder != NULL)) {
+ size_t curBase = in->cur - in->base;
+ size_t sizeOut = 64;
+ int ret;
+
+ /*
+ * Check for truncated multi-byte sequence
+ */
+ ret = xmlCharEncInput(buf, &sizeOut, /* flush */ 1);
+ xmlBufUpdateInput(buf->buffer, in, curBase);
+ if (ret < 0) {
+ xmlCtxtErrIO(ctxt, buf->error, NULL);
+ return;
+ }
+
+ /* Shouldn't happen */
+ if (in->cur < in->end)
+ xmlFatalErr(ctxt, XML_ERR_INTERNAL_ERROR, "expected EOF");
+ }
+}
+
/**
* xmlParserInputGrow:
* @in: an XML parser input
@@ -1105,7 +1148,8 @@ xmlDetectEBCDIC(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr *hout) {
return(res);
outlen = sizeof(out) - 1;
inlen = input->end - input->cur;
- res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen);
+ res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen,
+ /* flush */ 0);
/*
* Return the EBCDIC handler if decoding failed. The error will
* be reported later.
@@ -1354,7 +1398,7 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input,
nbchars = SIZE_MAX;
else
nbchars = 4000 /* MINLEN */;
- res = xmlCharEncInput(in, &nbchars);
+ res = xmlCharEncInput(in, &nbchars, /* flush */ 0);
if (res < 0)
code = in->error;
}
diff --git a/result/errors/truncated-utf16.xml.ent b/result/errors/truncated-utf16.xml.ent
index f5be53cb..25edf0f4 100644
--- a/result/errors/truncated-utf16.xml.ent
+++ b/result/errors/truncated-utf16.xml.ent
@@ -1,3 +1,3 @@
-./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF
+./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding
^
diff --git a/result/errors/truncated-utf16.xml.err b/result/errors/truncated-utf16.xml.err
index f5be53cb..25edf0f4 100644
--- a/result/errors/truncated-utf16.xml.err
+++ b/result/errors/truncated-utf16.xml.err
@@ -1,3 +1,3 @@
-./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF
+./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding
^
diff --git a/result/errors/truncated-utf16.xml.str b/result/errors/truncated-utf16.xml.str
index e45c5788..3e37f30e 100644
--- a/result/errors/truncated-utf16.xml.str
+++ b/result/errors/truncated-utf16.xml.str
@@ -1,4 +1,4 @@
-./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF
+./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding
^
./test/errors/truncated-utf16.xml : failed to parse
diff --git a/testparser.c b/testparser.c
index 2f2456e0..6495c43e 100644
--- a/testparser.c
+++ b/testparser.c
@@ -952,11 +952,88 @@ testWindowsUri(void) {
}
#endif /* WIN32 */
+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
+static int
+testTruncatedMultiByte(void) {
+ const char xml[] =
+ "\n"
+ "\xC3";
+#ifdef LIBXML_HTML_ENABLED
+ const char html[] =
+ "\n"
+ "\xC3";
+#endif
+ xmlDocPtr doc;
+ const xmlError *error;
+ int err = 0;
+
+ xmlResetLastError();
+ doc = xmlReadDoc(BAD_CAST xml, NULL, NULL, XML_PARSE_NOERROR);
+ error = xmlGetLastError();
+ if (error == NULL || error->code != XML_ERR_INVALID_ENCODING) {
+ fprintf(stderr, "xml, pull: expected XML_ERR_INVALID_ENCODING\n");
+ err = 1;
+ }
+ xmlFreeDoc(doc);
+
+#ifdef LIBXML_HTML_ENABLED
+ xmlResetLastError();
+ doc = htmlReadDoc(BAD_CAST html, NULL, NULL, XML_PARSE_NOERROR);
+ error = xmlGetLastError();
+ if (error == NULL || error->code != XML_ERR_INVALID_ENCODING) {
+ fprintf(stderr, "html, pull: expected XML_ERR_INVALID_ENCODING\n");
+ err = 1;
+ }
+ xmlFreeDoc(doc);
+#endif /* LIBXML_HTML_ENABLED */
+
+#ifdef LIBXML_PUSH_ENABLED
+ {
+ xmlParserCtxtPtr ctxt;
+
+ ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
+ xmlCtxtSetOptions(ctxt, XML_PARSE_NOERROR);
+
+ xmlParseChunk(ctxt, xml, sizeof(xml) - 1, 0);
+ xmlParseChunk(ctxt, "", 0, 1);
+
+ if (ctxt->errNo != XML_ERR_INVALID_ENCODING) {
+ fprintf(stderr, "xml, push: expected XML_ERR_INVALID_ENCODING\n");
+ err = 1;
+ }
+
+ xmlFreeDoc(ctxt->myDoc);
+ xmlFreeParserCtxt(ctxt);
+
+#ifdef LIBXML_HTML_ENABLED
+ ctxt = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL,
+ XML_CHAR_ENCODING_NONE);
+ xmlCtxtSetOptions(ctxt, XML_PARSE_NOERROR);
+
+ htmlParseChunk(ctxt, html, sizeof(html) - 1, 0);
+ htmlParseChunk(ctxt, "", 0, 1);
+
+ if (ctxt->errNo != XML_ERR_INVALID_ENCODING) {
+ fprintf(stderr, "html, push: expected XML_ERR_INVALID_ENCODING\n");
+ err = 1;
+ }
+
+ xmlFreeDoc(ctxt->myDoc);
+ htmlFreeParserCtxt(ctxt);
+#endif /* LIBXML_HTML_ENABLED */
+ }
+#endif /* LIBXML_PUSH_ENABLED */
+
+ return err;
+}
+#endif /* iconv || icu */
+
static int charEncConvImplError;
static int
-rot13Convert(unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, void *vctxt) {
+rot13Convert(void *vctxt, unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen,
+ int flush ATTRIBUTE_UNUSED) {
int *ctxt = vctxt;
int inSize = *inlen;
int outSize = *outlen;
@@ -1075,6 +1152,9 @@ main(void) {
err |= testBuildRelativeUri();
#if defined(_WIN32) || defined(__CYGWIN__)
err |= testWindowsUri();
+#endif
+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
+ err |= testTruncatedMultiByte();
#endif
err |= testCharEncConvImpl();
diff --git a/xmlIO.c b/xmlIO.c
index b2d86ec9..c02ac9c5 100644
--- a/xmlIO.c
+++ b/xmlIO.c
@@ -2201,7 +2201,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in,
* convert as much as possible to the parser reading buffer.
*/
nbchars = SIZE_MAX;
- if (xmlCharEncInput(in, &nbchars) < 0)
+ if (xmlCharEncInput(in, &nbchars, /* flush */ 0) < 0)
return(-1);
if (nbchars > INT_MAX)
nbchars = INT_MAX;
@@ -2312,7 +2312,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
else
sizeOut = SIZE_MAX;
- if (xmlCharEncInput(in, &sizeOut) < 0)
+ if (xmlCharEncInput(in, &sizeOut, /* flush */ 0) < 0)
return(-1);
res = sizeOut;
}