diff --git a/HTMLparser.c b/HTMLparser.c index 9daa6274..2260ef13 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4385,6 +4385,11 @@ htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) { htmlParseContent(ctxt); + /* + * Only check for truncated multi-byte sequences + */ + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); + /* TODO: Use xmlCtxtIsCatastrophicError */ if (ctxt->errNo != XML_ERR_NO_MEMORY) { xmlNodePtr cur; @@ -4509,11 +4514,9 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { htmlParseContent(ctxt); /* - * autoclose + * Only check for truncated multi-byte sequences */ - if (CUR == 0) - htmlAutoCloseOnEnd(ctxt); - + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); /* * SAX: end of the document processing. @@ -5237,12 +5240,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { int htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate) { - if ((ctxt == NULL) || (ctxt->input == NULL)) + if ((ctxt == NULL) || + (ctxt->input == NULL) || (ctxt->input->buf == NULL) || + (size < 0) || + ((size > 0) && (chunk == NULL))) return(XML_ERR_ARGUMENT); if (PARSER_STOPPED(ctxt) != 0) return(ctxt->errNo); - if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && - (ctxt->input->buf != NULL)) { + + if (size > 0) { size_t pos = ctxt->input->cur - ctxt->input->base; int res; @@ -5261,6 +5267,11 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) { htmlAutoCloseOnEnd(ctxt); + /* + * Only check for truncated multi-byte sequences + */ + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); + if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) ctxt->sax->endDocument(ctxt->userData); diff --git a/doc/libxml2-api.xml b/doc/libxml2-api.xml index 82593e21..a6058350 100644 --- a/doc/libxml2-api.xml +++ b/doc/libxml2-api.xml @@ -8107,13 +8107,14 @@ crash if you try to modify the tree)'/> - Convert between character encodings. On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced. - + Convert between character encodings. The value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced. If the converter can consume partial multi-byte sequences, the @flush flag can be used to detect truncated sequences at EOF. Otherwise, the flag can be ignored. + + - + If this function returns XML_ERR_OK, it must fill the @out pointer with an encoding handler. The handler can be obtained from xmlCharEncNewCustomHandler. diff --git a/encoding.c b/encoding.c index 1996ef3f..f172c68e 100644 --- a/encoding.c +++ b/encoding.c @@ -113,35 +113,35 @@ static const xmlEncTableEntry xmlEncTable[] = { }; static int -asciiToAscii(unsigned char* out, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +asciiToAscii(void *vctxt, unsigned char* out, int *outlen, + const unsigned char* in, int *inlen, int flush); static int -UTF8ToUTF8(unsigned char* out, int *outlen, - const unsigned char* inb, int *inlenb, void *vctxt); +UTF8ToUTF8(void *vctxt, unsigned char* out, int *outlen, + const unsigned char* inb, int *inlenb, int flush); static int -latin1ToUTF8(unsigned char* out, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +latin1ToUTF8(void *vctxt, unsigned char* out, int *outlen, + const unsigned char* in, int *inlen, int flush); static int -UTF16LEToUTF8(unsigned char* out, int *outlen, - const unsigned char* inb, int *inlenb, void *vctxt); +UTF16LEToUTF8(void *vctxt, unsigned char* out, int *outlen, + const unsigned char* inb, int *inlenb, int flush); static int -UTF16BEToUTF8(unsigned char* out, int *outlen, - const unsigned char* inb, int *inlenb, void *vctxt); +UTF16BEToUTF8(void *vctxt, unsigned char* out, int *outlen, + const unsigned char* inb, int *inlenb, int flush); #ifdef LIBXML_OUTPUT_ENABLED static int -UTF8ToLatin1(unsigned char* outb, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +UTF8ToLatin1(void *vctxt, unsigned char* outb, int *outlen, + const unsigned char* in, int *inlen, int flush); static int -UTF8ToUTF16(unsigned char* outb, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +UTF8ToUTF16(void *vctxt, unsigned char* outb, int *outlen, + const unsigned char* in, int *inlen, int flush); static int -UTF8ToUTF16LE(unsigned char* outb, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +UTF8ToUTF16LE(void *vctxt, unsigned char* outb, int *outlen, + const unsigned char* in, int *inlen, int flush); static int -UTF8ToUTF16BE(unsigned char* outb, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +UTF8ToUTF16BE(void *vctxt, unsigned char* outb, int *outlen, + const unsigned char* in, int *inlen, int flush); #else /* LIBXML_OUTPUT_ENABLED */ @@ -154,8 +154,8 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen, #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED) static int -UTF8ToHtmlWrapper(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt); +UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, int flush); #else #define UTF8ToHtmlWrapper NULL #endif @@ -166,11 +166,11 @@ UTF8ToHtmlWrapper(unsigned char *out, int *outlen, #include "iso8859x.inc" static int -ISO8859xToUTF8(unsigned char* out, int *outlen, - const unsigned char* in, int *inlen, void *vctxt); +ISO8859xToUTF8(void *vctxt, unsigned char* out, int *outlen, + const unsigned char* in, int *inlen, int flush); static int -UTF8ToISO8859x(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt); +UTF8ToISO8859x(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, int flush); #define MAKE_ISO_HANDLER(name, n) \ { (char *) name, { ISO8859xToUTF8 }, { UTF8ToISO8859x }, \ @@ -1073,6 +1073,7 @@ typedef struct { * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in + * @flush: end of input * * Returns an XML_ENC_ERR code. * @@ -1081,8 +1082,9 @@ typedef struct { * The value of @outlen after return is the number of octets produced. */ static int -xmlIconvConvert(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt) { +xmlIconvConvert(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, + int flush ATTRIBUTE_UNUSED) { xmlIconvCtxt *ctxt = vctxt; size_t icv_inlen, icv_outlen; const char *icv_in = (const char *) in; @@ -1293,6 +1295,7 @@ struct _uconv_t { * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in + * @flush: end of input * * Returns an XML_ENC_ERR code. * @@ -1301,8 +1304,8 @@ struct _uconv_t { * The value of @outlen after return is the number of octets produced. */ static int -xmlUconvConvert(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt) { +xmlUconvConvert(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, int flush) { xmlUconvCtxt *cd = vctxt; const char *ucv_in = (const char *) in; char *ucv_out = (char *) out; @@ -1317,14 +1320,10 @@ xmlUconvConvert(unsigned char *out, int *outlen, } /* - * Note that the ICU API is stateful. It can always consume a certain - * amount of input even if the output buffer would overflow. The - * remaining input must be processed by calling ucnv_convertEx with a - * possibly empty input buffer. - * - * ucnv_convertEx is always called with reset and flush set to 0, - * so we don't mess up the state. This should never generate - * U_TRUNCATED_CHAR_FOUND errors. + * The ICU API can consume input, including partial sequences, + * even if the output buffer would overflow. The remaining input + * must be processed by calling ucnv_convertEx with a possibly + * empty input buffer. */ if (cd->isInput) { source = cd->uconv; @@ -1337,7 +1336,8 @@ xmlUconvConvert(unsigned char *out, int *outlen, ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, - cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); + cd->pivot_buf + ICU_PIVOT_BUF_SIZE, + /* reset */ 0, flush, &err); *inlen = ucv_in - (const char*) in; *outlen = ucv_out - (char *) out; @@ -1347,8 +1347,8 @@ xmlUconvConvert(unsigned char *out, int *outlen, } else { switch (err) { case U_TRUNCATED_CHAR_FOUND: - /* Shouldn't happen without flush */ - ret = XML_ENC_ERR_SUCCESS; + /* Should only happen with flush */ + ret = XML_ENC_ERR_INPUT; break; case U_BUFFER_OVERFLOW_ERROR: @@ -1510,6 +1510,7 @@ xmlEncConvertError(int code) { * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in + * @flush: end of input * * The value of @inlen after return is the number of octets consumed * as the return value is 0, else unpredictable. @@ -1519,7 +1520,8 @@ xmlEncConvertError(int code) { */ int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, - int *outlen, const unsigned char *in, int *inlen) { + int *outlen, const unsigned char *in, int *inlen, + int flush) { int ret; if (handler->flags & XML_HANDLER_LEGACY) { @@ -1534,6 +1536,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, ret = func(out, outlen, in, inlen); } else { xmlCharEncConvFunc func = handler->input.func; + int oldInlen; if (func == NULL) { *outlen = 0; @@ -1541,7 +1544,14 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, return(XML_ENC_ERR_INTERNAL); } - ret = func(out, outlen, in, inlen, handler->inputCtxt); + oldInlen = *inlen; + ret = func(handler->inputCtxt, out, outlen, in, inlen, flush); + + /* + * Check for truncated multi-byte sequence. + */ + if ((flush) && (ret == XML_ENC_ERR_SUCCESS) && (*inlen != oldInlen)) + ret = XML_ENC_ERR_INPUT; } if (ret > 0) @@ -1588,7 +1598,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, return(XML_ENC_ERR_INTERNAL); } - ret = func(out, outlen, in, inlen, handler->outputCtxt); + ret = func(handler->outputCtxt, out, outlen, in, inlen, /* flush */ 0); } if (ret > 0) @@ -1617,6 +1627,7 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, * xmlCharEncInput: * @input: a parser input buffer * @sizeOut: pointer to output size + * @flush: end of input * * @sizeOut should be set to the maximum output size (or SIZE_MAX). * After return, it is set to the number of bytes written. @@ -1626,7 +1637,7 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, * Returns an XML_ENC_ERR code. */ int -xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut) +xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush) { xmlBufPtr out, in; const xmlChar *dataIn; @@ -1644,7 +1655,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut) *sizeOut = 0; availIn = xmlBufUse(in); - if (availIn == 0) + if ((availIn == 0) && (!flush)) return(0); dataIn = xmlBufContent(in); totalIn = 0; @@ -1675,7 +1686,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut) } ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - dataIn, &c_in); + dataIn, &c_in, flush && completeIn); totalIn += c_in; dataIn += c_in; @@ -1750,7 +1761,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, written = out->size - out->use - 1; } ret = xmlEncInputChunk(handler, &out->content[out->use], &written, - in->content, &toconv); + in->content, &toconv, /* flush */ 0); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; @@ -2077,9 +2088,10 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { ************************************************************************/ static int -asciiToAscii(unsigned char* out, int *poutlen, +asciiToAscii(void *vctxt ATTRIBUTE_UNUSED, + unsigned char* out, int *poutlen, const unsigned char* in, int *pinlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { const unsigned char *inend; const unsigned char *instart = in; int inlen, outlen, ret; @@ -2121,9 +2133,10 @@ asciiToAscii(unsigned char* out, int *poutlen, } static int -latin1ToUTF8(unsigned char* out, int *outlen, +latin1ToUTF8(void *vctxt ATTRIBUTE_UNUSED, + unsigned char* out, int *outlen, const unsigned char* in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { unsigned char* outstart = out; const unsigned char* instart = in; unsigned char* outend; @@ -2180,13 +2193,15 @@ done: int xmlIsolat1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { - return(latin1ToUTF8(out, outlen, in, inlen, NULL)); + return(latin1ToUTF8(/* ctxt */ NULL, out, outlen, in, inlen, + /* flush */ 0)); } static int -UTF8ToUTF8(unsigned char* out, int *outlen, +UTF8ToUTF8(void *vctxt ATTRIBUTE_UNUSED, + unsigned char* out, int *outlen, const unsigned char* in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { int len; int ret; @@ -2214,9 +2229,10 @@ UTF8ToUTF8(unsigned char* out, int *outlen, #ifdef LIBXML_OUTPUT_ENABLED static int -UTF8ToLatin1(unsigned char* out, int *outlen, +UTF8ToLatin1(void *vctxt ATTRIBUTE_UNUSED, + unsigned char* out, int *outlen, const unsigned char* in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { const unsigned char* outend; const unsigned char* outstart = out; const unsigned char* instart = in; @@ -2286,14 +2302,16 @@ xmlUTF8ToIsolat1(unsigned char* out, int *outlen, if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL)) return(XML_ENC_ERR_INTERNAL); - return(UTF8ToLatin1(out, outlen, in, inlen, NULL)); + return(UTF8ToLatin1(/* ctxt */ NULL, out, outlen, in, inlen, + /* flush */ 0)); } #endif /* LIBXML_OUTPUT_ENABLED */ static int -UTF16LEToUTF8(unsigned char *out, int *outlen, +UTF16LEToUTF8(void *vctxt ATTRIBUTE_UNUSED, + unsigned char *out, int *outlen, const unsigned char *in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend = in + (*inlen & ~1); unsigned char *outstart = out; @@ -2360,9 +2378,10 @@ done: #ifdef LIBXML_OUTPUT_ENABLED static int -UTF8ToUTF16LE(unsigned char *out, int *outlen, +UTF8ToUTF16LE(void *vctxt ATTRIBUTE_UNUSED, + unsigned char *out, int *outlen, const unsigned char *in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend; unsigned char *outstart = out; @@ -2462,9 +2481,10 @@ done: } static int -UTF8ToUTF16(unsigned char* outb, int *outlen, +UTF8ToUTF16(void *vctxt, + unsigned char* outb, int *outlen, const unsigned char* in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush) { if (in == NULL) { /* * initialization, add the Byte Order Mark for UTF-16LE @@ -2480,14 +2500,15 @@ UTF8ToUTF16(unsigned char* outb, int *outlen, *inlen = 0; return(0); } - return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL)); + return (UTF8ToUTF16LE(vctxt, outb, outlen, in, inlen, flush)); } #endif /* LIBXML_OUTPUT_ENABLED */ static int -UTF16BEToUTF8(unsigned char *out, int *outlen, +UTF16BEToUTF8(void *vctxt ATTRIBUTE_UNUSED, + unsigned char *out, int *outlen, const unsigned char *in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend = in + (*inlen & ~1); unsigned char *outstart = out; @@ -2554,9 +2575,10 @@ done: #ifdef LIBXML_OUTPUT_ENABLED static int -UTF8ToUTF16BE(unsigned char *out, int *outlen, +UTF8ToUTF16BE(void *vctxt ATTRIBUTE_UNUSED, + unsigned char *out, int *outlen, const unsigned char *in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { + int flush ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend; unsigned char *outstart = out; @@ -2657,10 +2679,11 @@ done: #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED) static int -UTF8ToHtmlWrapper(unsigned char *out, int *outlen, +UTF8ToHtmlWrapper(void *vctxt ATTRIBUTE_UNUSED, + unsigned char *out, int *outlen, const unsigned char *in, int *inlen, - void *vctxt ATTRIBUTE_UNUSED) { - return(UTF8ToHtml(out, outlen, in, inlen)); + int flush ATTRIBUTE_UNUSED) { + return(htmlUTF8ToHtml(out, outlen, in, inlen)); } #endif @@ -2668,8 +2691,10 @@ UTF8ToHtmlWrapper(unsigned char *out, int *outlen, defined(LIBXML_ISO8859X_ENABLED) static int -UTF8ToISO8859x(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt) { +UTF8ToISO8859x(void *vctxt, + unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, + int flush ATTRIBUTE_UNUSED) { const unsigned char *xlattable = vctxt; const unsigned char *instart = in; const unsigned char *inend; @@ -2748,8 +2773,10 @@ done: } static int -ISO8859xToUTF8(unsigned char* out, int *outlen, - const unsigned char* in, int *inlen, void *vctxt) { +ISO8859xToUTF8(void *vctxt, + unsigned char* out, int *outlen, + const unsigned char* in, int *inlen, + int flush ATTRIBUTE_UNUSED) { unsigned short const *unicodetable = vctxt; const unsigned char* instart = in; const unsigned char* inend; diff --git a/example/icu.c b/example/icu.c index 0e93b671..59105e9b 100644 --- a/example/icu.c +++ b/example/icu.c @@ -31,8 +31,8 @@ typedef struct { } myConvCtxt; static int -icuConvert(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt) { +icuConvert(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, int flush) { myConvCtxt *cd = vctxt; const char *ucv_in = (const char *) in; char *ucv_out = (char *) out; @@ -47,14 +47,10 @@ icuConvert(unsigned char *out, int *outlen, } /* - * Note that the ICU API is stateful. It can always consume a certain - * amount of input even if the output buffer would overflow. The - * remaining input must be processed by calling ucnv_convertEx with a - * possibly empty input buffer. - * - * ucnv_convertEx is always called with reset and flush set to 0, - * so we don't mess up the state. This should never generate - * U_TRUNCATED_CHAR_FOUND errors. + * The ICU API can consume input, including partial sequences, + * even if the output buffer would overflow. The remaining input + * must be processed by calling ucnv_convertEx with a possibly + * empty input buffer. */ if (cd->isInput) { source = cd->uconv; @@ -67,7 +63,8 @@ icuConvert(unsigned char *out, int *outlen, ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, - cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); + cd->pivot_buf + ICU_PIVOT_BUF_SIZE, + /* reset */ 0, flush, &err); *inlen = ucv_in - (const char*) in; *outlen = ucv_out - (char *) out; @@ -77,8 +74,8 @@ icuConvert(unsigned char *out, int *outlen, } else { switch (err) { case U_TRUNCATED_CHAR_FOUND: - /* Shouldn't happen without flush */ - ret = XML_ENC_ERR_SUCCESS; + /* Should only happen with flush */ + ret = XML_ENC_ERR_INPUT; break; case U_BUFFER_OVERFLOW_ERROR: diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index 8a3cddd8..03c67b12 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -126,17 +126,22 @@ typedef int (*xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in + * @flush: end of input * * Convert between character encodings. * - * On success, the value of @inlen after return is the number of - * bytes consumed and @outlen is the number of bytes produced. + * The value of @inlen after return is the number of bytes consumed + * and @outlen is the number of bytes produced. * - * Returns the number of bytes written or an XML_ENC_ERR code. + * If the converter can consume partial multi-byte sequences, the + * @flush flag can be used to detect truncated sequences at EOF. + * Otherwise, the flag can be ignored. + * + * Returns a non-negative number on success or an XML_ENC_ERR code. */ typedef int -(*xmlCharEncConvFunc)(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt); +(*xmlCharEncConvFunc)(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, int flush); /** * xmlCharEncConvCtxtDtor: diff --git a/include/private/enc.h b/include/private/enc.h index 864025f8..de575582 100644 --- a/include/private/enc.h +++ b/include/private/enc.h @@ -9,9 +9,10 @@ xmlInitEncodingInternal(void); XML_HIDDEN int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, - int *outlen, const unsigned char *in, int *inlen); + int *outlen, const unsigned char *in, int *inlen, + int flush); XML_HIDDEN int -xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut); +xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush); XML_HIDDEN int xmlCharEncOutput(xmlOutputBufferPtr output, int init); diff --git a/include/private/parser.h b/include/private/parser.h index 1c92edc8..bae9f55b 100644 --- a/include/private/parser.h +++ b/include/private/parser.h @@ -140,4 +140,7 @@ XML_HIDDEN xmlChar * xmlExpandEntitiesInAttValue(xmlParserCtxtPtr ctxt, const xmlChar *str, int normalize); +XML_HIDDEN void +xmlParserCheckEOF(xmlParserCtxtPtr ctxt, xmlParserErrors code); + #endif /* XML_PARSER_H_PRIVATE__ */ diff --git a/parser.c b/parser.c index 8a26968a..aacaf1f8 100644 --- a/parser.c +++ b/parser.c @@ -7300,9 +7300,7 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID, while (ctxt->inputNr > oldInputNr) xmlPopPE(ctxt); - if (RAW != 0) { - xmlFatalErr(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED, NULL); - } + xmlParserCheckEOF(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED); } /** @@ -9875,8 +9873,7 @@ xmlParseContent(xmlParserCtxtPtr ctxt) { xmlParseContentInternal(ctxt); - if (ctxt->input->cur < ctxt->input->end) - xmlFatalErr(ctxt, XML_ERR_NOT_WELL_BALANCED, NULL); + xmlParserCheckEOF(ctxt, XML_ERR_NOT_WELL_BALANCED); } /** @@ -10737,16 +10734,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { */ xmlParseMisc(ctxt); - if (ctxt->input->cur < ctxt->input->end) { - if (ctxt->wellFormed) - xmlFatalErr(ctxt, XML_ERR_DOCUMENT_END, NULL); - } else if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder != NULL) && - (ctxt->input->buf->error == 0) && - (!xmlBufIsEmpty(ctxt->input->buf->raw))) { - xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR, - "Truncated multi-byte sequence at EOF\n"); - } + xmlParserCheckEOF(ctxt, XML_ERR_DOCUMENT_END); } ctxt->instate = XML_PARSER_EOF; @@ -11596,11 +11584,8 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, xmlFatalErrMsg(ctxt, XML_ERR_DOCUMENT_EMPTY, "Start tag expected, '<' not found\n"); } - } else if ((ctxt->input->buf->encoder != NULL) && - (ctxt->input->buf->error == 0) && - (!xmlBufIsEmpty(ctxt->input->buf->raw))) { - xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR, - "Truncated multi-byte sequence at EOF\n"); + } else { + xmlParserCheckEOF(ctxt, XML_ERR_DOCUMENT_END); } if (ctxt->instate != XML_PARSER_EOF) { ctxt->instate = XML_PARSER_EOF; diff --git a/parserInternals.c b/parserInternals.c index e24af956..7d9bcb27 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -596,6 +596,49 @@ xmlParserGrow(xmlParserCtxtPtr ctxt) { return(ret); } +/** + * xmlParserCheckEOF: + * @ctxt: parser ctxt + * @code: error code + * + * Raises an error with @code if the input wasn't consumed + * completely. + */ +void +xmlParserCheckEOF(xmlParserCtxtPtr ctxt, xmlParserErrors code) { + xmlParserInputPtr in = ctxt->input; + xmlParserInputBufferPtr buf; + + if (ctxt->errNo != XML_ERR_OK) + return; + + if (in->cur < in->end) { + xmlFatalErr(ctxt, code, NULL); + return; + } + + buf = in->buf; + if ((buf != NULL) && (buf->encoder != NULL)) { + size_t curBase = in->cur - in->base; + size_t sizeOut = 64; + int ret; + + /* + * Check for truncated multi-byte sequence + */ + ret = xmlCharEncInput(buf, &sizeOut, /* flush */ 1); + xmlBufUpdateInput(buf->buffer, in, curBase); + if (ret < 0) { + xmlCtxtErrIO(ctxt, buf->error, NULL); + return; + } + + /* Shouldn't happen */ + if (in->cur < in->end) + xmlFatalErr(ctxt, XML_ERR_INTERNAL_ERROR, "expected EOF"); + } +} + /** * xmlParserInputGrow: * @in: an XML parser input @@ -1105,7 +1148,8 @@ xmlDetectEBCDIC(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr *hout) { return(res); outlen = sizeof(out) - 1; inlen = input->end - input->cur; - res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen); + res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, + /* flush */ 0); /* * Return the EBCDIC handler if decoding failed. The error will * be reported later. @@ -1354,7 +1398,7 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input, nbchars = SIZE_MAX; else nbchars = 4000 /* MINLEN */; - res = xmlCharEncInput(in, &nbchars); + res = xmlCharEncInput(in, &nbchars, /* flush */ 0); if (res < 0) code = in->error; } diff --git a/result/errors/truncated-utf16.xml.ent b/result/errors/truncated-utf16.xml.ent index f5be53cb..25edf0f4 100644 --- a/result/errors/truncated-utf16.xml.ent +++ b/result/errors/truncated-utf16.xml.ent @@ -1,3 +1,3 @@ -./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF +./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding ^ diff --git a/result/errors/truncated-utf16.xml.err b/result/errors/truncated-utf16.xml.err index f5be53cb..25edf0f4 100644 --- a/result/errors/truncated-utf16.xml.err +++ b/result/errors/truncated-utf16.xml.err @@ -1,3 +1,3 @@ -./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF +./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding ^ diff --git a/result/errors/truncated-utf16.xml.str b/result/errors/truncated-utf16.xml.str index e45c5788..3e37f30e 100644 --- a/result/errors/truncated-utf16.xml.str +++ b/result/errors/truncated-utf16.xml.str @@ -1,4 +1,4 @@ -./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF +./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding ^ ./test/errors/truncated-utf16.xml : failed to parse diff --git a/testparser.c b/testparser.c index 2f2456e0..6495c43e 100644 --- a/testparser.c +++ b/testparser.c @@ -952,11 +952,88 @@ testWindowsUri(void) { } #endif /* WIN32 */ +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) +static int +testTruncatedMultiByte(void) { + const char xml[] = + "\n" + "\xC3"; +#ifdef LIBXML_HTML_ENABLED + const char html[] = + "\n" + "
\xC3"; +#endif + xmlDocPtr doc; + const xmlError *error; + int err = 0; + + xmlResetLastError(); + doc = xmlReadDoc(BAD_CAST xml, NULL, NULL, XML_PARSE_NOERROR); + error = xmlGetLastError(); + if (error == NULL || error->code != XML_ERR_INVALID_ENCODING) { + fprintf(stderr, "xml, pull: expected XML_ERR_INVALID_ENCODING\n"); + err = 1; + } + xmlFreeDoc(doc); + +#ifdef LIBXML_HTML_ENABLED + xmlResetLastError(); + doc = htmlReadDoc(BAD_CAST html, NULL, NULL, XML_PARSE_NOERROR); + error = xmlGetLastError(); + if (error == NULL || error->code != XML_ERR_INVALID_ENCODING) { + fprintf(stderr, "html, pull: expected XML_ERR_INVALID_ENCODING\n"); + err = 1; + } + xmlFreeDoc(doc); +#endif /* LIBXML_HTML_ENABLED */ + +#ifdef LIBXML_PUSH_ENABLED + { + xmlParserCtxtPtr ctxt; + + ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL); + xmlCtxtSetOptions(ctxt, XML_PARSE_NOERROR); + + xmlParseChunk(ctxt, xml, sizeof(xml) - 1, 0); + xmlParseChunk(ctxt, "", 0, 1); + + if (ctxt->errNo != XML_ERR_INVALID_ENCODING) { + fprintf(stderr, "xml, push: expected XML_ERR_INVALID_ENCODING\n"); + err = 1; + } + + xmlFreeDoc(ctxt->myDoc); + xmlFreeParserCtxt(ctxt); + +#ifdef LIBXML_HTML_ENABLED + ctxt = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, + XML_CHAR_ENCODING_NONE); + xmlCtxtSetOptions(ctxt, XML_PARSE_NOERROR); + + htmlParseChunk(ctxt, html, sizeof(html) - 1, 0); + htmlParseChunk(ctxt, "", 0, 1); + + if (ctxt->errNo != XML_ERR_INVALID_ENCODING) { + fprintf(stderr, "html, push: expected XML_ERR_INVALID_ENCODING\n"); + err = 1; + } + + xmlFreeDoc(ctxt->myDoc); + htmlFreeParserCtxt(ctxt); +#endif /* LIBXML_HTML_ENABLED */ + } +#endif /* LIBXML_PUSH_ENABLED */ + + return err; +} +#endif /* iconv || icu */ + static int charEncConvImplError; static int -rot13Convert(unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, void *vctxt) { +rot13Convert(void *vctxt, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen, + int flush ATTRIBUTE_UNUSED) { int *ctxt = vctxt; int inSize = *inlen; int outSize = *outlen; @@ -1075,6 +1152,9 @@ main(void) { err |= testBuildRelativeUri(); #if defined(_WIN32) || defined(__CYGWIN__) err |= testWindowsUri(); +#endif +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) + err |= testTruncatedMultiByte(); #endif err |= testCharEncConvImpl(); diff --git a/xmlIO.c b/xmlIO.c index b2d86ec9..c02ac9c5 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -2201,7 +2201,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, * convert as much as possible to the parser reading buffer. */ nbchars = SIZE_MAX; - if (xmlCharEncInput(in, &nbchars) < 0) + if (xmlCharEncInput(in, &nbchars, /* flush */ 0) < 0) return(-1); if (nbchars > INT_MAX) nbchars = INT_MAX; @@ -2312,7 +2312,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) { else sizeOut = SIZE_MAX; - if (xmlCharEncInput(in, &sizeOut) < 0) + if (xmlCharEncInput(in, &sizeOut, /* flush */ 0) < 0) return(-1); res = sizeOut; }