566012 autodetected encoding and encoding conflict

* encoding.c parser.c parserInternals.c: when we autodetect an encoding
  but it's actually not completely compatible with the one declared
  great care must be taken to not convert more than just the first line.
  Led to some refactoring, more private functions and a bit of cleanup.
This commit is contained in:
Daniel Veillard 2009-08-26 11:38:49 +02:00
parent 59f5308591
commit 7e385bd4e2
3 changed files with 145 additions and 39 deletions

View File

@ -1737,24 +1737,28 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
* The real API used by libxml for on-the-fly conversion *
* *
************************************************************************/
int
xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in, int len);
/**
* xmlCharEncFirstLine:
* xmlCharEncFirstLineInt:
* @handler: char enconding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* @len: number of bytes to convert for the first line, or -1
*
* Front-end for the encoding handler input function, but handle only
* the very first line, i.e. limit itself to 45 chars.
*
* Returns the number of byte written if success, or
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
*/
int
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in, int len) {
int ret = -2;
int written;
int toconv;
@ -1771,9 +1775,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
* 45 chars should be sufficient to reach the end of the encoding
* declaration without going too far inside the document content.
* on UTF-16 this means 90bytes, on UCS4 this means 180
* The actual value depending on guessed encoding is passed as @len
* if provided
*/
if (toconv > 180)
toconv = 180;
if (len >= 0) {
if (toconv > len)
toconv = len;
} else {
if (toconv > 180)
toconv = 180;
}
if (toconv * 2 >= written) {
xmlBufferGrow(out, toconv);
written = out->size - out->use - 1;
@ -1827,15 +1838,35 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
return(ret);
}
/**
* xmlCharEncFirstLine:
* @handler: char enconding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* Front-end for the encoding handler input function, but handle only
* the very first line, i.e. limit itself to 45 chars.
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
*/
int
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
return(xmlCharEncFirstLineInt(handler, out, in, -1));
}
/**
* xmlCharEncInFunc:
* @handler: char encoding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
*
* Generic front-end for the encoding handler input function
*
* Returns the number of byte written if success, or
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or

View File

@ -10109,8 +10109,9 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
/*
* Check for the XMLDecl in the Prolog.
* do not GROW here to avoid the detected encoder to decode more
* than just the first line
*/
GROW;
if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) {
/*

View File

@ -945,6 +945,17 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
* *
************************************************************************/
/* defined in encoding.c, not public */
int
xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in, int len);
static int
xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
xmlCharEncodingHandlerPtr handler, int len);
static int
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler, int len);
/**
* xmlSwitchEncoding:
* @ctxt: the parser context
@ -959,6 +970,7 @@ int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
xmlCharEncodingHandlerPtr handler;
int len = -1;
if (ctxt == NULL) return(-1);
switch (enc) {
@ -1002,9 +1014,33 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
(ctxt->input->cur[2] == 0xBF)) {
ctxt->input->cur += 3;
}
break ;
default:
break;
len = 90;
break;
case XML_CHAR_ENCODING_UCS2:
len = 90;
break;
case XML_CHAR_ENCODING_UCS4BE:
case XML_CHAR_ENCODING_UCS4LE:
case XML_CHAR_ENCODING_UCS4_2143:
case XML_CHAR_ENCODING_UCS4_3412:
len = 180;
break;
case XML_CHAR_ENCODING_EBCDIC:
case XML_CHAR_ENCODING_8859_1:
case XML_CHAR_ENCODING_8859_2:
case XML_CHAR_ENCODING_8859_3:
case XML_CHAR_ENCODING_8859_4:
case XML_CHAR_ENCODING_8859_5:
case XML_CHAR_ENCODING_8859_6:
case XML_CHAR_ENCODING_8859_7:
case XML_CHAR_ENCODING_8859_8:
case XML_CHAR_ENCODING_8859_9:
case XML_CHAR_ENCODING_ASCII:
case XML_CHAR_ENCODING_2022_JP:
case XML_CHAR_ENCODING_SHIFT_JIS:
case XML_CHAR_ENCODING_EUC_JP:
len = 45;
break;
}
handler = xmlGetCharEncodingHandler(enc);
if (handler == NULL) {
@ -1095,7 +1131,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
if (handler == NULL)
return(-1);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(xmlSwitchToEncoding(ctxt, handler));
return(xmlSwitchToEncodingInt(ctxt, handler, len));
}
/**
@ -1103,15 +1139,16 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
* @ctxt: the parser context
* @input: the input stream
* @handler: the encoding handler
* @len: the number of bytes to convert for the first line or -1
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler)
static int
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler, int len)
{
int nbchars;
@ -1208,9 +1245,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
* parsed with the autodetected encoding
* into the parser reading buffer.
*/
nbchars = xmlCharEncFirstLine(input->buf->encoder,
input->buf->buffer,
input->buf->raw);
nbchars = xmlCharEncFirstLineInt(input->buf->encoder,
input->buf->buffer,
input->buf->raw,
len);
}
if (nbchars < 0) {
xmlErrInternal(ctxt,
@ -1235,6 +1273,58 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
return (0);
}
/**
* xmlSwitchInputEncoding:
* @ctxt: the parser context
* @input: the input stream
* @handler: the encoding handler
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler) {
return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
}
/**
* xmlSwitchToEncodingInt:
* @ctxt: the parser context
* @handler: the encoding handler
* @len: the lenght to convert or -1
*
* change the input functions when discovering the character encoding
* of a given entity, and convert only @len bytes of the output, this
* is needed on auto detect to allows any declared encoding later to
* convert the actual content after the xmlDecl
*
* Returns 0 in case of success, -1 otherwise
*/
static int
xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
xmlCharEncodingHandlerPtr handler, int len) {
int ret = 0;
if (handler != NULL) {
if (ctxt->input != NULL) {
ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
} else {
xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
NULL);
return(-1);
}
/*
* The parsing is now done in UTF8 natively
*/
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else
return(-1);
return(ret);
}
/**
* xmlSwitchToEncoding:
* @ctxt: the parser context
@ -1248,23 +1338,7 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
int
xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
{
int ret = 0;
if (handler != NULL) {
if (ctxt->input != NULL) {
ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
} else {
xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
NULL);
return(-1);
}
/*
* The parsing is now done in UTF8 natively
*/
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else
return(-1);
return(ret);
return (xmlSwitchToEncodingInt(ctxt, handler, -1));
}
/************************************************************************