diff --git a/HTMLparser.c b/HTMLparser.c index 00b64c13..00c30edb 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4346,8 +4346,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { xmlDetectEncoding(ctxt); /* - * This is wrong but matches long-standing behavior. In most cases, - * a document starting with an XML declaration will specify UTF-8. + * TODO: Implement HTML5 prescan algorithm + */ + + /* + * This is wrong but matches long-standing behavior. In most + * cases, a document starting with an XML declaration will + * specify UTF-8. The HTML5 prescan algorithm handles + * XML declarations in a better way. */ if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && (xmlStrncmp(ctxt->input->cur, BAD_CAST "input->flags & XML_INPUT_HAS_ENCODING) == 0) && (xmlStrncmp(ctxt->input->cur, BAD_CAST "html) && + (in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) { enc = XML_CHAR_ENCODING_UCS4BE; autoFlag = XML_INPUT_AUTO_OTHER; } else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) { + /* + * TODO: The HTML5 spec requires to check that the + * next codepoint is an 'x'. + */ enc = XML_CHAR_ENCODING_UTF16BE; autoFlag = XML_INPUT_AUTO_UTF16BE; } @@ -1467,10 +1478,15 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) { case 0x3C: if (in[1] == 0x00) { - if ((in[2] == 0x00) && (in[3] == 0x00)) { + if ((!ctxt->html) && + (in[2] == 0x00) && (in[3] == 0x00)) { enc = XML_CHAR_ENCODING_UCS4LE; autoFlag = XML_INPUT_AUTO_OTHER; } else if ((in[2] == 0x3F) && (in[3] == 0x00)) { + /* + * TODO: The HTML5 spec requires to check that the + * next codepoint is an 'x'. + */ enc = XML_CHAR_ENCODING_UTF16LE; autoFlag = XML_INPUT_AUTO_UTF16LE; } @@ -1478,7 +1494,8 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) { break; case 0x4C: - if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) { + if ((!ctxt->html) && + (in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) { enc = XML_CHAR_ENCODING_EBCDIC; autoFlag = XML_INPUT_AUTO_OTHER; }