diff --git a/HTMLparser.c b/HTMLparser.c
index 00b64c13..00c30edb 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -4346,8 +4346,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
xmlDetectEncoding(ctxt);
/*
- * This is wrong but matches long-standing behavior. In most cases,
- * a document starting with an XML declaration will specify UTF-8.
+ * TODO: Implement HTML5 prescan algorithm
+ */
+
+ /*
+ * This is wrong but matches long-standing behavior. In most
+ * cases, a document starting with an XML declaration will
+ * specify UTF-8. The HTML5 prescan algorithm handles
+ * XML declarations in a better way.
*/
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
(xmlStrncmp(ctxt->input->cur, BAD_CAST "input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
(xmlStrncmp(ctxt->input->cur, BAD_CAST "html) &&
+ (in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
enc = XML_CHAR_ENCODING_UCS4BE;
autoFlag = XML_INPUT_AUTO_OTHER;
} else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) {
+ /*
+ * TODO: The HTML5 spec requires to check that the
+ * next codepoint is an 'x'.
+ */
enc = XML_CHAR_ENCODING_UTF16BE;
autoFlag = XML_INPUT_AUTO_UTF16BE;
}
@@ -1467,10 +1478,15 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
case 0x3C:
if (in[1] == 0x00) {
- if ((in[2] == 0x00) && (in[3] == 0x00)) {
+ if ((!ctxt->html) &&
+ (in[2] == 0x00) && (in[3] == 0x00)) {
enc = XML_CHAR_ENCODING_UCS4LE;
autoFlag = XML_INPUT_AUTO_OTHER;
} else if ((in[2] == 0x3F) && (in[3] == 0x00)) {
+ /*
+ * TODO: The HTML5 spec requires to check that the
+ * next codepoint is an 'x'.
+ */
enc = XML_CHAR_ENCODING_UTF16LE;
autoFlag = XML_INPUT_AUTO_UTF16LE;
}
@@ -1478,7 +1494,8 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
break;
case 0x4C:
- if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
+ if ((!ctxt->html) &&
+ (in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
enc = XML_CHAR_ENCODING_EBCDIC;
autoFlag = XML_INPUT_AUTO_OTHER;
}