diff --git a/ChangeLog b/ChangeLog index 26107dfc..8a8411fb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Wed Jul 16 23:15:53 CEST 2003 Daniel Veillard + + * parserInternals.c: patch from Dodji Seketeli about UTF16 BOM + when using the push XML parser. + * result/utf16bom.xml result/noent/utf16bom.xml test/utf16bom.xml: + added the test to the regression suite. + Tue Jul 15 22:03:13 CEST 2003 Daniel Veillard * globals.c: add xmlThrDefMutex = NULL in xmlCleanupGlobals() diff --git a/parserInternals.c b/parserInternals.c index 8e57cdaa..9c71570a 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1621,6 +1621,23 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) ctxt->input->cur += 3; } return(0); + case XML_CHAR_ENCODING_UTF16LE: + case XML_CHAR_ENCODING_UTF16BE: + /*The raw input characters are encoded + *in UTF-16. As we expect this function + *to be called after xmlCharEncInFunc, we expect + *ctxt->input->cur to contain UTF-8 encoded characters. + *So the raw UTF16 Byte Order Mark + *has also been converted into + *an UTF-8 BOM. Let's skip that BOM. + */ + if ((ctxt->input != NULL) && + (ctxt->input->cur[0] == 0xEF) && + (ctxt->input->cur[1] == 0xBB) && + (ctxt->input->cur[2] == 0xBF)) { + ctxt->input->cur += 3; + } + break ; default: break; } diff --git a/result/noent/utf16bom.xml b/result/noent/utf16bom.xml new file mode 100644 index 00000000..6ea296e2 Binary files /dev/null and b/result/noent/utf16bom.xml differ diff --git a/result/utf16bom.xml b/result/utf16bom.xml new file mode 100644 index 00000000..6ea296e2 Binary files /dev/null and b/result/utf16bom.xml differ diff --git a/test/utf16bom.xml b/test/utf16bom.xml new file mode 100644 index 00000000..1916dc1e Binary files /dev/null and b/test/utf16bom.xml differ