mirror of
https://gitlab.gnome.org/GNOME/libxml2
synced 2025-03-28 21:33:13 +00:00
html: Reenable buggy detection of XML declarations
Switch to UTF-8 if a document starts with '<?xm' to match old behavior. Also enable this check in the push parser. Fixes #637.
This commit is contained in:
parent
e2ce828c9b
commit
e395946194
18
HTMLparser.c
18
HTMLparser.c
@ -4851,6 +4851,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
||||
|
||||
xmlDetectEncoding(ctxt);
|
||||
|
||||
/*
|
||||
* This is wrong but matches long-standing behavior. In most cases,
|
||||
* a document starting with an XML declaration will specify UTF-8.
|
||||
*/
|
||||
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
|
||||
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
|
||||
|
||||
/*
|
||||
* Wipe out everything which is before the first '<'
|
||||
*/
|
||||
@ -5408,6 +5416,16 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
*/
|
||||
goto done;
|
||||
case XML_PARSER_START:
|
||||
/*
|
||||
* This is wrong but matches long-standing behavior. In most
|
||||
* cases, a document starting with an XML declaration will
|
||||
* specify UTF-8.
|
||||
*/
|
||||
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
|
||||
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
|
||||
}
|
||||
|
||||
/*
|
||||
* Very first chars read from the document flow.
|
||||
*/
|
||||
|
4
result/HTML/xml-declaration-1.html
Normal file
4
result/HTML/xml-declaration-1.html
Normal file
@ -0,0 +1,4 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<?xml encoding="UTF-8"><html><body>
|
||||
<p>öäüß</p>
|
||||
</body></html>
|
13
result/HTML/xml-declaration-1.html.sax
Normal file
13
result/HTML/xml-declaration-1.html.sax
Normal file
@ -0,0 +1,13 @@
|
||||
SAX.setDocumentLocator()
|
||||
SAX.startDocument()
|
||||
SAX.processingInstruction(xml, encoding="UTF-8")
|
||||
SAX.startElement(html)
|
||||
SAX.startElement(body)
|
||||
SAX.startElement(p)
|
||||
SAX.characters(öäüß, 8)
|
||||
SAX.endElement(p)
|
||||
SAX.characters(
|
||||
, 1)
|
||||
SAX.endElement(body)
|
||||
SAX.endElement(html)
|
||||
SAX.endDocument()
|
@ -2136,6 +2136,12 @@ pushBoundaryTest(const char *filename, const char *result,
|
||||
int cur = 0;
|
||||
unsigned long avail, oldConsumed, consumed;
|
||||
|
||||
/*
|
||||
* HTML encoding detection doesn't work when data is fed bytewise.
|
||||
*/
|
||||
if (strcmp(filename, "./test/HTML/xml-declaration-1.html") == 0)
|
||||
return(0);
|
||||
|
||||
/*
|
||||
* If the parser made progress, check that exactly one construct was
|
||||
* processed and that the input buffer is (almost) empty.
|
||||
|
2
test/HTML/xml-declaration-1.html
Normal file
2
test/HTML/xml-declaration-1.html
Normal file
@ -0,0 +1,2 @@
|
||||
<?xml encoding="UTF-8">
|
||||
<p>öäüß</p>
|
Loading…
x
Reference in New Issue
Block a user