SAX2: Ignore namespaces in HTML documents

In commit 21ca8829, we started to ignore namespaces in HTML element
names but we still called xmlSplitQName, effectively stripping the
namespace prefix. This would cause elements like <o:p> being parsed
as <p>. Now we leave the name untouched.

Fixes #508.
This commit is contained in:
Nick Wellnhofer 2023-03-31 16:47:48 +02:00
parent 0e42adce77
commit d7d0bc6581
5 changed files with 43 additions and 6 deletions

15
SAX2.c
View File

@ -1632,12 +1632,15 @@ xmlSAX2StartElement(void *ctx, const xmlChar *fullname, const xmlChar **atts)
ctxt->validate = 0;
}
/*
* Split the full name into a namespace prefix and the tag name
*/
name = xmlSplitQName(ctxt, fullname, &prefix);
if (ctxt->html) {
prefix = NULL;
name = xmlStrdup(fullname);
} else {
/*
* Split the full name into a namespace prefix and the tag name
*/
name = xmlSplitQName(ctxt, fullname, &prefix);
}
/*
* Note : the namespace resolution is deferred until the end of the

6
result/HTML/names.html Normal file
View File

@ -0,0 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<body>
<o:p></o:p>
</body>
</html>

View File

@ -0,0 +1,3 @@
./test/HTML/names.html:3: HTML parser error : Tag o:p invalid
<o:p></o:p>
^

View File

@ -0,0 +1,20 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
SAX.characters(
, 1)
SAX.startElement(body)
SAX.characters(
, 3)
SAX.startElement(o:p)
SAX.error: Tag o:p invalid
SAX.endElement(o:p)
SAX.characters(
, 1)
SAX.endElement(body)
SAX.characters(
, 1)
SAX.endElement(html)
SAX.characters(
, 1)
SAX.endDocument()

5
test/HTML/names.html Normal file
View File

@ -0,0 +1,5 @@
<html>
<body>
<o:p></o:p>
</body>
</html>