html: Stop implying <p> start tags

Only <html>, <head> or <body> should be implied. Opening extra <p> tags
has always been a libxml2 quirk.
This commit is contained in:
Nick Wellnhofer 2025-02-13 18:20:46 +01:00
parent 71122421a1
commit 8cf6129bbd
8 changed files with 22 additions and 67 deletions

View File

@ -1164,19 +1164,6 @@ static const htmlStartCloseEntry htmlStartClose[] = {
{ "xmp", "ul" } { "xmp", "ul" }
}; };
/*
* The list of HTML elements which are supposed not to have
* CDATA content and where a p element will be implied
*
* TODO: extend that list by reading the HTML SGML DTD on
* implied paragraph
*/
static const char *const htmlNoContentElements[] = {
"html",
"head",
NULL
};
/* /*
* The list of HTML attributes which are of content %Script; * The list of HTML attributes which are of content %Script;
* NOTE: when adding ones, check htmlIsScriptAttribute() since * NOTE: when adding ones, check htmlIsScriptAttribute() since
@ -1568,48 +1555,22 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
} }
/** /**
* htmlCheckParagraph * htmlStartCharData
* @ctxt: an HTML parser context * @ctxt: an HTML parser context
* *
* Check whether a p element need to be implied before inserting * Prepare for non-whitespace character data.
* characters in the current element.
*
* Returns 1 if a paragraph has been inserted, 0 if not and -1
* in case of error.
*/ */
static int static void
htmlCheckParagraph(htmlParserCtxtPtr ctxt) { htmlStartCharData(htmlParserCtxtPtr ctxt) {
const xmlChar *tag; if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
int i; return;
if (ctxt == NULL)
return(-1);
if (ctxt->options & HTML_PARSE_HTML5)
return(0);
tag = ctxt->name;
if (tag == NULL) {
htmlAutoClose(ctxt, BAD_CAST"p");
htmlCheckImplied(ctxt, BAD_CAST"p");
htmlnamePush(ctxt, BAD_CAST"p");
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
return(1);
}
if (!htmlOmittedDefaultValue) if (!htmlOmittedDefaultValue)
return(0); return;
for (i = 0; htmlNoContentElements[i] != NULL; i++) {
if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { if (xmlStrEqual(ctxt->name, BAD_CAST "head"))
htmlAutoClose(ctxt, BAD_CAST"p"); htmlAutoClose(ctxt, BAD_CAST "p");
htmlCheckImplied(ctxt, BAD_CAST"p"); htmlCheckImplied(ctxt, BAD_CAST "p");
htmlnamePush(ctxt, BAD_CAST"p");
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
return(1);
}
}
return(0);
} }
/** /**
@ -2972,7 +2933,7 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
/* /*
* Add leading whitespace to html or head elements before * Add leading whitespace to html or head elements before
* calling htmlCheckParagraph. * calling htmlStartCharData.
*/ */
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
if (!IS_WS_HTML(buf[i])) if (!IS_WS_HTML(buf[i]))
@ -2994,7 +2955,7 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
if (size <= 0) if (size <= 0)
return; return;
htmlCheckParagraph(ctxt); htmlStartCharData(ctxt);
} }
if ((mode == 0) && if ((mode == 0) &&
@ -4084,7 +4045,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
SKIP(2); SKIP(2);
if (ctxt->input->cur >= ctxt->input->end) { if (ctxt->input->cur >= ctxt->input->end) {
htmlCheckParagraph(ctxt); htmlStartCharData(ctxt);
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
(ctxt->sax->characters != NULL)) (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, ctxt->sax->characters(ctxt->userData,
@ -4243,7 +4204,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
} else if (IS_ASCII_LETTER(NXT(1))) { } else if (IS_ASCII_LETTER(NXT(1))) {
htmlParseElementInternal(ctxt); htmlParseElementInternal(ctxt);
} else { } else {
htmlCheckParagraph(ctxt); htmlStartCharData(ctxt);
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
(ctxt->sax->characters != NULL)) (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1); ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
@ -5187,7 +5148,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->checkIndex = 0; ctxt->checkIndex = 0;
} else { } else {
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
htmlCheckParagraph(ctxt); htmlStartCharData(ctxt);
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
(ctxt->sax->characters != NULL)) (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, ctxt->sax->characters(ctxt->userData,

View File

@ -1,2 +1,2 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><p>&amp;j&Ugrave;</p></body></html> <html><body>&amp;j&Ugrave;</body></html>

View File

@ -2,10 +2,8 @@ SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.startElement(html) SAX.startElement(html)
SAX.startElement(body) SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&amp;j, 2) SAX.characters(&amp;j, 2)
SAX.characters(&Ugrave;, 2) SAX.characters(&Ugrave;, 2)
SAX.endElement(p)
SAX.endElement(body) SAX.endElement(body)
SAX.endElement(html) SAX.endElement(html)
SAX.endDocument() SAX.endDocument()

View File

@ -1,3 +1,3 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><p>&amp;:&ecirc; <html><body>&amp;:&ecirc;
</p></body></html> </body></html>

View File

@ -2,11 +2,9 @@ SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.startElement(html) SAX.startElement(html)
SAX.startElement(body) SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&amp;:, 2) SAX.characters(&amp;:, 2)
SAX.characters(&ecirc; SAX.characters(&ecirc;
, 3) , 3)
SAX.endElement(p)
SAX.endElement(body) SAX.endElement(body)
SAX.endElement(html) SAX.endElement(html)
SAX.endDocument() SAX.endDocument()

View File

@ -1,6 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html> <html>
<head> </head> <head> </head>
<body><p>x <body>x
</p></body> </body>
</html> </html>

View File

@ -5,10 +5,8 @@ SAX.startElement(head)
SAX.characters( , 3) SAX.characters( , 3)
SAX.endElement(head) SAX.endElement(head)
SAX.startElement(body) SAX.startElement(body)
SAX.startElement(p)
SAX.characters(x SAX.characters(x
, 2) , 2)
SAX.endElement(p)
SAX.endElement(body) SAX.endElement(body)
SAX.endElement(html) SAX.endElement(html)
SAX.endDocument() SAX.endDocument()

View File

@ -474,7 +474,7 @@ testHtmlPushWithEncoding(void) {
err = 1; err = 1;
} }
node = xmlDocGetRootElement(doc)->children->children->children; node = xmlDocGetRootElement(doc)->children->children;
if (!xmlStrEqual(node->content, BAD_CAST "-\xC3\xA4-")) { if (!xmlStrEqual(node->content, BAD_CAST "-\xC3\xA4-")) {
fprintf(stderr, "testHtmlPushWithEncoding failed\n"); fprintf(stderr, "testHtmlPushWithEncoding failed\n");
err = 1; err = 1;