From 71122421a11a9de368f1d5ead42db920e3cb1e31 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Thu, 13 Feb 2025 14:04:10 +0100 Subject: [PATCH] html: Make implied

tags more deterministic libxml2's HTML parser adds

start tags in some situations. This behavior, which doesn't follow any standard, was added in 2000, see here: http://veillard.com/XML/messages/0655.html Text nodes that only contain whitespace don't imply a

tag, but the whitespace check cannot work reliably if we're parsing partial text data which can happen with both pull and push parser. The logic in `areBlanks` is hard to follow. The checks involving `CUR` depend on the position of the input pointer and seem dubious. It's also possible that the behavior changed inadvertently with a later commit. As a result, it's hard to come up with good test cases. We now process leading whitespace before creating implied tags. This is more in line with HTML5 and should avoid at least some issues with partial text data. For example, parsing the string " x" used to result in:

x

And now results in:

x

Except for the implied

tag, this matches HTML5. --- HTMLparser.c | 42 +++++++++++++++++++++++++++++------ result/HTML/implied1.html | 6 +++++ result/HTML/implied1.html.sax | 14 ++++++++++++ test/HTML/implied1.html | 1 + 4 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 result/HTML/implied1.html create mode 100644 result/HTML/implied1.html.sax create mode 100644 test/HTML/implied1.html diff --git a/HTMLparser.c b/HTMLparser.c index 83d70de9..eac9964a 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2965,16 +2965,44 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf, if ((mode == 0) || (mode == DATA_RCDATA) || (ctxt->sax->cdataBlock == NULL)) { - int blank = areBlanks(ctxt, buf, size); + if ((ctxt->name == NULL) || + (xmlStrEqual(ctxt->name, BAD_CAST "html")) || + (xmlStrEqual(ctxt->name, BAD_CAST "head"))) { + int i; - if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) { + /* + * Add leading whitespace to html or head elements before + * calling htmlCheckParagraph. + */ + for (i = 0; i < size; i++) + if (!IS_WS_HTML(buf[i])) + break; + + if (i > 0) { + if (!ctxt->keepBlanks) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, i); + } + + buf += i; + size -= i; + } + + if (size <= 0) + return; + + htmlCheckParagraph(ctxt); + } + + if ((mode == 0) && + (!ctxt->keepBlanks) && + (areBlanks(ctxt, buf, size))) { if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - buf, size); + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size); } else { - if ((mode == 0) && (blank < 0)) - htmlCheckParagraph(ctxt); - if (ctxt->sax->characters != NULL) ctxt->sax->characters(ctxt->userData, buf, size); } diff --git a/result/HTML/implied1.html b/result/HTML/implied1.html new file mode 100644 index 00000000..6c860658 --- /dev/null +++ b/result/HTML/implied1.html @@ -0,0 +1,6 @@ + + + +

x +

+ diff --git a/result/HTML/implied1.html.sax b/result/HTML/implied1.html.sax new file mode 100644 index 00000000..b3d71651 --- /dev/null +++ b/result/HTML/implied1.html.sax @@ -0,0 +1,14 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(html) +SAX.startElement(head) +SAX.characters( , 3) +SAX.endElement(head) +SAX.startElement(body) +SAX.startElement(p) +SAX.characters(x +, 2) +SAX.endElement(p) +SAX.endElement(body) +SAX.endElement(html) +SAX.endDocument() diff --git a/test/HTML/implied1.html b/test/HTML/implied1.html new file mode 100644 index 00000000..d288ee72 --- /dev/null +++ b/test/HTML/implied1.html @@ -0,0 +1 @@ + x