From 7a8722f55783d09f690f59cd4eea426a930ac8a2 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Fri, 31 Jan 2025 14:55:29 +0100 Subject: [PATCH] parser: Document that XML_PARSE_NOBLANKS is broken Long text content can generate multiple "characters" callbacks which can lead to NOBLANKS removing whitespace in non-whitespace text nodes. So the NOBLANKS option doesn't even work reliably with the pull parser. This would be extremely hard to fix. Unfortunately, `xmllint --format` relies on this option which is another reason why this feature never really worked. --- doc/xmllint.xml | 4 ++++ parser.c | 13 ++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/xmllint.xml b/doc/xmllint.xml index 547bf678..3de2b875 100644 --- a/doc/xmllint.xml +++ b/doc/xmllint.xml @@ -283,6 +283,10 @@ environment variable controls the indentation. The default value is two spaces " "). + + Especially in the absence of a DTD, this feature has never worked reliably + and is fundamentally broken. + diff --git a/parser.c b/parser.c index ccfa9e53..3e034d47 100644 --- a/parser.c +++ b/parser.c @@ -4914,6 +4914,11 @@ get_more_space: (ctxt->disableSAX == 0) && (ctxt->sax->ignorableWhitespace != ctxt->sax->characters)) { + /* + * Calling areBlanks with only parts of a text node + * is fundamentally broken, making the NOBLANKS option + * essentially unusable. + */ if (areBlanks(ctxt, tmp, nbchar, 1)) { if (ctxt->sax->ignorableWhitespace != NULL) ctxt->sax->ignorableWhitespace(ctxt->userData, @@ -13715,11 +13720,9 @@ xmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask) * * XML_PARSE_NOBLANKS * - * Remove some text nodes containing only whitespace from the - * result document. Which nodes are removed depends on DTD - * element declarations or a conservative heuristic. The - * reindenting feature of the serialization code relies on this - * option to be set when parsing. Use of this option is + * Remove some whitespace from the result document. Where to + * remove whitespace depends on DTD element declarations or a + * broken heuristic with unfixable bugs. Use of this option is * DISCOURAGED. * * Not supported by the push parser.