html: Make data parsing modes work with push parser

This can't be solved with a simple scan for a terminator. Instead, we
make htmlParseCharData handle incomplete data if the "partial" flag is
set.
This commit is contained in:
Nick Wellnhofer 2025-02-01 18:21:47 +01:00
parent 4be1e8befb
commit 080285724b
4 changed files with 65 additions and 32 deletions

View File

@ -2985,13 +2985,15 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
/**
* htmlParseCharData:
* @ctxt: an HTML parser context
* @terminate: true if the input buffer is complete
* @partial: true if the input buffer is incomplete
*
* Parse character data and references.
*
* Returns 1 if all data was parsed, 0 otherwise.
*/
static int
htmlParseCharData(htmlParserCtxtPtr ctxt) {
htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
xmlParserInputPtr input = ctxt->input;
xmlChar utf8Char[4];
int complete = 0;
@ -3044,6 +3046,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
}
if (avail == 0) {
if ((partial) && (ncr)) {
in -= ncrSize;
ncrSize = 0;
}
done = 1;
break;
}
@ -3162,8 +3169,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
}
}
if ((mode != 0) && (PARSER_PROGRESSIVE(ctxt))) {
in += 1;
if ((partial) && (j >= avail)) {
done = 1;
goto next_chunk;
}
@ -3183,6 +3189,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
mode = DATA_SCRIPT;
}
if ((partial) && (j >= avail)) {
done = 1;
goto next_chunk;
}
break;
case '&':
@ -3210,6 +3221,26 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
}
}
} else {
if (partial) {
int terminated = 0;
size_t i;
/*
* &CounterClockwiseContourIntegral; has 33 bytes.
*/
for (i = 1; i < avail; i++) {
if ((i >= 32) || !IS_ASCII_LETTER(in[i])) {
terminated = 1;
break;
}
}
if (!terminated) {
done = 1;
goto next_chunk;
}
}
repl = htmlFindEntityPrefix(in + j,
avail - j,
/* isAttr */ 0,
@ -3222,6 +3253,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
skip = 0;
}
if ((partial) && (j >= avail)) {
done = 1;
goto next_chunk;
}
break;
case '\0':
@ -3236,6 +3272,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
break;
case '\r':
if (partial && avail < 2) {
done = 1;
goto next_chunk;
}
skip = 1;
if (in[1] != 0x0A) {
repl = BAD_CAST "\x0A";
@ -3250,6 +3291,9 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
xmlChar * guess;
if (in > chunk)
goto next_chunk;
guess = htmlFindEncoding(ctxt);
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
@ -3262,11 +3306,12 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
goto restart;
}
/*
* We should handle partial data to allow the push
* parser to pass incomplete chunks.
*/
size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
size = htmlValidateUtf8(ctxt, in, avail, partial);
if ((partial) && (size == 0)) {
done = 1;
goto next_chunk;
}
if (size <= 0) {
skip = 1;
@ -4154,7 +4199,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
SKIP(1);
}
} else {
htmlParseCharData(ctxt);
htmlParseCharData(ctxt, /* partial */ 0);
}
SHRINK;
@ -5027,23 +5072,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
mode = ctxt->endCheckState;
if (mode != 0) {
while ((PARSER_STOPPED(ctxt) == 0) &&
(in->cur < in->end)) {
size_t extra;
extra = strlen((const char *) ctxt->name) + 2;
if ((!terminate) &&
(htmlParseLookupString(ctxt, 0, "<", 1,
extra) < 0))
return;
ctxt->checkIndex = 0;
if (htmlParseCharData(ctxt))
break;
}
break;
if (htmlParseCharData(ctxt, !terminate) == 0)
return;
} else if (in->cur[0] == '<') {
int next;
@ -5125,7 +5155,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
return;
ctxt->checkIndex = 0;
htmlParseCharData(ctxt);
htmlParseCharData(ctxt, /* partial */ 0);
}
break;

View File

@ -3,7 +3,8 @@ SAX.startDocument()
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&amp;j&Ugrave;, 4)
SAX.characters(&amp;j, 2)
SAX.characters(&Ugrave;, 2)
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)

View File

@ -3,8 +3,9 @@ SAX.startDocument()
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&amp;:&ecirc;
, 5)
SAX.characters(&amp;:, 2)
SAX.characters(&ecirc;
, 3)
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)

View File

@ -2013,7 +2013,8 @@ SAX.characters(
, 1)
SAX.startElement(font, size='2', face='Arial, Helvetica, sans-serif', color='#000000')
SAX.startElement(b)
SAX.characters(F&uuml;hrer Furor, 13)
SAX.characters(F, 1)
SAX.characters(&uuml;hrer Furor, 12)
SAX.endElement(b)
SAX.endElement(font)
SAX.startElement(br)