mirror of
https://gitlab.gnome.org/GNOME/libxml2
synced 2025-03-28 21:33:13 +00:00
html: Make data parsing modes work with push parser
This can't be solved with a simple scan for a terminator. Instead, we make htmlParseCharData handle incomplete data if the "partial" flag is set.
This commit is contained in:
parent
4be1e8befb
commit
080285724b
86
HTMLparser.c
86
HTMLparser.c
@ -2985,13 +2985,15 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
|
||||
/**
|
||||
* htmlParseCharData:
|
||||
* @ctxt: an HTML parser context
|
||||
* @terminate: true if the input buffer is complete
|
||||
* @partial: true if the input buffer is incomplete
|
||||
*
|
||||
* Parse character data and references.
|
||||
*
|
||||
* Returns 1 if all data was parsed, 0 otherwise.
|
||||
*/
|
||||
|
||||
static int
|
||||
htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
|
||||
xmlParserInputPtr input = ctxt->input;
|
||||
xmlChar utf8Char[4];
|
||||
int complete = 0;
|
||||
@ -3044,6 +3046,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
|
||||
if (avail == 0) {
|
||||
if ((partial) && (ncr)) {
|
||||
in -= ncrSize;
|
||||
ncrSize = 0;
|
||||
}
|
||||
|
||||
done = 1;
|
||||
break;
|
||||
}
|
||||
@ -3162,8 +3169,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
}
|
||||
|
||||
if ((mode != 0) && (PARSER_PROGRESSIVE(ctxt))) {
|
||||
in += 1;
|
||||
if ((partial) && (j >= avail)) {
|
||||
done = 1;
|
||||
goto next_chunk;
|
||||
}
|
||||
@ -3183,6 +3189,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
mode = DATA_SCRIPT;
|
||||
}
|
||||
|
||||
if ((partial) && (j >= avail)) {
|
||||
done = 1;
|
||||
goto next_chunk;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '&':
|
||||
@ -3210,6 +3221,26 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (partial) {
|
||||
int terminated = 0;
|
||||
size_t i;
|
||||
|
||||
/*
|
||||
* ∳ has 33 bytes.
|
||||
*/
|
||||
for (i = 1; i < avail; i++) {
|
||||
if ((i >= 32) || !IS_ASCII_LETTER(in[i])) {
|
||||
terminated = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!terminated) {
|
||||
done = 1;
|
||||
goto next_chunk;
|
||||
}
|
||||
}
|
||||
|
||||
repl = htmlFindEntityPrefix(in + j,
|
||||
avail - j,
|
||||
/* isAttr */ 0,
|
||||
@ -3222,6 +3253,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
skip = 0;
|
||||
}
|
||||
|
||||
if ((partial) && (j >= avail)) {
|
||||
done = 1;
|
||||
goto next_chunk;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '\0':
|
||||
@ -3236,6 +3272,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
break;
|
||||
|
||||
case '\r':
|
||||
if (partial && avail < 2) {
|
||||
done = 1;
|
||||
goto next_chunk;
|
||||
}
|
||||
|
||||
skip = 1;
|
||||
if (in[1] != 0x0A) {
|
||||
repl = BAD_CAST "\x0A";
|
||||
@ -3250,6 +3291,9 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
|
||||
xmlChar * guess;
|
||||
|
||||
if (in > chunk)
|
||||
goto next_chunk;
|
||||
|
||||
guess = htmlFindEncoding(ctxt);
|
||||
if (guess == NULL) {
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
||||
@ -3262,11 +3306,12 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
||||
goto restart;
|
||||
}
|
||||
|
||||
/*
|
||||
* We should handle partial data to allow the push
|
||||
* parser to pass incomplete chunks.
|
||||
*/
|
||||
size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
|
||||
size = htmlValidateUtf8(ctxt, in, avail, partial);
|
||||
|
||||
if ((partial) && (size == 0)) {
|
||||
done = 1;
|
||||
goto next_chunk;
|
||||
}
|
||||
|
||||
if (size <= 0) {
|
||||
skip = 1;
|
||||
@ -4154,7 +4199,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
||||
SKIP(1);
|
||||
}
|
||||
} else {
|
||||
htmlParseCharData(ctxt);
|
||||
htmlParseCharData(ctxt, /* partial */ 0);
|
||||
}
|
||||
|
||||
SHRINK;
|
||||
@ -5027,23 +5072,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
mode = ctxt->endCheckState;
|
||||
|
||||
if (mode != 0) {
|
||||
while ((PARSER_STOPPED(ctxt) == 0) &&
|
||||
(in->cur < in->end)) {
|
||||
size_t extra;
|
||||
|
||||
extra = strlen((const char *) ctxt->name) + 2;
|
||||
|
||||
if ((!terminate) &&
|
||||
(htmlParseLookupString(ctxt, 0, "<", 1,
|
||||
extra) < 0))
|
||||
return;
|
||||
ctxt->checkIndex = 0;
|
||||
|
||||
if (htmlParseCharData(ctxt))
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
if (htmlParseCharData(ctxt, !terminate) == 0)
|
||||
return;
|
||||
} else if (in->cur[0] == '<') {
|
||||
int next;
|
||||
|
||||
@ -5125,7 +5155,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
(htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
|
||||
return;
|
||||
ctxt->checkIndex = 0;
|
||||
htmlParseCharData(ctxt);
|
||||
htmlParseCharData(ctxt, /* partial */ 0);
|
||||
}
|
||||
|
||||
break;
|
||||
|
@ -3,7 +3,8 @@ SAX.startDocument()
|
||||
SAX.startElement(html)
|
||||
SAX.startElement(body)
|
||||
SAX.startElement(p)
|
||||
SAX.characters(&jÙ, 4)
|
||||
SAX.characters(&j, 2)
|
||||
SAX.characters(Ù, 2)
|
||||
SAX.endElement(p)
|
||||
SAX.endElement(body)
|
||||
SAX.endElement(html)
|
||||
|
@ -3,8 +3,9 @@ SAX.startDocument()
|
||||
SAX.startElement(html)
|
||||
SAX.startElement(body)
|
||||
SAX.startElement(p)
|
||||
SAX.characters(&:ê
|
||||
, 5)
|
||||
SAX.characters(&:, 2)
|
||||
SAX.characters(ê
|
||||
, 3)
|
||||
SAX.endElement(p)
|
||||
SAX.endElement(body)
|
||||
SAX.endElement(html)
|
||||
|
@ -2013,7 +2013,8 @@ SAX.characters(
|
||||
, 1)
|
||||
SAX.startElement(font, size='2', face='Arial, Helvetica, sans-serif', color='#000000')
|
||||
SAX.startElement(b)
|
||||
SAX.characters(Führer Furor, 13)
|
||||
SAX.characters(F, 1)
|
||||
SAX.characters(ührer Furor, 12)
|
||||
SAX.endElement(b)
|
||||
SAX.endElement(font)
|
||||
SAX.startElement(br)
|
||||
|
Loading…
x
Reference in New Issue
Block a user