parser: Rewrite push parser boundary checks

Remove inaccurate xmlParseCheckTransition check.

Remove non-incremental xmlParseGetLasts check.

Add functions that check for several boundary constructs more
accurately, keeping track of progress in ctxt->checkIndex.

Fixes #439.
This commit is contained in:
Nick Wellnhofer 2022-11-15 18:23:33 +01:00
parent 2059df5358
commit 68a6518c45
7 changed files with 362 additions and 475 deletions

View File

@ -5334,30 +5334,17 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
int base, len;
htmlParserInputPtr in;
const xmlChar *buf;
int invalue = 0;
char valdellim = 0x0;
int quote;
in = ctxt->input;
if (in == NULL)
return (-1);
base = in->cur - in->base;
if (base < 0)
return (-1);
base = ctxt->checkIndex;
quote = ctxt->endCheckState;
if (ctxt->checkIndex > base) {
base = ctxt->checkIndex;
/* Abuse hasPErefs member to restore current state. */
invalue = ctxt->hasPErefs & 1 ? 1 : 0;
}
if (in->buf == NULL) {
buf = in->base;
len = in->length;
} else {
buf = xmlBufContent(in->buf->buffer);
len = xmlBufUse(in->buf->buffer);
}
buf = in->cur;
len = in->end - in->cur;
/* take into account the sequence length */
if (third)
@ -5366,18 +5353,13 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
len--;
for (; base < len; base++) {
if (ignoreattrval) {
if (quote) {
if (buf[base] == quote)
quote = 0;
continue;
}
if (buf[base] == '"' || buf[base] == '\'') {
if (invalue) {
if (buf[base] == valdellim) {
invalue = 0;
continue;
}
} else {
valdellim = buf[base];
invalue = 1;
continue;
}
} else if (invalue) {
quote = buf[base];
continue;
}
}
@ -5390,29 +5372,12 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
continue;
}
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
"HPP: lookup '%c' found at %d\n",
first, base);
else if (third == 0)
xmlGenericError(xmlGenericErrorContext,
"HPP: lookup '%c%c' found at %d\n",
first, next, base);
else
xmlGenericError(xmlGenericErrorContext,
"HPP: lookup '%c%c%c' found at %d\n",
first, next, third, base);
#endif
return (base - (in->cur - in->base));
ctxt->endCheckState = 0;
return (base);
}
}
ctxt->checkIndex = base;
/* Abuse hasPErefs member to track current state. */
if (invalue)
ctxt->hasPErefs |= 1;
else
ctxt->hasPErefs &= ~1;
ctxt->endCheckState = quote;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
@ -5446,7 +5411,6 @@ static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
{
int mark = 0;
int cur = CUR_PTR - BASE_PTR;
while (mark >= 0) {
mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
@ -5455,7 +5419,7 @@ htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
return mark;
}
ctxt->checkIndex = cur + mark + 1;
ctxt->checkIndex = mark + 1;
}
return mark;
}
@ -6806,6 +6770,7 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
ctxt->vctxt.warning = xmlParserValidityWarning;
ctxt->record_info = 0;
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0;

5
SAX2.c
View File

@ -387,6 +387,7 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
xmlCharEncoding enc;
int oldcharset;
const xmlChar *oldencoding;
int oldprogressive;
/*
* Ask the Entity resolver to load the damn thing
@ -409,7 +410,9 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
oldinputTab = ctxt->inputTab;
oldcharset = ctxt->charset;
oldencoding = ctxt->encoding;
oldprogressive = ctxt->progressive;
ctxt->encoding = NULL;
ctxt->progressive = 0;
ctxt->inputTab = (xmlParserInputPtr *)
xmlMalloc(5 * sizeof(xmlParserInputPtr));
@ -422,6 +425,7 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->inputTab = oldinputTab;
ctxt->charset = oldcharset;
ctxt->encoding = oldencoding;
ctxt->progressive = oldprogressive;
return;
}
ctxt->inputNr = 0;
@ -472,6 +476,7 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
(!xmlDictOwns(ctxt->dict, ctxt->encoding))))
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = oldencoding;
ctxt->progressive = oldprogressive;
/* ctxt->wellFormed = oldwellFormed; */
}
}

View File

@ -311,6 +311,8 @@ struct _xmlParserCtxt {
int input_id; /* we need to label inputs */
unsigned long sizeentcopy; /* volume of entity copy */
int endCheckState; /* quote state for push parser */
};
/**

679
parser.c
View File

@ -11074,142 +11074,231 @@ xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) {
************************************************************************/
/**
* xmlParseLookupSequence:
* xmlParseLookupChar:
* @ctxt: an XML parser context
* @first: the first char to lookup
* @next: the next char to lookup or zero
* @third: the next char to lookup or zero
* @c: character
*
* Try to find if a sequence (first, next, third) or just (first next) or
* (first) is available in the input stream.
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
*
* Returns the index to the current parsing point if the full sequence
* is available, -1 otherwise.
* Check whether the input buffer contains a character.
*/
static int
xmlParseLookupSequence(xmlParserCtxtPtr ctxt, xmlChar first,
xmlChar next, xmlChar third) {
int base, len;
xmlParserInputPtr in;
const xmlChar *buf;
xmlParseLookupChar(xmlParserCtxtPtr ctxt, int c) {
const xmlChar *cur;
in = ctxt->input;
if (in == NULL) return(-1);
base = in->cur - in->base;
if (base < 0) return(-1);
if (ctxt->checkIndex > base)
base = ctxt->checkIndex;
if (in->buf == NULL) {
buf = in->base;
len = in->length;
if (ctxt->checkIndex == 0) {
cur = ctxt->input->cur + 1;
} else {
buf = xmlBufContent(in->buf->buffer);
len = xmlBufUse(in->buf->buffer);
cur = ctxt->input->cur + ctxt->checkIndex;
}
/* take into account the sequence length */
if (third) len -= 2;
else if (next) len --;
for (;base < len;base++) {
if (buf[base] == first) {
if (third != 0) {
if ((buf[base + 1] != next) ||
(buf[base + 2] != third)) continue;
} else if (next != 0) {
if (buf[base + 1] != next) continue;
}
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
"PP: lookup '%c' found at %d\n",
first, base);
else if (third == 0)
xmlGenericError(xmlGenericErrorContext,
"PP: lookup '%c%c' found at %d\n",
first, next, base);
else
xmlGenericError(xmlGenericErrorContext,
"PP: lookup '%c%c%c' found at %d\n",
first, next, third, base);
#endif
return(base - (in->cur - in->base));
}
if (memchr(cur, c, ctxt->input->end - cur) == NULL) {
ctxt->checkIndex = ctxt->input->end - ctxt->input->cur;
return(0);
} else {
ctxt->checkIndex = 0;
return(1);
}
ctxt->checkIndex = base;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
"PP: lookup '%c' failed\n", first);
else if (third == 0)
xmlGenericError(xmlGenericErrorContext,
"PP: lookup '%c%c' failed\n", first, next);
else
xmlGenericError(xmlGenericErrorContext,
"PP: lookup '%c%c%c' failed\n", first, next, third);
#endif
return(-1);
}
/**
* xmlParseGetLasts:
* xmlParseLookupString:
* @ctxt: an XML parser context
* @lastlt: pointer to store the last '<' from the input
* @lastgt: pointer to store the last '>' from the input
* @startDelta: delta to apply at the start
* @str: string
* @strLen: length of string
*
* Lookup the last < and > in the current chunk
* Check whether the input buffer contains a string.
*/
static void
xmlParseGetLasts(xmlParserCtxtPtr ctxt, const xmlChar **lastlt,
const xmlChar **lastgt) {
const xmlChar *tmp;
static const xmlChar *
xmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
const char *str, size_t strLen) {
const xmlChar *cur, *term;
if ((ctxt == NULL) || (lastlt == NULL) || (lastgt == NULL)) {
xmlGenericError(xmlGenericErrorContext,
"Internal error: xmlParseGetLasts\n");
return;
}
if ((ctxt->progressive != 0) && (ctxt->inputNr == 1)) {
tmp = ctxt->input->end;
tmp--;
while ((tmp >= ctxt->input->base) && (*tmp != '<')) tmp--;
if (tmp < ctxt->input->base) {
*lastlt = NULL;
*lastgt = NULL;
} else {
*lastlt = tmp;
tmp++;
while ((tmp < ctxt->input->end) && (*tmp != '>')) {
if (*tmp == '\'') {
tmp++;
while ((tmp < ctxt->input->end) && (*tmp != '\'')) tmp++;
if (tmp < ctxt->input->end) tmp++;
} else if (*tmp == '"') {
tmp++;
while ((tmp < ctxt->input->end) && (*tmp != '"')) tmp++;
if (tmp < ctxt->input->end) tmp++;
} else
tmp++;
}
if (tmp < ctxt->input->end)
*lastgt = tmp;
else {
tmp = *lastlt;
tmp--;
while ((tmp >= ctxt->input->base) && (*tmp != '>')) tmp--;
if (tmp >= ctxt->input->base)
*lastgt = tmp;
else
*lastgt = NULL;
}
}
if (ctxt->checkIndex == 0) {
cur = ctxt->input->cur + startDelta;
} else {
*lastlt = NULL;
*lastgt = NULL;
cur = ctxt->input->cur + ctxt->checkIndex;
}
term = BAD_CAST strstr((const char *) cur, str);
if (term == NULL) {
const xmlChar *end = ctxt->input->end;
/* Rescan (strLen - 1) characters. */
if ((size_t) (end - cur) < strLen)
end = cur;
else
end -= strLen - 1;
ctxt->checkIndex = end - ctxt->input->cur;
} else {
ctxt->checkIndex = 0;
}
return(term);
}
/**
* xmlParseLookupCharData:
* @ctxt: an XML parser context
*
* Check whether the input buffer contains terminated char data.
*/
static int
xmlParseLookupCharData(xmlParserCtxtPtr ctxt) {
const xmlChar *cur = ctxt->input->cur + ctxt->checkIndex;
const xmlChar *end = ctxt->input->end;
while (cur < end) {
if ((*cur == '<') || (*cur == '&')) {
ctxt->checkIndex = 0;
return(1);
}
cur++;
}
ctxt->checkIndex = cur - ctxt->input->cur;
return(0);
}
/**
* xmlParseLookupGt:
* @ctxt: an XML parser context
*
* Check whether there's enough data in the input buffer to finish parsing
* a start tag. This has to take quotes into account.
*/
static int
xmlParseLookupGt(xmlParserCtxtPtr ctxt) {
const xmlChar *cur;
const xmlChar *end = ctxt->input->end;
int state = ctxt->endCheckState;
if (ctxt->checkIndex == 0)
cur = ctxt->input->cur + 1;
else
cur = ctxt->input->cur + ctxt->checkIndex;
while (cur < end) {
if (state) {
if (*cur == state)
state = 0;
} else if (*cur == '\'' || *cur == '"') {
state = *cur;
} else if (*cur == '>') {
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
return(1);
}
cur++;
}
ctxt->checkIndex = cur - ctxt->input->cur;
ctxt->endCheckState = state;
return(0);
}
/**
* xmlParseLookupInternalSubset:
* @ctxt: an XML parser context
*
* Check whether there's enough data in the input buffer to finish parsing
* the internal subset.
*/
static int
xmlParseLookupInternalSubset(xmlParserCtxtPtr ctxt) {
/*
* Sorry, but progressive parsing of the internal subset is not
* supported. We first check that the full content of the internal
* subset is available and parsing is launched only at that point.
* Internal subset ends with "']' S? '>'" in an unescaped section and
* not in a ']]>' sequence which are conditional sections.
*/
const xmlChar *cur, *start;
const xmlChar *end = ctxt->input->end;
int state = ctxt->endCheckState;
if (ctxt->checkIndex == 0) {
cur = ctxt->input->cur + 1;
} else {
cur = ctxt->input->cur + ctxt->checkIndex;
}
start = cur;
while (cur < end) {
if (state == '-') {
if ((*cur == '-') &&
(cur[1] == '-') &&
(cur[2] == '>')) {
state = 0;
cur += 3;
start = cur;
continue;
}
}
else if (state == ']') {
if (*cur == '>') {
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
return(1);
}
if (IS_BLANK_CH(*cur)) {
state = ' ';
} else if (*cur != ']') {
state = 0;
start = cur;
continue;
}
}
else if (state == ' ') {
if (*cur == '>') {
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
return(1);
}
if (!IS_BLANK_CH(*cur)) {
state = 0;
start = cur;
continue;
}
}
else if (state != 0) {
if (*cur == state) {
state = 0;
start = cur + 1;
}
}
else if (*cur == '<') {
if ((cur[1] == '!') &&
(cur[2] == '-') &&
(cur[3] == '-')) {
state = '-';
cur += 4;
/* Don't treat <!--> as comment */
start = cur;
continue;
}
}
else if ((*cur == '"') || (*cur == '\'') || (*cur == ']')) {
state = *cur;
}
cur++;
}
/*
* Rescan the three last characters to detect "<!--" and "-->"
* split across chunks.
*/
if ((state == 0) || (state == '-')) {
if (cur - start < 3)
cur = start;
else
cur -= 3;
}
ctxt->checkIndex = cur - ctxt->input->cur;
ctxt->endCheckState = state;
return(0);
}
/**
* xmlCheckCdataPush:
* @cur: pointer to the block of characters
@ -11292,7 +11381,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
int ret = 0;
int avail, tlen;
xmlChar cur, next;
const xmlChar *lastlt, *lastgt;
if (ctxt->input == NULL)
return(0);
@ -11353,9 +11441,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if ((ctxt->input != NULL) &&
(ctxt->input->cur - ctxt->input->base > 4096)) {
xmlParserInputShrink(ctxt->input);
ctxt->checkIndex = 0;
}
xmlParseGetLasts(ctxt, &lastlt, &lastgt);
while (ctxt->instate != XML_PARSER_EOF) {
if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1))
@ -11442,10 +11528,10 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
}
if ((cur == '<') && (next == '?')) {
/* PI or XML decl */
if (avail < 5) return(ret);
if (avail < 5) goto done;
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '?', '>', 0) < 0))
return(ret);
(!xmlParseLookupString(ctxt, 2, "?>", 2)))
goto done;
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData,
&xmlDefaultSAXLocator);
@ -11526,15 +11612,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
ctxt->sax->endDocument(ctxt->userData);
goto done;
}
if (!terminate) {
if (ctxt->progressive) {
/* > can be found unescaped in attribute values */
if ((lastgt == NULL) || (ctxt->input->cur >= lastgt))
goto done;
} else if (xmlParseLookupSequence(ctxt, '>', 0, 0) < 0) {
goto done;
}
}
if ((!terminate) && (!xmlParseLookupGt(ctxt)))
goto done;
if (ctxt->spaceNr == 0)
spacePush(ctxt, -1);
else if (*ctxt->space == -2)
@ -11599,7 +11678,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
} else {
ctxt->instate = XML_PARSER_CONTENT;
}
ctxt->progressive = 1;
break;
}
if (RAW == '>') {
@ -11614,7 +11692,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
nameNsPush(ctxt, name, prefix, URI, line, ctxt->nsNr - nsNr);
ctxt->instate = XML_PARSER_CONTENT;
ctxt->progressive = 1;
break;
}
case XML_PARSER_CONTENT: {
@ -11628,33 +11705,21 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
break;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) {
ctxt->progressive = XML_PARSER_PI;
(!xmlParseLookupString(ctxt, 2, "?>", 2)))
goto done;
}
xmlParsePI(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
ctxt->progressive = 1;
} else if ((cur == '<') && (next != '!')) {
ctxt->instate = XML_PARSER_START_TAG;
break;
} else if ((cur == '<') && (next == '!') &&
(ctxt->input->cur[2] == '-') &&
(ctxt->input->cur[3] == '-')) {
int term;
if (avail < 4)
goto done;
ctxt->input->cur += 4;
term = xmlParseLookupSequence(ctxt, '-', '-', '>');
ctxt->input->cur -= 4;
if ((!terminate) && (term < 0)) {
ctxt->progressive = XML_PARSER_COMMENT;
if ((!terminate) &&
(!xmlParseLookupString(ctxt, 4, "-->", 3)))
goto done;
}
xmlParseComment(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
ctxt->progressive = 1;
} else if ((cur == '<') && (ctxt->input->cur[1] == '!') &&
(ctxt->input->cur[2] == '[') &&
(ctxt->input->cur[3] == 'C') &&
@ -11674,8 +11739,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
"detected an error in element content\n");
SKIP(1);
} else if (cur == '&') {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
if ((!terminate) && (!xmlParseLookupChar(ctxt, ';')))
goto done;
xmlParseReference(ctxt);
} else {
@ -11693,18 +11757,10 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
*/
if ((ctxt->inputNr == 1) &&
(avail < XML_PARSER_BIG_BUFFER_SIZE)) {
if (!terminate) {
if (ctxt->progressive) {
if ((lastlt == NULL) ||
(ctxt->input->cur > lastlt))
goto done;
} else if (xmlParseLookupSequence(ctxt,
'<', 0, 0) < 0) {
goto done;
}
}
if ((!terminate) && (!xmlParseLookupCharData(ctxt)))
goto done;
}
ctxt->checkIndex = 0;
ctxt->checkIndex = 0;
xmlParseCharData(ctxt, 0);
}
break;
@ -11712,15 +11768,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
case XML_PARSER_END_TAG:
if (avail < 2)
goto done;
if (!terminate) {
if (ctxt->progressive) {
/* > can be found unescaped in attribute values */
if ((lastgt == NULL) || (ctxt->input->cur >= lastgt))
goto done;
} else if (xmlParseLookupSequence(ctxt, '>', 0, 0) < 0) {
goto done;
}
}
if ((!terminate) && (!xmlParseLookupChar(ctxt, '>')))
goto done;
if (ctxt->sax2) {
xmlParseEndTag2(ctxt, &ctxt->pushTab[ctxt->nameNr - 1]);
nameNsPop(ctxt);
@ -11742,35 +11791,35 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
* The Push mode need to have the SAX callback for
* cdataBlock merge back contiguous callbacks.
*/
int base;
const xmlChar *term;
base = xmlParseLookupSequence(ctxt, ']', ']', '>');
if (base < 0) {
if (avail >= XML_PARSER_BIG_BUFFER_SIZE + 2) {
int tmp;
term = xmlParseLookupString(ctxt, 0, "]]>", 3);
if (term == NULL) {
int tmp;
tmp = xmlCheckCdataPush(ctxt->input->cur,
XML_PARSER_BIG_BUFFER_SIZE, 0);
if (tmp < 0) {
tmp = -tmp;
ctxt->input->cur += tmp;
goto encoding_error;
}
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (ctxt->sax->cdataBlock != NULL)
ctxt->sax->cdataBlock(ctxt->userData,
ctxt->input->cur, tmp);
else if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
ctxt->input->cur, tmp);
}
if (ctxt->instate == XML_PARSER_EOF)
goto done;
SKIPL(tmp);
ctxt->checkIndex = 0;
}
goto done;
if (avail < XML_PARSER_BIG_BUFFER_SIZE + 2)
goto done;
ctxt->checkIndex = 0;
tmp = xmlCheckCdataPush(ctxt->input->cur,
XML_PARSER_BIG_BUFFER_SIZE, 0);
if (tmp < 0) {
tmp = -tmp;
ctxt->input->cur += tmp;
goto encoding_error;
}
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (ctxt->sax->cdataBlock != NULL)
ctxt->sax->cdataBlock(ctxt->userData,
ctxt->input->cur, tmp);
else if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
ctxt->input->cur, tmp);
}
if (ctxt->instate == XML_PARSER_EOF)
goto done;
SKIPL(tmp);
} else {
int base = term - CUR_PTR;
int tmp;
tmp = xmlCheckCdataPush(ctxt->input->cur, base, 1);
@ -11804,7 +11853,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
SKIPL(base + 3);
ctxt->checkIndex = 0;
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@ -11827,10 +11875,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
next = ctxt->input->cur[1];
if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) {
ctxt->progressive = XML_PARSER_PI;
(!xmlParseLookupString(ctxt, 2, "?>", 2)))
goto done;
}
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing PI\n");
@ -11839,16 +11885,12 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_MISC;
ctxt->progressive = 1;
ctxt->checkIndex = 0;
} else if ((cur == '<') && (next == '!') &&
(ctxt->input->cur[2] == '-') &&
(ctxt->input->cur[3] == '-')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) {
ctxt->progressive = XML_PARSER_COMMENT;
(!xmlParseLookupString(ctxt, 4, "-->", 3)))
goto done;
}
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing Comment\n");
@ -11857,8 +11899,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_MISC;
ctxt->progressive = 1;
ctxt->checkIndex = 0;
} else if ((cur == '<') && (next == '!') &&
(ctxt->input->cur[2] == 'D') &&
(ctxt->input->cur[3] == 'O') &&
@ -11867,18 +11907,13 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
(ctxt->input->cur[6] == 'Y') &&
(ctxt->input->cur[7] == 'P') &&
(ctxt->input->cur[8] == 'E')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) {
ctxt->progressive = XML_PARSER_DTD;
goto done;
}
if ((!terminate) && (!xmlParseLookupGt(ctxt)))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing internal subset\n");
#endif
ctxt->inSubset = 1;
ctxt->progressive = 0;
ctxt->checkIndex = 0;
xmlParseDocTypeDecl(ctxt);
if (ctxt->instate == XML_PARSER_EOF)
goto done;
@ -11911,8 +11946,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
goto done;
} else {
ctxt->instate = XML_PARSER_START_TAG;
ctxt->progressive = XML_PARSER_START_TAG;
xmlParseGetLasts(ctxt, &lastlt, &lastgt);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: entering START_TAG\n");
@ -11932,10 +11965,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
next = ctxt->input->cur[1];
if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) {
ctxt->progressive = XML_PARSER_PI;
(!xmlParseLookupString(ctxt, 2, "?>", 2)))
goto done;
}
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing PI\n");
@ -11944,14 +11975,11 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_PROLOG;
ctxt->progressive = 1;
} else if ((cur == '<') && (next == '!') &&
(ctxt->input->cur[2] == '-') && (ctxt->input->cur[3] == '-')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) {
ctxt->progressive = XML_PARSER_COMMENT;
(!xmlParseLookupString(ctxt, 4, "-->", 3)))
goto done;
}
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing Comment\n");
@ -11960,15 +11988,11 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_PROLOG;
ctxt->progressive = 1;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
} else {
ctxt->instate = XML_PARSER_START_TAG;
if (ctxt->progressive == 0)
ctxt->progressive = XML_PARSER_START_TAG;
xmlParseGetLasts(ctxt, &lastlt, &lastgt);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: entering START_TAG\n");
@ -11988,10 +12012,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
next = ctxt->input->cur[1];
if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) {
ctxt->progressive = XML_PARSER_PI;
(!xmlParseLookupString(ctxt, 2, "?>", 2)))
goto done;
}
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing PI\n");
@ -12000,14 +12022,11 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_EPILOG;
ctxt->progressive = 1;
} else if ((cur == '<') && (next == '!') &&
(ctxt->input->cur[2] == '-') && (ctxt->input->cur[3] == '-')) {
if ((!terminate) &&
(xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) {
ctxt->progressive = XML_PARSER_COMMENT;
(!xmlParseLookupString(ctxt, 4, "-->", 3)))
goto done;
}
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: Parsing Comment\n");
@ -12016,7 +12035,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_EPILOG;
ctxt->progressive = 1;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
@ -12033,117 +12051,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
}
break;
case XML_PARSER_DTD: {
/*
* Sorry but progressive parsing of the internal subset
* is not expected to be supported. We first check that
* the full content of the internal subset is available and
* the parsing is launched only at that point.
* Internal subset ends up with "']' S? '>'" in an unescaped
* section and not in a ']]>' sequence which are conditional
* sections (whoever argued to keep that crap in XML deserve
* a place in hell !).
*/
int base, i;
xmlChar *buf;
xmlChar quote = 0;
size_t use;
base = ctxt->input->cur - ctxt->input->base;
if (base < 0) return(0);
if (ctxt->checkIndex > base)
base = ctxt->checkIndex;
buf = xmlBufContent(ctxt->input->buf->buffer);
use = xmlBufUse(ctxt->input->buf->buffer);
for (;(unsigned int) base < use; base++) {
if (quote != 0) {
if (buf[base] == quote)
quote = 0;
continue;
}
if ((quote == 0) && (buf[base] == '<')) {
int found = 0;
/* special handling of comments */
if (((unsigned int) base + 4 < use) &&
(buf[base + 1] == '!') &&
(buf[base + 2] == '-') &&
(buf[base + 3] == '-')) {
for (;(unsigned int) base + 3 < use; base++) {
if ((buf[base] == '-') &&
(buf[base + 1] == '-') &&
(buf[base + 2] == '>')) {
found = 1;
base += 2;
break;
}
}
if (!found) {
#if 0
fprintf(stderr, "unfinished comment\n");
#endif
break; /* for */
}
continue;
}
}
if (buf[base] == '"') {
quote = '"';
continue;
}
if (buf[base] == '\'') {
quote = '\'';
continue;
}
if (buf[base] == ']') {
#if 0
fprintf(stderr, "%c%c%c%c: ", buf[base],
buf[base + 1], buf[base + 2], buf[base + 3]);
#endif
if ((unsigned int) base +1 >= use)
break;
if (buf[base + 1] == ']') {
/* conditional crap, skip both ']' ! */
base++;
continue;
}
for (i = 1; (unsigned int) base + i < use; i++) {
if (buf[base + i] == '>') {
#if 0
fprintf(stderr, "found\n");
#endif
goto found_end_int_subset;
}
if (!IS_BLANK_CH(buf[base + i])) {
#if 0
fprintf(stderr, "not found\n");
#endif
goto not_end_of_int_subset;
}
}
#if 0
fprintf(stderr, "end of stream\n");
#endif
break;
}
not_end_of_int_subset:
continue; /* for */
}
/*
* We didn't found the end of the Internal subset
*/
if (quote == 0)
ctxt->checkIndex = base;
else
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
"PP: lookup of int subset end filed\n");
#endif
goto done;
found_end_int_subset:
ctxt->checkIndex = 0;
if ((!terminate) && (!xmlParseLookupInternalSubset(ctxt)))
goto done;
xmlParseInternalSubset(ctxt);
if (ctxt->instate == XML_PARSER_EOF)
goto done;
@ -12157,7 +12066,6 @@ found_end_int_subset:
if (ctxt->instate == XML_PARSER_EOF)
goto done;
ctxt->instate = XML_PARSER_PROLOG;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"PP: entering PROLOG\n");
@ -12257,55 +12165,6 @@ encoding_error:
return(0);
}
/**
* xmlParseCheckTransition:
* @ctxt: an XML parser context
* @chunk: a char array
* @size: the size in byte of the chunk
*
* Check depending on the current parser state if the chunk given must be
* processed immediately or one need more data to advance on parsing.
*
* Returns -1 in case of error, 0 if the push is not needed and 1 if needed
*/
static int
xmlParseCheckTransition(xmlParserCtxtPtr ctxt, const char *chunk, int size) {
if ((ctxt == NULL) || (chunk == NULL) || (size < 0))
return(-1);
if (ctxt->instate == XML_PARSER_START_TAG) {
if (memchr(chunk, '>', size) != NULL)
return(1);
return(0);
}
if (ctxt->progressive == XML_PARSER_COMMENT) {
if (memchr(chunk, '>', size) != NULL)
return(1);
return(0);
}
if (ctxt->instate == XML_PARSER_CDATA_SECTION) {
if (memchr(chunk, '>', size) != NULL)
return(1);
return(0);
}
if (ctxt->progressive == XML_PARSER_PI) {
if (memchr(chunk, '>', size) != NULL)
return(1);
return(0);
}
if (ctxt->instate == XML_PARSER_END_TAG) {
if (memchr(chunk, '>', size) != NULL)
return(1);
return(0);
}
if ((ctxt->progressive == XML_PARSER_DTD) ||
(ctxt->instate == XML_PARSER_DTD)) {
if (memchr(chunk, '>', size) != NULL)
return(1);
return(0);
}
return(1);
}
/**
* xmlParseChunk:
* @ctxt: an XML parser context
@ -12322,8 +12181,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
int terminate) {
int end_in_lf = 0;
int remain = 0;
size_t old_avail = 0;
size_t avail = 0;
if (ctxt == NULL)
return(XML_ERR_INTERNAL_ERROR);
@ -12331,6 +12188,10 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
return(ctxt->errNo);
if (ctxt->instate == XML_PARSER_EOF)
return(-1);
if (ctxt->input == NULL)
return(-1);
ctxt->progressive = 1;
if (ctxt->instate == XML_PARSER_START)
xmlDetectSAX2(ctxt);
if ((size > 0) && (chunk != NULL) && (!terminate) &&
@ -12347,7 +12208,6 @@ xmldecl_done:
size_t cur = ctxt->input->cur - ctxt->input->base;
int res;
old_avail = xmlBufUse(ctxt->input->buf->buffer);
/*
* Specific handling if we autodetected an encoding, we should not
* push more than the first line ... which depend on the encoding
@ -12415,23 +12275,11 @@ xmldecl_done:
}
}
}
if (remain != 0) {
xmlParseTryOrFinish(ctxt, 0);
} else {
if ((ctxt->input != NULL) && (ctxt->input->buf != NULL))
avail = xmlBufUse(ctxt->input->buf->buffer);
/*
* Depending on the current state it may not be such
* a good idea to try parsing if there is nothing in the chunk
* which would be worth doing a parser state transition and we
* need to wait for more data
*/
if ((terminate) || (avail > XML_MAX_TEXT_LENGTH) ||
(old_avail == 0) || (avail == 0) ||
(xmlParseCheckTransition(ctxt,
(const char *)&ctxt->input->base[old_avail],
avail - old_avail)))
xmlParseTryOrFinish(ctxt, terminate);
xmlParseTryOrFinish(ctxt, terminate);
}
if (ctxt->instate == XML_PARSER_EOF)
return(ctxt->errNo);
@ -14895,6 +14743,7 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt)
#endif
ctxt->record_info = 0;
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0;

View File

@ -1,4 +1,15 @@
./test/errors/754946.xml:1: parser error : Extra content at the end of the document
<!DOCTYPE A [
^
./test/errors/754946.xml:3: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration
%SYSTEM;
^
Entity: line 1:
A<lbbbbbbbbbbbbbbbbbbb_
^
./test/errors/754946.xml:4: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration
<![
^
./test/errors/754946.xml:4: parser error : DOCTYPE improperly terminated
<![
^
./test/errors/754946.xml : failed to parse

View File

@ -1,4 +1,33 @@
./test/errors/759573-2.xml:2: parser error : Extra content at the end of the document
<!DOCTYPE test [
^
Entity: line 1: parser error : Space required after '<!ENTITY'
%zz;
^
Entity: line 1:
<!ENTITY<?xDOCTYPEm~?>
^
Entity: line 1: parser error : xmlParseEntityDecl: no name
%zz;
^
Entity: line 1:
<!ENTITY<?xDOCTYPEm~?>
^
Entity: line 1: parser error : ParsePI: PI xDOCTYPEm space expected
%zz;
^
Entity: line 1:
<!ENTITY<?xDOCTYPEm~?>
^
./test/errors/759573-2.xml:6: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration
%xx;ÿggKENSMYNT&#35;MENTD&#372zz;'>
^
Entity: line 2:
<![INCLUDE[
^
./test/errors/759573-2.xml:6: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration
%xx;ÿggKENSMYNT&#35;MENTD&#372zz;'>
^
./test/errors/759573-2.xml:6: parser error : DOCTYPE improperly terminated
%xx;ÿggKENSMYNT&#35;MENTD&#372zz;'>
^
./test/errors/759573-2.xml : failed to parse

View File

@ -1,4 +1,30 @@
./test/errors/759573.xml:1: parser error : Extra content at the end of the document
<?h?><!DOCTYPEt[<!ELEMENT t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;00
^
./test/errors/759573.xml:1: parser error : Space required after '<!ENTITY'
ELEMENT t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITY
^
./test/errors/759573.xml:1: parser error : Space required after the entity name
LEMENT t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITYz
^
./test/errors/759573.xml:1: parser error : Entity value required
LEMENT t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITYz
^
./test/errors/759573.xml:1: parser error : PEReference: no name
T t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITYz>%xx;
^
Entity: line 1:
%<![INCLUDE[000%ஸ000%z;
^
./test/errors/759573.xml:1: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration
T t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITYz>%xx;
^
Entity: line 1:
%<![INCLUDE[000%ஸ000%z;
^
./test/errors/759573.xml:1: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration
T t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITYz>%xx;
^
./test/errors/759573.xml:1: parser error : DOCTYPE improperly terminated
T t (A)><!ENTITY % xx '&#37;<![INCLUDE[000&#37;&#3000;000&#37;z;'><!ENTITYz>%xx;
^
./test/errors/759573.xml : failed to parse