html: Rework htmlLookupSequence

Rename to htmlLookupString and use strstr for increased performance.
This commit is contained in:
Nick Wellnhofer 2023-05-04 17:47:38 +02:00
parent 637215a4de
commit dc2d498318

View File

@ -5265,67 +5265,54 @@ htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
} }
/** /**
* htmlParseLookupSequence: * htmlParseLookupString:
* @ctxt: an HTML parser context * @ctxt: an XML parser context
* @first: the first char to lookup * @startDelta: delta to apply at the start
* @next: the next char to lookup or zero * @str: string
* @third: the next char to lookup or zero * @strLen: length of string
* *
* Try to find if a sequence (first, next, third) or just (first next) or * Check whether the input buffer contains a string.
* (first) is available in the input stream.
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
* This is basically similar to xmlParseLookupSequence()
*
* Returns the index to the current parsing point if the full sequence
* is available, -1 otherwise.
*/ */
static int static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
xmlChar next, xmlChar third) { const char *str, size_t strLen) {
size_t base, len; const xmlChar *cur, *term;
htmlParserInputPtr in; int ret;
const xmlChar *buf;
int quote;
in = ctxt->input; if (ctxt->checkIndex == 0) {
if (in == NULL) cur = ctxt->input->cur + startDelta;
return (-1); } else {
cur = ctxt->input->cur + ctxt->checkIndex;
base = ctxt->checkIndex;
quote = ctxt->endCheckState;
buf = in->cur;
len = in->end - in->cur;
/* take into account the sequence length */
if (third)
len -= 2;
else if (next)
len--;
for (; base < len; base++) {
if (base >= INT_MAX / 2) {
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
return (base - 2);
}
if (buf[base] == first) {
if (third != 0) {
if ((buf[base + 1] != next) || (buf[base + 2] != third))
continue;
} else if (next != 0) {
if (buf[base + 1] != next)
continue;
}
ctxt->checkIndex = 0;
ctxt->endCheckState = 0;
return (base);
}
} }
ctxt->checkIndex = base;
ctxt->endCheckState = quote; term = BAD_CAST strstr((const char *) cur, str);
return (-1); if (term == NULL) {
const xmlChar *end = ctxt->input->end;
size_t index;
/* Rescan (strLen - 1) characters. */
if ((size_t) (end - cur) < strLen)
end = cur;
else
end -= strLen - 1;
index = end - ctxt->input->cur;
if (index > INT_MAX / 2) {
ctxt->checkIndex = 0;
ret = INT_MAX / 2;
} else {
ctxt->checkIndex = index;
ret = -1;
}
} else {
ctxt->checkIndex = 0;
if (term - ctxt->input->cur > INT_MAX / 2)
ret = INT_MAX / 2;
else
ret = term - ctxt->input->cur;
}
return(ret);
} }
/** /**
@ -5338,7 +5325,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
* This function has a side effect of (possibly) incrementing ctxt->checkIndex * This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the * to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally. * parser, do not use liberally.
* This wraps to htmlParseLookupSequence()
* *
* Returns the index to the current parsing point if the full sequence is available, -1 otherwise. * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
*/ */
@ -5349,7 +5335,7 @@ htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
int offset; int offset;
while (1) { while (1) {
mark = htmlParseLookupSequence(ctxt, '-', '-', 0); mark = htmlParseLookupString(ctxt, 2, "--", 2);
if (mark < 0) if (mark < 0)
break; break;
if ((NXT(mark+2) == '>') || if ((NXT(mark+2) == '>') ||
@ -5457,7 +5443,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 9, ">", 1) < 0))
goto done; goto done;
htmlParseDocTypeDecl(ctxt); htmlParseDocTypeDecl(ctxt);
ctxt->instate = XML_PARSER_PROLOG; ctxt->instate = XML_PARSER_PROLOG;
@ -5493,7 +5479,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_MISC; ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 2, ">", 1) < 0))
goto done; goto done;
htmlParsePI(ctxt); htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_MISC; ctxt->instate = XML_PARSER_MISC;
@ -5503,7 +5489,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 9, ">", 1) < 0))
goto done; goto done;
htmlParseDocTypeDecl(ctxt); htmlParseDocTypeDecl(ctxt);
ctxt->instate = XML_PARSER_PROLOG; ctxt->instate = XML_PARSER_PROLOG;
@ -5529,7 +5515,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_PROLOG; ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 2, ">", 1) < 0))
goto done; goto done;
htmlParsePI(ctxt); htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_PROLOG; ctxt->instate = XML_PARSER_PROLOG;
@ -5560,7 +5546,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_EPILOG; ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 2, ">", 1) < 0))
goto done; goto done;
htmlParsePI(ctxt); htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_EPILOG; ctxt->instate = XML_PARSER_EPILOG;
@ -5732,7 +5718,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int idx; int idx;
xmlChar val; xmlChar val;
idx = htmlParseLookupSequence(ctxt, '<', '/', 0); idx = htmlParseLookupString(ctxt, 0, "</", 2);
if (idx < 0) if (idx < 0)
goto done; goto done;
val = in->cur[idx + 2]; val = in->cur[idx + 2];
@ -5762,7 +5748,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 9, ">", 1) < 0))
goto done; goto done;
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n", "Misplaced DOCTYPE declaration\n",
@ -5776,13 +5762,13 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
} else { } else {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 2, ">", 1) < 0))
goto done; goto done;
htmlSkipBogusComment(ctxt); htmlSkipBogusComment(ctxt);
} }
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) (htmlParseLookupString(ctxt, 2, ">", 1) < 0))
goto done; goto done;
htmlParsePI(ctxt); htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
@ -5810,7 +5796,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
* data detection. * data detection.
*/ */
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '<', 0, 0) < 0)) (htmlParseLookupString(ctxt, 0, "<", 1) < 0))
goto done; goto done;
ctxt->checkIndex = 0; ctxt->checkIndex = 0;
while ((PARSER_STOPPED(ctxt) == 0) && while ((PARSER_STOPPED(ctxt) == 0) &&