mirror of
https://gitlab.gnome.org/GNOME/libxml2
synced 2025-03-28 21:33:13 +00:00
html: Fix DOCTYPE parsing
This commit is contained in:
parent
9678163f54
commit
6edf1a645e
385
HTMLparser.c
385
HTMLparser.c
@ -80,23 +80,6 @@ htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
|
||||
str1, str2, NULL, 0, msg, str1, str2);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseErrInt:
|
||||
* @ctxt: an HTML parser context
|
||||
* @error: the error number
|
||||
* @msg: the error message
|
||||
* @val: integer info
|
||||
*
|
||||
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
|
||||
*/
|
||||
static void LIBXML_ATTR_FORMAT(3,0)
|
||||
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
|
||||
const char *msg, int val)
|
||||
{
|
||||
xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
|
||||
NULL, NULL, NULL, val, msg, val);
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
* Parser stacks related functions and macros *
|
||||
@ -2996,125 +2979,6 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseSystemLiteral:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* parse an HTML Literal
|
||||
*
|
||||
* [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
|
||||
*
|
||||
* Returns the SystemLiteral parsed or NULL
|
||||
*/
|
||||
|
||||
static xmlChar *
|
||||
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
|
||||
size_t len = 0, startPosition = 0;
|
||||
int err = 0;
|
||||
int quote;
|
||||
xmlChar *ret = NULL;
|
||||
|
||||
if ((CUR != '"') && (CUR != '\'')) {
|
||||
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
|
||||
"SystemLiteral \" or ' expected\n", NULL, NULL);
|
||||
return(NULL);
|
||||
}
|
||||
quote = CUR;
|
||||
NEXT;
|
||||
|
||||
if (CUR_PTR < BASE_PTR)
|
||||
return(ret);
|
||||
startPosition = CUR_PTR - BASE_PTR;
|
||||
|
||||
while ((PARSER_STOPPED(ctxt) == 0) &&
|
||||
(CUR != 0) && (CUR != quote) && (CUR != '>')) {
|
||||
/* TODO: Handle UTF-8 */
|
||||
if (!IS_CHAR_CH(CUR)) {
|
||||
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Invalid char in SystemLiteral 0x%X\n", CUR);
|
||||
err = 1;
|
||||
}
|
||||
NEXT;
|
||||
len++;
|
||||
}
|
||||
if (CUR != quote) {
|
||||
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
|
||||
"Unfinished SystemLiteral\n", NULL, NULL);
|
||||
} else {
|
||||
if (err == 0) {
|
||||
ret = xmlStrndup((BASE_PTR+startPosition), len);
|
||||
if (ret == NULL) {
|
||||
htmlErrMemory(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
}
|
||||
NEXT;
|
||||
}
|
||||
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParsePubidLiteral:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* parse an HTML public literal
|
||||
*
|
||||
* [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
|
||||
*
|
||||
* Returns the PubidLiteral parsed or NULL.
|
||||
*/
|
||||
|
||||
static xmlChar *
|
||||
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
|
||||
size_t len = 0, startPosition = 0;
|
||||
int err = 0;
|
||||
int quote;
|
||||
xmlChar *ret = NULL;
|
||||
|
||||
if ((CUR != '"') && (CUR != '\'')) {
|
||||
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
|
||||
"PubidLiteral \" or ' expected\n", NULL, NULL);
|
||||
return(NULL);
|
||||
}
|
||||
quote = CUR;
|
||||
NEXT;
|
||||
|
||||
/*
|
||||
* Name ::= (Letter | '_') (NameChar)*
|
||||
*/
|
||||
if (CUR_PTR < BASE_PTR)
|
||||
return(ret);
|
||||
startPosition = CUR_PTR - BASE_PTR;
|
||||
|
||||
while ((PARSER_STOPPED(ctxt) == 0) &&
|
||||
(CUR != 0) && (CUR != quote) && (CUR != '>')) {
|
||||
if (!IS_PUBIDCHAR_CH(CUR)) {
|
||||
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Invalid char in PubidLiteral 0x%X\n", CUR);
|
||||
err = 1;
|
||||
}
|
||||
len++;
|
||||
NEXT;
|
||||
}
|
||||
|
||||
if (CUR != quote) {
|
||||
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
|
||||
"Unfinished PubidLiteral\n", NULL, NULL);
|
||||
} else {
|
||||
if (err == 0) {
|
||||
ret = xmlStrndup((BASE_PTR + startPosition), len);
|
||||
if (ret == NULL) {
|
||||
htmlErrMemory(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
}
|
||||
NEXT;
|
||||
}
|
||||
|
||||
return(ret);
|
||||
}
|
||||
|
||||
static void
|
||||
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
|
||||
int size, int mode) {
|
||||
@ -3281,64 +3145,6 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
return(res);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseExternalID:
|
||||
* @ctxt: an HTML parser context
|
||||
* @publicID: a xmlChar** receiving PubidLiteral
|
||||
*
|
||||
* Parse an External ID or a Public ID
|
||||
*
|
||||
* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
|
||||
* | 'PUBLIC' S PubidLiteral S SystemLiteral
|
||||
*
|
||||
* [83] PublicID ::= 'PUBLIC' S PubidLiteral
|
||||
*
|
||||
* Returns the function returns SystemLiteral and in the second
|
||||
* case publicID receives PubidLiteral, is strict is off
|
||||
* it is possible to return NULL and have publicID set.
|
||||
*/
|
||||
|
||||
static xmlChar *
|
||||
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
|
||||
xmlChar *URI = NULL;
|
||||
|
||||
if ((UPPER == 'S') && (UPP(1) == 'Y') &&
|
||||
(UPP(2) == 'S') && (UPP(3) == 'T') &&
|
||||
(UPP(4) == 'E') && (UPP(5) == 'M')) {
|
||||
SKIP(6);
|
||||
if (!IS_BLANK_CH(CUR)) {
|
||||
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
|
||||
"Space required after 'SYSTEM'\n", NULL, NULL);
|
||||
}
|
||||
SKIP_BLANKS;
|
||||
URI = htmlParseSystemLiteral(ctxt);
|
||||
if (URI == NULL) {
|
||||
htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
|
||||
"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
|
||||
}
|
||||
} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
|
||||
(UPP(2) == 'B') && (UPP(3) == 'L') &&
|
||||
(UPP(4) == 'I') && (UPP(5) == 'C')) {
|
||||
SKIP(6);
|
||||
if (!IS_BLANK_CH(CUR)) {
|
||||
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
|
||||
"Space required after 'PUBLIC'\n", NULL, NULL);
|
||||
}
|
||||
SKIP_BLANKS;
|
||||
*publicID = htmlParsePubidLiteral(ctxt);
|
||||
if (*publicID == NULL) {
|
||||
htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
|
||||
"htmlParseExternalID: PUBLIC, no Public Identifier\n",
|
||||
NULL, NULL);
|
||||
}
|
||||
SKIP_BLANKS;
|
||||
if ((CUR == '"') || (CUR == '\'')) {
|
||||
URI = htmlParseSystemLiteral(ctxt);
|
||||
}
|
||||
}
|
||||
return(URI);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseComment:
|
||||
* @ctxt: an HTML parser context
|
||||
@ -3515,21 +3321,92 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* htmlParseDoctypeLiteral:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Parse a DOCTYPE SYTSTEM or PUBLIC literal.
|
||||
*
|
||||
* Returns the literal or NULL in case of error.
|
||||
*/
|
||||
|
||||
static xmlChar *
|
||||
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
|
||||
xmlChar *buf = NULL;
|
||||
int len;
|
||||
int size = HTML_PARSER_BUFFER_SIZE;
|
||||
int quote, cur, l;
|
||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
||||
XML_MAX_TEXT_LENGTH :
|
||||
XML_MAX_NAME_LENGTH;
|
||||
|
||||
if ((CUR != '"') && (CUR != '\''))
|
||||
return(NULL);
|
||||
quote = CUR;
|
||||
NEXT;
|
||||
|
||||
buf = xmlMalloc(size);
|
||||
if (buf == NULL) {
|
||||
htmlErrMemory(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
len = 0;
|
||||
|
||||
while (ctxt->input->cur < ctxt->input->end) {
|
||||
cur = CUR_CHAR(l);
|
||||
|
||||
if (cur == '>')
|
||||
break;
|
||||
|
||||
if (cur == quote) {
|
||||
SKIP(1);
|
||||
break;
|
||||
}
|
||||
|
||||
if (len + 5 >= size) {
|
||||
xmlChar *tmp;
|
||||
|
||||
size *= 2;
|
||||
tmp = (xmlChar *) xmlRealloc(buf, size);
|
||||
if (tmp == NULL) {
|
||||
xmlFree(buf);
|
||||
htmlErrMemory(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
buf = tmp;
|
||||
}
|
||||
|
||||
COPY_BUF(buf,len,cur);
|
||||
if (len > maxLength) {
|
||||
htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
|
||||
"identifier too long", NULL, NULL);
|
||||
xmlFree(buf);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
NEXTL(l);
|
||||
}
|
||||
|
||||
buf[len] = 0;
|
||||
return(buf);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseDocTypeDecl:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* parse a DOCTYPE declaration
|
||||
*
|
||||
* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
|
||||
* ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
|
||||
* Parse a DOCTYPE declaration.
|
||||
*/
|
||||
|
||||
static void
|
||||
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
|
||||
const xmlChar *name;
|
||||
xmlChar *ExternalID = NULL;
|
||||
xmlChar *name = NULL;
|
||||
xmlChar *publicId = NULL;
|
||||
xmlChar *URI = NULL;
|
||||
int nameCap, nameSize;
|
||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
||||
XML_MAX_TEXT_LENGTH :
|
||||
XML_MAX_NAME_LENGTH;
|
||||
|
||||
/*
|
||||
* We know that '<!DOCTYPE' has been detected.
|
||||
@ -3538,15 +3415,54 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
|
||||
|
||||
SKIP_BLANKS;
|
||||
|
||||
/*
|
||||
* Parse the DOCTYPE name.
|
||||
*/
|
||||
name = htmlParseName(ctxt);
|
||||
if (name == NULL) {
|
||||
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
||||
"htmlParseDocTypeDecl : no DOCTYPE name !\n",
|
||||
NULL, NULL);
|
||||
nameCap = 0;
|
||||
nameSize = 0;
|
||||
while (ctxt->input->cur < ctxt->input->end) {
|
||||
int l;
|
||||
int c = CUR_CHAR(l);
|
||||
|
||||
if (c == '>')
|
||||
break;
|
||||
|
||||
if (nameSize + 5 > nameCap) {
|
||||
size_t newCap = nameCap ? nameCap * 2 : 32;
|
||||
xmlChar *tmp = xmlRealloc(name, newCap);
|
||||
|
||||
if (tmp == NULL) {
|
||||
htmlErrMemory(ctxt);
|
||||
xmlFree(name);
|
||||
return;
|
||||
}
|
||||
|
||||
name = tmp;
|
||||
nameCap = newCap;
|
||||
}
|
||||
|
||||
if (c < 0x80) {
|
||||
if (IS_WS_HTML(c))
|
||||
break;
|
||||
|
||||
if ((ctxt->options & HTML_PARSE_HTML5) &&
|
||||
(c >= 'A') && (c <= 'Z'))
|
||||
c += 32;
|
||||
|
||||
name[nameSize++] = c;
|
||||
} else {
|
||||
COPY_BUF(name, nameSize, c);
|
||||
}
|
||||
|
||||
if (nameSize > maxLength) {
|
||||
htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
|
||||
"identifier too long", NULL, NULL);
|
||||
goto bogus;
|
||||
}
|
||||
|
||||
NEXTL(l);
|
||||
}
|
||||
|
||||
if (name != NULL)
|
||||
name[nameSize] = 0;
|
||||
|
||||
/*
|
||||
* Check that upper(name) == "HTML" !!!!!!!!!!!!!
|
||||
*/
|
||||
@ -3554,37 +3470,46 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
|
||||
SKIP_BLANKS;
|
||||
|
||||
/*
|
||||
* Check for SystemID and ExternalID
|
||||
* Check for SystemID and publicId
|
||||
*/
|
||||
URI = htmlParseExternalID(ctxt, &ExternalID);
|
||||
SKIP_BLANKS;
|
||||
|
||||
/*
|
||||
* We should be at the end of the DOCTYPE declaration.
|
||||
*/
|
||||
if (CUR != '>') {
|
||||
htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
|
||||
"DOCTYPE improperly terminated\n", NULL, NULL);
|
||||
/* Ignore bogus content */
|
||||
while ((CUR != 0) && (CUR != '>') &&
|
||||
(PARSER_STOPPED(ctxt) == 0))
|
||||
NEXT;
|
||||
if ((UPPER == 'P') && (UPP(1) == 'U') &&
|
||||
(UPP(2) == 'B') && (UPP(3) == 'L') &&
|
||||
(UPP(4) == 'I') && (UPP(5) == 'C')) {
|
||||
SKIP(6);
|
||||
SKIP_BLANKS;
|
||||
publicId = htmlParseDoctypeLiteral(ctxt);
|
||||
if (publicId == NULL)
|
||||
goto bogus;
|
||||
SKIP_BLANKS;
|
||||
URI = htmlParseDoctypeLiteral(ctxt);
|
||||
} else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
|
||||
(UPP(2) == 'S') && (UPP(3) == 'T') &&
|
||||
(UPP(4) == 'E') && (UPP(5) == 'M')) {
|
||||
SKIP(6);
|
||||
SKIP_BLANKS;
|
||||
URI = htmlParseDoctypeLiteral(ctxt);
|
||||
}
|
||||
|
||||
bogus:
|
||||
/* Ignore bogus content */
|
||||
while (ctxt->input->cur < ctxt->input->end) {
|
||||
int c = CUR;
|
||||
|
||||
NEXT;
|
||||
if (c == '>')
|
||||
break;
|
||||
}
|
||||
if (CUR == '>')
|
||||
SKIP(1);
|
||||
|
||||
/*
|
||||
* Create or update the document accordingly to the DOCTYPE
|
||||
*/
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
|
||||
(!ctxt->disableSAX))
|
||||
ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
|
||||
ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
|
||||
|
||||
/*
|
||||
* Cleanup, since we don't use all those identifiers
|
||||
*/
|
||||
if (URI != NULL) xmlFree(URI);
|
||||
if (ExternalID != NULL) xmlFree(ExternalID);
|
||||
xmlFree(name);
|
||||
xmlFree(URI);
|
||||
xmlFree(publicId);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user