From 01411e7c5ea0fff181271e092f46a2138c3720ec Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 8 Feb 2021 20:58:32 +0100 Subject: [PATCH] Check for invalid redeclarations of predefined entities Implement section "4.6 Predefined Entities" of the XML 1.0 spec and check whether redeclarations of predefined entities match the original definitions. Note that some test cases declared But the XML spec clearly states that this is illegal: > If the entities lt or amp are declared, they MUST be declared as > internal entities whose replacement text is a character reference to > the respective character (less-than sign or ampersand) being escaped; > the double escaping is REQUIRED for these entities so that references > to them produce a well-formed result. Also fixes #217 but the connection is only tangential. The integer overflow discovered by fuzzing was more related to the fact that various parts of the parser disagreed on whether to prefer predefined entities over their redeclarations. The whole situation is a mess and even depends on legacy parser options. But now that redeclarations are validated, it shouldn't make a difference. As noted in the added comment, this is also one of the cases where overly defensive checks can hide interesting logic bugs from fuzzers. --- entities.c | 39 ++++++++++++++++++++++++++++++- result/ent6hex | 9 +++++++ result/ent6hex.rde | 2 ++ result/ent6hex.rdr | 2 ++ result/ent6hex.sax | 17 ++++++++++++++ result/ent6hex.sax2 | 17 ++++++++++++++ result/errors/759398.xml.str | 2 +- result/noent/ent6hex | 9 +++++++ result/noent/ent6hex.sax2 | 17 ++++++++++++++ result/valid/REC-xml-19980210.xml | 2 +- test/ent6hex | 8 +++++++ test/errors/759398.xml | 2 +- test/relaxng/tutor11_1_3.xml | 2 +- test/valid/REC-xml-19980210.xml | 2 +- tree.c | 13 +++++++++++ 15 files changed, 137 insertions(+), 6 deletions(-) create mode 100644 result/ent6hex create mode 100644 result/ent6hex.rde create mode 100644 result/ent6hex.rdr create mode 100644 result/ent6hex.sax create mode 100644 result/ent6hex.sax2 create mode 100644 result/noent/ent6hex create mode 100644 result/noent/ent6hex.sax2 create mode 100644 test/ent6hex diff --git a/entities.c b/entities.c index 08ef1428..caad0b5d 100644 --- a/entities.c +++ b/entities.c @@ -210,7 +210,7 @@ xmlAddEntity(xmlDtdPtr dtd, const xmlChar *name, int type, const xmlChar *content) { xmlDictPtr dict = NULL; xmlEntitiesTablePtr table = NULL; - xmlEntityPtr ret; + xmlEntityPtr ret, predef; if (name == NULL) return(NULL); @@ -223,6 +223,43 @@ xmlAddEntity(xmlDtdPtr dtd, const xmlChar *name, int type, case XML_INTERNAL_GENERAL_ENTITY: case XML_EXTERNAL_GENERAL_PARSED_ENTITY: case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY: + predef = xmlGetPredefinedEntity(name); + if (predef != NULL) { + int valid = 0; + + /* 4.6 Predefined Entities */ + if (type == XML_INTERNAL_GENERAL_ENTITY) { + int c = predef->content[0]; + + if (((content[0] == c) && (content[1] == 0)) && + ((c == '>') || (c == '\'') || (c == '"'))) { + valid = 1; + } else if ((content[0] == '&') && (content[1] == '#')) { + if (content[2] == 'x') { + xmlChar *hex = BAD_CAST "0123456789ABCDEF"; + xmlChar ref[] = "00;"; + + ref[0] = hex[c / 16 % 16]; + ref[1] = hex[c % 16]; + if (xmlStrcasecmp(&content[3], ref) == 0) + valid = 1; + } else { + xmlChar ref[] = "00;"; + + ref[0] = '0' + c / 10 % 10; + ref[1] = '0' + c % 10; + if (xmlStrEqual(&content[2], ref)) + valid = 1; + } + } + } + if (!valid) { + xmlEntitiesErr(XML_ERR_ENTITY_PROCESSING, + "xmlAddEntity: invalid redeclaration of predefined" + " entity"); + return(NULL); + } + } if (dtd->entities == NULL) dtd->entities = xmlHashCreateDict(0, dict); table = dtd->entities; diff --git a/result/ent6hex b/result/ent6hex new file mode 100644 index 00000000..6d59df9c --- /dev/null +++ b/result/ent6hex @@ -0,0 +1,9 @@ + + + + + + +]> + diff --git a/result/ent6hex.rde b/result/ent6hex.rde new file mode 100644 index 00000000..9b0a34db --- /dev/null +++ b/result/ent6hex.rde @@ -0,0 +1,2 @@ +0 10 doc 0 0 +0 1 doc 1 0 diff --git a/result/ent6hex.rdr b/result/ent6hex.rdr new file mode 100644 index 00000000..9b0a34db --- /dev/null +++ b/result/ent6hex.rdr @@ -0,0 +1,2 @@ +0 10 doc 0 0 +0 1 doc 1 0 diff --git a/result/ent6hex.sax b/result/ent6hex.sax new file mode 100644 index 00000000..df844cdb --- /dev/null +++ b/result/ent6hex.sax @@ -0,0 +1,17 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.internalSubset(doc, , ) +SAX.entityDecl(lt, 1, (null), (null), <) +SAX.getEntity(lt) +SAX.entityDecl(gt, 1, (null), (null), >) +SAX.getEntity(gt) +SAX.entityDecl(amp, 1, (null), (null), &) +SAX.getEntity(amp) +SAX.entityDecl(apos, 1, (null), (null), ') +SAX.getEntity(apos) +SAX.entityDecl(quot, 1, (null), (null), ") +SAX.getEntity(quot) +SAX.externalSubset(doc, , ) +SAX.startElement(doc) +SAX.endElement(doc) +SAX.endDocument() diff --git a/result/ent6hex.sax2 b/result/ent6hex.sax2 new file mode 100644 index 00000000..6c9df2eb --- /dev/null +++ b/result/ent6hex.sax2 @@ -0,0 +1,17 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.internalSubset(doc, , ) +SAX.entityDecl(lt, 1, (null), (null), <) +SAX.getEntity(lt) +SAX.entityDecl(gt, 1, (null), (null), >) +SAX.getEntity(gt) +SAX.entityDecl(amp, 1, (null), (null), &) +SAX.getEntity(amp) +SAX.entityDecl(apos, 1, (null), (null), ') +SAX.getEntity(apos) +SAX.entityDecl(quot, 1, (null), (null), ") +SAX.getEntity(quot) +SAX.externalSubset(doc, , ) +SAX.startElementNs(doc, NULL, NULL, 0, 0, 0) +SAX.endElementNs(doc, NULL, NULL) +SAX.endDocument() diff --git a/result/errors/759398.xml.str b/result/errors/759398.xml.str index de9a28c2..6809c06f 100644 --- a/result/errors/759398.xml.str +++ b/result/errors/759398.xml.str @@ -1,5 +1,5 @@ ./test/errors/759398.xml:210: parser error : internal error: detected an error in element content -need to worry about parsers whi + + + + + +]> + diff --git a/result/noent/ent6hex.sax2 b/result/noent/ent6hex.sax2 new file mode 100644 index 00000000..6c9df2eb --- /dev/null +++ b/result/noent/ent6hex.sax2 @@ -0,0 +1,17 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.internalSubset(doc, , ) +SAX.entityDecl(lt, 1, (null), (null), <) +SAX.getEntity(lt) +SAX.entityDecl(gt, 1, (null), (null), >) +SAX.getEntity(gt) +SAX.entityDecl(amp, 1, (null), (null), &) +SAX.getEntity(amp) +SAX.entityDecl(apos, 1, (null), (null), ') +SAX.getEntity(apos) +SAX.entityDecl(quot, 1, (null), (null), ") +SAX.getEntity(quot) +SAX.externalSubset(doc, , ) +SAX.startElementNs(doc, NULL, NULL, 0, 0, 0) +SAX.endElementNs(doc, NULL, NULL) +SAX.endDocument() diff --git a/result/valid/REC-xml-19980210.xml b/result/valid/REC-xml-19980210.xml index 4460b3eb..9fb91039 100644 --- a/result/valid/REC-xml-19980210.xml +++ b/result/valid/REC-xml-19980210.xml @@ -10,7 +10,7 @@ publication. --> - + "> '"> diff --git a/test/ent6hex b/test/ent6hex new file mode 100644 index 00000000..b86028b3 --- /dev/null +++ b/test/ent6hex @@ -0,0 +1,8 @@ + + + + + +]> + diff --git a/test/errors/759398.xml b/test/errors/759398.xml index 132e0297..1fcaa3f8 100644 --- a/test/errors/759398.xml +++ b/test/errors/759398.xml @@ -18,7 +18,7 @@ publication. --> - + "> '"> diff --git a/test/relaxng/tutor11_1_3.xml b/test/relaxng/tutor11_1_3.xml index d146032d..495550b1 100644 --- a/test/relaxng/tutor11_1_3.xml +++ b/test/relaxng/tutor11_1_3.xml @@ -18,7 +18,7 @@ publication. --> - + "> '"> diff --git a/test/valid/REC-xml-19980210.xml b/test/valid/REC-xml-19980210.xml index bec8e904..6591b694 100644 --- a/test/valid/REC-xml-19980210.xml +++ b/test/valid/REC-xml-19980210.xml @@ -18,7 +18,7 @@ publication. --> - + "> '"> diff --git a/tree.c b/tree.c index d2347dfd..d6ea7049 100644 --- a/tree.c +++ b/tree.c @@ -1310,6 +1310,16 @@ xmlStringLenGetNodeList(const xmlDoc *doc, const xmlChar *value, int len) { else tmp = 0; while (tmp != ';') { /* Non input consuming loop */ + /* + * If you find an integer overflow here when fuzzing, + * the bug is probably elsewhere. This function should + * only receive entities that were already validated by + * the parser, typically by xmlParseAttValueComplex + * calling xmlStringDecodeEntities. + * + * So it's better *not* to check for overflow to + * potentially discover new bugs. + */ if ((tmp >= '0') && (tmp <= '9')) charval = charval * 16 + (tmp - '0'); else if ((tmp >= 'a') && (tmp <= 'f')) @@ -1338,6 +1348,7 @@ xmlStringLenGetNodeList(const xmlDoc *doc, const xmlChar *value, int len) { else tmp = 0; while (tmp != ';') { /* Non input consuming loops */ + /* Don't check for integer overflow, see above. */ if ((tmp >= '0') && (tmp <= '9')) charval = charval * 10 + (tmp - '0'); else { @@ -1517,6 +1528,7 @@ xmlStringGetNodeList(const xmlDoc *doc, const xmlChar *value) { cur += 3; tmp = *cur; while (tmp != ';') { /* Non input consuming loop */ + /* Don't check for integer overflow, see above. */ if ((tmp >= '0') && (tmp <= '9')) charval = charval * 16 + (tmp - '0'); else if ((tmp >= 'a') && (tmp <= 'f')) @@ -1539,6 +1551,7 @@ xmlStringGetNodeList(const xmlDoc *doc, const xmlChar *value) { cur += 2; tmp = *cur; while (tmp != ';') { /* Non input consuming loops */ + /* Don't check for integer overflow, see above. */ if ((tmp >= '0') && (tmp <= '9')) charval = charval * 10 + (tmp - '0'); else {