libxml2/fuzz/genSeed.c
Nick Wellnhofer b349225952 include: Change some return types from int to enum
This also affects some new functions from 2.13.
2025-03-14 02:31:01 +01:00

685 lines
18 KiB
C

/*
* xmlSeed.c: Generate the XML seed corpus for fuzzing.
*
* See Copyright for the status of this software.
*/
#include <stdio.h>
#include <string.h>
#include <glob.h>
#include <libgen.h>
#include <sys/stat.h>
#ifdef _WIN32
#include <direct.h>
#else
#include <unistd.h>
#endif
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/HTMLparser.h>
#include <libxml/xinclude.h>
#include <libxml/xmlschemas.h>
#include "fuzz.h"
#define PATH_SIZE 500
#define SEED_BUF_SIZE 16384
#define EXPR_SIZE 4500
#define FLAG_READER (1 << 0)
#define FLAG_LINT (1 << 1)
#define FLAG_PUSH_CHUNK_SIZE (1 << 2)
typedef int
(*fileFunc)(const char *base, FILE *out);
typedef int
(*mainFunc)(const char *arg);
static struct {
FILE *out;
xmlHashTablePtr entities; /* Maps URLs to xmlFuzzEntityInfos */
xmlExternalEntityLoader oldLoader;
fileFunc processFile;
const char *fuzzer;
int counter;
char cwd[PATH_SIZE];
int flags;
} globalData;
#if defined(HAVE_SCHEMA_FUZZER) || \
defined(HAVE_XML_FUZZER)
/*
* A custom resource loader that writes all external DTDs or entities to a
* single file in the format expected by xmlFuzzResourceLoader.
*/
static xmlParserErrors
fuzzResourceRecorder(void *data ATTRIBUTE_UNUSED, const char *URL,
const char *ID ATTRIBUTE_UNUSED,
xmlResourceType type ATTRIBUTE_UNUSED,
xmlParserInputFlags flags,
xmlParserInputPtr *out) {
xmlParserInputPtr in;
static const int chunkSize = 16384;
int code, len;
*out = NULL;
code = xmlNewInputFromUrl(URL, flags, &in);
if (code != XML_ERR_OK)
return(code);
if (globalData.entities == NULL) {
globalData.entities = xmlHashCreate(4);
} else if (xmlHashLookup(globalData.entities,
(const xmlChar *) URL) != NULL) {
*out = in;
return(XML_ERR_OK);
}
do {
len = xmlParserInputGrow(in, chunkSize);
if (len < 0) {
fprintf(stderr, "Error reading %s\n", URL);
xmlFreeInputStream(in);
return(in->buf->error);
}
} while (len > 0);
data = xmlStrdup(xmlBufContent(in->buf->buffer));
if (data == NULL) {
fprintf(stderr, "Error allocating entity data\n");
xmlFreeInputStream(in);
return(XML_ERR_NO_MEMORY);
}
xmlFreeInputStream(in);
xmlHashAddEntry(globalData.entities, (const xmlChar *) URL, data);
return(xmlNewInputFromUrl(URL, flags, out));
}
static void
fuzzRecorderInit(FILE *out) {
globalData.out = out;
globalData.entities = xmlHashCreate(8);
globalData.oldLoader = xmlGetExternalEntityLoader();
}
static void
fuzzRecorderWriteAndFree(void *entry, const xmlChar *file) {
char *data = entry;
xmlFuzzWriteString(globalData.out, (const char *) file);
xmlFuzzWriteString(globalData.out, data);
xmlFree(data);
}
static void
fuzzRecorderWrite(const char *file) {
xmlHashRemoveEntry(globalData.entities, (const xmlChar *) file,
fuzzRecorderWriteAndFree);
}
static void
fuzzRecorderCleanup(void) {
/* Write remaining entities (in random order). */
xmlHashFree(globalData.entities, fuzzRecorderWriteAndFree);
globalData.out = NULL;
globalData.entities = NULL;
globalData.oldLoader = NULL;
}
#endif
#ifdef HAVE_XML_FUZZER
static int
processXml(const char *docFile, FILE *out) {
int opts = XML_PARSE_NOENT | XML_PARSE_DTDLOAD;
xmlParserCtxtPtr ctxt;
xmlDocPtr doc;
if (globalData.flags & FLAG_LINT) {
/* Switches */
xmlFuzzWriteInt(out, 0, 4);
xmlFuzzWriteInt(out, 0, 4);
/* maxmem */
xmlFuzzWriteInt(out, 0, 4);
/* max-ampl */
xmlFuzzWriteInt(out, 0, 1);
/* pretty */
xmlFuzzWriteInt(out, 0, 1);
/* encode */
xmlFuzzWriteString(out, "");
/* pattern */
xmlFuzzWriteString(out, "");
/* xpath */
xmlFuzzWriteString(out, "");
} else {
/* Parser options. */
xmlFuzzWriteInt(out, opts, 4);
/* Max allocations. */
xmlFuzzWriteInt(out, 0, 4);
if (globalData.flags & FLAG_PUSH_CHUNK_SIZE) {
/* Chunk size for push parser */
xmlFuzzWriteInt(out, 256, 4);
}
if (globalData.flags & FLAG_READER) {
/* Initial reader program with a couple of OP_READs */
xmlFuzzWriteString(out, "\x01\x01\x01\x01\x01\x01\x01\x01");
}
}
fuzzRecorderInit(out);
ctxt = xmlNewParserCtxt();
xmlCtxtSetErrorHandler(ctxt, xmlFuzzSErrorFunc, NULL);
xmlCtxtSetResourceLoader(ctxt, fuzzResourceRecorder, NULL);
doc = xmlCtxtReadFile(ctxt, docFile, NULL, opts);
#ifdef LIBXML_XINCLUDE_ENABLED
{
xmlXIncludeCtxtPtr xinc = xmlXIncludeNewContext(doc);
xmlXIncludeSetErrorHandler(xinc, xmlFuzzSErrorFunc, NULL);
xmlXIncludeSetResourceLoader(xinc, fuzzResourceRecorder, NULL);
xmlXIncludeSetFlags(xinc, opts);
xmlXIncludeProcessNode(xinc, (xmlNodePtr) doc);
xmlXIncludeFreeContext(xinc);
}
#endif
xmlFreeDoc(doc);
xmlFreeParserCtxt(ctxt);
fuzzRecorderWrite(docFile);
fuzzRecorderCleanup();
return(0);
}
#endif
#ifdef HAVE_HTML_FUZZER
static int
processHtml(const char *docFile, FILE *out) {
char buf[SEED_BUF_SIZE];
FILE *file;
size_t size;
/* Parser options. */
xmlFuzzWriteInt(out, 0, 4);
/* Max allocations. */
xmlFuzzWriteInt(out, 0, 4);
/* Copy file */
file = fopen(docFile, "rb");
if (file == NULL) {
fprintf(stderr, "couldn't open %s\n", docFile);
return(0);
}
do {
size = fread(buf, 1, SEED_BUF_SIZE, file);
if (size > 0)
fwrite(buf, 1, size, out);
} while (size == SEED_BUF_SIZE);
fclose(file);
return(0);
}
#endif
#if defined(HAVE_HTML_FUZZER) || \
defined(HAVE_XML_FUZZER)
static int
processPattern(const char *pattern) {
glob_t globbuf;
int ret = 0;
int res;
size_t i;
res = glob(pattern, 0, NULL, &globbuf);
if (res == GLOB_NOMATCH)
return(0);
if (res != 0) {
fprintf(stderr, "couldn't match pattern %s\n", pattern);
return(-1);
}
for (i = 0; i < globbuf.gl_pathc; i++) {
struct stat statbuf;
char outPath[PATH_SIZE];
char *dirBuf = NULL;
char *baseBuf = NULL;
const char *path, *dir, *base;
FILE *out = NULL;
int dirChanged = 0;
size_t size;
path = globbuf.gl_pathv[i];
if ((stat(path, &statbuf) != 0) || (!S_ISREG(statbuf.st_mode)))
continue;
dirBuf = (char *) xmlCharStrdup(path);
baseBuf = (char *) xmlCharStrdup(path);
if ((dirBuf == NULL) || (baseBuf == NULL)) {
fprintf(stderr, "memory allocation failed\n");
ret = -1;
goto error;
}
dir = dirname(dirBuf);
base = basename(baseBuf);
size = snprintf(outPath, sizeof(outPath), "seed/%s/%s",
globalData.fuzzer, base);
if (size >= PATH_SIZE) {
fprintf(stderr, "creating path failed\n");
ret = -1;
goto error;
}
out = fopen(outPath, "wb");
if (out == NULL) {
fprintf(stderr, "couldn't open %s for writing\n", outPath);
ret = -1;
goto error;
}
if (chdir(dir) != 0) {
fprintf(stderr, "couldn't chdir to %s\n", dir);
ret = -1;
goto error;
}
dirChanged = 1;
if (globalData.processFile(base, out) != 0)
ret = -1;
error:
if (out != NULL)
fclose(out);
xmlFree(dirBuf);
xmlFree(baseBuf);
if ((dirChanged) && (chdir(globalData.cwd) != 0)) {
fprintf(stderr, "couldn't chdir to %s\n", globalData.cwd);
ret = -1;
break;
}
}
globfree(&globbuf);
return(ret);
}
#endif
#if defined(HAVE_SCHEMA_FUZZER)
static int
processSchema(const char *xsdFile, const char *xmlFile, FILE *out) {
xmlSchemaPtr schema;
xmlSchemaParserCtxtPtr pctxt;
/* Max allocations. */
xmlFuzzWriteInt(out, 0, 4);
fuzzRecorderInit(out);
pctxt = xmlSchemaNewParserCtxt(xsdFile);
xmlSchemaSetParserStructuredErrors(pctxt, xmlFuzzSErrorFunc, NULL);
xmlSchemaSetResourceLoader(pctxt, fuzzResourceRecorder, NULL);
schema = xmlSchemaParse(pctxt);
xmlSchemaFreeParserCtxt(pctxt);
if (schema != NULL) {
xmlSchemaValidCtxtPtr vctxt;
xmlParserCtxtPtr ctxt;
xmlDocPtr doc;
ctxt = xmlNewParserCtxt();
xmlCtxtSetErrorHandler(ctxt, xmlFuzzSErrorFunc, NULL);
xmlCtxtSetResourceLoader(ctxt, fuzzResourceRecorder, NULL);
doc = xmlCtxtReadFile(ctxt, xmlFile, NULL, XML_PARSE_NOENT);
xmlFreeParserCtxt(ctxt);
vctxt = xmlSchemaNewValidCtxt(schema);
xmlSchemaSetValidStructuredErrors(vctxt, xmlFuzzSErrorFunc, NULL);
xmlSchemaValidateDoc(vctxt, doc);
xmlSchemaFreeValidCtxt(vctxt);
xmlFreeDoc(doc);
xmlSchemaFree(schema);
}
fuzzRecorderWrite(xsdFile);
fuzzRecorderWrite(xmlFile);
fuzzRecorderCleanup();
return(0);
}
static int
processSchemaPattern(const char *pattern) {
glob_t globbuf;
int ret = 0;
int res;
size_t i;
res = glob(pattern, 0, NULL, &globbuf);
if (res == GLOB_NOMATCH)
return(0);
if (res != 0) {
fprintf(stderr, "couldn't match pattern %s\n", pattern);
return(-1);
}
for (i = 0; i < globbuf.gl_pathc; i++) {
glob_t globbuf2;
struct stat statbuf;
char xmlPattern[PATH_SIZE];
char *dirBuf = NULL;
char *baseBuf = NULL;
const char *path, *dir, *base;
size_t size, dirLen, baseLen, len, j;
path = globbuf.gl_pathv[i];
if ((stat(path, &statbuf) != 0) || (!S_ISREG(statbuf.st_mode)))
continue;
dirBuf = (char *) xmlCharStrdup(path);
baseBuf = (char *) xmlCharStrdup(path);
if ((dirBuf == NULL) || (baseBuf == NULL)) {
fprintf(stderr, "memory allocation failed\n");
ret = -1;
goto error;
}
dir = dirname(dirBuf);
dirLen = strlen(dir);
base = basename(baseBuf);
baseLen = strlen(base);
len = strlen(path);
if (len <= 5)
continue;
/* Strip .xsl or _0.xsd suffix */
if (len > 6 && path[len - 6] == '_')
len -= 6;
else
len -= 4;
size = snprintf(xmlPattern, sizeof(xmlPattern), "%.*s_*.xml",
(int) len, path);
if (size >= PATH_SIZE) {
fprintf(stderr, "creating path failed\n");
ret = -1;
goto error;
}
res = glob(xmlPattern, 0, NULL, &globbuf2);
if (res == GLOB_NOMATCH)
goto error;
if (res != 0) {
fprintf(stderr, "couldn't match pattern %s\n", xmlPattern);
ret = -1;
goto error;
}
for (j = 0; j < globbuf2.gl_pathc; j++) {
char outPath[PATH_SIZE];
const char *xmlFile;
FILE *out = NULL;
xmlFile = globbuf2.gl_pathv[j];
len = strlen(xmlFile);
if (len < dirLen + 7)
continue;
if (len >= 6 && xmlFile[len - 6] == '_')
size = snprintf(outPath, sizeof(outPath), "seed/%s/%.*s_%c",
globalData.fuzzer, (int) baseLen - 4, base,
xmlFile[len - 5]);
else
size = snprintf(outPath, sizeof(outPath), "seed/%s/%.*s",
globalData.fuzzer, (int) baseLen - 4, base);
if (size >= PATH_SIZE) {
fprintf(stderr, "creating path failed\n");
ret = -1;
continue;
}
out = fopen(outPath, "wb");
if (out == NULL) {
fprintf(stderr, "couldn't open %s for writing\n", outPath);
ret = -1;
continue;
}
if (chdir(dir) != 0) {
fprintf(stderr, "couldn't chdir to %s\n", dir);
ret = -1;
} else {
if (processSchema(base, xmlFile + dirLen + 1, out) != 0)
ret = -1;
}
fclose(out);
if (chdir(globalData.cwd) != 0) {
fprintf(stderr, "couldn't chdir to %s\n", globalData.cwd);
ret = -1;
break;
}
}
globfree(&globbuf2);
error:
xmlFree(dirBuf);
xmlFree(baseBuf);
}
globfree(&globbuf);
return(ret);
}
#endif
#ifdef HAVE_XPATH_FUZZER
static int
processXPath(const char *testDir, const char *prefix, const char *name,
const char *data, const char *subdir, int xptr) {
char pattern[PATH_SIZE];
glob_t globbuf;
size_t i, size;
int ret = 0, res;
size = snprintf(pattern, sizeof(pattern), "%s/%s/%s*",
testDir, subdir, prefix);
if (size >= PATH_SIZE)
return(-1);
res = glob(pattern, 0, NULL, &globbuf);
if (res == GLOB_NOMATCH)
return(0);
if (res != 0) {
fprintf(stderr, "couldn't match pattern %s\n", pattern);
return(-1);
}
for (i = 0; i < globbuf.gl_pathc; i++) {
char *path = globbuf.gl_pathv[i];
struct stat statbuf;
FILE *in;
char expr[EXPR_SIZE];
if ((stat(path, &statbuf) != 0) || (!S_ISREG(statbuf.st_mode)))
continue;
in = fopen(path, "rb");
if (in == NULL) {
ret = -1;
continue;
}
while (fgets(expr, EXPR_SIZE, in) != NULL) {
char outPath[PATH_SIZE];
FILE *out;
int j;
for (j = 0; expr[j] != 0; j++)
if (expr[j] == '\r' || expr[j] == '\n')
break;
expr[j] = 0;
size = snprintf(outPath, sizeof(outPath), "seed/xpath/%s-%d",
name, globalData.counter);
if (size >= PATH_SIZE) {
ret = -1;
continue;
}
out = fopen(outPath, "wb");
if (out == NULL) {
ret = -1;
continue;
}
/* Max allocations. */
xmlFuzzWriteInt(out, 0, 4);
if (xptr) {
xmlFuzzWriteString(out, expr);
} else {
char xptrExpr[EXPR_SIZE+100];
/* Wrap XPath expressions as XPointer */
snprintf(xptrExpr, sizeof(xptrExpr), "xpointer(%s)", expr);
xmlFuzzWriteString(out, xptrExpr);
}
xmlFuzzWriteString(out, data);
fclose(out);
globalData.counter++;
}
fclose(in);
}
globfree(&globbuf);
return(ret);
}
static int
processXPathDir(const char *testDir) {
char pattern[PATH_SIZE];
glob_t globbuf;
size_t i, size;
int ret = 0;
globalData.counter = 1;
if (processXPath(testDir, "", "expr", "<d></d>", "expr", 0) != 0)
ret = -1;
size = snprintf(pattern, sizeof(pattern), "%s/docs/*", testDir);
if (size >= PATH_SIZE)
return(1);
if (glob(pattern, 0, NULL, &globbuf) != 0)
return(1);
for (i = 0; i < globbuf.gl_pathc; i++) {
char *path = globbuf.gl_pathv[i];
char *data;
const char *docFile;
data = xmlSlurpFile(path, NULL);
if (data == NULL) {
ret = -1;
continue;
}
docFile = basename(path);
globalData.counter = 1;
if (processXPath(testDir, docFile, docFile, data, "tests", 0) != 0)
ret = -1;
if (processXPath(testDir, docFile, docFile, data, "xptr", 1) != 0)
ret = -1;
if (processXPath(testDir, docFile, docFile, data, "xptr-xp1", 1) != 0)
ret = -1;
xmlFree(data);
}
globfree(&globbuf);
return(ret);
}
#endif
int
main(int argc, const char **argv) {
mainFunc processArg = NULL;
const char *fuzzer;
int ret = 0;
int i;
if (argc < 3) {
fprintf(stderr, "usage: seed [FUZZER] [PATTERN...]\n");
return(1);
}
fuzzer = argv[1];
if (strcmp(fuzzer, "html") == 0) {
#ifdef HAVE_HTML_FUZZER
processArg = processPattern;
globalData.flags |= FLAG_PUSH_CHUNK_SIZE;
globalData.processFile = processHtml;
#endif
} else if (strcmp(fuzzer, "lint") == 0) {
#ifdef HAVE_LINT_FUZZER
processArg = processPattern;
globalData.flags |= FLAG_LINT;
globalData.processFile = processXml;
#endif
} else if (strcmp(fuzzer, "reader") == 0) {
#ifdef HAVE_READER_FUZZER
processArg = processPattern;
globalData.flags |= FLAG_READER;
globalData.processFile = processXml;
#endif
} else if (strcmp(fuzzer, "schema") == 0) {
#ifdef HAVE_SCHEMA_FUZZER
processArg = processSchemaPattern;
#endif
} else if (strcmp(fuzzer, "valid") == 0) {
#ifdef HAVE_VALID_FUZZER
processArg = processPattern;
globalData.processFile = processXml;
#endif
} else if (strcmp(fuzzer, "xinclude") == 0) {
#ifdef HAVE_XINCLUDE_FUZZER
processArg = processPattern;
globalData.processFile = processXml;
#endif
} else if (strcmp(fuzzer, "xml") == 0) {
#ifdef HAVE_XML_FUZZER
processArg = processPattern;
globalData.flags |= FLAG_PUSH_CHUNK_SIZE;
globalData.processFile = processXml;
#endif
} else if (strcmp(fuzzer, "xpath") == 0) {
#ifdef HAVE_XPATH_FUZZER
processArg = processXPathDir;
#endif
} else {
fprintf(stderr, "unknown fuzzer %s\n", fuzzer);
return(1);
}
globalData.fuzzer = fuzzer;
if (getcwd(globalData.cwd, PATH_SIZE) == NULL) {
fprintf(stderr, "couldn't get current directory\n");
return(1);
}
if (processArg != NULL)
for (i = 2; i < argc; i++)
processArg(argv[i]);
return(ret);
}