added support for HTML PIs #156087 added specific tests Daniel

* HTMLparser.c: added support for HTML PIs #156087
* test/HTML/python.html result/HTML/python.html*: added specific tests
Daniel
This commit is contained in:
Daniel Veillard 2004-10-22 14:34:23 +00:00
parent 01c3bd53d8
commit fc484dd0a0
9 changed files with 223 additions and 11 deletions

View File

@ -1,3 +1,8 @@
Fri Oct 22 16:36:50 CEST 2004 Daniel Veillard <daniel@veillard.com>
* HTMLparser.c: added support for HTML PIs #156087
* test/HTML/python.html result/HTML/python.html*: added specific tests
Fri Oct 22 15:20:23 CEST 2004 Daniel Veillard <daniel@veillard.com>
* threads.c: fixed nasty bug #156087

View File

@ -2808,6 +2808,117 @@ htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
return(URI);
}
/**
* xmlParsePI:
* @ctxt: an XML parser context
*
* parse an XML Processing Instruction.
*
* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
*/
static void
htmlParsePI(htmlParserCtxtPtr ctxt) {
xmlChar *buf = NULL;
int len = 0;
int size = HTML_PARSER_BUFFER_SIZE;
int cur, l;
const xmlChar *target;
xmlParserInputState state;
int count = 0;
if ((RAW == '<') && (NXT(1) == '?')) {
state = ctxt->instate;
ctxt->instate = XML_PARSER_PI;
/*
* this is a Processing Instruction.
*/
SKIP(2);
SHRINK;
/*
* Parse the target name and check for special support like
* namespace.
*/
target = htmlParseName(ctxt);
if (target != NULL) {
if (RAW == '>') {
SKIP(1);
/*
* SAX: PI detected.
*/
if ((ctxt->sax) && (!ctxt->disableSAX) &&
(ctxt->sax->processingInstruction != NULL))
ctxt->sax->processingInstruction(ctxt->userData,
target, NULL);
ctxt->instate = state;
return;
}
buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
if (buf == NULL) {
htmlErrMemory(ctxt, NULL);
ctxt->instate = state;
return;
}
cur = CUR;
if (!IS_BLANK(cur)) {
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
"ParsePI: PI %s space expected\n", target, NULL);
}
SKIP_BLANKS;
cur = CUR_CHAR(l);
while (IS_CHAR(cur) && (cur != '>')) {
if (len + 5 >= size) {
xmlChar *tmp;
size *= 2;
tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
if (tmp == NULL) {
htmlErrMemory(ctxt, NULL);
xmlFree(buf);
ctxt->instate = state;
return;
}
buf = tmp;
}
count++;
if (count > 50) {
GROW;
count = 0;
}
COPY_BUF(l,buf,len,cur);
NEXTL(l);
cur = CUR_CHAR(l);
if (cur == 0) {
SHRINK;
GROW;
cur = CUR_CHAR(l);
}
}
buf[len] = 0;
if (cur != '>') {
htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
"ParsePI: PI %s never end ...\n", target, NULL);
} else {
SKIP(1);
/*
* SAX: PI detected.
*/
if ((ctxt->sax) && (!ctxt->disableSAX) &&
(ctxt->sax->processingInstruction != NULL))
ctxt->sax->processingInstruction(ctxt->userData,
target, buf);
}
xmlFree(buf);
} else {
htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
"PI is not started correctly", NULL, NULL);
}
ctxt->instate = state;
}
}
/**
* htmlParseComment:
* @ctxt: an HTML parser context
@ -3643,14 +3754,21 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
}
/*
* Second case : a sub-element.
* Second case : a Processing Instruction.
*/
else if ((CUR == '<') && (NXT(1) == '?')) {
htmlParsePI(ctxt);
}
/*
* Third case : a sub-element.
*/
else if (CUR == '<') {
htmlParseElement(ctxt);
}
/*
* Third case : a reference. If if has not been resolved,
* Fourth case : a reference. If if has not been resolved,
* parsing returns it's Name, create the node
*/
else if (CUR == '&') {
@ -3658,7 +3776,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
}
/*
* Fourth : end of the resource
* Fifth case : end of the resource
*/
else if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
@ -3852,11 +3970,13 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
/*
* Parse possible comments before any content
* Parse possible comments and PIs before any content
*/
while ((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) {
while (((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) ||
((CUR == '<') && (NXT(1) == '?'))) {
htmlParseComment(ctxt);
htmlParsePI(ctxt);
SKIP_BLANKS;
}
@ -3875,11 +3995,13 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
SKIP_BLANKS;
/*
* Parse possible comments before any content
* Parse possible comments and PIs before any content
*/
while ((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) {
while (((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) ||
((CUR == '<') && (NXT(1) == '?'))) {
htmlParseComment(ctxt);
htmlParsePI(ctxt);
SKIP_BLANKS;
}
@ -4444,6 +4566,16 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing PI\n");
#endif
htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '!') &&
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
@ -4494,6 +4626,16 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing PI\n");
#endif
htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
@ -4531,6 +4673,16 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing PI\n");
#endif
htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
@ -4737,6 +4889,16 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing PI\n");
#endif
htmlParsePI(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '!') && (avail < 4)) {
goto done;
} else if ((cur == '<') && (next == '/')) {

1
SAX.c
View File

@ -115,6 +115,7 @@ inithtmlDefaultSAXHandler(xmlSAXHandlerV1 *hdlr)
hdlr->characters = xmlSAX2Characters;
hdlr->cdataBlock = xmlSAX2CDataBlock;
hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
hdlr->processingInstruction = xmlSAX2ProcessingInstruction;
hdlr->processingInstruction = NULL;
hdlr->comment = xmlSAX2Comment;
hdlr->warning = xmlParserWarning;

2
SAX2.c
View File

@ -2635,7 +2635,7 @@ xmlSAX2InitHtmlDefaultSAXHandler(xmlSAXHandler *hdlr)
hdlr->characters = xmlSAX2Characters;
hdlr->cdataBlock = xmlSAX2CDataBlock;
hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
hdlr->processingInstruction = NULL;
hdlr->processingInstruction = xmlSAX2ProcessingInstruction;
hdlr->comment = xmlSAX2Comment;
hdlr->warning = xmlParserWarning;
hdlr->error = xmlParserError;

View File

@ -430,7 +430,7 @@ xmlSAXHandlerV1 htmlDefaultSAXHandler = {
NULL,
xmlSAX2Characters,
xmlSAX2IgnorableWhitespace,
NULL,
xmlSAX2ProcessingInstruction,
xmlSAX2Comment,
xmlParserWarning,
xmlParserError,

5
result/HTML/python.html Normal file
View File

@ -0,0 +1,5 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<?xml-stylesheet href="./css/ht2html.css" type="text/css"?><html>
<!-- THIS PAGE IS AUTOMATICALLY GENERATED. DO NOT EDIT. --><head><title>Python Programming Language</title></head>
<body></body>
</html>

View File

View File

@ -0,0 +1,29 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.internalSubset(html, -//W3C//DTD HTML 4.01 Transitional//EN, http://www.w3.org/TR/html4/loose.dtd)
SAX.processingInstruction(xml-stylesheet, href="./css/ht2html.css" type="text/css"?)
SAX.startElement(html)
SAX.ignorableWhitespace(
, 1)
SAX.comment( THIS PAGE IS AUTOMATICALLY GENERATED. DO NOT EDIT. )
SAX.ignorableWhitespace(
, 1)
SAX.startElement(head)
SAX.ignorableWhitespace(
, 1)
SAX.startElement(title)
SAX.characters(Python Programming Language, 27)
SAX.endElement(title)
SAX.ignorableWhitespace(
, 1)
SAX.endElement(head)
SAX.ignorableWhitespace(
, 1)
SAX.startElement(body)
SAX.ignorableWhitespace(
, 1)
SAX.endElement(body)
SAX.endElement(html)
SAX.ignorableWhitespace(
, 1)
SAX.endDocument()

10
test/HTML/python.html Normal file
View File

@ -0,0 +1,10 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd" >
<?xml-stylesheet href="./css/ht2html.css" type="text/css"?>
<html>
<!-- THIS PAGE IS AUTOMATICALLY GENERATED. DO NOT EDIT. -->
<head>
<title>Python Programming Language</title>
</head>
<body>
</body></html>