mirror of
https://gitlab.gnome.org/GNOME/libxml2
synced 2025-03-28 21:33:13 +00:00
Add HTML parser support for HTML5 meta charset encoding declaration
For https://bugzilla.gnome.org/show_bug.cgi?id=655218 http://www.w3.org/TR/2011/WD-html5-20110525/semantics.html#the-meta-element """ The charset attribute specifies the character encoding used by the document. This is a character encoding declaration. If the attribute is present in an XML document, its value must be an ASCII case-insensitive match for the string "UTF-8" (and the document is therefore forced to use UTF-8 as its encoding). """ However, while <meta http-equiv="Content-Type" content="text/html; charset=utf8"> works, <meta charset="utf8"> does not. While libxml2 HTML parser is not tuned for HTML5, this is a simple addition Also added a testcase
This commit is contained in:
parent
1eabc31401
commit
868d92da89
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,9 +1,6 @@
|
||||
*.o
|
||||
*.lo
|
||||
*.xml
|
||||
*.log
|
||||
*.rng
|
||||
*.html
|
||||
*.patch
|
||||
.deps
|
||||
.libs
|
||||
|
53
HTMLparser.c
53
HTMLparser.c
@ -727,7 +727,7 @@ static const char* const map_contents[] = { BLOCK, "area", NULL } ;
|
||||
static const char* const name_attr[] = { "name", NULL } ;
|
||||
static const char* const action_attr[] = { "action", NULL } ;
|
||||
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
|
||||
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
|
||||
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
|
||||
static const char* const content_attr[] = { "content", NULL } ;
|
||||
static const char* const type_attr[] = { "type", NULL } ;
|
||||
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
|
||||
@ -3435,20 +3435,19 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlCheckEncoding:
|
||||
* htmlCheckEncodingDirect:
|
||||
* @ctxt: an HTML parser context
|
||||
* @attvalue: the attribute value
|
||||
*
|
||||
* Checks an http-equiv attribute from a Meta tag to detect
|
||||
* Checks an attribute value to detect
|
||||
* the encoding
|
||||
* If a new encoding is detected the parser is switched to decode
|
||||
* it and pass UTF8
|
||||
*/
|
||||
static void
|
||||
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
||||
const xmlChar *encoding;
|
||||
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
|
||||
|
||||
if ((ctxt == NULL) || (attvalue == NULL) ||
|
||||
if ((ctxt == NULL) || (encoding == NULL) ||
|
||||
(ctxt->options & HTML_PARSE_IGNORE_ENC))
|
||||
return;
|
||||
|
||||
@ -3456,14 +3455,6 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
||||
if (ctxt->input->encoding != NULL)
|
||||
return;
|
||||
|
||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
|
||||
if (encoding != NULL) {
|
||||
encoding += 8;
|
||||
} else {
|
||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
|
||||
if (encoding != NULL)
|
||||
encoding += 9;
|
||||
}
|
||||
if (encoding != NULL) {
|
||||
xmlCharEncoding enc;
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
@ -3535,6 +3526,38 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlCheckEncoding:
|
||||
* @ctxt: an HTML parser context
|
||||
* @attvalue: the attribute value
|
||||
*
|
||||
* Checks an http-equiv attribute from a Meta tag to detect
|
||||
* the encoding
|
||||
* If a new encoding is detected the parser is switched to decode
|
||||
* it and pass UTF8
|
||||
*/
|
||||
static void
|
||||
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
||||
const xmlChar *encoding;
|
||||
|
||||
if (!attvalue)
|
||||
return;
|
||||
|
||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
|
||||
if (encoding != NULL) {
|
||||
encoding += 7;
|
||||
}
|
||||
/*
|
||||
* skip blank
|
||||
*/
|
||||
if (encoding && IS_BLANK_CH(*encoding))
|
||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
|
||||
if (encoding && *encoding == '=') {
|
||||
encoding ++;
|
||||
htmlCheckEncodingDirect(ctxt, encoding);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlCheckMeta:
|
||||
* @ctxt: an HTML parser context
|
||||
@ -3559,6 +3582,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
||||
if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
|
||||
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
|
||||
http = 1;
|
||||
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
|
||||
htmlCheckEncodingDirect(ctxt, value);
|
||||
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
|
||||
content = value;
|
||||
att = atts[i++];
|
||||
|
7
result/HTML/html5_enc.html
Normal file
7
result/HTML/html5_enc.html
Normal file
@ -0,0 +1,7 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head><meta charset="iso-8859-1"></head>
|
||||
<body>
|
||||
<p>très</p>
|
||||
</body>
|
||||
</html>
|
0
result/HTML/html5_enc.html.err
Normal file
0
result/HTML/html5_enc.html.err
Normal file
30
result/HTML/html5_enc.html.sax
Normal file
30
result/HTML/html5_enc.html.sax
Normal file
@ -0,0 +1,30 @@
|
||||
SAX.setDocumentLocator()
|
||||
SAX.startDocument()
|
||||
SAX.startElement(html)
|
||||
SAX.ignorableWhitespace(
|
||||
, 1)
|
||||
SAX.startElement(head)
|
||||
SAX.ignorableWhitespace(
|
||||
, 1)
|
||||
SAX.startElement(meta, charset='iso-8859-1')
|
||||
SAX.endElement(meta)
|
||||
SAX.ignorableWhitespace(
|
||||
, 1)
|
||||
SAX.endElement(head)
|
||||
SAX.ignorableWhitespace(
|
||||
, 1)
|
||||
SAX.startElement(body)
|
||||
SAX.characters(
|
||||
, 3)
|
||||
SAX.startElement(p)
|
||||
SAX.characters(très, 5)
|
||||
SAX.endElement(p)
|
||||
SAX.characters(
|
||||
, 1)
|
||||
SAX.endElement(body)
|
||||
SAX.ignorableWhitespace(
|
||||
, 1)
|
||||
SAX.endElement(html)
|
||||
SAX.ignorableWhitespace(
|
||||
, 1)
|
||||
SAX.endDocument()
|
8
test/HTML/html5_enc.html
Normal file
8
test/HTML/html5_enc.html
Normal file
@ -0,0 +1,8 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="iso-8859-1"/>
|
||||
</head>
|
||||
<body>
|
||||
<p>très</p>
|
||||
</body>
|
||||
</html>
|
Loading…
x
Reference in New Issue
Block a user