/*
* HTMLtree.c : implementation of access function for an HTML tree.
*
* See Copyright for the status of this software.
*
* daniel@veillard.com
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_HTML_ENABLED
#include /* for memset() only ! */
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "private/buf.h"
#include "private/error.h"
#include "private/io.h"
#include "private/save.h"
/************************************************************************
* *
* Getting/Setting encoding meta tags *
* *
************************************************************************/
/**
* htmlGetMetaEncoding:
* @doc: the document
*
* Encoding definition lookup in the Meta tags
*
* Returns the current encoding as flagged in the HTML source
*/
const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc) {
htmlNodePtr cur;
const xmlChar *content;
const xmlChar *encoding;
if (doc == NULL)
return(NULL);
cur = doc->children;
/*
* Search the html
*/
while (cur != NULL) {
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
if (xmlStrEqual(cur->name, BAD_CAST"html"))
break;
if (xmlStrEqual(cur->name, BAD_CAST"head"))
goto found_head;
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
goto found_meta;
}
cur = cur->next;
}
if (cur == NULL)
return(NULL);
cur = cur->children;
/*
* Search the head
*/
while (cur != NULL) {
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
if (xmlStrEqual(cur->name, BAD_CAST"head"))
break;
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
goto found_meta;
}
cur = cur->next;
}
if (cur == NULL)
return(NULL);
found_head:
cur = cur->children;
/*
* Search the meta elements
*/
found_meta:
while (cur != NULL) {
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
xmlAttrPtr attr = cur->properties;
int http;
const xmlChar *value;
content = NULL;
http = 0;
while (attr != NULL) {
if ((attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL)) {
value = attr->children->content;
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1;
else if ((value != NULL)
&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
content = value;
if ((http != 0) && (content != NULL))
goto found_content;
}
attr = attr->next;
}
}
}
cur = cur->next;
}
return(NULL);
found_content:
encoding = xmlStrstr(content, BAD_CAST"charset=");
if (encoding == NULL)
encoding = xmlStrstr(content, BAD_CAST"Charset=");
if (encoding == NULL)
encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
if (encoding != NULL) {
encoding += 8;
} else {
encoding = xmlStrstr(content, BAD_CAST"charset =");
if (encoding == NULL)
encoding = xmlStrstr(content, BAD_CAST"Charset =");
if (encoding == NULL)
encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
if (encoding != NULL)
encoding += 9;
}
if (encoding != NULL) {
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
}
return(encoding);
}
/**
* htmlSetMetaEncoding:
* @doc: the document
* @encoding: the encoding string
*
* Sets the current encoding in the Meta tags
* NOTE: this will not change the document content encoding, just
* the META flag associated.
*
* Returns 0 in case of success and -1 in case of error
*/
int
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
htmlNodePtr cur, meta = NULL, head = NULL;
const xmlChar *content = NULL;
char newcontent[100];
newcontent[0] = 0;
if (doc == NULL)
return(-1);
/* html isn't a real encoding it's just libxml2 way to get entities */
if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
return(-1);
if (encoding != NULL) {
snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
(char *)encoding);
newcontent[sizeof(newcontent) - 1] = 0;
}
cur = doc->children;
/*
* Search the html
*/
while (cur != NULL) {
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
break;
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
goto found_head;
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
goto found_meta;
}
cur = cur->next;
}
if (cur == NULL)
return(-1);
cur = cur->children;
/*
* Search the head
*/
while (cur != NULL) {
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
break;
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
head = cur->parent;
goto found_meta;
}
}
cur = cur->next;
}
if (cur == NULL)
return(-1);
found_head:
head = cur;
if (cur->children == NULL)
goto create;
cur = cur->children;
found_meta:
/*
* Search and update all the remaining the meta elements carrying
* encoding information
*/
while (cur != NULL) {
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
xmlAttrPtr attr = cur->properties;
int http;
const xmlChar *value;
content = NULL;
http = 0;
while (attr != NULL) {
if ((attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL)) {
value = attr->children->content;
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1;
else
{
if ((value != NULL) &&
(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
content = value;
}
if ((http != 0) && (content != NULL))
break;
}
attr = attr->next;
}
if ((http != 0) && (content != NULL)) {
meta = cur;
break;
}
}
}
cur = cur->next;
}
create:
if (meta == NULL) {
if ((encoding != NULL) && (head != NULL)) {
/*
* Create a new Meta element with the right attributes
*/
meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
if (head->children == NULL)
xmlAddChild(head, meta);
else
xmlAddPrevSibling(head->children, meta);
xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
}
} else {
/* remove the meta tag if NULL is passed */
if (encoding == NULL) {
xmlUnlinkNode(meta);
xmlFreeNode(meta);
}
/* change the document only if there is a real encoding change */
else if (xmlStrcasestr(content, encoding) == NULL) {
xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
}
}
return(0);
}
/**
* booleanHTMLAttrs:
*
* These are the HTML attributes which will be output
* in minimized form, i.e.