mirror of
https://gitlab.gnome.org/GNOME/libxml2
synced 2025-03-28 21:33:13 +00:00
- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8
string oriented functions and started cleaning the related areas in xpath.c which needed fixing in this respect Daniel
This commit is contained in:
parent
2d70372ce3
commit
97ac13197c
@ -1,3 +1,9 @@
|
||||
Wed May 30 21:12:45 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
|
||||
|
||||
* xpath.c encoding.[ch]: William M. Brack provided a set of UTF8
|
||||
string oriented functions and started cleaning the related areas
|
||||
in xpath.c which needed fixing in this respect
|
||||
|
||||
Wed May 30 20:30:47 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
|
||||
|
||||
* HTMLtree.c: applied patch from Jaroslaw Kolakowski to close bug
|
||||
|
234
encoding.c
234
encoding.c
@ -13,11 +13,14 @@
|
||||
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
|
||||
* Information Interchange, ANSI X3.4-1986.
|
||||
*
|
||||
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
|
||||
*
|
||||
* See Copyright for the status of this software.
|
||||
*
|
||||
* Daniel.Veillard@w3.org
|
||||
*
|
||||
* UTF8 string routines from:
|
||||
* "William M. Brack" <wbrack@mmm.com.hk>
|
||||
*
|
||||
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
|
||||
*/
|
||||
|
||||
#include "libxml.h"
|
||||
@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0;
|
||||
|
||||
static int xmlLittleEndian = 1;
|
||||
|
||||
/*
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
*
|
||||
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
||||
* 0000 0000-0000 007F 0xxxxxxx
|
||||
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
*
|
||||
* I hope we won't use values > 0xFFFF anytime soon !
|
||||
*/
|
||||
/************************************************************************
|
||||
* *
|
||||
* Generic UTF8 handling routines *
|
||||
* *
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8: *
|
||||
* *
|
||||
* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
|
||||
* 0000 0000-0000 007F 0xxxxxxx *
|
||||
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
|
||||
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
|
||||
* *
|
||||
* I hope we won't use values > 0xFFFF anytime soon ! *
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
/**
|
||||
* xmlUTF8Strlen:
|
||||
@ -85,7 +92,7 @@ static int xmlLittleEndian = 1;
|
||||
* Returns the number of characters in the string or -1 in case of error
|
||||
*/
|
||||
int
|
||||
xmlUTF8Strlen(const unsigned char *utf) {
|
||||
xmlUTF8Strlen(const xmlChar *utf) {
|
||||
int ret = 0;
|
||||
|
||||
if (utf == NULL)
|
||||
@ -227,6 +234,178 @@ xmlCheckUTF8(const unsigned char *utf)
|
||||
return(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlUTF8Strsize:
|
||||
* @utf: a sequence of UTF-8 encoded bytes
|
||||
* @len: the number of characters in the array
|
||||
*
|
||||
* storage size of an UTF8 string
|
||||
*
|
||||
* Returns the storage size of
|
||||
* the first 'len' characters of ARRAY
|
||||
*
|
||||
*/
|
||||
|
||||
int
|
||||
xmlUTF8Strsize(const xmlChar *utf, int len) {
|
||||
const xmlChar *ptr=utf;
|
||||
xmlChar ch;
|
||||
|
||||
if (len <= 0)
|
||||
return(0);
|
||||
|
||||
while ( len-- > 0) {
|
||||
if ( !*ptr )
|
||||
break;
|
||||
if ( (ch = *ptr++) & 0x80)
|
||||
while ( (ch<<=1) & 0x80 )
|
||||
ptr++;
|
||||
}
|
||||
return (ptr - utf);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* xmlUTF8Strndup:
|
||||
* @utf: the input UTF8 *
|
||||
* @len: the len of @utf (in chars)
|
||||
*
|
||||
* a strndup for array of UTF8's
|
||||
*
|
||||
* Returns a new UTF8 * or NULL
|
||||
*/
|
||||
xmlChar *
|
||||
xmlUTF8Strndup(const xmlChar *utf, int len) {
|
||||
xmlChar *ret;
|
||||
int i;
|
||||
|
||||
if ((utf == NULL) || (len < 0)) return(NULL);
|
||||
i = xmlUTF8Strsize(utf, len);
|
||||
ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
|
||||
if (ret == NULL) {
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"malloc of %ld byte failed\n",
|
||||
(len + 1) * (long)sizeof(xmlChar));
|
||||
return(NULL);
|
||||
}
|
||||
memcpy(ret, utf, i * sizeof(xmlChar));
|
||||
ret[i] = 0;
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlUTF8Strpos:
|
||||
* @utf: the input UTF8 *
|
||||
* @pos: the position of the desired UTF8 char (in chars)
|
||||
*
|
||||
* a function to provide the equivalent of fetching a
|
||||
* character from a string array
|
||||
*
|
||||
* Returns a pointer to the UTF8 character or NULL
|
||||
*/
|
||||
xmlChar *
|
||||
xmlUTF8Strpos(const xmlChar *utf, int pos) {
|
||||
xmlChar ch;
|
||||
|
||||
if (utf == NULL) return(NULL);
|
||||
if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
|
||||
return(NULL);
|
||||
while (pos--) {
|
||||
if ((ch=*utf++) == 0) return(NULL);
|
||||
if ( ch & 0x80 ) {
|
||||
/* if not simple ascii, verify proper format */
|
||||
if ( (ch & 0xc0) != 0xc0 )
|
||||
return(NULL);
|
||||
/* then skip over remaining bytes for this char */
|
||||
while ( (ch <<= 1) & 0x80 )
|
||||
if ( (*utf++ & 0xc0) != 0x80 )
|
||||
return(NULL);
|
||||
}
|
||||
}
|
||||
return((xmlChar *)utf);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlUTF8Strloc:
|
||||
* @utf: the input UTF8 *
|
||||
* @utfchar: the UTF8 character to be found
|
||||
*
|
||||
* a function to provide relative location of a UTF8 char
|
||||
*
|
||||
* Returns the relative character position of the desired char
|
||||
* or -1 if not found
|
||||
*/
|
||||
int
|
||||
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
|
||||
int i, size;
|
||||
xmlChar ch;
|
||||
|
||||
if (utf==NULL || utfchar==NULL) return -1;
|
||||
size = xmlUTF8Strsize(utfchar, 1);
|
||||
for(i=0; (ch=*utf) != 0; i++) {
|
||||
if (xmlStrncmp(utf, utfchar, size)==0)
|
||||
return(i);
|
||||
utf++;
|
||||
if ( ch & 0x80 ) {
|
||||
/* if not simple ascii, verify proper format */
|
||||
if ( (ch & 0xc0) != 0xc0 )
|
||||
return(-1);
|
||||
/* then skip over remaining bytes for this char */
|
||||
while ( (ch <<= 1) & 0x80 )
|
||||
if ( (*utf++ & 0xc0) != 0x80 )
|
||||
return(-1);
|
||||
}
|
||||
}
|
||||
|
||||
return(-1);
|
||||
}
|
||||
/**
|
||||
* xmlUTF8Strsub:
|
||||
* @utf: a sequence of UTF-8 encoded bytes
|
||||
*
|
||||
* @start: relative pos of first char
|
||||
* @len: total number to copy
|
||||
*
|
||||
* Note: positions are given in units of UTF-8 chars
|
||||
*
|
||||
* Returns a pointer to a newly created string
|
||||
* or NULL if any problem
|
||||
*/
|
||||
|
||||
xmlChar *
|
||||
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
|
||||
int i;
|
||||
xmlChar ch;
|
||||
|
||||
if (utf == NULL) return(NULL);
|
||||
if (start < 0) return(NULL);
|
||||
if (len < 0) return(NULL);
|
||||
|
||||
/*
|
||||
* Skip over any leading chars
|
||||
*/
|
||||
for (i = 0;i < start;i++) {
|
||||
if ((ch=*utf++) == 0) return(NULL);
|
||||
if ( ch & 0x80 ) {
|
||||
/* if not simple ascii, verify proper format */
|
||||
if ( (ch & 0xc0) != 0xc0 )
|
||||
return(NULL);
|
||||
/* then skip over remaining bytes for this char */
|
||||
while ( (ch <<= 1) & 0x80 )
|
||||
if ( (*utf++ & 0xc0) != 0x80 )
|
||||
return(NULL);
|
||||
}
|
||||
}
|
||||
|
||||
return(xmlUTF8Strndup(utf, len));
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
* Conversions To/From UTF8 encoding *
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
/**
|
||||
* asciiToUTF8:
|
||||
* @out: a pointer to an array of bytes to store the result
|
||||
@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
|
||||
return(0);
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
* Generic encoding handling routines *
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
/**
|
||||
* xmlDetectCharEncoding:
|
||||
* @in: a pointer to the first bytes of the XML entity, must be at least
|
||||
@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
/****************************************************************
|
||||
* *
|
||||
* Char encoding handlers *
|
||||
* *
|
||||
****************************************************************/
|
||||
/************************************************************************
|
||||
* *
|
||||
* Char encoding handlers *
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
|
||||
/* the size should be growable, but it's not a big deal ... */
|
||||
#define MAX_ENCODING_HANDLERS 50
|
||||
@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) {
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
* ICONV based generic conversion functions *
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
/**
|
||||
* xmlIconvWrapper:
|
||||
@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd,
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
* The real API used by libxml for on-the-fly conversion *
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
/**
|
||||
* xmlCharEncFirstLine:
|
||||
* @handler: char enconding transformation data structure
|
||||
|
19
encoding.h
19
encoding.h
@ -191,8 +191,25 @@ int isolat1ToUTF8 (unsigned char* out,
|
||||
int *outlen,
|
||||
const unsigned char* in,
|
||||
int *inlen);
|
||||
/*
|
||||
* exports additional "UTF-8 aware" string routines which are
|
||||
*/
|
||||
|
||||
int xmlCheckUTF8 (const unsigned char *utf);
|
||||
int xmlUTF8Strlen (const unsigned char *utf);
|
||||
|
||||
int xmlUTF8Strsize (const xmlChar *utf,
|
||||
int len);
|
||||
xmlChar * xmlUTF8Strndup (const xmlChar *utf,
|
||||
int len);
|
||||
xmlChar * xmlUTF8Strpos (const xmlChar *utf,
|
||||
int pos);
|
||||
int xmlUTF8Strloc (const xmlChar *utf,
|
||||
const xmlChar *utfchar);
|
||||
xmlChar * xmlUTF8Strsub (const xmlChar *utf,
|
||||
int start,
|
||||
int len);
|
||||
|
||||
int xmlUTF8Strlen (const xmlChar *utf);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -191,8 +191,25 @@ int isolat1ToUTF8 (unsigned char* out,
|
||||
int *outlen,
|
||||
const unsigned char* in,
|
||||
int *inlen);
|
||||
/*
|
||||
* exports additional "UTF-8 aware" string routines which are
|
||||
*/
|
||||
|
||||
int xmlCheckUTF8 (const unsigned char *utf);
|
||||
int xmlUTF8Strlen (const unsigned char *utf);
|
||||
|
||||
int xmlUTF8Strsize (const xmlChar *utf,
|
||||
int len);
|
||||
xmlChar * xmlUTF8Strndup (const xmlChar *utf,
|
||||
int len);
|
||||
xmlChar * xmlUTF8Strpos (const xmlChar *utf,
|
||||
int pos);
|
||||
int xmlUTF8Strloc (const xmlChar *utf,
|
||||
const xmlChar *utfchar);
|
||||
xmlChar * xmlUTF8Strsub (const xmlChar *utf,
|
||||
int start,
|
||||
int len);
|
||||
|
||||
int xmlUTF8Strlen (const xmlChar *utf);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
104
xpath.c
104
xpath.c
@ -4840,28 +4840,27 @@ xmlXPathStartsWithFunction(xmlXPathParserContextPtr ctxt, int nargs) {
|
||||
void
|
||||
xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) {
|
||||
xmlXPathObjectPtr str, start, len;
|
||||
double le, in;
|
||||
int i, l;
|
||||
double le=0, in;
|
||||
int i, l, m;
|
||||
xmlChar *ret;
|
||||
|
||||
/*
|
||||
* TODO: need to be converted to UTF8 strings
|
||||
*/
|
||||
if (nargs < 2) {
|
||||
CHECK_ARITY(2);
|
||||
}
|
||||
if (nargs > 3) {
|
||||
CHECK_ARITY(3);
|
||||
}
|
||||
/*
|
||||
* take care of possible last (position) argument
|
||||
*/
|
||||
if (nargs == 3) {
|
||||
CAST_TO_NUMBER;
|
||||
CHECK_TYPE(XPATH_NUMBER);
|
||||
len = valuePop(ctxt);
|
||||
le = len->floatval;
|
||||
xmlXPathFreeObject(len);
|
||||
} else {
|
||||
le = 2000000000;
|
||||
}
|
||||
|
||||
CAST_TO_NUMBER;
|
||||
CHECK_TYPE(XPATH_NUMBER);
|
||||
start = valuePop(ctxt);
|
||||
@ -4870,38 +4869,49 @@ xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) {
|
||||
CAST_TO_STRING;
|
||||
CHECK_TYPE(XPATH_STRING);
|
||||
str = valuePop(ctxt);
|
||||
le += in;
|
||||
m = xmlUTF8Strlen((const unsigned char *)str->stringval);
|
||||
|
||||
/* integer index of the first char */
|
||||
/*
|
||||
* If last pos not present, calculate last position
|
||||
*/
|
||||
if (nargs != 3)
|
||||
le = m;
|
||||
|
||||
/*
|
||||
* To meet our requirements, initial index calculations
|
||||
* must be done before we convert to integer format
|
||||
*
|
||||
* First we normalize indices
|
||||
*/
|
||||
in -= 1.0;
|
||||
le += in;
|
||||
if (in < 0.0)
|
||||
in = 0.0;
|
||||
if (le > (double)m)
|
||||
le = (double)m;
|
||||
|
||||
/*
|
||||
* Now we go to integer form, rounding up
|
||||
*/
|
||||
i = (int) in;
|
||||
if (((double)i) != in) i++;
|
||||
|
||||
/* integer index of the last char */
|
||||
l = (int) le;
|
||||
if (((double)l) != le) l++;
|
||||
|
||||
/* back to a zero based len */
|
||||
i--;
|
||||
l--;
|
||||
|
||||
/* check against the string len */
|
||||
if (l > 1024) {
|
||||
l = xmlStrlen(str->stringval);
|
||||
}
|
||||
if (i < 0) {
|
||||
i = 0;
|
||||
}
|
||||
if (l > m) l=m;
|
||||
|
||||
/* number of chars to copy */
|
||||
l -= i;
|
||||
|
||||
ret = xmlStrsub(str->stringval, i, l);
|
||||
ret = xmlUTF8Strsub(str->stringval, i, l);
|
||||
if (ret == NULL)
|
||||
valuePush(ctxt, xmlXPathNewCString(""));
|
||||
else {
|
||||
valuePush(ctxt, xmlXPathNewString(ret));
|
||||
xmlFree(ret);
|
||||
}
|
||||
|
||||
xmlXPathFreeObject(str);
|
||||
}
|
||||
|
||||
@ -5037,7 +5047,7 @@ xmlXPathNormalizeFunction(xmlXPathParserContextPtr ctxt, int nargs) {
|
||||
blank = 0;
|
||||
while (*source) {
|
||||
if (IS_BLANK(*source)) {
|
||||
blank = *source;
|
||||
blank = 0x20;
|
||||
} else {
|
||||
if (blank) {
|
||||
xmlBufferAdd(target, &blank, 1);
|
||||
@ -5081,13 +5091,11 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) {
|
||||
xmlXPathObjectPtr from;
|
||||
xmlXPathObjectPtr to;
|
||||
xmlBufferPtr target;
|
||||
int i, offset, max;
|
||||
int offset, max;
|
||||
xmlChar ch;
|
||||
const xmlChar *point;
|
||||
xmlChar *point;
|
||||
xmlChar *cptr;
|
||||
|
||||
/*
|
||||
* TODO: need to be converted to UTF8 strings
|
||||
*/
|
||||
CHECK_ARITY(3);
|
||||
|
||||
CAST_TO_STRING;
|
||||
@ -5099,15 +5107,37 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) {
|
||||
|
||||
target = xmlBufferCreate();
|
||||
if (target) {
|
||||
max = xmlStrlen(to->stringval);
|
||||
for (i = 0; (ch = str->stringval[i]); i++) {
|
||||
point = xmlStrchr(from->stringval, ch);
|
||||
if (point) {
|
||||
offset = (int)(point - from->stringval);
|
||||
if (offset < max)
|
||||
xmlBufferAdd(target, &to->stringval[offset], 1);
|
||||
} else
|
||||
xmlBufferAdd(target, &ch, 1);
|
||||
max = xmlUTF8Strlen(to->stringval);
|
||||
for (cptr = str->stringval; (ch=*cptr); ) {
|
||||
offset = xmlUTF8Strloc(from->stringval, cptr);
|
||||
if (offset >= 0) {
|
||||
if (offset < max) {
|
||||
point = xmlUTF8Strpos(to->stringval, offset);
|
||||
if (point)
|
||||
xmlBufferAdd(target, point, xmlUTF8Strsize(point, 1));
|
||||
}
|
||||
} else
|
||||
xmlBufferAdd(target, cptr, xmlUTF8Strsize(cptr, 1));
|
||||
|
||||
/* Step to next character in input */
|
||||
cptr++;
|
||||
if ( ch & 0x80 ) {
|
||||
/* if not simple ascii, verify proper format */
|
||||
if ( (ch & 0xc0) != 0xc0 ) {
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"xmlXPathTranslateFunction: Invalid UTF8 string\n");
|
||||
break;
|
||||
}
|
||||
/* then skip over remaining bytes for this char */
|
||||
while ( (ch <<= 1) & 0x80 )
|
||||
if ( (*cptr++ & 0xc0) != 0x80 ) {
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"xmlXPathTranslateFunction: Invalid UTF8 string\n");
|
||||
break;
|
||||
}
|
||||
if (ch & 0x80) /* must have had error encountered */
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
valuePush(ctxt, xmlXPathNewString(xmlBufferContent(target)));
|
||||
|
Loading…
x
Reference in New Issue
Block a user