- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8

string oriented functions and started cleaning the related areas
  in xpath.c which needed fixing in this respect
Daniel
This commit is contained in:
Daniel Veillard 2001-05-30 19:14:17 +00:00
parent 2d70372ce3
commit 97ac13197c
5 changed files with 325 additions and 57 deletions

View File

@ -1,3 +1,9 @@
Wed May 30 21:12:45 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
* xpath.c encoding.[ch]: William M. Brack provided a set of UTF8
string oriented functions and started cleaning the related areas
in xpath.c which needed fixing in this respect
Wed May 30 20:30:47 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
* HTMLtree.c: applied patch from Jaroslaw Kolakowski to close bug

View File

@ -13,11 +13,14 @@
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
* Information Interchange, ANSI X3.4-1986.
*
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
*
* See Copyright for the status of this software.
*
* Daniel.Veillard@w3.org
*
* UTF8 string routines from:
* "William M. Brack" <wbrack@mmm.com.hk>
*
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
*/
#include "libxml.h"
@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0;
static int xmlLittleEndian = 1;
/*
* From rfc2044: encoding of the Unicode values on UTF-8:
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*
* I hope we won't use values > 0xFFFF anytime soon !
*/
/************************************************************************
* *
* Generic UTF8 handling routines *
* *
* From rfc2044: encoding of the Unicode values on UTF-8: *
* *
* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
* 0000 0000-0000 007F 0xxxxxxx *
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
* *
* I hope we won't use values > 0xFFFF anytime soon ! *
* *
************************************************************************/
/**
* xmlUTF8Strlen:
@ -85,7 +92,7 @@ static int xmlLittleEndian = 1;
* Returns the number of characters in the string or -1 in case of error
*/
int
xmlUTF8Strlen(const unsigned char *utf) {
xmlUTF8Strlen(const xmlChar *utf) {
int ret = 0;
if (utf == NULL)
@ -227,6 +234,178 @@ xmlCheckUTF8(const unsigned char *utf)
return(1);
}
/**
* xmlUTF8Strsize:
* @utf: a sequence of UTF-8 encoded bytes
* @len: the number of characters in the array
*
* storage size of an UTF8 string
*
* Returns the storage size of
* the first 'len' characters of ARRAY
*
*/
int
xmlUTF8Strsize(const xmlChar *utf, int len) {
const xmlChar *ptr=utf;
xmlChar ch;
if (len <= 0)
return(0);
while ( len-- > 0) {
if ( !*ptr )
break;
if ( (ch = *ptr++) & 0x80)
while ( (ch<<=1) & 0x80 )
ptr++;
}
return (ptr - utf);
}
/**
* xmlUTF8Strndup:
* @utf: the input UTF8 *
* @len: the len of @utf (in chars)
*
* a strndup for array of UTF8's
*
* Returns a new UTF8 * or NULL
*/
xmlChar *
xmlUTF8Strndup(const xmlChar *utf, int len) {
xmlChar *ret;
int i;
if ((utf == NULL) || (len < 0)) return(NULL);
i = xmlUTF8Strsize(utf, len);
ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
if (ret == NULL) {
xmlGenericError(xmlGenericErrorContext,
"malloc of %ld byte failed\n",
(len + 1) * (long)sizeof(xmlChar));
return(NULL);
}
memcpy(ret, utf, i * sizeof(xmlChar));
ret[i] = 0;
return(ret);
}
/**
* xmlUTF8Strpos:
* @utf: the input UTF8 *
* @pos: the position of the desired UTF8 char (in chars)
*
* a function to provide the equivalent of fetching a
* character from a string array
*
* Returns a pointer to the UTF8 character or NULL
*/
xmlChar *
xmlUTF8Strpos(const xmlChar *utf, int pos) {
xmlChar ch;
if (utf == NULL) return(NULL);
if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
return(NULL);
while (pos--) {
if ((ch=*utf++) == 0) return(NULL);
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 )
return(NULL);
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*utf++ & 0xc0) != 0x80 )
return(NULL);
}
}
return((xmlChar *)utf);
}
/**
* xmlUTF8Strloc:
* @utf: the input UTF8 *
* @utfchar: the UTF8 character to be found
*
* a function to provide relative location of a UTF8 char
*
* Returns the relative character position of the desired char
* or -1 if not found
*/
int
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
int i, size;
xmlChar ch;
if (utf==NULL || utfchar==NULL) return -1;
size = xmlUTF8Strsize(utfchar, 1);
for(i=0; (ch=*utf) != 0; i++) {
if (xmlStrncmp(utf, utfchar, size)==0)
return(i);
utf++;
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 )
return(-1);
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*utf++ & 0xc0) != 0x80 )
return(-1);
}
}
return(-1);
}
/**
* xmlUTF8Strsub:
* @utf: a sequence of UTF-8 encoded bytes
*
* @start: relative pos of first char
* @len: total number to copy
*
* Note: positions are given in units of UTF-8 chars
*
* Returns a pointer to a newly created string
* or NULL if any problem
*/
xmlChar *
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
int i;
xmlChar ch;
if (utf == NULL) return(NULL);
if (start < 0) return(NULL);
if (len < 0) return(NULL);
/*
* Skip over any leading chars
*/
for (i = 0;i < start;i++) {
if ((ch=*utf++) == 0) return(NULL);
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 )
return(NULL);
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*utf++ & 0xc0) != 0x80 )
return(NULL);
}
}
return(xmlUTF8Strndup(utf, len));
}
/************************************************************************
* *
* Conversions To/From UTF8 encoding *
* *
************************************************************************/
/**
* asciiToUTF8:
* @out: a pointer to an array of bytes to store the result
@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
return(0);
}
/************************************************************************
* *
* Generic encoding handling routines *
* *
************************************************************************/
/**
* xmlDetectCharEncoding:
* @in: a pointer to the first bytes of the XML entity, must be at least
@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
return(NULL);
}
/****************************************************************
* *
* Char encoding handlers *
* *
****************************************************************/
/************************************************************************
* *
* Char encoding handlers *
* *
************************************************************************/
/* the size should be growable, but it's not a big deal ... */
#define MAX_ENCODING_HANDLERS 50
@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) {
return(NULL);
}
/************************************************************************
* *
* ICONV based generic conversion functions *
* *
************************************************************************/
#ifdef LIBXML_ICONV_ENABLED
/**
* xmlIconvWrapper:
@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd,
}
#endif /* LIBXML_ICONV_ENABLED */
/************************************************************************
* *
* The real API used by libxml for on-the-fly conversion *
* *
************************************************************************/
/**
* xmlCharEncFirstLine:
* @handler: char enconding transformation data structure

View File

@ -191,8 +191,25 @@ int isolat1ToUTF8 (unsigned char* out,
int *outlen,
const unsigned char* in,
int *inlen);
/*
* exports additional "UTF-8 aware" string routines which are
*/
int xmlCheckUTF8 (const unsigned char *utf);
int xmlUTF8Strlen (const unsigned char *utf);
int xmlUTF8Strsize (const xmlChar *utf,
int len);
xmlChar * xmlUTF8Strndup (const xmlChar *utf,
int len);
xmlChar * xmlUTF8Strpos (const xmlChar *utf,
int pos);
int xmlUTF8Strloc (const xmlChar *utf,
const xmlChar *utfchar);
xmlChar * xmlUTF8Strsub (const xmlChar *utf,
int start,
int len);
int xmlUTF8Strlen (const xmlChar *utf);
#ifdef __cplusplus
}

View File

@ -191,8 +191,25 @@ int isolat1ToUTF8 (unsigned char* out,
int *outlen,
const unsigned char* in,
int *inlen);
/*
* exports additional "UTF-8 aware" string routines which are
*/
int xmlCheckUTF8 (const unsigned char *utf);
int xmlUTF8Strlen (const unsigned char *utf);
int xmlUTF8Strsize (const xmlChar *utf,
int len);
xmlChar * xmlUTF8Strndup (const xmlChar *utf,
int len);
xmlChar * xmlUTF8Strpos (const xmlChar *utf,
int pos);
int xmlUTF8Strloc (const xmlChar *utf,
const xmlChar *utfchar);
xmlChar * xmlUTF8Strsub (const xmlChar *utf,
int start,
int len);
int xmlUTF8Strlen (const xmlChar *utf);
#ifdef __cplusplus
}

104
xpath.c
View File

@ -4840,28 +4840,27 @@ xmlXPathStartsWithFunction(xmlXPathParserContextPtr ctxt, int nargs) {
void
xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) {
xmlXPathObjectPtr str, start, len;
double le, in;
int i, l;
double le=0, in;
int i, l, m;
xmlChar *ret;
/*
* TODO: need to be converted to UTF8 strings
*/
if (nargs < 2) {
CHECK_ARITY(2);
}
if (nargs > 3) {
CHECK_ARITY(3);
}
/*
* take care of possible last (position) argument
*/
if (nargs == 3) {
CAST_TO_NUMBER;
CHECK_TYPE(XPATH_NUMBER);
len = valuePop(ctxt);
le = len->floatval;
xmlXPathFreeObject(len);
} else {
le = 2000000000;
}
CAST_TO_NUMBER;
CHECK_TYPE(XPATH_NUMBER);
start = valuePop(ctxt);
@ -4870,38 +4869,49 @@ xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) {
CAST_TO_STRING;
CHECK_TYPE(XPATH_STRING);
str = valuePop(ctxt);
le += in;
m = xmlUTF8Strlen((const unsigned char *)str->stringval);
/* integer index of the first char */
/*
* If last pos not present, calculate last position
*/
if (nargs != 3)
le = m;
/*
* To meet our requirements, initial index calculations
* must be done before we convert to integer format
*
* First we normalize indices
*/
in -= 1.0;
le += in;
if (in < 0.0)
in = 0.0;
if (le > (double)m)
le = (double)m;
/*
* Now we go to integer form, rounding up
*/
i = (int) in;
if (((double)i) != in) i++;
/* integer index of the last char */
l = (int) le;
if (((double)l) != le) l++;
/* back to a zero based len */
i--;
l--;
/* check against the string len */
if (l > 1024) {
l = xmlStrlen(str->stringval);
}
if (i < 0) {
i = 0;
}
if (l > m) l=m;
/* number of chars to copy */
l -= i;
ret = xmlStrsub(str->stringval, i, l);
ret = xmlUTF8Strsub(str->stringval, i, l);
if (ret == NULL)
valuePush(ctxt, xmlXPathNewCString(""));
else {
valuePush(ctxt, xmlXPathNewString(ret));
xmlFree(ret);
}
xmlXPathFreeObject(str);
}
@ -5037,7 +5047,7 @@ xmlXPathNormalizeFunction(xmlXPathParserContextPtr ctxt, int nargs) {
blank = 0;
while (*source) {
if (IS_BLANK(*source)) {
blank = *source;
blank = 0x20;
} else {
if (blank) {
xmlBufferAdd(target, &blank, 1);
@ -5081,13 +5091,11 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) {
xmlXPathObjectPtr from;
xmlXPathObjectPtr to;
xmlBufferPtr target;
int i, offset, max;
int offset, max;
xmlChar ch;
const xmlChar *point;
xmlChar *point;
xmlChar *cptr;
/*
* TODO: need to be converted to UTF8 strings
*/
CHECK_ARITY(3);
CAST_TO_STRING;
@ -5099,15 +5107,37 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) {
target = xmlBufferCreate();
if (target) {
max = xmlStrlen(to->stringval);
for (i = 0; (ch = str->stringval[i]); i++) {
point = xmlStrchr(from->stringval, ch);
if (point) {
offset = (int)(point - from->stringval);
if (offset < max)
xmlBufferAdd(target, &to->stringval[offset], 1);
} else
xmlBufferAdd(target, &ch, 1);
max = xmlUTF8Strlen(to->stringval);
for (cptr = str->stringval; (ch=*cptr); ) {
offset = xmlUTF8Strloc(from->stringval, cptr);
if (offset >= 0) {
if (offset < max) {
point = xmlUTF8Strpos(to->stringval, offset);
if (point)
xmlBufferAdd(target, point, xmlUTF8Strsize(point, 1));
}
} else
xmlBufferAdd(target, cptr, xmlUTF8Strsize(cptr, 1));
/* Step to next character in input */
cptr++;
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 ) {
xmlGenericError(xmlGenericErrorContext,
"xmlXPathTranslateFunction: Invalid UTF8 string\n");
break;
}
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*cptr++ & 0xc0) != 0x80 ) {
xmlGenericError(xmlGenericErrorContext,
"xmlXPathTranslateFunction: Invalid UTF8 string\n");
break;
}
if (ch & 0x80) /* must have had error encountered */
break;
}
}
}
valuePush(ctxt, xmlXPathNewString(xmlBufferContent(target)));