- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8

string oriented functions and started cleaning the related areas in xpath.c which needed fixing in this respect Daniel
2025-03-28 21:33:13 +00:00 · 2001-05-30 19:14:17 +00:00 · 2001-05-30 19:14:17 +00:00 · 97ac13197c
commit 97ac13197c
parent 2d70372ce3
5 changed files with 325 additions and 57 deletions
--- a/6
+++ b/6
@ -1,3 +1,9 @@
+Wed May 30 21:12:45 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
+
+	* xpath.c encoding.[ch]: William M. Brack provided a set of UTF8
+	  string oriented functions and started cleaning the related areas
+	  in xpath.c which needed fixing in this respect
+
 Wed May 30 20:30:47 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>

 	* HTMLtree.c: applied patch from Jaroslaw Kolakowski to close bug
--- a/encoding.c
+++ b/encoding.c
@ -13,11 +13,14 @@
 * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
 *                Information Interchange, ANSI X3.4-1986.
 *
- * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
- *
 * See Copyright for the status of this software.
 *
 * Daniel.Veillard@w3.org
+ *
+ * UTF8 string routines from:
+ * "William M. Brack" <wbrack@mmm.com.hk>
+ *
+ * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
 */

 #include "libxml.h"
@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0;

 static int xmlLittleEndian = 1;

-/*
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F   0xxxxxxx
- * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
- *
- * I hope we won't use values > 0xFFFF anytime soon !
- */
+/************************************************************************
+ *									*
+ *			Generic UTF8 handling routines			*
+ *									*
+ * From rfc2044: encoding of the Unicode values on UTF-8:		*
+ *									*
+ * UCS-4 range (hex.)           UTF-8 octet sequence (binary)		*
+ * 0000 0000-0000 007F   0xxxxxxx					*
+ * 0000 0080-0000 07FF   110xxxxx 10xxxxxx				*
+ * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 			*
+ *									*
+ * I hope we won't use values > 0xFFFF anytime soon !			*
+ *									*
+ ************************************************************************/

 /**
 * xmlUTF8Strlen:
@ -85,7 +92,7 @@ static int xmlLittleEndian = 1;
 * Returns the number of characters in the string or -1 in case of error
 */
 int
-xmlUTF8Strlen(const unsigned char *utf) {
+xmlUTF8Strlen(const xmlChar *utf) {
    int ret = 0;

    if (utf == NULL)
@ -227,6 +234,178 @@ xmlCheckUTF8(const unsigned char *utf)
      return(1);
 }

+/**
+ * xmlUTF8Strsize:
+ * @utf:  a sequence of UTF-8 encoded bytes
+ * @len:  the number of characters in the array
+ *
+ * storage size of an UTF8 string
+ *
+ * Returns the storage size of
+ * the first 'len' characters of ARRAY
+ *
+ */
+
+int
+xmlUTF8Strsize(const xmlChar *utf, int len) {
+    const xmlChar	*ptr=utf;
+    xmlChar	ch;
+
+    if (len <= 0)
+	return(0);
+
+    while ( len-- > 0) {
+	if ( !*ptr )
+	    break;
+	if ( (ch = *ptr++) & 0x80)
+	    while ( (ch<<=1) & 0x80 )
+		ptr++;
+    }
+    return (ptr - utf);
+}
+
+
+/**
+ * xmlUTF8Strndup:
+ * @utf:  the input UTF8 *
+ * @len:  the len of @utf (in chars)
+ *
+ * a strndup for array of UTF8's
+ *
+ * Returns a new UTF8 * or NULL
+ */
+xmlChar *
+xmlUTF8Strndup(const xmlChar *utf, int len) {
+    xmlChar *ret;
+    int i;
+    
+    if ((utf == NULL) || (len < 0)) return(NULL);
+    i = xmlUTF8Strsize(utf, len);
+    ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
+    if (ret == NULL) {
+        xmlGenericError(xmlGenericErrorContext,
+		"malloc of %ld byte failed\n",
+	        (len + 1) * (long)sizeof(xmlChar));
+        return(NULL);
+    }
+    memcpy(ret, utf, i * sizeof(xmlChar));
+    ret[i] = 0;
+    return(ret);
+}
+
+/**
+ * xmlUTF8Strpos:
+ * @utf:  the input UTF8 *
+ * @pos:  the position of the desired UTF8 char (in chars)
+ *
+ * a function to provide the equivalent of fetching a
+ * character from a string array
+ *
+ * Returns a pointer to the UTF8 character or NULL
+ */
+xmlChar *
+xmlUTF8Strpos(const xmlChar *utf, int pos) {
+    xmlChar ch;
+
+    if (utf == NULL) return(NULL);
+    if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
+	return(NULL);
+    while (pos--) {
+	if ((ch=*utf++) == 0) return(NULL);
+	if ( ch & 0x80 ) {
+	    /* if not simple ascii, verify proper format */
+	    if ( (ch & 0xc0) != 0xc0 )
+		return(NULL);
+	    /* then skip over remaining bytes for this char */
+	    while ( (ch <<= 1) & 0x80 )
+		if ( (*utf++ & 0xc0) != 0x80 )
+		    return(NULL);
+	}
+    }
+    return((xmlChar *)utf);
+}
+
+/**
+ * xmlUTF8Strloc:
+ * @utf:  the input UTF8 *
+ * @utfchar:  the UTF8 character to be found
+ *
+ * a function to provide relative location of a UTF8 char
+ *
+ * Returns the relative character position of the desired char
+ * or -1 if not found
+ */
+int
+xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
+    int i, size;
+    xmlChar ch;
+
+    if (utf==NULL || utfchar==NULL) return -1;
+    size = xmlUTF8Strsize(utfchar, 1);
+	for(i=0; (ch=*utf) != 0; i++) {
+	    if (xmlStrncmp(utf, utfchar, size)==0)
+		return(i);
+	    utf++;
+	    if ( ch & 0x80 ) {
+		/* if not simple ascii, verify proper format */
+		if ( (ch & 0xc0) != 0xc0 )
+		    return(-1);
+		/* then skip over remaining bytes for this char */
+		while ( (ch <<= 1) & 0x80 )
+		    if ( (*utf++ & 0xc0) != 0x80 )
+			return(-1);
+	    }
+	}
+
+    return(-1);
+}
+/**
+ * xmlUTF8Strsub:
+ * @utf:  a sequence of UTF-8 encoded bytes
+ *
+ * @start: relative pos of first char
+ * @len:   total number to copy
+ *
+ * Note:  positions are given in units of UTF-8 chars
+ *
+ * Returns a pointer to a newly created string
+ * or NULL if any problem
+ */
+
+xmlChar *
+xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
+    int	    i;
+    xmlChar ch;
+
+    if (utf == NULL) return(NULL);
+    if (start < 0) return(NULL);
+    if (len < 0) return(NULL);
+
+    /*
+     * Skip over any leading chars
+     */
+    for (i = 0;i < start;i++) {
+	if ((ch=*utf++) == 0) return(NULL);
+	if ( ch & 0x80 ) {
+	    /* if not simple ascii, verify proper format */
+	    if ( (ch & 0xc0) != 0xc0 )
+		return(NULL);
+	    /* then skip over remaining bytes for this char */
+	    while ( (ch <<= 1) & 0x80 )
+		if ( (*utf++ & 0xc0) != 0x80 )
+		    return(NULL);
+	}
+    }
+
+    return(xmlUTF8Strndup(utf, len));
+}
+
+/************************************************************************
+ *									*
+ *		Conversions To/From UTF8 encoding			*
+ *									*
+ ************************************************************************/
+
 /**
 * asciiToUTF8:
 * @out:  a pointer to an array of bytes to store the result
@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
    return(0);
 }

+/************************************************************************
+ *									*
+ *		Generic encoding handling routines			*
+ *									*
+ ************************************************************************/
+
 /**
 * xmlDetectCharEncoding:
 * @in:  a pointer to the first bytes of the XML entity, must be at least
@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
    return(NULL);
 }

-/****************************************************************
- *								*
- *		Char encoding handlers				*
- *								*
- ****************************************************************/
+/************************************************************************
+ *									*
+ *			Char encoding handlers				*
+ *									*
+ ************************************************************************/
+

 /* the size should be growable, but it's not a big deal ... */
 #define MAX_ENCODING_HANDLERS 50
@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) {
    return(NULL);
 }

+/************************************************************************
+ *									*
+ *		ICONV based generic conversion functions		*
+ *									*
+ ************************************************************************/
+
 #ifdef LIBXML_ICONV_ENABLED
 /**
 * xmlIconvWrapper:
@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd,
 }
 #endif /* LIBXML_ICONV_ENABLED */

+/************************************************************************
+ *									*
+ *		The real API used by libxml for on-the-fly conversion	*
+ *									*
+ ************************************************************************/
+
 /**
 * xmlCharEncFirstLine:
 * @handler:	char enconding transformation data structure
--- a/encoding.h
+++ b/encoding.h
@ -191,8 +191,25 @@ int	isolat1ToUTF8			(unsigned char* out,
 					 int *outlen,
 					 const unsigned char* in,
 					 int *inlen);
+/*
+ * exports additional "UTF-8 aware" string routines which are
+ */
+
 int	xmlCheckUTF8			(const unsigned char *utf);
-int	xmlUTF8Strlen			(const unsigned char *utf);
+
+int	xmlUTF8Strsize			(const xmlChar *utf,
+					 int len);
+xmlChar * xmlUTF8Strndup		(const xmlChar *utf,
+					 int len);
+xmlChar * xmlUTF8Strpos			(const xmlChar *utf,
+					 int pos);
+int	xmlUTF8Strloc			(const xmlChar *utf,
+					 const xmlChar *utfchar);
+xmlChar * xmlUTF8Strsub			(const xmlChar *utf,
+					 int start,
+					 int len);
+
+int	xmlUTF8Strlen			(const xmlChar *utf);

 #ifdef __cplusplus
 }
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@ -191,8 +191,25 @@ int	isolat1ToUTF8			(unsigned char* out,
 					 int *outlen,
 					 const unsigned char* in,
 					 int *inlen);
+/*
+ * exports additional "UTF-8 aware" string routines which are
+ */
+
 int	xmlCheckUTF8			(const unsigned char *utf);
-int	xmlUTF8Strlen			(const unsigned char *utf);
+
+int	xmlUTF8Strsize			(const xmlChar *utf,
+					 int len);
+xmlChar * xmlUTF8Strndup		(const xmlChar *utf,
+					 int len);
+xmlChar * xmlUTF8Strpos			(const xmlChar *utf,
+					 int pos);
+int	xmlUTF8Strloc			(const xmlChar *utf,
+					 const xmlChar *utfchar);
+xmlChar * xmlUTF8Strsub			(const xmlChar *utf,
+					 int start,
+					 int len);
+
+int	xmlUTF8Strlen			(const xmlChar *utf);

 #ifdef __cplusplus
 }
--- a/xpath.c
+++ b/xpath.c
@ -4840,28 +4840,27 @@ xmlXPathStartsWithFunction(xmlXPathParserContextPtr ctxt, int nargs) {
 void
 xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) {
    xmlXPathObjectPtr str, start, len;
-    double le, in;
-    int i, l;
+    double le=0, in;
+    int i, l, m;
    xmlChar *ret;

-    /* 
-     * TODO: need to be converted to UTF8 strings
-     */
    if (nargs < 2) {
 	CHECK_ARITY(2);
    }
    if (nargs > 3) {
 	CHECK_ARITY(3);
    }
+    /*
+     * take care of possible last (position) argument
+    */
    if (nargs == 3) {
 	CAST_TO_NUMBER;
 	CHECK_TYPE(XPATH_NUMBER);
 	len = valuePop(ctxt);
 	le = len->floatval;
        xmlXPathFreeObject(len);
-    } else {
-	le = 2000000000;
    }
+
    CAST_TO_NUMBER;
    CHECK_TYPE(XPATH_NUMBER);
    start = valuePop(ctxt);
@ -4870,38 +4869,49 @@ xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) {
    CAST_TO_STRING;
    CHECK_TYPE(XPATH_STRING);
    str = valuePop(ctxt);
-    le += in;
+    m = xmlUTF8Strlen((const unsigned char *)str->stringval);

-    /* integer index of the first char */
+    /*
+     * If last pos not present, calculate last position
+    */
+    if (nargs != 3)
+	le = m;
+
+    /*
+     * To meet our requirements, initial index calculations
+     * must be done before we convert to integer format
+     *
+     * First we normalize indices
+     */
+    in -= 1.0;
+    le += in;
+    if (in < 0.0)
+	in = 0.0;
+    if (le > (double)m)
+	le = (double)m;
+
+    /*
+     * Now we go to integer form, rounding up
+     */
    i = (int) in;
    if (((double)i) != in) i++;
    
-    /* integer index of the last char */
    l = (int) le;
    if (((double)l) != le) l++;

-    /* back to a zero based len */
-    i--;
-    l--;
-
-    /* check against the string len */
-    if (l > 1024) {
-        l = xmlStrlen(str->stringval);
-    }
-    if (i < 0) {
-        i = 0;
-    }
+    if (l > m) l=m;

    /* number of chars to copy */
    l -= i;

-    ret = xmlStrsub(str->stringval, i, l);
+    ret = xmlUTF8Strsub(str->stringval, i, l);
    if (ret == NULL)
 	valuePush(ctxt, xmlXPathNewCString(""));
    else {
 	valuePush(ctxt, xmlXPathNewString(ret));
 	xmlFree(ret);
    }
+
    xmlXPathFreeObject(str);
 }

@ -5037,7 +5047,7 @@ xmlXPathNormalizeFunction(xmlXPathParserContextPtr ctxt, int nargs) {
    blank = 0;
    while (*source) {
      if (IS_BLANK(*source)) {
-	blank = *source;
+	blank = 0x20;
      } else {
 	if (blank) {
 	  xmlBufferAdd(target, &blank, 1);
@ -5081,13 +5091,11 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) {
    xmlXPathObjectPtr from;
    xmlXPathObjectPtr to;
    xmlBufferPtr target;
-    int i, offset, max;
+    int offset, max;
    xmlChar ch;
-    const xmlChar *point;
+    xmlChar *point;
+    xmlChar *cptr;

-    /* 
-     * TODO: need to be converted to UTF8 strings
-     */
    CHECK_ARITY(3);

    CAST_TO_STRING;
@ -5099,15 +5107,37 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) {

    target = xmlBufferCreate();
    if (target) {
-	max = xmlStrlen(to->stringval);
-	for (i = 0; (ch = str->stringval[i]); i++) {
-	    point = xmlStrchr(from->stringval, ch);
-	    if (point) {
-		offset = (int)(point - from->stringval);
-		if (offset < max)
-		    xmlBufferAdd(target, &to->stringval[offset], 1);
-		} else
-		    xmlBufferAdd(target, &ch, 1);
+	max = xmlUTF8Strlen(to->stringval);
+	for (cptr = str->stringval; (ch=*cptr); ) {
+	    offset = xmlUTF8Strloc(from->stringval, cptr);
+	    if (offset >= 0) {
+		if (offset < max) {
+		    point = xmlUTF8Strpos(to->stringval, offset);
+		    if (point)
+			xmlBufferAdd(target, point, xmlUTF8Strsize(point, 1));
+		}
+	    } else
+		xmlBufferAdd(target, cptr, xmlUTF8Strsize(cptr, 1));
+
+	    /* Step to next character in input */
+	    cptr++;
+	    if ( ch & 0x80 ) {
+		/* if not simple ascii, verify proper format */
+		if ( (ch & 0xc0) != 0xc0 ) {
+		    xmlGenericError(xmlGenericErrorContext,
+			"xmlXPathTranslateFunction: Invalid UTF8 string\n");
+		    break;
+		}
+		/* then skip over remaining bytes for this char */
+		while ( (ch <<= 1) & 0x80 )
+		    if ( (*cptr++ & 0xc0) != 0x80 ) {
+			xmlGenericError(xmlGenericErrorContext,
+			    "xmlXPathTranslateFunction: Invalid UTF8 string\n");
+			break;
+		    }
+		if (ch & 0x80) /* must have had error encountered */
+		    break;
+	    }
 	}
    }
    valuePush(ctxt, xmlXPathNewString(xmlBufferContent(target)));