encoding

encoding - interface for the encoding conversion functions

interface for the encoding conversion functions needed for XML basic encoding and iconv() support. Related specs are rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies [ISO-10646] UTF-8 and UTF-16 in Annexes [ISO-8859-1] ISO Latin-1 characters codes. [UNICODE] The Unicode Consortium, "The Unicode Standard -- Worldwide Character Encoding -- Version 1.0", Addison- Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is described in Unicode Technical Report #4. [US-ASCII] Coded Character Set--7-bit American Standard Code for Information Interchange, ANSI X3.4-1986.

Author(s): Daniel Veillard

Synopsis

#define UTF8Toisolat1;
#define isolat1ToUTF8;
typedef enum xmlCharEncError;
typedef enum xmlCharEncFlags;
typedef enum xmlCharEncoding;
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
typedef xmlCharEncodingHandler * xmlCharEncodingHandlerPtr;
int	xmlAddEncodingAlias		(const char * name, 
const char * alias); int xmlCharEncCloseFunc (xmlCharEncodingHandler * handler); typedef void xmlCharEncConvCtxtDtor (void * vctxt); typedef xmlCharEncError xmlCharEncConvFunc (void * vctxt,
unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int flush); typedef xmlParserErrors xmlCharEncConvImpl (void * vctxt,
const char * name,
xmlCharEncFlags flags,
xmlCharEncodingHandler ** out); int xmlCharEncFirstLine (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in); int xmlCharEncInFunc (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in); xmlParserErrors xmlCharEncNewCustomHandler (const char * name,
xmlCharEncConvFunc input,
xmlCharEncConvFunc output,
xmlCharEncConvCtxtDtor ctxtDtor,
void * inputCtxt,
void * outputCtxt,
xmlCharEncodingHandler ** out); int xmlCharEncOutFunc (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in); typedef int xmlCharEncodingInputFunc (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); typedef int xmlCharEncodingOutputFunc (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); void xmlCleanupCharEncodingHandlers (void); void xmlCleanupEncodingAliases (void); xmlParserErrors xmlCreateCharEncodingHandler (const char * name,
xmlCharEncFlags flags,
xmlCharEncConvImpl impl,
void * implCtxt,
xmlCharEncodingHandler ** out); int xmlDelEncodingAlias (const char * alias); xmlCharEncoding xmlDetectCharEncoding (const unsigned char * in,
int len); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler (const char * name); xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler (xmlCharEncoding enc); const char * xmlGetCharEncodingName (xmlCharEncoding enc); const char * xmlGetEncodingAlias (const char * alias); void xmlInitCharEncodingHandlers (void); int xmlIsolat1ToUTF8 (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); xmlParserErrors xmlLookupCharEncodingHandler (xmlCharEncoding enc,
xmlCharEncodingHandler ** out); xmlCharEncodingHandlerPtr xmlNewCharEncodingHandler (const char * name,
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output); xmlParserErrors xmlOpenCharEncodingHandler (const char * name,
int output,
xmlCharEncodingHandler ** out); xmlCharEncoding xmlParseCharEncoding (const char * name); void xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); int xmlUTF8ToIsolat1 (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen);

Description

Details

Macro UTF8Toisolat1

#define UTF8Toisolat1;


Macro isolat1ToUTF8

#define isolat1ToUTF8;



Enum xmlCharEncFlags

enum xmlCharEncFlags {
    XML_ENC_INPUT = 1
    XML_ENC_OUTPUT = 2
};


Enum xmlCharEncoding

enum xmlCharEncoding {
    XML_CHAR_ENCODING_ERROR = -1 /* No char encoding detected */
    XML_CHAR_ENCODING_NONE = 0 /* No char encoding detected */
    XML_CHAR_ENCODING_UTF8 = 1 /* UTF-8 */
    XML_CHAR_ENCODING_UTF16LE = 2 /* UTF-16 little endian */
    XML_CHAR_ENCODING_UTF16BE = 3 /* UTF-16 big endian */
    XML_CHAR_ENCODING_UCS4LE = 4 /* UCS-4 little endian */
    XML_CHAR_ENCODING_UCS4BE = 5 /* UCS-4 big endian */
    XML_CHAR_ENCODING_EBCDIC = 6 /* EBCDIC uh! */
    XML_CHAR_ENCODING_UCS4_2143 = 7 /* UCS-4 unusual ordering */
    XML_CHAR_ENCODING_UCS4_3412 = 8 /* UCS-4 unusual ordering */
    XML_CHAR_ENCODING_UCS2 = 9 /* UCS-2 */
    XML_CHAR_ENCODING_8859_1 = 10 /* ISO-8859-1 ISO Latin 1 */
    XML_CHAR_ENCODING_8859_2 = 11 /* ISO-8859-2 ISO Latin 2 */
    XML_CHAR_ENCODING_8859_3 = 12 /* ISO-8859-3 */
    XML_CHAR_ENCODING_8859_4 = 13 /* ISO-8859-4 */
    XML_CHAR_ENCODING_8859_5 = 14 /* ISO-8859-5 */
    XML_CHAR_ENCODING_8859_6 = 15 /* ISO-8859-6 */
    XML_CHAR_ENCODING_8859_7 = 16 /* ISO-8859-7 */
    XML_CHAR_ENCODING_8859_8 = 17 /* ISO-8859-8 */
    XML_CHAR_ENCODING_8859_9 = 18 /* ISO-8859-9 */
    XML_CHAR_ENCODING_2022_JP = 19 /* ISO-2022-JP */
    XML_CHAR_ENCODING_SHIFT_JIS = 20 /* Shift_JIS */
    XML_CHAR_ENCODING_EUC_JP = 21 /* EUC-JP */
    XML_CHAR_ENCODING_ASCII = 22 /* pure ASCII Available since 2.14.0 */
    XML_CHAR_ENCODING_UTF16 = 23 /* UTF-16 native */
    XML_CHAR_ENCODING_HTML = 24 /* HTML (output only) */
    XML_CHAR_ENCODING_8859_10 = 25 /* ISO-8859-10 */
    XML_CHAR_ENCODING_8859_11 = 26 /* ISO-8859-11 */
    XML_CHAR_ENCODING_8859_13 = 27 /* ISO-8859-13 */
    XML_CHAR_ENCODING_8859_14 = 28 /* ISO-8859-14 */
    XML_CHAR_ENCODING_8859_15 = 29 /* ISO-8859-15 */
    XML_CHAR_ENCODING_8859_16 = 30 /*  ISO-8859-16 */
};


Structure xmlCharEncodingHandler

struct _xmlCharEncodingHandler {
    char *	name
    void *	inputCtxt
    void *	outputCtxt
    xmlCharEncConvCtxtDtor	ctxtDtor
    int	flags
} xmlCharEncodingHandler;


Typedef xmlCharEncodingHandlerPtr

xmlCharEncodingHandler * xmlCharEncodingHandlerPtr;


Function type xmlCharEncConvCtxtDtor

void	xmlCharEncConvCtxtDtor		(void * vctxt)

Free a conversion context.

vctxt: conversion context

Function type xmlCharEncConvFunc

xmlCharEncError	xmlCharEncConvFunc	(void * vctxt, 
unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int flush)

Convert between character encodings. The value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced. If the converter can consume partial multi-byte sequences, the @flush flag can be used to detect truncated sequences at EOF. Otherwise, the flag can be ignored.

vctxt: conversion context
out: a pointer to an array of bytes to store the result
outlen: the length of @out
in: a pointer to an array of input bytes
inlen: the length of @in
flush: end of input
Returns: an XML_ENC_ERR code.

Function type xmlCharEncConvImpl

xmlParserErrors	xmlCharEncConvImpl	(void * vctxt, 
const char * name,
xmlCharEncFlags flags,
xmlCharEncodingHandler ** out)

If this function returns XML_ERR_OK, it must fill the @out pointer with an encoding handler. The handler can be obtained from xmlCharEncNewCustomHandler. @flags can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both.

vctxt: user data
name: encoding name
flags: bit mask of flags
out: pointer to resulting handler
Returns: an xmlParserErrors code.

Function type xmlCharEncodingInputFunc

int	xmlCharEncodingInputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Convert characters to UTF-8. On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.

out: a pointer to an array of bytes to store the UTF-8 result
outlen: the length of @out
in: a pointer to an array of chars in the original encoding
inlen: the length of @in
Returns: the number of bytes written or an XML_ENC_ERR code.

Function type xmlCharEncodingOutputFunc

int	xmlCharEncodingOutputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Convert characters from UTF-8. On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.

out: a pointer to an array of bytes to store the result
outlen: the length of @out
in: a pointer to an array of UTF-8 chars
inlen: the length of @in
Returns: the number of bytes written or an XML_ENC_ERR code.

xmlAddEncodingAlias ()

int	xmlAddEncodingAlias		(const char * name, 
const char * alias)

DEPRECATED: This function modifies global state and is not thread-safe. Registers an alias @alias for an encoding named @name. Existing alias will be overwritten.

name: the encoding name as parsed, in UTF-8 format (ASCII actually)
alias: the alias name as parsed, in UTF-8 format (ASCII actually)
Returns: 0 in case of success, -1 in case of error

xmlCharEncCloseFunc ()

int	xmlCharEncCloseFunc		(xmlCharEncodingHandler * handler)

Releases an xmlCharEncodingHandler. Must be called after a handler is no longer in use.

handler: char encoding transformation data structure
Returns: 0.

xmlCharEncFirstLine ()

int	xmlCharEncFirstLine		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)

DEPERECATED: Don't use.

handler: char encoding transformation data structure
out: an xmlBuffer for the output.
in: an xmlBuffer for the input
Returns: the number of bytes written or an XML_ENC_ERR code.

xmlCharEncInFunc ()

int	xmlCharEncInFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)

Generic front-end for the encoding handler input function

handler: char encoding transformation data structure
out: an xmlBuffer for the output.
in: an xmlBuffer for the input
Returns: the number of bytes written or an XML_ENC_ERR code.

xmlCharEncNewCustomHandler ()

xmlParserErrors	xmlCharEncNewCustomHandler	(const char * name, 
xmlCharEncConvFunc input,
xmlCharEncConvFunc output,
xmlCharEncConvCtxtDtor ctxtDtor,
void * inputCtxt,
void * outputCtxt,
xmlCharEncodingHandler ** out)

Create a custom xmlCharEncodingHandler.

name: the encoding name
input: input callback which converts to UTF-8
output: output callback which converts from UTF-8
ctxtDtor: context destructor
inputCtxt: context for input callback
outputCtxt: context for output callback
out: pointer to resulting handler
Returns: an xmlParserErrors code.

xmlCharEncOutFunc ()

int	xmlCharEncOutFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)

Generic front-end for the encoding handler output function a first call with @in == NULL has to be made firs to initiate the output in case of non-stateless encoding needing to initiate their state or the output (like the BOM in UTF16). In case of UTF8 sequence conversion errors for the given encoder, the content will be automatically remapped to a CharRef sequence.

handler: char encoding transformation data structure
out: an xmlBuffer for the output.
in: an xmlBuffer for the input
Returns: the number of bytes written or an XML_ENC_ERR code.

xmlCleanupCharEncodingHandlers ()

void	xmlCleanupCharEncodingHandlers	(void)

DEPRECATED: This function will be made private. Call xmlCleanupParser to free global state but see the warnings there. xmlCleanupParser should be only called once at program exit. In most cases, you don't have call cleanup functions at all. Cleanup the memory allocated for the char encoding support, it unregisters all the encoding handlers and the aliases.


xmlCleanupEncodingAliases ()

void	xmlCleanupEncodingAliases	(void)

DEPRECATED: This function modifies global state and is not thread-safe. Unregisters all aliases


xmlCreateCharEncodingHandler ()

xmlParserErrors	xmlCreateCharEncodingHandler	(const char * name, 
xmlCharEncFlags flags,
xmlCharEncConvImpl impl,
void * implCtxt,
xmlCharEncodingHandler ** out)

Find or create a handler matching the encoding. The following converters are looked up in order: - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) - Custom implementation if provided - User-registered global handler (deprecated) - iconv if enabled - ICU if enabled The handler must be closed with xmlCharEncCloseFunc. If the encoding is UTF-8, a NULL handler and no error code will be returned. @flags can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both. Available since 2.14.0.

name: a string describing the char encoding.
flags: bit mask of flags
impl: a conversion implementation (optional)
implCtxt: user data for conversion implementation (optional)
out: pointer to result
Returns: XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another xmlParserErrors error code.

xmlDelEncodingAlias ()

int	xmlDelEncodingAlias		(const char * alias)

DEPRECATED: This function modifies global state and is not thread-safe. Unregisters an encoding alias @alias

alias: the alias name as parsed, in UTF-8 format (ASCII actually)
Returns: 0 in case of success, -1 in case of error

xmlDetectCharEncoding ()

xmlCharEncoding	xmlDetectCharEncoding	(const unsigned char * in, 
int len)

Guess the encoding of the entity using the first bytes of the entity content according to the non-normative appendix F of the XML-1.0 recommendation.

in: a pointer to the first bytes of the XML entity, must be at least 2 bytes long (at least 4 if encoding is UTF4 variant).
len: pointer to the length of the buffer
Returns: one of the XML_CHAR_ENCODING_... values.

xmlFindCharEncodingHandler ()

xmlCharEncodingHandlerPtr	xmlFindCharEncodingHandler	(const char * name)

DEPRECATED: Use xmlOpenCharEncodingHandler which has better error reporting. If the encoding is UTF-8, this will return a no-op handler that shouldn't be used.

name: a string describing the char encoding.
Returns: the handler or NULL if no handler was found or an error occurred.

xmlGetCharEncodingHandler ()

xmlCharEncodingHandlerPtr	xmlGetCharEncodingHandler	(xmlCharEncoding enc)

DEPRECATED: Use xmlLookupCharEncodingHandler which has better error reporting.

enc: an xmlCharEncoding value.
Returns: the handler or NULL if no handler was found or an error occurred.

xmlGetCharEncodingName ()

const char *	xmlGetCharEncodingName	(xmlCharEncoding enc)

The "canonical" name for XML encoding. C.f. http://www.w3.org/TR/REC-xml#charencoding Section 4.3.3 Character Encoding in Entities

enc: the encoding
Returns: the canonical name for the given encoding

xmlGetEncodingAlias ()

const char *	xmlGetEncodingAlias	(const char * alias)

DEPRECATED: This function is not thread-safe. Lookup an encoding name for the given alias.

alias: the alias name as parsed, in UTF-8 format (ASCII actually)
Returns: NULL if not found, otherwise the original name

xmlInitCharEncodingHandlers ()

void	xmlInitCharEncodingHandlers	(void)

DEPRECATED: Alias for xmlInitParser.


xmlIsolat1ToUTF8 ()

int	xmlIsolat1ToUTF8		(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 block of chars out.

out: a pointer to an array of bytes to store the result
outlen: the length of @out
in: a pointer to an array of ISO Latin 1 chars
inlen: the length of @in
Returns: the number of bytes written or an XML_ENC_ERR code. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets produced.

xmlLookupCharEncodingHandler ()

xmlParserErrors	xmlLookupCharEncodingHandler	(xmlCharEncoding enc, 
xmlCharEncodingHandler ** out)

Find or create a handler matching the encoding. The following converters are looked up in order: - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) - User-registered global handler (deprecated) - iconv if enabled - ICU if enabled The handler must be closed with xmlCharEncCloseFunc. If the encoding is UTF-8, a NULL handler and no error code will be returned. Available since 2.13.0.

enc: an xmlCharEncoding value.
out: pointer to result
Returns: XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another xmlParserErrors error code.

xmlNewCharEncodingHandler ()

xmlCharEncodingHandlerPtr	xmlNewCharEncodingHandler	(const char * name, 
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output)

DEPRECATED: This function modifies global state and is not thread-safe. Create and registers an xmlCharEncodingHandler.

name: the encoding name, in UTF-8 format (ASCII actually)
input: the xmlCharEncodingInputFunc to read that encoding
output: the xmlCharEncodingOutputFunc to write that encoding
Returns: the xmlCharEncodingHandlerPtr created (or NULL in case of error).

xmlOpenCharEncodingHandler ()

xmlParserErrors	xmlOpenCharEncodingHandler	(const char * name, 
int output,
xmlCharEncodingHandler ** out)

Find or create a handler matching the encoding. The following converters are looked up in order: - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) - User-registered global handler (deprecated) - iconv if enabled - ICU if enabled The handler must be closed with xmlCharEncCloseFunc. If the encoding is UTF-8, a NULL handler and no error code will be returned. Available since 2.13.0.

name: a string describing the char encoding.
output: boolean, use handler for output
out: pointer to result
Returns: XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another xmlParserErrors error code.

xmlParseCharEncoding ()

xmlCharEncoding	xmlParseCharEncoding	(const char * name)

Compare the string to the encoding schemes already known. Note that the comparison is case insensitive accordingly to the section [XML] 4.3.3 Character Encoding in Entities.

name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Returns: one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE if not recognized.

xmlRegisterCharEncodingHandler ()

void	xmlRegisterCharEncodingHandler	(xmlCharEncodingHandlerPtr handler)

DEPRECATED: This function modifies global state and is not thread-safe. Register the char encoding handler.

handler: the xmlCharEncodingHandlerPtr handler block

xmlUTF8ToIsolat1 ()

int	xmlUTF8ToIsolat1		(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 block of chars out.

out: a pointer to an array of bytes to store the result
outlen: the length of @out
in: a pointer to an array of UTF-8 chars
inlen: the length of @in
Returns: the number of bytes written or an XML_ENC_ERR code. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets produced.