![]() |
![]() |
![]() |
![]() |
libxml2 Reference Manual |
---|
encoding - interface for the encoding conversion functions
interface for the encoding conversion functions needed for XML basic encoding and iconv() support. Related specs are rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies [ISO-10646] UTF-8 and UTF-16 in Annexes [ISO-8859-1] ISO Latin-1 characters codes. [UNICODE] The Unicode Consortium, "The Unicode Standard -- Worldwide Character Encoding -- Version 1.0", Addison- Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is described in Unicode Technical Report #4. [US-ASCII] Coded Character Set--7-bit American Standard Code for Information Interchange, ANSI X3.4-1986.
Author(s): Daniel Veillard
#define UTF8Toisolat1; #define isolat1ToUTF8; typedef enum xmlCharEncError; typedef enum xmlCharEncFlags; typedef enum xmlCharEncoding; typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; typedef xmlCharEncodingHandler * xmlCharEncodingHandlerPtr; int xmlAddEncodingAlias (const char * name,
const char * alias); int xmlCharEncCloseFunc (xmlCharEncodingHandler * handler); typedef void xmlCharEncConvCtxtDtor (void * vctxt); typedef xmlCharEncError xmlCharEncConvFunc (void * vctxt,
unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int flush); typedef xmlParserErrors xmlCharEncConvImpl (void * vctxt,
const char * name,
xmlCharEncFlags flags,
xmlCharEncodingHandler ** out); int xmlCharEncFirstLine (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in); int xmlCharEncInFunc (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in); xmlParserErrors xmlCharEncNewCustomHandler (const char * name,
xmlCharEncConvFunc input,
xmlCharEncConvFunc output,
xmlCharEncConvCtxtDtor ctxtDtor,
void * inputCtxt,
void * outputCtxt,
xmlCharEncodingHandler ** out); int xmlCharEncOutFunc (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in); typedef int xmlCharEncodingInputFunc (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); typedef int xmlCharEncodingOutputFunc (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); void xmlCleanupCharEncodingHandlers (void); void xmlCleanupEncodingAliases (void); xmlParserErrors xmlCreateCharEncodingHandler (const char * name,
xmlCharEncFlags flags,
xmlCharEncConvImpl impl,
void * implCtxt,
xmlCharEncodingHandler ** out); int xmlDelEncodingAlias (const char * alias); xmlCharEncoding xmlDetectCharEncoding (const unsigned char * in,
int len); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler (const char * name); xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler (xmlCharEncoding enc); const char * xmlGetCharEncodingName (xmlCharEncoding enc); const char * xmlGetEncodingAlias (const char * alias); void xmlInitCharEncodingHandlers (void); int xmlIsolat1ToUTF8 (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); xmlParserErrors xmlLookupCharEncodingHandler (xmlCharEncoding enc,
xmlCharEncodingHandler ** out); xmlCharEncodingHandlerPtr xmlNewCharEncodingHandler (const char * name,
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output); xmlParserErrors xmlOpenCharEncodingHandler (const char * name,
int output,
xmlCharEncodingHandler ** out); xmlCharEncoding xmlParseCharEncoding (const char * name); void xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); int xmlUTF8ToIsolat1 (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen);
#define UTF8Toisolat1;
#define isolat1ToUTF8;
enum xmlCharEncError { XML_ENC_ERR_MEMORY = -4 XML_ENC_ERR_SPACE = -3 XML_ENC_ERR_INPUT = -2 XML_ENC_ERR_INTERNAL = -1 XML_ENC_ERR_SUCCESS = 0 };
enum xmlCharEncFlags { XML_ENC_INPUT = 1 XML_ENC_OUTPUT = 2 };
enum xmlCharEncoding { XML_CHAR_ENCODING_ERROR = -1 /* No char encoding detected */ XML_CHAR_ENCODING_NONE = 0 /* No char encoding detected */ XML_CHAR_ENCODING_UTF8 = 1 /* UTF-8 */ XML_CHAR_ENCODING_UTF16LE = 2 /* UTF-16 little endian */ XML_CHAR_ENCODING_UTF16BE = 3 /* UTF-16 big endian */ XML_CHAR_ENCODING_UCS4LE = 4 /* UCS-4 little endian */ XML_CHAR_ENCODING_UCS4BE = 5 /* UCS-4 big endian */ XML_CHAR_ENCODING_EBCDIC = 6 /* EBCDIC uh! */ XML_CHAR_ENCODING_UCS4_2143 = 7 /* UCS-4 unusual ordering */ XML_CHAR_ENCODING_UCS4_3412 = 8 /* UCS-4 unusual ordering */ XML_CHAR_ENCODING_UCS2 = 9 /* UCS-2 */ XML_CHAR_ENCODING_8859_1 = 10 /* ISO-8859-1 ISO Latin 1 */ XML_CHAR_ENCODING_8859_2 = 11 /* ISO-8859-2 ISO Latin 2 */ XML_CHAR_ENCODING_8859_3 = 12 /* ISO-8859-3 */ XML_CHAR_ENCODING_8859_4 = 13 /* ISO-8859-4 */ XML_CHAR_ENCODING_8859_5 = 14 /* ISO-8859-5 */ XML_CHAR_ENCODING_8859_6 = 15 /* ISO-8859-6 */ XML_CHAR_ENCODING_8859_7 = 16 /* ISO-8859-7 */ XML_CHAR_ENCODING_8859_8 = 17 /* ISO-8859-8 */ XML_CHAR_ENCODING_8859_9 = 18 /* ISO-8859-9 */ XML_CHAR_ENCODING_2022_JP = 19 /* ISO-2022-JP */ XML_CHAR_ENCODING_SHIFT_JIS = 20 /* Shift_JIS */ XML_CHAR_ENCODING_EUC_JP = 21 /* EUC-JP */ XML_CHAR_ENCODING_ASCII = 22 /* pure ASCII Available since 2.14.0 */ XML_CHAR_ENCODING_UTF16 = 23 /* UTF-16 native */ XML_CHAR_ENCODING_HTML = 24 /* HTML (output only) */ XML_CHAR_ENCODING_8859_10 = 25 /* ISO-8859-10 */ XML_CHAR_ENCODING_8859_11 = 26 /* ISO-8859-11 */ XML_CHAR_ENCODING_8859_13 = 27 /* ISO-8859-13 */ XML_CHAR_ENCODING_8859_14 = 28 /* ISO-8859-14 */ XML_CHAR_ENCODING_8859_15 = 29 /* ISO-8859-15 */ XML_CHAR_ENCODING_8859_16 = 30 /* ISO-8859-16 */ };
struct _xmlCharEncodingHandler { char * name void * inputCtxt void * outputCtxt xmlCharEncConvCtxtDtor ctxtDtor int flags } xmlCharEncodingHandler;
xmlCharEncodingHandler * xmlCharEncodingHandlerPtr;
void xmlCharEncConvCtxtDtor (void * vctxt)
Free a conversion context.
vctxt: | conversion context |
xmlCharEncError xmlCharEncConvFunc (void * vctxt,
unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int flush)
Convert between character encodings. The value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced. If the converter can consume partial multi-byte sequences, the @flush flag can be used to detect truncated sequences at EOF. Otherwise, the flag can be ignored.
vctxt: | conversion context |
out: | a pointer to an array of bytes to store the result |
outlen: | the length of @out |
in: | a pointer to an array of input bytes |
inlen: | the length of @in |
flush: | end of input |
Returns: | an XML_ENC_ERR code. |
xmlParserErrors xmlCharEncConvImpl (void * vctxt,
const char * name,
xmlCharEncFlags flags,
xmlCharEncodingHandler ** out)
If this function returns XML_ERR_OK, it must fill the @out pointer with an encoding handler. The handler can be obtained from xmlCharEncNewCustomHandler. @flags can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both.
vctxt: | user data |
name: | encoding name |
flags: | bit mask of flags |
out: | pointer to resulting handler |
Returns: | an xmlParserErrors code. |
int xmlCharEncodingInputFunc (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Convert characters to UTF-8. On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.
out: | a pointer to an array of bytes to store the UTF-8 result |
outlen: | the length of @out |
in: | a pointer to an array of chars in the original encoding |
inlen: | the length of @in |
Returns: | the number of bytes written or an XML_ENC_ERR code. |
int xmlCharEncodingOutputFunc (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Convert characters from UTF-8. On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.
out: | a pointer to an array of bytes to store the result |
outlen: | the length of @out |
in: | a pointer to an array of UTF-8 chars |
inlen: | the length of @in |
Returns: | the number of bytes written or an XML_ENC_ERR code. |
int xmlAddEncodingAlias (const char * name,
const char * alias)
DEPRECATED: This function modifies global state and is not thread-safe. Registers an alias @alias for an encoding named @name. Existing alias will be overwritten.
name: | the encoding name as parsed, in UTF-8 format (ASCII actually) |
alias: | the alias name as parsed, in UTF-8 format (ASCII actually) |
Returns: | 0 in case of success, -1 in case of error |
int xmlCharEncCloseFunc (xmlCharEncodingHandler * handler)
Releases an xmlCharEncodingHandler. Must be called after a handler is no longer in use.
handler: | char encoding transformation data structure |
Returns: | 0. |
int xmlCharEncFirstLine (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in)
DEPERECATED: Don't use.
int xmlCharEncInFunc (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in)
Generic front-end for the encoding handler input function
xmlParserErrors xmlCharEncNewCustomHandler (const char * name,
xmlCharEncConvFunc input,
xmlCharEncConvFunc output,
xmlCharEncConvCtxtDtor ctxtDtor,
void * inputCtxt,
void * outputCtxt,
xmlCharEncodingHandler ** out)
Create a custom xmlCharEncodingHandler.
name: | the encoding name |
input: | input callback which converts to UTF-8 |
output: | output callback which converts from UTF-8 |
ctxtDtor: | context destructor |
inputCtxt: | context for input callback |
outputCtxt: | context for output callback |
out: | pointer to resulting handler |
Returns: | an xmlParserErrors code. |
int xmlCharEncOutFunc (xmlCharEncodingHandler * handler,
xmlBufferPtr out,
xmlBufferPtr in)
Generic front-end for the encoding handler output function a first call with @in == NULL has to be made firs to initiate the output in case of non-stateless encoding needing to initiate their state or the output (like the BOM in UTF16). In case of UTF8 sequence conversion errors for the given encoder, the content will be automatically remapped to a CharRef sequence.
void xmlCleanupCharEncodingHandlers (void)
DEPRECATED: This function will be made private. Call xmlCleanupParser to free global state but see the warnings there. xmlCleanupParser should be only called once at program exit. In most cases, you don't have call cleanup functions at all. Cleanup the memory allocated for the char encoding support, it unregisters all the encoding handlers and the aliases.
void xmlCleanupEncodingAliases (void)
DEPRECATED: This function modifies global state and is not thread-safe. Unregisters all aliases
xmlParserErrors xmlCreateCharEncodingHandler (const char * name,
xmlCharEncFlags flags,
xmlCharEncConvImpl impl,
void * implCtxt,
xmlCharEncodingHandler ** out)
Find or create a handler matching the encoding. The following converters are looked up in order: - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) - Custom implementation if provided - User-registered global handler (deprecated) - iconv if enabled - ICU if enabled The handler must be closed with xmlCharEncCloseFunc. If the encoding is UTF-8, a NULL handler and no error code will be returned. @flags can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both. Available since 2.14.0.
name: | a string describing the char encoding. |
flags: | bit mask of flags |
impl: | a conversion implementation (optional) |
implCtxt: | user data for conversion implementation (optional) |
out: | pointer to result |
Returns: | XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another xmlParserErrors error code. |
int xmlDelEncodingAlias (const char * alias)
DEPRECATED: This function modifies global state and is not thread-safe. Unregisters an encoding alias @alias
alias: | the alias name as parsed, in UTF-8 format (ASCII actually) |
Returns: | 0 in case of success, -1 in case of error |
xmlCharEncoding xmlDetectCharEncoding (const unsigned char * in,
int len)
Guess the encoding of the entity using the first bytes of the entity content according to the non-normative appendix F of the XML-1.0 recommendation.
in: | a pointer to the first bytes of the XML entity, must be at least 2 bytes long (at least 4 if encoding is UTF4 variant). |
len: | pointer to the length of the buffer |
Returns: | one of the XML_CHAR_ENCODING_... values. |
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler (const char * name)
DEPRECATED: Use xmlOpenCharEncodingHandler which has better error reporting. If the encoding is UTF-8, this will return a no-op handler that shouldn't be used.
name: | a string describing the char encoding. |
Returns: | the handler or NULL if no handler was found or an error occurred. |
xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler (xmlCharEncoding enc)
DEPRECATED: Use xmlLookupCharEncodingHandler which has better error reporting.
enc: | an xmlCharEncoding value. |
Returns: | the handler or NULL if no handler was found or an error occurred. |
const char * xmlGetCharEncodingName (xmlCharEncoding enc)
The "canonical" name for XML encoding. C.f. http://www.w3.org/TR/REC-xml#charencoding Section 4.3.3 Character Encoding in Entities
enc: | the encoding |
Returns: | the canonical name for the given encoding |
const char * xmlGetEncodingAlias (const char * alias)
DEPRECATED: This function is not thread-safe. Lookup an encoding name for the given alias.
alias: | the alias name as parsed, in UTF-8 format (ASCII actually) |
Returns: | NULL if not found, otherwise the original name |
void xmlInitCharEncodingHandlers (void)
DEPRECATED: Alias for xmlInitParser.
int xmlIsolat1ToUTF8 (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 block of chars out.
out: | a pointer to an array of bytes to store the result |
outlen: | the length of @out |
in: | a pointer to an array of ISO Latin 1 chars |
inlen: | the length of @in |
Returns: | the number of bytes written or an XML_ENC_ERR code. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets produced. |
xmlParserErrors xmlLookupCharEncodingHandler (xmlCharEncoding enc,
xmlCharEncodingHandler ** out)
Find or create a handler matching the encoding. The following converters are looked up in order: - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) - User-registered global handler (deprecated) - iconv if enabled - ICU if enabled The handler must be closed with xmlCharEncCloseFunc. If the encoding is UTF-8, a NULL handler and no error code will be returned. Available since 2.13.0.
enc: | an xmlCharEncoding value. |
out: | pointer to result |
Returns: | XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another xmlParserErrors error code. |
xmlCharEncodingHandlerPtr xmlNewCharEncodingHandler (const char * name,
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output)
DEPRECATED: This function modifies global state and is not thread-safe. Create and registers an xmlCharEncodingHandler.
name: | the encoding name, in UTF-8 format (ASCII actually) |
input: | the xmlCharEncodingInputFunc to read that encoding |
output: | the xmlCharEncodingOutputFunc to write that encoding |
Returns: | the xmlCharEncodingHandlerPtr created (or NULL in case of error). |
xmlParserErrors xmlOpenCharEncodingHandler (const char * name,
int output,
xmlCharEncodingHandler ** out)
Find or create a handler matching the encoding. The following converters are looked up in order: - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) - User-registered global handler (deprecated) - iconv if enabled - ICU if enabled The handler must be closed with xmlCharEncCloseFunc. If the encoding is UTF-8, a NULL handler and no error code will be returned. Available since 2.13.0.
name: | a string describing the char encoding. |
output: | boolean, use handler for output |
out: | pointer to result |
Returns: | XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another xmlParserErrors error code. |
xmlCharEncoding xmlParseCharEncoding (const char * name)
Compare the string to the encoding schemes already known. Note that the comparison is case insensitive accordingly to the section [XML] 4.3.3 Character Encoding in Entities.
name: | the encoding name as parsed, in UTF-8 format (ASCII actually) |
Returns: | one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE if not recognized. |
void xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler)
DEPRECATED: This function modifies global state and is not thread-safe. Register the char encoding handler.
handler: | the xmlCharEncodingHandlerPtr handler block |
int xmlUTF8ToIsolat1 (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 block of chars out.
out: | a pointer to an array of bytes to store the result |
outlen: | the length of @out |
in: | a pointer to an array of UTF-8 chars |
inlen: | the length of @in |
Returns: | the number of bytes written or an XML_ENC_ERR code. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets produced. |