From 28186e81d947a830d9895cecc2d8e836a3cbccd0 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Tue, 24 Mar 2009 10:13:27 +0000 Subject: * libc/ctype/iswalpha.c: Handle all wchar_t as unicode on _MB_CAPABLE systems. * libc/ctype/iswblank.c: Ditto. * libc/ctype/iswcntrl.c: Ditto. * libc/ctype/iswprint.c: Ditto. * libc/ctype/iswpunct.c: Ditto. * libc/ctype/iswspace.c: Ditto. * libc/ctype/jp2uc.c (__jp2uc): On Cygwin, just return c. Explain why. * libc/ctype/towlower.c: Ditto. * libc/ctype/towupper.c: Ditto. * libc/include/sys/config.h: Define _MB_EXTENDED_CHARSETS_ISO and _MB_EXTENDED_CHARSETS_WINDOWS if _MB_EXTENDED_CHARSETS_ALL is defined. Define _MB_EXTENDED_CHARSETS_ALL on Cygwin only for now. * libc/include/sys/reent.h (struct _reent): Mark _current_category and _current_locale as unused. * libc/locale/locale.c: Add new charset support to documentation. Include ../stdio/local.h from here. (lc_ctype_charset): Set to "ASCII" by default. (lc_message_charset): Ditto. (_setlocale_r): Don't set _current_category and _current_locale. (loadlocale): Add Cygwin codepage support. On _MB_CAPABLE systems, set __mbtowc and __wctomb function pointers to function corresponding with current charset. Don't allow non-existant ISO-8859-12 charset. Add support for Windows singlebyte codepages. On Cygwin, add support for GBK, CP949, and BIG5. On Cygwin, call __set_ctype() in case the catorgy is LC_CTYPE. Don't set _current_category and _current_locale. * libc/stdlib/Makefile.am (GENERAL_SOURCES): Add sb_charsets.c. * libc/stdlib/Makefile.in: Regenerate. * libc/stdlib/local.h: Add prototype for __locale_charset. Add prototypes for __mbtowc and __wctomb pointers. Add prototypes for charset-specific _wctomb_r and _mbtowc_r functions. Declare tables and functions from sb_charsets.c. * libc/stdlib/mbtowc_r.c (__mbtowc): Define. Set to __ascii_mbtowc by default. (_mbtowc_r): Just call __mbtowc from here. (__ascii_mbtowc): New function. (__iso_mbtowc): New function. (__cp_mbtowc): New function. (__utf8_mbtowc): New function. (__sjis_mbtowc): New function. Disable on Cygwin. (__eucjp_mbtowc): New function. Disable on Cygwin. (__jis_mbtowc): New function. Disable on Cygwin. * libc/stdlib/sb_charsets.c: New file, adding singlebyte to UTF conversion tables for all ISO and CP charsets. (__iso_8859_index): New function. (__cp_index): New function. * libc/stdlib/wctomb_r.c (__wctomb): Define. Set to __ascii_wctomb by default. (_wctomb_r): Just call __wctomb from here. (__ascii_wctomb): New function. (__utf8_wctomb): New function. (__sjis_wctomb): New function. Disable on Cygwin. (__eucjp_wctomb): New function. Disable on Cygwin. (__jis_wctomb): New function. Disable on Cygwin. (__iso_wctomb): New function. (__cp_wctomb): New function. --- newlib/libc/locale/locale.c | 169 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 152 insertions(+), 17 deletions(-) (limited to 'newlib/libc/locale/locale.c') diff --git a/newlib/libc/locale/locale.c b/newlib/libc/locale/locale.c index a4cd30eb9..d3644eb8a 100644 --- a/newlib/libc/locale/locale.c +++ b/newlib/libc/locale/locale.c @@ -47,11 +47,18 @@ and <<"C">> values for <[locale]>; strings representing other locales are not honored unless _MB_CAPABLE is defined in which case POSIX locale strings are allowed, plus five extensions supported for backward compatibility with older implementations using newlib: <<"C-UTF-8">>, <<"C-JIS">>, <<"C-EUCJP">>, -<<"C-SJIS">>, or <<"C-ISO-8859-x">> with 1 <= x <= 15. Even when using -POSIX locale strings, the only charsets allowed are <<"UTF-8">>, <<"JIS">>, -<<"EUCJP">>, <<"SJIS">>, or <<"ISO-8859-x">> with 1 <= x <= 15. (<<"">> is -also accepted; if given, the settings are read from the corresponding -LC_* environment variables and $LANG according to POSIX rules. +<<"C-SJIS">>, <<"C-ISO-8859-x">> with 1 <= x <= 15, or <<"C-CPxxx">> with +xxx in [437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, 874, 1125, 1250, +1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258]. Even when using POSIX +locale strings, the only charsets allowed are <<"UTF-8">>, <<"JIS">>, +<<"EUCJP">>, <<"SJIS">>, <<"ISO-8859-x">> with 1 <= x <= 15, or +<<"CPxxx">> with xxx in [437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866, +874, 1125, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258]. +(<<"">> is also accepted; if given, the settings are read from the +corresponding LC_* environment variables and $LANG according to POSIX rules. + +Under Cygwin, this implementation additionally supports the charsets <<"GBK">>, +<<"CP949">>, and <<"BIG5">>. If you use <> as the <[locale]> argument, <> returns a pointer to the string representing the current locale (always @@ -85,6 +92,9 @@ PORTABILITY ANSI C requires <>, but the only locale required across all implementations is the C locale. +NOTES +There is no ISO-8859-12 codepage. It's also refused by this implementation. + No supporting OS subroutines are required. */ @@ -129,6 +139,11 @@ No supporting OS subroutines are required. #include #include #include +#include +#include "../stdlib/local.h" +#ifdef __CYGWIN__ +#include +#endif #define _LC_LAST 7 #define ENCODING_LEN 31 @@ -190,8 +205,8 @@ static const char *__get_locale_env(struct _reent *, int); #endif -static char lc_ctype_charset[ENCODING_LEN + 1] = "ISO-8859-1"; -static char lc_message_charset[ENCODING_LEN + 1] = "ISO-8859-1"; +static char lc_ctype_charset[ENCODING_LEN + 1] = "ASCII"; +static char lc_message_charset[ENCODING_LEN + 1] = "ASCII"; char * _DEFUN(_setlocale_r, (p, category, locale), @@ -205,8 +220,6 @@ _DEFUN(_setlocale_r, (p, category, locale), if (strcmp (locale, "POSIX") && strcmp (locale, "C") && strcmp (locale, "")) return NULL; - p->_current_category = category; - p->_current_locale = locale; } return "C"; #else @@ -361,6 +374,11 @@ currentlocale() #endif #ifdef _MB_CAPABLE +#ifdef __CYGWIN__ +extern void *__set_charset_from_codepage (unsigned int, char *charset); +extern void __set_ctype (const char *charset); +#endif /* __CYGWIN__ */ + static char * loadlocale(struct _reent *p, int category) { @@ -382,7 +400,7 @@ loadlocale(struct _reent *p, int category) if (!strcmp (locale, "POSIX")) strcpy (locale, "C"); if (!strcmp (locale, "C")) /* Default "C" locale */ - strcpy (charset, "ISO-8859-1"); + strcpy (charset, "ASCII"); else if (locale[0] == 'C' && locale[1] == '-') /* Old newlib style */ strcpy (charset, locale + 2); else /* POSIX style */ @@ -414,7 +432,11 @@ loadlocale(struct _reent *p, int category) } else if (c[0] == '\0' || c[0] == '@') /* End of string or just a modifier */ +#ifdef __CYGWIN__ + __set_charset_from_codepage (GetACP (), charset); +#else strcpy (charset, "ISO-8859-1"); +#endif else /* Invalid string */ return NULL; @@ -426,42 +448,155 @@ loadlocale(struct _reent *p, int category) if (strcmp (charset, "UTF-8")) return NULL; mbc_max = 6; +#ifdef _MB_CAPABLE + __wctomb = __utf8_wctomb; + __mbtowc = __utf8_mbtowc; +#endif break; case 'J': if (strcmp (charset, "JIS")) return NULL; mbc_max = 8; +#ifdef _MB_CAPABLE + __wctomb = __jis_wctomb; + __mbtowc = __jis_mbtowc; +#endif break; case 'E': - if (strcmp (charset, "EUCJP")) + if (strcmp (charset, "EUCJP") && strcmp (charset, "eucJP")) return NULL; + strcpy (charset, "EUCJP"); mbc_max = 2; +#ifdef _MB_CAPABLE + __wctomb = __eucjp_wctomb; + __mbtowc = __eucjp_mbtowc; +#endif break; case 'S': if (strcmp (charset, "SJIS")) return NULL; mbc_max = 2; +#ifdef _MB_CAPABLE + __wctomb = __sjis_wctomb; + __mbtowc = __sjis_mbtowc; +#endif break; case 'I': - default: - /* Must be exactly one of ISO-8859-1, [...] ISO-8859-15. */ + /* Must be exactly one of ISO-8859-1, [...] ISO-8859-16, except for + ISO-8859-12. */ if (strncmp (charset, "ISO-8859-", 9)) return NULL; - val = strtol (charset + 9, &end, 10); - if (val < 1 || val > 15 || *end) + val = _strtol_r (p, charset + 9, &end, 10); + if (val < 1 || val > 16 || val == 12 || *end) return NULL; mbc_max = 1; +#ifdef _MB_CAPABLE +#ifdef _MB_EXTENDED_CHARSETS_ISO + __wctomb = __iso_wctomb; + __mbtowc = __iso_mbtowc; +#else /* !_MB_EXTENDED_CHARSETS_ISO */ + __wctomb = __ascii_wctomb; + __mbtowc = __ascii_mbtowc; +#endif /* _MB_EXTENDED_CHARSETS_ISO */ +#endif + break; + case 'C': + if (charset[1] != 'P') + return NULL; + val = _strtol_r (p, charset + 2, &end, 10); + if (*end) + return NULL; + switch (val) + { + case 437: + case 720: + case 737: + case 775: + case 850: + case 852: + case 855: + case 857: + case 858: + case 862: + case 866: + case 874: + case 1125: + case 1250: + case 1251: + case 1252: + case 1253: + case 1254: + case 1255: + case 1256: + case 1257: + case 1258: + mbc_max = 1; +#ifdef _MB_CAPABLE +#ifdef _MB_EXTENDED_CHARSETS_WINDOWS + __wctomb = __cp_wctomb; + __mbtowc = __cp_mbtowc; +#else /* !_MB_EXTENDED_CHARSETS_WINDOWS */ + __wctomb = __ascii_wctomb; + __mbtowc = __ascii_mbtowc; +#endif /* _MB_EXTENDED_CHARSETS_WINDOWS */ +#endif + break; +#ifdef __CYGWIN__ + case 949: + mbc_max = 2; +#ifdef _MB_CAPABLE + __wctomb = __kr_wctomb; + __mbtowc = __kr_mbtowc; +#endif + break; +#endif + default: + return NULL; + } + break; + case 'A': + if (strcmp (charset, "ASCII")) + return NULL; + mbc_max = 1; +#ifdef _MB_CAPABLE + __wctomb = __ascii_wctomb; + __mbtowc = __ascii_mbtowc; +#endif break; +#ifdef __CYGWIN__ + case 'G': + if (strcmp (charset, "GBK")) + return NULL; + mbc_max = 2; +#ifdef _MB_CAPABLE + __wctomb = __gbk_wctomb; + __mbtowc = __gbk_mbtowc; +#endif + break; + case 'B': + if (strcmp (charset, "BIG5") && strcmp (charset, "Big5")) + return NULL; + strcpy (charset, "BIG5"); + mbc_max = 2; +#ifdef _MB_CAPABLE + __wctomb = __big5_wctomb; + __mbtowc = __big5_mbtowc; +#endif + break; +#endif /* __CYGWIN__ */ + default: + return NULL; } if (category == LC_CTYPE) { strcpy (lc_ctype_charset, charset); __mb_cur_max = mbc_max; +#ifdef __CYGWIN__ + __set_ctype (charset); +#endif } else if (category == LC_MESSAGES) strcpy (lc_message_charset, charset); - p->_current_category = category; - p->_current_locale = locale; return strcpy(current_categories[category], new_categories[category]); } -- cgit v1.2.3