VLC  3.0.15
vlc_charset.h
Go to the documentation of this file.
1 /*****************************************************************************
2  * vlc_charset.h: Unicode UTF-8 wrappers function
3  *****************************************************************************
4  * Copyright (C) 2003-2005 VLC authors and VideoLAN
5  * Copyright © 2005-2010 Rémi Denis-Courmont
6  * $Id: ca626b30b16b46112487d3089b3afcf9b3b4f248 $
7  *
8  * Author: Rémi Denis-Courmont
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24 
25 #ifndef VLC_CHARSET_H
26 #define VLC_CHARSET_H 1
27 
28 /**
29  * \file
30  * Characters sets handling
31  *
32  * \ingroup strings
33  * @{
34  */
35 
36 /**
37  * Decodes a code point from UTF-8.
38  *
39  * Converts the first character in a UTF-8 sequence into a Unicode code point.
40  *
41  * \param str an UTF-8 bytes sequence [IN]
42  * \param pwc address of a location to store the code point [OUT]
43  *
44  * \return the number of bytes occupied by the decoded code point
45  *
46  * \retval (size_t)-1 not a valid UTF-8 sequence
47  * \retval 0 null character (i.e. str points to an empty string)
48  * \retval 1 (non-null) ASCII character
49  * \retval 2-4 non-ASCII character
50  */
51 VLC_API size_t vlc_towc(const char *str, uint32_t *restrict pwc);
52 
53 /**
54  * Checks UTF-8 validity.
55  *
56  * Checks whether a null-terminated string is a valid UTF-8 bytes sequence.
57  *
58  * \param str string to check
59  *
60  * \retval str the string is a valid null-terminated UTF-8 sequence
61  * \retval NULL the string is not an UTF-8 sequence
62  */
63 VLC_USED static inline const char *IsUTF8(const char *str)
64 {
65  size_t n;
66  uint32_t cp;
67 
68  while ((n = vlc_towc(str, &cp)) != 0)
69  if (likely(n != (size_t)-1))
70  str += n;
71  else
72  return NULL;
73  return str;
74 }
75 
76 /**
77  * Removes non-UTF-8 sequences.
78  *
79  * Replaces invalid or <i>over-long</i> UTF-8 bytes sequences within a
80  * null-terminated string with question marks. This is so that the string can
81  * be printed at least partially.
82  *
83  * \warning Do not use this were correctness is critical. use IsUTF8() and
84  * handle the error case instead. This function is mainly for display or debug.
85  *
86  * \note Converting from Latin-1 to UTF-8 in place is not possible (the string
87  * size would be increased). So it is not attempted even if it would otherwise
88  * be less disruptive.
89  *
90  * \retval str the string is a valid null-terminated UTF-8 sequence
91  * (i.e. no changes were made)
92  * \retval NULL the string is not an UTF-8 sequence
93  */
94 static inline char *EnsureUTF8(char *str)
95 {
96  char *ret = str;
97  size_t n;
98  uint32_t cp;
99 
100  while ((n = vlc_towc(str, &cp)) != 0)
101  if (likely(n != (size_t)-1))
102  str += n;
103  else
104  {
105  *str++ = '?';
106  ret = NULL;
107  }
108  return ret;
109 }
110 
111 /* iconv wrappers (defined in src/extras/libc.c) */
112 #define VLC_ICONV_ERR ((size_t) -1)
113 typedef void *vlc_iconv_t;
114 VLC_API vlc_iconv_t vlc_iconv_open( const char *, const char * ) VLC_USED;
115 VLC_API size_t vlc_iconv( vlc_iconv_t, const char **, size_t *, char **, size_t * ) VLC_USED;
117 
118 #include <stdarg.h>
119 
120 VLC_API int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap );
121 VLC_API int utf8_fprintf( FILE *, const char *, ... ) VLC_FORMAT( 2, 3 );
122 VLC_API char * vlc_strcasestr(const char *, const char *) VLC_USED;
123 
124 VLC_API char * FromCharset( const char *charset, const void *data, size_t data_size ) VLC_USED;
125 VLC_API void * ToCharset( const char *charset, const char *in, size_t *outsize ) VLC_USED;
126 
127 #ifdef _WIN32
128 VLC_USED
129 static inline char *FromWide (const wchar_t *wide)
130 {
131  size_t len = WideCharToMultiByte (CP_UTF8, 0, wide, -1, NULL, 0, NULL, NULL);
132  if (len == 0)
133  return NULL;
134 
135  char *out = (char *)malloc (len);
136 
137  if (likely(out))
138  WideCharToMultiByte (CP_UTF8, 0, wide, -1, out, len, NULL, NULL);
139  return out;
140 }
141 
142 VLC_USED
143 static inline wchar_t *ToWide (const char *utf8)
144 {
145  int len = MultiByteToWideChar (CP_UTF8, 0, utf8, -1, NULL, 0);
146  if (len == 0)
147  return NULL;
148 
149  wchar_t *out = (wchar_t *)malloc (len * sizeof (wchar_t));
150 
151  if (likely(out))
152  MultiByteToWideChar (CP_UTF8, 0, utf8, -1, out, len);
153  return out;
154 }
155 
157 static inline char *ToCodePage (unsigned cp, const char *utf8)
158 {
159  wchar_t *wide = ToWide (utf8);
160  if (wide == NULL)
161  return NULL;
162 
163  size_t len = WideCharToMultiByte (cp, 0, wide, -1, NULL, 0, NULL, NULL);
164  if (len == 0) {
165  free(wide);
166  return NULL;
167  }
168 
169  char *out = (char *)malloc (len);
170  if (likely(out != NULL))
171  WideCharToMultiByte (cp, 0, wide, -1, out, len, NULL, NULL);
172  free (wide);
173  return out;
174 }
175 
177 static inline char *FromCodePage (unsigned cp, const char *mb)
178 {
179  int len = MultiByteToWideChar (cp, 0, mb, -1, NULL, 0);
180  if (len == 0)
181  return NULL;
182 
183  wchar_t *wide = (wchar_t *)malloc (len * sizeof (wchar_t));
184  if (unlikely(wide == NULL))
185  return NULL;
186  MultiByteToWideChar (cp, 0, mb, -1, wide, len);
187 
188  char *utf8 = FromWide (wide);
189  free (wide);
190  return utf8;
191 }
192 
194 static inline char *FromANSI (const char *ansi)
195 {
196  return FromCodePage (GetACP (), ansi);
197 }
198 
200 static inline char *ToANSI (const char *utf8)
201 {
202  return ToCodePage (GetACP (), utf8);
203 }
204 
205 # ifdef UNICODE
206 # define FromT FromWide
207 # define ToT ToWide
208 # else
209 # define FromT FromANSI
210 # define ToT ToANSI
211 # endif
212 # define FromLocale FromANSI
213 # define ToLocale ToANSI
214 # define LocaleFree(s) free((char *)(s))
215 # define FromLocaleDup FromANSI
216 # define ToLocaleDup ToANSI
217 
218 #elif defined(__OS2__)
219 
220 VLC_USED static inline char *FromLocale (const char *locale)
221 {
222  return locale ? FromCharset ((char *)"", locale, strlen(locale)) : NULL;
223 }
224 
225 VLC_USED static inline char *ToLocale (const char *utf8)
226 {
227  size_t outsize;
228  return utf8 ? (char *)ToCharset ("", utf8, &outsize) : NULL;
229 }
230 
231 VLC_USED static inline void LocaleFree (const char *str)
232 {
233  free ((char *)str);
234 }
235 
236 VLC_USED static inline char *FromLocaleDup (const char *locale)
237 {
238  return FromCharset ("", locale, strlen(locale));
239 }
240 
241 VLC_USED static inline char *ToLocaleDup (const char *utf8)
242 {
243  size_t outsize;
244  return (char *)ToCharset ("", utf8, &outsize);
245 }
246 
247 #else
248 
249 # define FromLocale(l) (l)
250 # define ToLocale(u) (u)
251 # define LocaleFree(s) ((void)(s))
252 # define FromLocaleDup strdup
253 # define ToLocaleDup strdup
254 #endif
255 
256 /**
257  * Converts a nul-terminated string from ISO-8859-1 to UTF-8.
258  */
259 static inline char *FromLatin1 (const char *latin)
260 {
261  char *str = (char *)malloc (2 * strlen (latin) + 1), *utf8 = str;
262  unsigned char c;
263 
264  if (str == NULL)
265  return NULL;
266 
267  while ((c = *(latin++)) != '\0')
268  {
269  if (c >= 0x80)
270  {
271  *(utf8++) = 0xC0 | (c >> 6);
272  *(utf8++) = 0x80 | (c & 0x3F);
273  }
274  else
275  *(utf8++) = c;
276  }
277  *(utf8++) = '\0';
278 
279  utf8 = (char *)realloc (str, utf8 - str);
280  return utf8 ? utf8 : str;
281 }
282 
283 /** @} */
284 
285 VLC_API double us_strtod( const char *, char ** ) VLC_USED;
286 VLC_API float us_strtof( const char *, char ** ) VLC_USED;
287 VLC_API double us_atof( const char * ) VLC_USED;
288 VLC_API int us_vasprintf( char **, const char *, va_list );
289 VLC_API int us_asprintf( char **, const char *, ... ) VLC_USED;
290 
291 #endif
LocaleFree
#define LocaleFree(s)
Definition: vlc_charset.h:251
VLC_FORMAT
#define VLC_FORMAT(x, y)
Definition: vlc_common.h:100
VLC_API
#define VLC_API
Definition: fourcc_gen.c:30
vlc_iconv
size_t vlc_iconv(vlc_iconv_t, const char **, size_t *, char **, size_t *)
us_atof
double us_atof(const char *)
us_atof() has the same prototype as ANSI C atof() but it expects a dot as decimal separator,...
Definition: charset.c:87
vlc_common.h
FromLocaleDup
#define FromLocaleDup
Definition: vlc_charset.h:252
vlc_strcasestr
char * vlc_strcasestr(const char *, const char *)
Look for an UTF-8 string within another one in a case-insensitive fashion.
Definition: unicode.c:196
us_vasprintf
int us_vasprintf(char **, const char *, va_list)
us_vasprintf() has the same prototype as vasprintf(), but doesn't use the system locale.
Definition: charset.c:97
FromLocale
#define FromLocale(l)
Definition: vlc_charset.h:249
us_asprintf
int us_asprintf(char **, const char *,...)
us_asprintf() has the same prototype as asprintf(), but doesn't use the system locale.
Definition: charset.c:118
FromCharset
char * FromCharset(const char *charset, const void *data, size_t data_size)
Converts a string from the given character encoding to utf-8.
Definition: unicode.c:235
vlc_iconv_t
void * vlc_iconv_t
Definition: vlc_charset.h:113
FromLatin1
static char * FromLatin1(const char *latin)
Converts a nul-terminated string from ISO-8859-1 to UTF-8.
Definition: vlc_charset.h:259
us_strtof
float us_strtof(const char *, char **)
us_strtof() has the same prototype as ANSI C strtof() but it uses the POSIX/C decimal format,...
Definition: charset.c:68
ToCharset
void * ToCharset(const char *charset, const char *in, size_t *outsize)
Converts a nul-terminated UTF-8 string to a given character encoding.
Definition: unicode.c:277
ToLocaleDup
#define ToLocaleDup
Definition: vlc_charset.h:253
vlc_iconv_open
vlc_iconv_t vlc_iconv_open(const char *, const char *)
utf8_vfprintf
int utf8_vfprintf(FILE *stream, const char *fmt, va_list ap)
Formats an UTF-8 string as vfprintf(), then print it, with appropriate conversion to local encoding.
Definition: unicode.c:50
utf8_fprintf
int utf8_fprintf(FILE *, const char *,...)
Formats an UTF-8 string as fprintf(), then print it, with appropriate conversion to local encoding.
Definition: unicode.c:100
us_strtod
double us_strtod(const char *, char **)
us_strtod() has the same prototype as ANSI C strtod() but it uses the POSIX/C decimal format,...
Definition: charset.c:49
vlc_towc
size_t vlc_towc(const char *str, uint32_t *restrict pwc)
Decodes a code point from UTF-8.
Definition: unicode.c:111
likely
#define likely(p)
Definition: vlc_common.h:113
EnsureUTF8
static char * EnsureUTF8(char *str)
Removes non-UTF-8 sequences.
Definition: vlc_charset.h:94
VLC_USED
#define VLC_USED
Definition: fourcc_gen.c:31
vlc_iconv_close
int vlc_iconv_close(vlc_iconv_t)
VLC_MALLOC
#define VLC_MALLOC
Definition: vlc_common.h:102
unlikely
#define unlikely(p)
Definition: vlc_common.h:114
IsUTF8
static const char * IsUTF8(const char *str)
Checks UTF-8 validity.
Definition: vlc_charset.h:63
ToLocale
#define ToLocale(u)
Definition: vlc_charset.h:250