8 * @ch: A Latin-1 character.
10 * You can convert Latin-1 characters between upper and lower case with two Glk
11 * utility functions, glk_char_to_lower() and glk_char_to_upper(). These have a
12 * few advantages over the standard ANSI <function>tolower()</function> and
13 * <function>toupper()</function> macros. They work for the entire Latin-1
14 * character set, including accented letters; they behave consistently on all
15 * platforms, since they're part of the Glk library; and they are safe for all
16 * characters. That is, if you call glk_char_to_lower() on a lower-case
17 * character, or a character which is not a letter, you'll get the argument
20 * The case-sensitive characters in Latin-1 are the ranges 0x41..0x5A,
21 * 0xC0..0xD6, 0xD8..0xDE (upper case) and the ranges 0x61..0x7A, 0xE0..0xF6,
22 * 0xF8..0xFE (lower case). These are arranged in parallel; so
23 * glk_char_to_lower() will add 0x20 to values in the upper-case ranges, and
24 * glk_char_to_upper() will subtract 0x20 from values in the lower-case ranges.
26 * Returns: A lowercase or non-letter Latin-1 character.
29 glk_char_to_lower(unsigned char ch)
31 if( (ch >= 0x41 && ch <= 0x5A) || (ch >= 0xC0 && ch <= 0xD6) || (ch >= 0xD8 && ch <= 0xDE) )
38 * @ch: A Latin-1 character.
40 * If @ch is a lowercase character in the Latin-1 character set, converts it to
41 * uppercase. Otherwise, leaves it unchanged. See glk_char_to_lower().
43 * Returns: An uppercase or non-letter Latin-1 character.
46 glk_char_to_upper(unsigned char ch)
48 if( (ch >= 0x61 && ch <= 0x7A) || (ch >= 0xE0 && ch <= 0xF6) || (ch >= 0xF8 && ch <= 0xFE) )
54 * glk_buffer_to_lower_case_uni:
55 * @buf: A character array in UCS-4.
56 * @len: Available length of @buf.
57 * @numchars: Number of characters in @buf.
59 * Unicode character conversion is trickier, and must be applied to character
60 * arrays, not single characters. These functions
61 * (glk_buffer_to_lower_case_uni(), glk_buffer_to_upper_case_uni(), and
62 * glk_buffer_to_title_case_uni()) provide two length arguments because a
63 * string of Unicode characters may expand when its case changes. The @len
64 * argument is the available length of the buffer; @numchars is the number of
65 * characters in the buffer initially. (So @numchars must be less than or equal
66 * to @len. The contents of the buffer after @numchars do not affect the
69 * The functions return the number of characters after conversion. If this is
70 * greater than @len, the characters in the array will be safely truncated at
71 * @len, but the true count will be returned. (The contents of the buffer after
72 * the returned count are undefined.)
74 * The <code>lower_case</code> and <code>upper_case</code> functions do what
75 * you'd expect: they convert every character in the buffer (the first @numchars
76 * of them) to its upper or lower-case equivalent, if there is such a thing.
78 * See the Unicode spec (chapter 3.13, chapter 4.2, etc) for the exact
79 * definitions of upper, lower, and title-case mapping.
82 * Unicode has some strange case cases. For example, a combined character
83 * that looks like <quote>ss</quote> might properly be upper-cased into
84 * <emphasis>two</emphasis> <quote>S</quote> characters. Title-casing is even
85 * stranger; <quote>ss</quote> (at the beginning of a word) might be
86 * title-cased into a different combined character that looks like
87 * <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
88 * title-casing the first character of the buffer. If it makes a difference.
91 * Returns: The number of characters after conversion.
94 glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars)
96 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
97 g_return_val_if_fail(numchars <= len, 0);
101 /* Lowercase the string */
102 char *utf8 = convert_ucs4_to_utf8(buf, numchars);
105 char *lowered = g_utf8_strdown(utf8, -1);
107 gunichar *outbuf = convert_utf8_to_ucs4(lowered, &outchars);
112 /* Copy the output buffer to the original buffer */
113 memcpy(buf, outbuf, MIN(outchars, len) * 4);
120 * glk_buffer_to_upper_case_uni:
121 * @buf: A character array in UCS-4.
122 * @len: Available length of @buf.
123 * @numchars: Number of characters in @buf.
125 * Converts the first @numchars characters of @buf to their uppercase
126 * equivalents, if there is such a thing. See glk_buffer_to_lower_case_uni().
128 * Returns: The number of characters after conversion.
131 glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
133 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
134 g_return_val_if_fail(numchars <= len, 0);
138 /* Uppercase the string */
139 char *utf8 = convert_ucs4_to_utf8(buf, numchars);
142 char *uppered = g_utf8_strup(utf8, -1);
144 gunichar *outbuf = convert_utf8_to_ucs4(uppered, &outchars);
149 /* Copy the output buffer to the original buffer */
150 memcpy(buf, outbuf, MIN(outchars, len) * 4);
157 * glk_buffer_to_title_case_uni:
158 * @buf: A character array in UCS-4.
159 * @len: Available length of @buf.
160 * @numchars: Number of characters in @buf.
161 * @lowerrest: %TRUE if the rest of @buf should be lowercased, %FALSE
164 * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
165 * an additional (boolean) flag. If the flag is zero, the function changes the
166 * first character of the buffer to upper-case, and leaves the rest of the
167 * buffer unchanged. If the flag is nonzero, it changes the first character to
168 * upper-case and the rest to lower-case.
171 * Earlier drafts of this spec had a separate function which title-cased the
172 * first character of every <emphasis>word</emphasis> in the buffer. I took
173 * this out after reading Unicode Standard Annex #29, which explains how
174 * to divide a string into words. If you want it, feel free to implement it.
177 * Returns: The number of characters after conversion.
180 glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lowerrest)
182 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
183 g_return_val_if_fail(numchars <= len, 0);
185 /* FIXME: This is wrong. g_unichar_totitle() which returns the titlecase of
186 one Unicode code point, but that only returns the correct result if the
187 titlecase character is also one code point.
188 For example, the one-character 'ffi' ligature should be title-cased to the
189 three-character string 'Ffi'. This code leaves it as the 'ffi' ligature,
191 However, nothing much can be done about it unless GLib gets a
192 g_utf8_strtitle() function, or we roll our own. */
193 *buf = g_unichar_totitle(*buf);
194 /* Call lowercase on the rest of the string */
196 return glk_buffer_to_lower_case_uni(buf + 1, len - 1, numchars - 1) + 1;
201 * glk_buffer_canon_decompose_uni:
202 * @buf: A character array in UCS-4.
203 * @len: Available length of @buf.
204 * @numchars: Number of characters in @buf.
206 * This transforms a string into its canonical decomposition
207 * (<quote>Normalization Form D</quote>). Effectively, this takes apart
208 * multipart characters into their individual parts. For example, it would
209 * convert <quote>è</quote> (character 0xE8, an accented
210 * <quote>e</quote>) into the two-character string containing <quote>e</quote>
211 * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
212 * character has multiple accent marks, they are also rearranged into a standard
215 * Returns: The number of characters in @buf after decomposition.
218 glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
220 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
221 g_return_val_if_fail(numchars <= len, 0);
225 /* Normalize the string */
226 char *utf8 = convert_ucs4_to_utf8(buf, numchars);
229 char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD);
231 gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
236 /* Copy the output buffer to the original buffer */
237 memcpy(buf, outbuf, MIN(outchars, len) * 4);
244 * glk_buffer_canon_normalize_uni:
245 * @buf: A character array in UCS-4.
246 * @len: Available length of @buf.
247 * @numchars: Number of characters in @buf.
249 * This transforms a string into its canonical decomposition and recomposition
250 * (<quote>Normalization Form C</quote>). Effectively, this takes apart
251 * multipart characters, and then puts them back together in a standard way. For
252 * example, this would convert the two-character string containing
253 * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
254 * ACCENT) into the one-character string <quote>è</quote> (character
255 * 0xE8, an accented <quote>e</quote>).
257 * The <code>canon_normalize</code> function includes decomposition as part of
258 * its implementation. You never have to call both functions on the same string.
260 * Both of these functions are idempotent.
262 * These functions provide two length arguments because a string of Unicode
263 * characters may expand when it is transformed. The @len argument is the
264 * available length of the buffer; @numchars is the number of characters in the
265 * buffer initially. (So @numchars must be less than or equal to @len. The
266 * contents of the buffer after @numchars do not affect the operation.)
268 * The functions return the number of characters after transformation. If this
269 * is greater than @len, the characters in the array will be safely truncated at
270 * @len, but the true count will be returned. (The contents of the buffer after
271 * the returned count are undefined.)
274 * The Unicode spec also defines stronger forms of these functions, called
275 * <quote>compatibility decomposition and recomposition</quote>
276 * (<quote>Normalization Form KD</quote> and <quote>Normalization Form
277 * KC</quote>.) These do all of the accent-mangling described above, but they
278 * also transform many other obscure Unicode characters into more familiar
279 * forms. For example, they split ligatures apart into separate letters. They
280 * also convert Unicode display variations such as script letters, circled
281 * letters, and half-width letters into their common forms.
285 * The Glk spec does not currently provide these stronger transformations.
286 * Glk's expected use of Unicode normalization is for line input, and an OS
287 * facility for line input will generally not produce these alternate
288 * character forms (unless the user goes out of his way to type them).
289 * Therefore, the need for these transformations does not seem to be worth the
290 * extra data table space.
293 * Returns: the number of characters in @buf after normalization.
296 glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
298 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
299 g_return_val_if_fail(numchars <= len, 0);
303 /* Normalize the string */
304 char *utf8 = convert_ucs4_to_utf8(buf, numchars);
307 char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC);
309 gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
314 /* Copy the output buffer to the original buffer */
315 memcpy(buf, outbuf, MIN(outchars, len) * 4);