8 * @ch: A Latin-1 character.
10 * You can convert Latin-1 characters between upper and lower case with two Glk
11 * utility functions, glk_char_to_lower() and glk_char_to_upper(). These have a
12 * few advantages over the standard ANSI <function>tolower()</function> and
13 * <function>toupper()</function> macros. They work for the entire Latin-1
14 * character set, including accented letters; they behave consistently on all
15 * platforms, since they're part of the Glk library; and they are safe for all
16 * characters. That is, if you call glk_char_to_lower() on a lower-case
17 * character, or a character which is not a letter, you'll get the argument
20 * The case-sensitive characters in Latin-1 are the ranges 0x41..0x5A,
21 * 0xC0..0xD6, 0xD8..0xDE (upper case) and the ranges 0x61..0x7A, 0xE0..0xF6,
22 * 0xF8..0xFE (lower case). These are arranged in parallel; so
23 * glk_char_to_lower() will add 0x20 to values in the upper-case ranges, and
24 * glk_char_to_upper() will subtract 0x20 from values in the lower-case ranges.
26 * Returns: A lowercase or non-letter Latin-1 character.
29 glk_char_to_lower(unsigned char ch)
31 if( (ch >= 0x41 && ch <= 0x5A) || (ch >= 0xC0 && ch <= 0xD6) || (ch >= 0xD8 && ch <= 0xDE) )
38 * @ch: A Latin-1 character.
40 * If @ch is a lowercase character in the Latin-1 character set, converts it to
41 * uppercase. Otherwise, leaves it unchanged. See glk_char_to_lower().
43 * Returns: An uppercase or non-letter Latin-1 character.
46 glk_char_to_upper(unsigned char ch)
48 if( (ch >= 0x61 && ch <= 0x7A) || (ch >= 0xE0 && ch <= 0xF6) || (ch >= 0xF8 && ch <= 0xFE) )
54 * glk_buffer_to_lower_case_uni:
55 * @buf: A character array in UCS-4.
56 * @len: Available length of @buf.
57 * @numchars: Number of characters in @buf.
59 * Unicode character conversion is trickier, and must be applied to character
60 * arrays, not single characters. These functions
61 * (glk_buffer_to_lower_case_uni(), glk_buffer_to_upper_case_uni(), and
62 * glk_buffer_to_title_case_uni()) provide two length arguments because a
63 * string of Unicode characters may expand when its case changes. The @len
64 * argument is the available length of the buffer; @numchars is the number of
65 * characters in the buffer initially. (So @numchars must be less than or equal
66 * to @len. The contents of the buffer after @numchars do not affect the
69 * The functions return the number of characters after conversion. If this is
70 * greater than @len, the characters in the array will be safely truncated at
71 * @len, but the true count will be returned. (The contents of the buffer after
72 * the returned count are undefined.)
74 * The <code>lower_case</code> and <code>upper_case</code> functions do what
75 * you'd expect: they convert every character in the buffer (the first @numchars
76 * of them) to its upper or lower-case equivalent, if there is such a thing.
78 * See the Unicode spec (chapter 3.13, chapter 4.2, etc) for the exact
79 * definitions of upper, lower, and title-case mapping.
82 * Unicode has some strange case cases. For example, a combined character
83 * that looks like <quote>ss</quote> might properly be upper-cased into
84 * <emphasis>two</emphasis> <quote>S</quote> characters. Title-casing is even
85 * stranger; <quote>ss</quote> (at the beginning of a word) might be
86 * title-cased into a different combined character that looks like
87 * <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
88 * title-casing the first character of the buffer. If it makes a difference.
91 * Returns: The number of characters after conversion.
94 glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars)
96 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
97 g_return_val_if_fail(numchars <= len, 0);
99 /* GLib has a function that converts _one_ UCS-4 character to _one_
100 lowercase UCS-4 character; so apparently we don't have to worry about the
101 string length changing... */
103 for(ptr = buf; ptr < buf + numchars; ptr++)
104 *ptr = g_unichar_tolower(*ptr);
110 * glk_buffer_to_upper_case_uni:
111 * @buf: A character array in UCS-4.
112 * @len: Available length of @buf.
113 * @numchars: Number of characters in @buf.
115 * Converts the first @numchars characters of @buf to their uppercase
116 * equivalents, if there is such a thing. See glk_buffer_to_lower_case_uni().
118 * Returns: The number of characters after conversion.
121 glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
123 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
124 g_return_val_if_fail(numchars <= len, 0);
126 /* GLib has a function that converts _one_ UCS-4 character to _one_
127 uppercase UCS-4 character; so apparently we don't have to worry about the
128 string length changing... */
130 for(ptr = buf; ptr < buf + numchars; ptr++)
131 *ptr = g_unichar_toupper(*ptr);
137 * glk_buffer_to_title_case_uni:
138 * @buf: A character array in UCS-4.
139 * @len: Available length of @buf.
140 * @numchars: Number of characters in @buf.
141 * @lowerrest: %TRUE if the rest of @buf should be lowercased, %FALSE
144 * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
145 * an additional (boolean) flag. If the flag is zero, the function changes the
146 * first character of the buffer to upper-case, and leaves the rest of the
147 * buffer unchanged. If the flag is nonzero, it changes the first character to
148 * upper-case and the rest to lower-case.
151 * Earlier drafts of this spec had a separate function which title-cased the
152 * first character of every <emphasis>word</emphasis> in the buffer. I took
153 * this out after reading Unicode Standard Annex #29, which explains how
154 * to divide a string into words. If you want it, feel free to implement it.
157 * Returns: The number of characters after conversion.
160 glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lowerrest)
162 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
163 g_return_val_if_fail(numchars <= len, 0);
165 /* GLib has a function that converts _one_ UCS-4 character to _one_
166 titlecase UCS-4 character; so apparently we don't have to worry about the
167 string length changing... */
168 *buf = g_unichar_totitle(*buf);
169 /* Call lowercase on the rest of the string */
171 return glk_buffer_to_lower_case_uni(buf + 1, len - 1, numchars - 1) + 1;
176 * glk_buffer_canon_decompose_uni:
177 * @buf: A character array in UCS-4.
178 * @len: Available length of @buf.
179 * @numchars: Number of characters in @buf.
181 * This transforms a string into its canonical decomposition
182 * (<quote>Normalization Form D</quote>). Effectively, this takes apart
183 * multipart characters into their individual parts. For example, it would
184 * convert <quote>è</quote> (character 0xE8, an accented
185 * <quote>e</quote>) into the two-character string containing <quote>e</quote>
186 * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
187 * character has multiple accent marks, they are also rearranged into a standard
190 * Returns: The number of characters in @buf after decomposition.
193 glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
195 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
196 g_return_val_if_fail(numchars <= len, 0);
200 /* Normalize the string */
201 char *utf8 = convert_ucs4_to_utf8(buf, numchars);
204 char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD);
206 gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
211 /* Copy the output buffer to the original buffer */
212 memcpy(buf, outbuf, MIN(outchars, len) * 4);
219 * glk_buffer_canon_normalize_uni:
220 * @buf: A character array in UCS-4.
221 * @len: Available length of @buf.
222 * @numchars: Number of characters in @buf.
224 * This transforms a string into its canonical decomposition and recomposition
225 * (<quote>Normalization Form C</quote>). Effectively, this takes apart
226 * multipart characters, and then puts them back together in a standard way. For
227 * example, this would convert the two-character string containing
228 * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
229 * ACCENT) into the one-character string <quote>è</quote> (character
230 * 0xE8, an accented <quote>e</quote>).
232 * The <code>canon_normalize</code> function includes decomposition as part of
233 * its implementation. You never have to call both functions on the same string.
235 * Both of these functions are idempotent.
237 * These functions provide two length arguments because a string of Unicode
238 * characters may expand when it is transformed. The @len argument is the
239 * available length of the buffer; @numchars is the number of characters in the
240 * buffer initially. (So @numchars must be less than or equal to @len. The
241 * contents of the buffer after @numchars do not affect the operation.)
243 * The functions return the number of characters after transformation. If this
244 * is greater than @len, the characters in the array will be safely truncated at
245 * @len, but the true count will be returned. (The contents of the buffer after
246 * the returned count are undefined.)
249 * The Unicode spec also defines stronger forms of these functions, called
250 * <quote>compatibility decomposition and recomposition</quote>
251 * (<quote>Normalization Form KD</quote> and <quote>Normalization Form
252 * KC</quote>.) These do all of the accent-mangling described above, but they
253 * also transform many other obscure Unicode characters into more familiar
254 * forms. For example, they split ligatures apart into separate letters. They
255 * also convert Unicode display variations such as script letters, circled
256 * letters, and half-width letters into their common forms.
260 * The Glk spec does not currently provide these stronger transformations.
261 * Glk's expected use of Unicode normalization is for line input, and an OS
262 * facility for line input will generally not produce these alternate
263 * character forms (unless the user goes out of his way to type them).
264 * Therefore, the need for these transformations does not seem to be worth the
265 * extra data table space.
268 * Returns: the number of characters in @buf after normalization.
271 glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
273 g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
274 g_return_val_if_fail(numchars <= len, 0);
278 /* Normalize the string */
279 char *utf8 = convert_ucs4_to_utf8(buf, numchars);
282 char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC);
284 gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
289 /* Copy the output buffer to the original buffer */
290 memcpy(buf, outbuf, MIN(outchars, len) * 4);