libchimara/case.c

   1 #include <string.h>
   2 #include <glib.h>
   3 #include "glk.h"
   4 #include "charset.h"
   5
   6 /**
   7  * glk_char_to_lower:
   8  * @ch: A Latin-1 character.
   9  *
  10  * You can convert Latin-1 characters between upper and lower case with two Glk
  11  * utility functions, glk_char_to_lower() and glk_char_to_upper(). These have a
  12  * few advantages over the standard ANSI <function>tolower()</function> and
  13  * <function>toupper()</function> macros. They work for the entire Latin-1
  14  * character set, including accented letters; they behave consistently on all
  15  * platforms, since they're part of the Glk library; and they are safe for all
  16  * characters. That is, if you call glk_char_to_lower() on a lower-case
  17  * character, or a character which is not a letter, you'll get the argument
  18  * back unchanged.
  19  *
  20  * The case-sensitive characters in Latin-1 are the ranges 0x41..0x5A,
  21  * 0xC0..0xD6, 0xD8..0xDE (upper case) and the ranges 0x61..0x7A, 0xE0..0xF6,
  22  * 0xF8..0xFE (lower case). These are arranged in parallel; so
  23  * glk_char_to_lower() will add 0x20 to values in the upper-case ranges, and
  24  * glk_char_to_upper() will subtract 0x20 from values in the lower-case ranges.
  25  *
  26  * Returns: A lowercase or non-letter Latin-1 character.
  27  */
  28 unsigned char
  29 glk_char_to_lower(unsigned char ch)
  30 {
  31         if( (ch >= 0x41 && ch <= 0x5A) || (ch >= 0xC0 && ch <= 0xD6) || (ch >= 0xD8 && ch <= 0xDE) )
  32                 return ch + 0x20;
  33         return ch;
  34 }
  35
  36 /**
  37  * glk_char_to_upper:
  38  * @ch: A Latin-1 character.
  39  *
  40  * If @ch is a lowercase character in the Latin-1 character set, converts it to
  41  * uppercase. Otherwise, leaves it unchanged. See glk_char_to_lower().
  42  *
  43  * Returns: An uppercase or non-letter Latin-1 character.
  44  */
  45 unsigned char
  46 glk_char_to_upper(unsigned char ch)
  47 {
  48         if( (ch >= 0x61 && ch <= 0x7A) || (ch >= 0xE0 && ch <= 0xF6) || (ch >= 0xF8 && ch <= 0xFE) )
  49                 return ch - 0x20;
  50         return ch;
  51 }
  52
  53 /**
  54  * glk_buffer_to_lower_case_uni:
  55  * @buf: A character array in UCS-4.
  56  * @len: Available length of @buf.
  57  * @numchars: Number of characters in @buf.
  58  *
  59  * Unicode character conversion is trickier, and must be applied to character
  60  * arrays, not single characters. These functions
  61  * (glk_buffer_to_lower_case_uni(), glk_buffer_to_upper_case_uni(), and
  62  * glk_buffer_to_title_case_uni()) provide two length arguments because a
  63  * string of Unicode characters may expand when its case changes. The @len
  64  * argument is the available length of the buffer; @numchars is the number of
  65  * characters in the buffer initially. (So @numchars must be less than or equal
  66  * to @len. The contents of the buffer after @numchars do not affect the
  67  * operation.)
  68  *
  69  * The functions return the number of characters after conversion. If this is
  70  * greater than @len, the characters in the array will be safely truncated at
  71  * @len, but the true count will be returned. (The contents of the buffer after
  72  * the returned count are undefined.)
  73  *
  74  * The <code>lower_case</code> and <code>upper_case</code> functions do what
  75  * you'd expect: they convert every character in the buffer (the first @numchars
  76  * of them) to its upper or lower-case equivalent, if there is such a thing.
  77  *
  78  * See the Unicode spec (chapter 3.13, chapter 4.2, etc) for the exact
  79  * definitions of upper, lower, and title-case mapping.
  80  *
  81  * <note><para>
  82  *   Unicode has some strange case cases. For example, a combined character
  83  *   that looks like <quote>ss</quote> might properly be upper-cased into
  84  *   <emphasis>two</emphasis> <quote>S</quote> characters. Title-casing is even
  85  *   stranger; <quote>ss</quote> (at the beginning of a word) might be
  86  *   title-cased into a different combined character that looks like
  87  *   <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
  88  *   title-casing the first character of the buffer. If it makes a difference.
  89  * </para></note>
  90  *
  91  * Returns: The number of characters after conversion.
  92  */
  93 glui32
  94 glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars)
  95 {
  96         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
  97         g_return_val_if_fail(numchars <= len, 0);
  98
  99         long outchars;
 100
 101         /* Lowercase the string */
 102         char *utf8 = convert_ucs4_to_utf8(buf, numchars);
 103         if(!utf8)
 104                 return numchars;
 105         char *lowered = g_utf8_strdown(utf8, -1);
 106         g_free(utf8);
 107         gunichar *outbuf = convert_utf8_to_ucs4(lowered, &outchars);
 108         g_free(lowered);
 109         if(!outbuf)
 110                 return numchars;
 111
 112         /* Copy the output buffer to the original buffer */
 113         memcpy(buf, outbuf, MIN(outchars, len) * 4);
 114         g_free(outbuf);
 115
 116         return outchars;
 117 }
 118
 119 /**
 120  * glk_buffer_to_upper_case_uni:
 121  * @buf: A character array in UCS-4.
 122  * @len: Available length of @buf.
 123  * @numchars: Number of characters in @buf.
 124  *
 125  * Converts the first @numchars characters of @buf to their uppercase
 126  * equivalents, if there is such a thing. See glk_buffer_to_lower_case_uni().
 127  *
 128  * Returns: The number of characters after conversion.
 129  */
 130 glui32
 131 glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
 132 {
 133         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 134         g_return_val_if_fail(numchars <= len, 0);
 135
 136         long outchars;
 137
 138         /* Uppercase the string */
 139         char *utf8 = convert_ucs4_to_utf8(buf, numchars);
 140         if(!utf8)
 141                 return numchars;
 142         char *uppered = g_utf8_strup(utf8, -1);
 143         g_free(utf8);
 144         gunichar *outbuf = convert_utf8_to_ucs4(uppered, &outchars);
 145         g_free(uppered);
 146         if(!outbuf)
 147                 return numchars;
 148
 149         /* Copy the output buffer to the original buffer */
 150         memcpy(buf, outbuf, MIN(outchars, len) * 4);
 151         g_free(outbuf);
 152
 153         return outchars;
 154 }
 155
 156 /**
 157  * glk_buffer_to_title_case_uni:
 158  * @buf: A character array in UCS-4.
 159  * @len: Available length of @buf.
 160  * @numchars: Number of characters in @buf.
 161  * @lowerrest: %TRUE if the rest of @buf should be lowercased, %FALSE
 162  * otherwise.
 163  *
 164  * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
 165  * an additional (boolean) flag. If the flag is zero, the function changes the
 166  * first character of the buffer to upper-case, and leaves the rest of the
 167  * buffer unchanged. If the flag is nonzero, it changes the first character to
 168  * upper-case and the rest to lower-case.
 169  *
 170  * <note><para>
 171  *   Earlier drafts of this spec had a separate function which title-cased the
 172  *   first character of every <emphasis>word</emphasis> in the buffer. I took
 173  *   this out after reading Unicode Standard Annex &num;29, which explains how
 174  *   to divide a string into words. If you want it, feel free to implement it.
 175  * </para></note>
 176  *
 177  * Returns: The number of characters after conversion.
 178  */
 179 glui32
 180 glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lowerrest)
 181 {
 182         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 183         g_return_val_if_fail(numchars <= len, 0);
 184
 185         /* FIXME: This is wrong. g_unichar_totitle() which returns the titlecase of
 186          one Unicode code point, but that only returns the correct result if the
 187          titlecase character is also one code point.
 188          For example, the one-character 'ffi' ligature should be title-cased to the
 189          three-character string 'Ffi'. This code leaves it as the 'ffi' ligature,
 190          which is incorrect.
 191          However, nothing much can be done about it unless GLib gets a
 192          g_utf8_strtitle() function, or we roll our own. */
 193         *buf = g_unichar_totitle(*buf);
 194         /* Call lowercase on the rest of the string */
 195         if(lowerrest)
 196                 return glk_buffer_to_lower_case_uni(buf + 1, len - 1, numchars - 1) + 1;
 197         return numchars;
 198 }
 199
 200 /**
 201  * glk_buffer_canon_decompose_uni:
 202  * @buf: A character array in UCS-4.
 203  * @len: Available length of @buf.
 204  * @numchars: Number of characters in @buf.
 205  *
 206  * This transforms a string into its canonical decomposition
 207  * (<quote>Normalization Form D</quote>). Effectively, this takes apart
 208  * multipart characters into their individual parts. For example, it would
 209  * convert <quote>&egrave;</quote> (character 0xE8, an accented
 210  * <quote>e</quote>) into the two-character string containing <quote>e</quote>
 211  * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
 212  * character has multiple accent marks, they are also rearranged into a standard
 213  * order.
 214  *
 215  * Returns: The number of characters in @buf after decomposition.
 216  */
 217 glui32
 218 glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
 219 {
 220         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 221         g_return_val_if_fail(numchars <= len, 0);
 222
 223         long outchars;
 224
 225         /* Normalize the string */
 226         char *utf8 = convert_ucs4_to_utf8(buf, numchars);
 227         if(!utf8)
 228                 return numchars;
 229         char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD);
 230         g_free(utf8);
 231         gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
 232         g_free(decomposed);
 233         if(!outbuf)
 234                 return numchars;
 235
 236         /* Copy the output buffer to the original buffer */
 237         memcpy(buf, outbuf, MIN(outchars, len) * 4);
 238         g_free(outbuf);
 239
 240         return outchars;
 241 }
 242
 243 /**
 244  * glk_buffer_canon_normalize_uni:
 245  * @buf: A character array in UCS-4.
 246  * @len: Available length of @buf.
 247  * @numchars: Number of characters in @buf.
 248  *
 249  * This transforms a string into its canonical decomposition and recomposition
 250  * (<quote>Normalization Form C</quote>). Effectively, this takes apart
 251  * multipart characters, and then puts them back together in a standard way. For
 252  * example, this would convert the two-character string containing
 253  * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
 254  * ACCENT) into the one-character string <quote>&egrave;</quote> (character
 255  * 0xE8, an accented <quote>e</quote>).
 256  *
 257  * The <code>canon_normalize</code> function includes decomposition as part of
 258  * its implementation. You never have to call both functions on the same string.
 259  *
 260  * Both of these functions are idempotent.
 261  *
 262  * These functions provide two length arguments because a string of Unicode
 263  * characters may expand when it is transformed. The @len argument is the
 264  * available length of the buffer; @numchars is the number of characters in the
 265  * buffer initially. (So @numchars must be less than or equal to @len. The
 266  * contents of the buffer after @numchars do not affect the operation.)
 267  *
 268  * The functions return the number of characters after transformation. If this
 269  * is greater than @len, the characters in the array will be safely truncated at
 270  * @len, but the true count will be returned. (The contents of the buffer after
 271  * the returned count are undefined.)
 272  *
 273  * <note><para>
 274  *   The Unicode spec also defines stronger forms of these functions, called
 275  *   <quote>compatibility decomposition and recomposition</quote>
 276  *   (<quote>Normalization Form KD</quote> and <quote>Normalization Form
 277  *   KC</quote>.) These do all of the accent-mangling described above, but they
 278  *   also transform many other obscure Unicode characters into more familiar
 279  *   forms. For example, they split ligatures apart into separate letters. They
 280  *   also convert Unicode display variations such as script letters, circled
 281  *   letters, and half-width letters into their common forms.
 282  * </para></note>
 283  *
 284  * <note><para>
 285  *   The Glk spec does not currently provide these stronger transformations.
 286  *   Glk's expected use of Unicode normalization is for line input, and an OS
 287  *   facility for line input will generally not produce these alternate
 288  *   character forms (unless the user goes out of his way to type them).
 289  *   Therefore, the need for these transformations does not seem to be worth the
 290  *   extra data table space.
 291  * </para></note>
 292  *
 293  * Returns: the number of characters in @buf after normalization.
 294  */
 295 glui32
 296 glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
 297 {
 298         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 299         g_return_val_if_fail(numchars <= len, 0);
 300
 301         long outchars;
 302
 303         /* Normalize the string */
 304         char *utf8 = convert_ucs4_to_utf8(buf, numchars);
 305         if(!utf8)
 306                 return numchars;
 307         char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC);
 308         g_free(utf8);
 309         gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
 310         g_free(decomposed);
 311         if(!outbuf)
 312                 return numchars;
 313
 314         /* Copy the output buffer to the original buffer */
 315         memcpy(buf, outbuf, MIN(outchars, len) * 4);
 316         g_free(outbuf);
 317
 318         return outchars;
 319 }