libchimara/case.c

   1 #include <gtk/gtk.h>
   2 #include "glk.h"
   3
   4 /**
   5  * glk_char_to_lower:
   6  * @ch: A Latin-1 character.
   7  *
   8  * You can convert Latin-1 characters between upper and lower case with two Glk
   9  * utility functions, glk_char_to_lower() and glk_char_to_upper(). These have a
  10  * few advantages over the standard ANSI <function>tolower()</function> and
  11  * <function>toupper()</function> macros. They work for the entire Latin-1
  12  * character set, including accented letters; they behave consistently on all
  13  * platforms, since they're part of the Glk library; and they are safe for all
  14  * characters. That is, if you call glk_char_to_lower() on a lower-case
  15  * character, or a character which is not a letter, you'll get the argument
  16  * back unchanged.
  17  *
  18  * The case-sensitive characters in Latin-1 are the ranges 0x41..0x5A,
  19  * 0xC0..0xD6, 0xD8..0xDE (upper case) and the ranges 0x61..0x7A, 0xE0..0xF6,
  20  * 0xF8..0xFE (lower case). These are arranged in parallel; so
  21  * glk_char_to_lower() will add 0x20 to values in the upper-case ranges, and
  22  * glk_char_to_upper() will subtract 0x20 from values in the lower-case ranges.
  23  *
  24  * Returns: A lowercase or non-letter Latin-1 character.
  25  */
  26 unsigned char
  27 glk_char_to_lower(unsigned char ch)
  28 {
  29         if( (ch >= 0x41 && ch <= 0x5A) || (ch >= 0xC0 && ch <= 0xD6) || (ch >= 0xD8 && ch <= 0xDE) )
  30                 return ch + 0x20;
  31         return ch;
  32 }
  33
  34 /**
  35  * glk_char_to_upper:
  36  * @ch: A Latin-1 character.
  37  *
  38  * If @ch is a lowercase character in the Latin-1 character set, converts it to
  39  * uppercase. Otherwise, leaves it unchanged. See glk_char_to_lower().
  40  *
  41  * Returns: An uppercase or non-letter Latin-1 character.
  42  */
  43 unsigned char
  44 glk_char_to_upper(unsigned char ch)
  45 {
  46         if( (ch >= 0x61 && ch <= 0x7A) || (ch >= 0xE0 && ch <= 0xF6) || (ch >= 0xF8 && ch <= 0xFE) )
  47                 return ch - 0x20;
  48         return ch;
  49 }
  50
  51 /**
  52  * glk_buffer_to_lower_case_uni:
  53  * @buf: A character array in UCS-4.
  54  * @len: Available length of @buf.
  55  * @numchars: Number of characters in @buf.
  56  *
  57  * Unicode character conversion is trickier, and must be applied to character
  58  * arrays, not single characters. These functions
  59  * (glk_buffer_to_lower_case_uni(), glk_buffer_to_upper_case_uni(), and
  60  * glk_buffer_to_title_case_uni()) provide two length arguments because a
  61  * string of Unicode characters may expand when its case changes. The @len
  62  * argument is the available length of the buffer; @numchars is the number of
  63  * characters in the buffer initially. (So @numchars must be less than or equal
  64  * to @len. The contents of the buffer after @numchars do not affect the
  65  * operation.)
  66  *
  67  * The functions return the number of characters after conversion. If this is
  68  * greater than @len, the characters in the array will be safely truncated at
  69  * @len, but the true count will be returned. (The contents of the buffer after
  70  * the returned count are undefined.)
  71  *
  72  * The <code>lower_case</code> and <code>upper_case</code> functions do what
  73  * you'd expect: they convert every character in the buffer (the first @numchars
  74  * of them) to its upper or lower-case equivalent, if there is such a thing.
  75  *
  76  * See the Unicode spec (chapter 3.13, chapter 4.2, etc) for the exact
  77  * definitions of upper, lower, and title-case mapping.
  78  *
  79  * <note><para>
  80  *   Unicode has some strange case cases. For example, a combined character
  81  *   that looks like <quote>ss</quote> might properly be upper-cased into
  82  *   <emphasis>two</emphasis> <quote>S</quote> characters. Title-casing is even
  83  *   stranger; <quote>ss</quote> (at the beginning of a word) might be
  84  *   title-cased into a different combined character that looks like
  85  *   <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
  86  *   title-casing the first character of the buffer. If it makes a difference.
  87  * </para></note>
  88  *
  89  * Returns: The number of characters after conversion.
  90  */
  91 glui32
  92 glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars)
  93 {
  94         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
  95         g_return_val_if_fail(numchars <= len, 0);
  96
  97         /* GLib has a function that converts _one_ UCS-4 character to _one_
  98         lowercase UCS-4 character; so apparently we don't have to worry about the
  99         string length changing... */
 100         glui32 *ptr;
 101         for(ptr = buf; ptr < buf + numchars; ptr++)
 102                 *ptr = g_unichar_tolower(*ptr);
 103
 104         return numchars;
 105 }
 106
 107 /**
 108  * glk_buffer_to_upper_case_uni:
 109  * @buf: A character array in UCS-4.
 110  * @len: Available length of @buf.
 111  * @numchars: Number of characters in @buf.
 112  *
 113  * Converts the first @numchars characters of @buf to their uppercase
 114  * equivalents, if there is such a thing. See glk_buffer_to_lower_case_uni().
 115  *
 116  * Returns: The number of characters after conversion.
 117  */
 118 glui32
 119 glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
 120 {
 121         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 122         g_return_val_if_fail(numchars <= len, 0);
 123
 124         /* GLib has a function that converts _one_ UCS-4 character to _one_
 125         uppercase UCS-4 character; so apparently we don't have to worry about the
 126         string length changing... */
 127         glui32 *ptr;
 128         for(ptr = buf; ptr < buf + numchars; ptr++)
 129                 *ptr = g_unichar_toupper(*ptr);
 130
 131         return numchars;
 132 }
 133
 134 /**
 135  * glk_buffer_to_title_case_uni:
 136  * @buf: A character array in UCS-4.
 137  * @len: Available length of @buf.
 138  * @numchars: Number of characters in @buf.
 139  * @lowerrest: %TRUE if the rest of @buf should be lowercased, %FALSE
 140  * otherwise.
 141  *
 142  * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
 143  * an additional (boolean) flag. If the flag is zero, the function changes the
 144  * first character of the buffer to upper-case, and leaves the rest of the
 145  * buffer unchanged. If the flag is nonzero, it changes the first character to
 146  * upper-case and the rest to lower-case.
 147  *
 148  * <note><para>
 149  *   Earlier drafts of this spec had a separate function which title-cased the
 150  *   first character of every <emphasis>word</emphasis> in the buffer. I took
 151  *   this out after reading Unicode Standard Annex &num;29, which explains how
 152  *   to divide a string into words. If you want it, feel free to implement it.
 153  * </para></note>
 154  *
 155  * Returns: The number of characters after conversion.
 156  */
 157 glui32
 158 glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lowerrest)
 159 {
 160         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 161         g_return_val_if_fail(numchars <= len, 0);
 162
 163         /* GLib has a function that converts _one_ UCS-4 character to _one_
 164         titlecase UCS-4 character; so apparently we don't have to worry about the
 165         string length changing... */
 166         *buf = g_unichar_totitle(*buf);
 167         /* Call lowercase on the rest of the string */
 168         if(lowerrest)
 169                 return glk_buffer_to_lower_case_uni(buf + 1, len - 1, numchars - 1) + 1;
 170         return numchars;
 171 }
 172
 173 /**
 174  * glk_buffer_canon_decompose_uni:
 175  * @buf: A character array in UCS-4.
 176  * @len: Available length of @buf.
 177  * @numchars: Number of characters in @buf.
 178  *
 179  * This transforms a string into its canonical decomposition
 180  * (<quote>Normalization Form D</quote>). Effectively, this takes apart
 181  * multipart characters into their individual parts. For example, it would
 182  * convert <quote>&egrave;</quote> (character 0xE8, an accented
 183  * <quote>e</quote>) into the two-character string containing <quote>e</quote>
 184  * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
 185  * character has multiple accent marks, they are also rearranged into a standard
 186  * order.
 187  *
 188  * Returns: The number of characters in @buf after decomposition.
 189  */
 190 glui32
 191 glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
 192 {
 193         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 194         g_return_val_if_fail(numchars <= len, 0);
 195
 196         /* TODO: Implement this */
 197         return numchars;
 198 }
 199
 200 /**
 201  * glk_buffer_canon_normalize_uni:
 202  * @buf: A character array in UCS-4.
 203  * @len: Available length of @buf.
 204  * @numchars: Number of characters in @buf.
 205  *
 206  * This transforms a string into its canonical decomposition and recomposition
 207  * (<quote>Normalization Form C</quote>). Effectively, this takes apart
 208  * multipart characters, and then puts them back together in a standard way. For
 209  * example, this would convert the two-character string containing
 210  * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
 211  * ACCENT) into the one-character string <quote>&egrave;</quote> (character
 212  * 0xE8, an accented <quote>e</quote>).
 213  *
 214  * The <code>canon_normalize</code> function includes decomposition as part of
 215  * its implementation. You never have to call both functions on the same string.
 216  *
 217  * Both of these functions are idempotent.
 218  *
 219  * These functions provide two length arguments because a string of Unicode
 220  * characters may expand when it is transformed. The @len argument is the
 221  * available length of the buffer; @numchars is the number of characters in the
 222  * buffer initially. (So @numchars must be less than or equal to @len. The
 223  * contents of the buffer after @numchars do not affect the operation.)
 224  *
 225  * The functions return the number of characters after transformation. If this
 226  * is greater than @len, the characters in the array will be safely truncated at
 227  * @len, but the true count will be returned. (The contents of the buffer after
 228  * the returned count are undefined.)
 229  *
 230  * <note><para>
 231  *   The Unicode spec also defines stronger forms of these functions, called
 232  *   <quote>compatibility decomposition and recomposition</quote>
 233  *   (<quote>Normalization Form KD</quote> and <quote>Normalization Form
 234  *   KC</quote>.) These do all of the accent-mangling described above, but they
 235  *   also transform many other obscure Unicode characters into more familiar
 236  *   forms. For example, they split ligatures apart into separate letters. They
 237  *   also convert Unicode display variations such as script letters, circled
 238  *   letters, and half-width letters into their common forms.
 239  * </para></note>
 240  *
 241  * <note><para>
 242  *   The Glk spec does not currently provide these stronger transformations.
 243  *   Glk's expected use of Unicode normalization is for line input, and an OS
 244  *   facility for line input will generally not produce these alternate
 245  *   character forms (unless the user goes out of his way to type them).
 246  *   Therefore, the need for these transformations does not seem to be worth the
 247  *   extra data table space.
 248  * </para></note>
 249  *
 250  * Returns: the number of characters in @buf after normalization.
 251  */
 252 glui32
 253 glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
 254 {
 255         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 256         g_return_val_if_fail(numchars <= len, 0);
 257
 258         /* TODO: Implement this */
 259         return numchars;
 260 }