X-Git-Url: https://git.stderr.nl/gitweb?a=blobdiff_plain;f=libchimara%2Fcase.c;h=1b6eed72ba68fddcc6d7c2aa279b048827e34bad;hb=f19a194bd066ea4320c1c6b6b8c5a375f7787af5;hp=47086e15ba9e92d19a386ef575f74ef26dd97bd9;hpb=2da183ac47f1157560dcd2c417f7f6a24ae72352;p=projects%2Fchimara%2Fchimara.git diff --git a/libchimara/case.c b/libchimara/case.c index 47086e1..1b6eed7 100644 --- a/libchimara/case.c +++ b/libchimara/case.c @@ -79,7 +79,7 @@ glk_char_to_upper(unsigned char ch) * * Unicode has some strange case cases. For example, a combined character * that looks like ss might properly be upper-cased into - * two characters S. Title-casing is even + * two S characters. Title-casing is even * stranger; ss (at the beginning of a word) might be * title-cased into a different combined character that looks like * Ss. The glk_buffer_to_title_case_uni() function is actually @@ -140,10 +140,10 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars) * otherwise. * * See glk_buffer_to_lower_case_uni(). The title_case function has - * an additional (boolean) flag. Its basic function is to change the first - * character of the buffer to upper-case, and leave the rest of the buffer - * unchanged. If @lowerrest is true, it changes all the non-first characters to - * lower-case (instead of leaving them alone.) + * an additional (boolean) flag. If the flag is zero, the function changes the + * first character of the buffer to upper-case, and leaves the rest of the + * buffer unchanged. If the flag is nonzero, it changes the first character to + * upper-case and the rest to lower-case. * * * Earlier drafts of this spec had a separate function which title-cased the @@ -170,3 +170,91 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo return numchars; } +/** + * glk_buffer_canon_decompose_uni: + * @buf: A character array in UCS-4. + * @len: Available length of @buf. + * @numchars: Number of characters in @buf. + * + * This transforms a string into its canonical decomposition + * (Normalization Form D). Effectively, this takes apart + * multipart characters into their individual parts. For example, it would + * convert è (character 0xE8, an accented + * e) into the two-character string containing e + * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single + * character has multiple accent marks, they are also rearranged into a standard + * order. + * + * Returns: The number of characters in @buf after decomposition. + */ +glui32 +glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars) +{ + g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); + g_return_val_if_fail(numchars <= len, 0); + + /* TODO: Implement this */ + return numchars; +} + +/** + * glk_buffer_canon_normalize_uni: + * @buf: A character array in UCS-4. + * @len: Available length of @buf. + * @numchars: Number of characters in @buf. + * + * This transforms a string into its canonical decomposition and recomposition + * (Normalization Form C). Effectively, this takes apart + * multipart characters, and then puts them back together in a standard way. For + * example, this would convert the two-character string containing + * e followed by Unicode character 0x0300 (COMBINING GRAVE + * ACCENT) into the one-character string è (character + * 0xE8, an accented e). + * + * The canon_normalize function includes decomposition as part of + * its implementation. You never have to call both functions on the same string. + * + * Both of these functions are idempotent. + * + * These functions provide two length arguments because a string of Unicode + * characters may expand when it is transformed. The @len argument is the + * available length of the buffer; @numchars is the number of characters in the + * buffer initially. (So @numchars must be less than or equal to @len. The + * contents of the buffer after @numchars do not affect the operation.) + * + * The functions return the number of characters after transformation. If this + * is greater than @len, the characters in the array will be safely truncated at + * @len, but the true count will be returned. (The contents of the buffer after + * the returned count are undefined.) + * + * + * The Unicode spec also defines stronger forms of these functions, called + * compatibility decomposition and recomposition + * (Normalization Form KD and Normalization Form + * KC.) These do all of the accent-mangling described above, but they + * also transform many other obscure Unicode characters into more familiar + * forms. For example, they split ligatures apart into separate letters. They + * also convert Unicode display variations such as script letters, circled + * letters, and half-width letters into their common forms. + * + * + * + * The Glk spec does not currently provide these stronger transformations. + * Glk's expected use of Unicode normalization is for line input, and an OS + * facility for line input will generally not produce these alternate + * character forms (unless the user goes out of his way to type them). + * Therefore, the need for these transformations does not seem to be worth the + * extra data table space. + * + * + * Returns: the number of characters in @buf after normalization. + */ +glui32 +glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars) +{ + g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); + g_return_val_if_fail(numchars <= len, 0); + + /* TODO: Implement this */ + return numchars; +}