X-Git-Url: https://git.stderr.nl/gitweb?a=blobdiff_plain;f=libchimara%2Fcase.c;h=3176a9a1adf2d60dd8aa7a204175db92772d1bce;hb=HEAD;hp=47086e15ba9e92d19a386ef575f74ef26dd97bd9;hpb=78652af29a2f39e626febd5f4213da57d3a13901;p=projects%2Fchimara%2Fchimara.git diff --git a/libchimara/case.c b/libchimara/case.c index 47086e1..3176a9a 100644 --- a/libchimara/case.c +++ b/libchimara/case.c @@ -1,5 +1,7 @@ -#include +#include +#include #include "glk.h" +#include "charset.h" /** * glk_char_to_lower: @@ -79,7 +81,7 @@ glk_char_to_upper(unsigned char ch) * * Unicode has some strange case cases. For example, a combined character * that looks like ss might properly be upper-cased into - * two characters S. Title-casing is even + * two S characters. Title-casing is even * stranger; ss (at the beginning of a word) might be * title-cased into a different combined character that looks like * Ss. The glk_buffer_to_title_case_uni() function is actually @@ -93,15 +95,25 @@ glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars) { g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); g_return_val_if_fail(numchars <= len, 0); + + long outchars; + + /* Lowercase the string */ + char *utf8 = convert_ucs4_to_utf8(buf, numchars); + if(!utf8) + return numchars; + char *lowered = g_utf8_strdown(utf8, -1); + g_free(utf8); + gunichar *outbuf = convert_utf8_to_ucs4(lowered, &outchars); + g_free(lowered); + if(!outbuf) + return numchars; - /* GLib has a function that converts _one_ UCS-4 character to _one_ - lowercase UCS-4 character; so apparently we don't have to worry about the - string length changing... */ - glui32 *ptr; - for(ptr = buf; ptr < buf + numchars; ptr++) - *ptr = g_unichar_tolower(*ptr); + /* Copy the output buffer to the original buffer */ + memcpy(buf, outbuf, MIN(outchars, len) * 4); + g_free(outbuf); - return numchars; + return outchars; } /** @@ -121,14 +133,24 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars) g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); g_return_val_if_fail(numchars <= len, 0); - /* GLib has a function that converts _one_ UCS-4 character to _one_ - uppercase UCS-4 character; so apparently we don't have to worry about the - string length changing... */ - glui32 *ptr; - for(ptr = buf; ptr < buf + numchars; ptr++) - *ptr = g_unichar_toupper(*ptr); + long outchars; - return numchars; + /* Uppercase the string */ + char *utf8 = convert_ucs4_to_utf8(buf, numchars); + if(!utf8) + return numchars; + char *uppered = g_utf8_strup(utf8, -1); + g_free(utf8); + gunichar *outbuf = convert_utf8_to_ucs4(uppered, &outchars); + g_free(uppered); + if(!outbuf) + return numchars; + + /* Copy the output buffer to the original buffer */ + memcpy(buf, outbuf, MIN(outchars, len) * 4); + g_free(outbuf); + + return outchars; } /** @@ -140,10 +162,10 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars) * otherwise. * * See glk_buffer_to_lower_case_uni(). The title_case function has - * an additional (boolean) flag. Its basic function is to change the first - * character of the buffer to upper-case, and leave the rest of the buffer - * unchanged. If @lowerrest is true, it changes all the non-first characters to - * lower-case (instead of leaving them alone.) + * an additional (boolean) flag. If the flag is zero, the function changes the + * first character of the buffer to upper-case, and leaves the rest of the + * buffer unchanged. If the flag is nonzero, it changes the first character to + * upper-case and the rest to lower-case. * * * Earlier drafts of this spec had a separate function which title-cased the @@ -160,9 +182,14 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); g_return_val_if_fail(numchars <= len, 0); - /* GLib has a function that converts _one_ UCS-4 character to _one_ - titlecase UCS-4 character; so apparently we don't have to worry about the - string length changing... */ + /* FIXME: This is wrong. g_unichar_totitle() which returns the titlecase of + one Unicode code point, but that only returns the correct result if the + titlecase character is also one code point. + For example, the one-character 'ffi' ligature should be title-cased to the + three-character string 'Ffi'. This code leaves it as the 'ffi' ligature, + which is incorrect. + However, nothing much can be done about it unless GLib gets a + g_utf8_strtitle() function, or we roll our own. */ *buf = g_unichar_totitle(*buf); /* Call lowercase on the rest of the string */ if(lowerrest) @@ -170,3 +197,123 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo return numchars; } +/** + * glk_buffer_canon_decompose_uni: + * @buf: A character array in UCS-4. + * @len: Available length of @buf. + * @numchars: Number of characters in @buf. + * + * This transforms a string into its canonical decomposition + * (Normalization Form D). Effectively, this takes apart + * multipart characters into their individual parts. For example, it would + * convert è (character 0xE8, an accented + * e) into the two-character string containing e + * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single + * character has multiple accent marks, they are also rearranged into a standard + * order. + * + * Returns: The number of characters in @buf after decomposition. + */ +glui32 +glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars) +{ + g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); + g_return_val_if_fail(numchars <= len, 0); + + long outchars; + + /* Normalize the string */ + char *utf8 = convert_ucs4_to_utf8(buf, numchars); + if(!utf8) + return numchars; + char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD); + g_free(utf8); + gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars); + g_free(decomposed); + if(!outbuf) + return numchars; + + /* Copy the output buffer to the original buffer */ + memcpy(buf, outbuf, MIN(outchars, len) * 4); + g_free(outbuf); + + return outchars; +} + +/** + * glk_buffer_canon_normalize_uni: + * @buf: A character array in UCS-4. + * @len: Available length of @buf. + * @numchars: Number of characters in @buf. + * + * This transforms a string into its canonical decomposition and recomposition + * (Normalization Form C). Effectively, this takes apart + * multipart characters, and then puts them back together in a standard way. For + * example, this would convert the two-character string containing + * e followed by Unicode character 0x0300 (COMBINING GRAVE + * ACCENT) into the one-character string è (character + * 0xE8, an accented e). + * + * The canon_normalize function includes decomposition as part of + * its implementation. You never have to call both functions on the same string. + * + * Both of these functions are idempotent. + * + * These functions provide two length arguments because a string of Unicode + * characters may expand when it is transformed. The @len argument is the + * available length of the buffer; @numchars is the number of characters in the + * buffer initially. (So @numchars must be less than or equal to @len. The + * contents of the buffer after @numchars do not affect the operation.) + * + * The functions return the number of characters after transformation. If this + * is greater than @len, the characters in the array will be safely truncated at + * @len, but the true count will be returned. (The contents of the buffer after + * the returned count are undefined.) + * + * + * The Unicode spec also defines stronger forms of these functions, called + * compatibility decomposition and recomposition + * (Normalization Form KD and Normalization Form + * KC.) These do all of the accent-mangling described above, but they + * also transform many other obscure Unicode characters into more familiar + * forms. For example, they split ligatures apart into separate letters. They + * also convert Unicode display variations such as script letters, circled + * letters, and half-width letters into their common forms. + * + * + * + * The Glk spec does not currently provide these stronger transformations. + * Glk's expected use of Unicode normalization is for line input, and an OS + * facility for line input will generally not produce these alternate + * character forms (unless the user goes out of his way to type them). + * Therefore, the need for these transformations does not seem to be worth the + * extra data table space. + * + * + * Returns: the number of characters in @buf after normalization. + */ +glui32 +glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars) +{ + g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0); + g_return_val_if_fail(numchars <= len, 0); + + long outchars; + + /* Normalize the string */ + char *utf8 = convert_ucs4_to_utf8(buf, numchars); + if(!utf8) + return numchars; + char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC); + g_free(utf8); + gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars); + g_free(decomposed); + if(!outbuf) + return numchars; + + /* Copy the output buffer to the original buffer */ + memcpy(buf, outbuf, MIN(outchars, len) * 4); + g_free(outbuf); + + return outchars; +}