+/**
+ * glk_buffer_canon_decompose_uni:
+ * @buf: A character array in UCS-4.
+ * @len: Available length of @buf.
+ * @numchars: Number of characters in @buf.
+ *
+ * This transforms a string into its canonical decomposition
+ * (<quote>Normalization Form D</quote>). Effectively, this takes apart
+ * multipart characters into their individual parts. For example, it would
+ * convert <quote>è</quote> (character 0xE8, an accented
+ * <quote>e</quote>) into the two-character string containing <quote>e</quote>
+ * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
+ * character has multiple accent marks, they are also rearranged into a standard
+ * order.
+ *
+ * Returns: The number of characters in @buf after decomposition.
+ */
+glui32
+glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
+{
+ g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
+ g_return_val_if_fail(numchars <= len, 0);
+
+ long outchars;
+
+ /* Normalize the string */
+ char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+ if(!utf8)
+ return numchars;
+ char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD);
+ g_free(utf8);
+ gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
+ g_free(decomposed);
+ if(!outbuf)
+ return numchars;
+
+ /* Copy the output buffer to the original buffer */
+ memcpy(buf, outbuf, MIN(outchars, len) * 4);
+ g_free(outbuf);
+
+ return outchars;
+}
+
+/**
+ * glk_buffer_canon_normalize_uni:
+ * @buf: A character array in UCS-4.
+ * @len: Available length of @buf.
+ * @numchars: Number of characters in @buf.
+ *
+ * This transforms a string into its canonical decomposition and recomposition
+ * (<quote>Normalization Form C</quote>). Effectively, this takes apart
+ * multipart characters, and then puts them back together in a standard way. For
+ * example, this would convert the two-character string containing
+ * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
+ * ACCENT) into the one-character string <quote>è</quote> (character
+ * 0xE8, an accented <quote>e</quote>).
+ *
+ * The <code>canon_normalize</code> function includes decomposition as part of
+ * its implementation. You never have to call both functions on the same string.
+ *
+ * Both of these functions are idempotent.
+ *
+ * These functions provide two length arguments because a string of Unicode
+ * characters may expand when it is transformed. The @len argument is the
+ * available length of the buffer; @numchars is the number of characters in the
+ * buffer initially. (So @numchars must be less than or equal to @len. The
+ * contents of the buffer after @numchars do not affect the operation.)
+ *
+ * The functions return the number of characters after transformation. If this
+ * is greater than @len, the characters in the array will be safely truncated at
+ * @len, but the true count will be returned. (The contents of the buffer after
+ * the returned count are undefined.)
+ *
+ * <note><para>
+ * The Unicode spec also defines stronger forms of these functions, called
+ * <quote>compatibility decomposition and recomposition</quote>
+ * (<quote>Normalization Form KD</quote> and <quote>Normalization Form
+ * KC</quote>.) These do all of the accent-mangling described above, but they
+ * also transform many other obscure Unicode characters into more familiar
+ * forms. For example, they split ligatures apart into separate letters. They
+ * also convert Unicode display variations such as script letters, circled
+ * letters, and half-width letters into their common forms.
+ * </para></note>
+ *
+ * <note><para>
+ * The Glk spec does not currently provide these stronger transformations.
+ * Glk's expected use of Unicode normalization is for line input, and an OS
+ * facility for line input will generally not produce these alternate
+ * character forms (unless the user goes out of his way to type them).
+ * Therefore, the need for these transformations does not seem to be worth the
+ * extra data table space.
+ * </para></note>
+ *
+ * Returns: the number of characters in @buf after normalization.
+ */
+glui32
+glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
+{
+ g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
+ g_return_val_if_fail(numchars <= len, 0);
+
+ long outchars;
+
+ /* Normalize the string */
+ char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+ if(!utf8)
+ return numchars;
+ char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC);
+ g_free(utf8);
+ gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
+ g_free(decomposed);
+ if(!outbuf)
+ return numchars;
+
+ /* Copy the output buffer to the original buffer */
+ memcpy(buf, outbuf, MIN(outchars, len) * 4);
+ g_free(outbuf);
+
+ return outchars;
+}