X-Git-Url: https://git.stderr.nl/gitweb?a=blobdiff_plain;f=libchimara%2Fcase.c;h=3176a9a1adf2d60dd8aa7a204175db92772d1bce;hb=HEAD;hp=47086e15ba9e92d19a386ef575f74ef26dd97bd9;hpb=78652af29a2f39e626febd5f4213da57d3a13901;p=projects%2Fchimara%2Fchimara.git

diff --git a/libchimara/case.c b/libchimara/case.c
index 47086e1..3176a9a 100644
--- a/libchimara/case.c
+++ b/libchimara/case.c
@@ -1,5 +1,7 @@
-#include <gtk/gtk.h>
+#include <string.h>
+#include <glib.h>
 #include "glk.h"
+#include "charset.h"
 
 /**
  * glk_char_to_lower:
@@ -79,7 +81,7 @@ glk_char_to_upper(unsigned char ch)
  * <note><para>
  *   Unicode has some strange case cases. For example, a combined character
  *   that looks like <quote>ss</quote> might properly be upper-cased into 
- *   <emphasis>two</emphasis> characters <quote>S</quote>. Title-casing is even
+ *   <emphasis>two</emphasis> <quote>S</quote> characters. Title-casing is even
  *   stranger; <quote>ss</quote> (at the beginning of a word) might be 
  *   title-cased into a different combined character that looks like 
  *   <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
@@ -93,15 +95,25 @@ glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars)
 {
 	g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 	g_return_val_if_fail(numchars <= len, 0);
+
+	long outchars;
+
+	/* Lowercase the string */
+	char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+	if(!utf8)
+		return numchars;
+	char *lowered = g_utf8_strdown(utf8, -1);
+	g_free(utf8);
+	gunichar *outbuf = convert_utf8_to_ucs4(lowered, &outchars);
+	g_free(lowered);
+	if(!outbuf)
+		return numchars;
 	
-	/* GLib has a function that converts _one_ UCS-4 character to _one_
-	lowercase UCS-4 character; so apparently we don't have to worry about the
-	string length changing... */
-	glui32 *ptr;
-	for(ptr = buf; ptr < buf + numchars; ptr++)
-		*ptr = g_unichar_tolower(*ptr);
+	/* Copy the output buffer to the original buffer */
+	memcpy(buf, outbuf, MIN(outchars, len) * 4);
+	g_free(outbuf);
 	
-	return numchars;
+	return outchars;
 }
 
 /**
@@ -121,14 +133,24 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
 	g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 	g_return_val_if_fail(numchars <= len, 0);
 	
-	/* GLib has a function that converts _one_ UCS-4 character to _one_
-	uppercase UCS-4 character; so apparently we don't have to worry about the
-	string length changing... */
-	glui32 *ptr;
-	for(ptr = buf; ptr < buf + numchars; ptr++)
-		*ptr = g_unichar_toupper(*ptr);
+	long outchars;
 	
-	return numchars;
+	/* Uppercase the string */
+	char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+	if(!utf8)
+		return numchars;
+	char *uppered = g_utf8_strup(utf8, -1);
+	g_free(utf8);
+	gunichar *outbuf = convert_utf8_to_ucs4(uppered, &outchars);
+	g_free(uppered);
+	if(!outbuf)
+		return numchars;
+
+	/* Copy the output buffer to the original buffer */
+	memcpy(buf, outbuf, MIN(outchars, len) * 4);
+	g_free(outbuf);
+
+	return outchars;
 }
 
 /**
@@ -140,10 +162,10 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
  * otherwise.
  *
  * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
- * an additional (boolean) flag. Its basic function is to change the first
- * character of the buffer to upper-case, and leave the rest of the buffer
- * unchanged. If @lowerrest is true, it changes all the non-first characters to
- * lower-case (instead of leaving them alone.) 
+ * an additional (boolean) flag. If the flag is zero, the function changes the
+ * first character of the buffer to upper-case, and leaves the rest of the
+ * buffer unchanged. If the flag is nonzero, it changes the first character to
+ * upper-case and the rest to lower-case.
  *
  * <note><para>
  *   Earlier drafts of this spec had a separate function which title-cased the
@@ -160,9 +182,14 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo
 	g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
 	g_return_val_if_fail(numchars <= len, 0);
 	
-	/* GLib has a function that converts _one_ UCS-4 character to _one_
-	titlecase UCS-4 character; so apparently we don't have to worry about the
-	string length changing... */
+	/* FIXME: This is wrong. g_unichar_totitle() which returns the titlecase of
+	 one Unicode code point, but that only returns the correct result if the
+	 titlecase character is also one code point.
+	 For example, the one-character 'ffi' ligature should be title-cased to the
+	 three-character string 'Ffi'. This code leaves it as the 'ffi' ligature,
+	 which is incorrect.
+	 However, nothing much can be done about it unless GLib gets a
+	 g_utf8_strtitle() function, or we roll our own. */
 	*buf = g_unichar_totitle(*buf);
 	/* Call lowercase on the rest of the string */
 	if(lowerrest)
@@ -170,3 +197,123 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo
 	return numchars;
 }
 
+/**
+ * glk_buffer_canon_decompose_uni:
+ * @buf: A character array in UCS-4.
+ * @len: Available length of @buf.
+ * @numchars: Number of characters in @buf.
+ *
+ * This transforms a string into its canonical decomposition
+ * (<quote>Normalization Form D</quote>). Effectively, this takes apart
+ * multipart characters into their individual parts. For example, it would
+ * convert <quote>&egrave;</quote> (character 0xE8, an accented
+ * <quote>e</quote>) into the two-character string containing <quote>e</quote>
+ * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
+ * character has multiple accent marks, they are also rearranged into a standard
+ * order.
+ *
+ * Returns: The number of characters in @buf after decomposition.
+ */
+glui32
+glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
+{
+	g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
+	g_return_val_if_fail(numchars <= len, 0);
+
+	long outchars;
+
+	/* Normalize the string */
+	char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+	if(!utf8)
+		return numchars;
+	char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD);
+	g_free(utf8);
+	gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
+	g_free(decomposed);
+	if(!outbuf)
+		return numchars;
+
+	/* Copy the output buffer to the original buffer */
+	memcpy(buf, outbuf, MIN(outchars, len) * 4);
+	g_free(outbuf);
+
+	return outchars;
+}
+
+/**
+ * glk_buffer_canon_normalize_uni:
+ * @buf: A character array in UCS-4.
+ * @len: Available length of @buf.
+ * @numchars: Number of characters in @buf.
+ *
+ * This transforms a string into its canonical decomposition and recomposition
+ * (<quote>Normalization Form C</quote>). Effectively, this takes apart
+ * multipart characters, and then puts them back together in a standard way. For
+ * example, this would convert the two-character string containing
+ * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
+ * ACCENT) into the one-character string <quote>&egrave;</quote> (character
+ * 0xE8, an accented <quote>e</quote>).
+ *
+ * The <code>canon_normalize</code> function includes decomposition as part of
+ * its implementation. You never have to call both functions on the same string.
+ *
+ * Both of these functions are idempotent.
+ *
+ * These functions provide two length arguments because a string of Unicode
+ * characters may expand when it is transformed. The @len argument is the
+ * available length of the buffer; @numchars is the number of characters in the
+ * buffer initially. (So @numchars must be less than or equal to @len. The
+ * contents of the buffer after @numchars do not affect the operation.)
+ *
+ * The functions return the number of characters after transformation. If this
+ * is greater than @len, the characters in the array will be safely truncated at
+ * @len, but the true count will be returned. (The contents of the buffer after
+ * the returned count are undefined.)
+ *
+ * <note><para>
+ *   The Unicode spec also defines stronger forms of these functions, called
+ *   <quote>compatibility decomposition and recomposition</quote>
+ *   (<quote>Normalization Form KD</quote> and <quote>Normalization Form
+ *   KC</quote>.) These do all of the accent-mangling described above, but they
+ *   also transform many other obscure Unicode characters into more familiar
+ *   forms. For example, they split ligatures apart into separate letters. They
+ *   also convert Unicode display variations such as script letters, circled
+ *   letters, and half-width letters into their common forms.
+ * </para></note>
+ *
+ * <note><para>
+ *   The Glk spec does not currently provide these stronger transformations.
+ *   Glk's expected use of Unicode normalization is for line input, and an OS
+ *   facility for line input will generally not produce these alternate
+ *   character forms (unless the user goes out of his way to type them).
+ *   Therefore, the need for these transformations does not seem to be worth the
+ *   extra data table space.
+ * </para></note>
+ *
+ * Returns: the number of characters in @buf after normalization.
+ */
+glui32
+glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
+{
+	g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
+	g_return_val_if_fail(numchars <= len, 0);
+
+	long outchars;
+
+	/* Normalize the string */
+	char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+	if(!utf8)
+		return numchars;
+	char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC);
+	g_free(utf8);
+	gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
+	g_free(decomposed);
+	if(!outbuf)
+		return numchars;
+
+	/* Copy the output buffer to the original buffer */
+	memcpy(buf, outbuf, MIN(outchars, len) * 4);
+	g_free(outbuf);
+
+	return outchars;
+}