PEP8 cleanup in player.py

[projects/chimara/chimara.git] / libchimara / case.c
diff --git a/libchimara/case.c b/libchimara/case.c

index 47086e15ba9e92d19a386ef575f74ef26dd97bd9..3176a9a1adf2d60dd8aa7a204175db92772d1bce 100644 (file)
--- a/libchimara/case.c
+++ b/libchimara/case.c
@@ -1,5 +1,7 @@
-#include <gtk/gtk.h>
+#include <string.h>
+#include <glib.h>
  #include "glk.h"
  #include "glk.h"
+#include "charset.h"
  
  /**
   * glk_char_to_lower:
  
  /**
   * glk_char_to_lower:
@@ -79,7 +81,7 @@ glk_char_to_upper(unsigned char ch)
   * <note><para>
   *   Unicode has some strange case cases. For example, a combined character
   *   that looks like <quote>ss</quote> might properly be upper-cased into 
   * <note><para>
   *   Unicode has some strange case cases. For example, a combined character
   *   that looks like <quote>ss</quote> might properly be upper-cased into 
- *   <emphasis>two</emphasis> characters <quote>S</quote>. Title-casing is even
+ *   <emphasis>two</emphasis> <quote>S</quote> characters. Title-casing is even
   *   stranger; <quote>ss</quote> (at the beginning of a word) might be 
   *   title-cased into a different combined character that looks like 
   *   <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
   *   stranger; <quote>ss</quote> (at the beginning of a word) might be 
   *   title-cased into a different combined character that looks like 
   *   <quote>Ss</quote>. The glk_buffer_to_title_case_uni() function is actually
@@ -93,15 +95,25 @@ glk_buffer_to_lower_case_uni(glui32 *buf, glui32 len, glui32 numchars)
  {
         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
         g_return_val_if_fail(numchars <= len, 0);
  {
         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
         g_return_val_if_fail(numchars <= len, 0);
+
+       long outchars;
+
+       /* Lowercase the string */
+       char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+       if(!utf8)
+               return numchars;
+       char *lowered = g_utf8_strdown(utf8, -1);
+       g_free(utf8);
+       gunichar *outbuf = convert_utf8_to_ucs4(lowered, &outchars);
+       g_free(lowered);
+       if(!outbuf)
+               return numchars;
         
         
-       /* GLib has a function that converts _one_ UCS-4 character to _one_
-       lowercase UCS-4 character; so apparently we don't have to worry about the
-       string length changing... */
-       glui32 *ptr;
-       for(ptr = buf; ptr < buf + numchars; ptr++)
-               *ptr = g_unichar_tolower(*ptr);
+       /* Copy the output buffer to the original buffer */
+       memcpy(buf, outbuf, MIN(outchars, len) * 4);
+       g_free(outbuf);
         
         
-       return numchars;
+       return outchars;
  }
  
  /**
  }
  
  /**
@@ -121,14 +133,24 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
         g_return_val_if_fail(numchars <= len, 0);
         
         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
         g_return_val_if_fail(numchars <= len, 0);
         
-       /* GLib has a function that converts _one_ UCS-4 character to _one_
-       uppercase UCS-4 character; so apparently we don't have to worry about the
-       string length changing... */
-       glui32 *ptr;
-       for(ptr = buf; ptr < buf + numchars; ptr++)
-               *ptr = g_unichar_toupper(*ptr);
+       long outchars;
         
         
-       return numchars;
+       /* Uppercase the string */
+       char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+       if(!utf8)
+               return numchars;
+       char *uppered = g_utf8_strup(utf8, -1);
+       g_free(utf8);
+       gunichar *outbuf = convert_utf8_to_ucs4(uppered, &outchars);
+       g_free(uppered);
+       if(!outbuf)
+               return numchars;
+
+       /* Copy the output buffer to the original buffer */
+       memcpy(buf, outbuf, MIN(outchars, len) * 4);
+       g_free(outbuf);
+
+       return outchars;
  }
  
  /**
  }
  
  /**
@@ -140,10 +162,10 @@ glk_buffer_to_upper_case_uni(glui32 *buf, glui32 len, glui32 numchars)
   * otherwise.
   *
   * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
   * otherwise.
   *
   * See glk_buffer_to_lower_case_uni(). The <code>title_case</code> function has
- * an additional (boolean) flag. Its basic function is to change the first
- * character of the buffer to upper-case, and leave the rest of the buffer
- * unchanged. If @lowerrest is true, it changes all the non-first characters to
- * lower-case (instead of leaving them alone.) 
+ * an additional (boolean) flag. If the flag is zero, the function changes the
+ * first character of the buffer to upper-case, and leaves the rest of the
+ * buffer unchanged. If the flag is nonzero, it changes the first character to
+ * upper-case and the rest to lower-case.
   *
   * <note><para>
   *   Earlier drafts of this spec had a separate function which title-cased the
   *
   * <note><para>
   *   Earlier drafts of this spec had a separate function which title-cased the
@@ -160,9 +182,14 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo
         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
         g_return_val_if_fail(numchars <= len, 0);
         
         g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
         g_return_val_if_fail(numchars <= len, 0);
         
-       /* GLib has a function that converts _one_ UCS-4 character to _one_
-       titlecase UCS-4 character; so apparently we don't have to worry about the
-       string length changing... */
+       /* FIXME: This is wrong. g_unichar_totitle() which returns the titlecase of
+        one Unicode code point, but that only returns the correct result if the
+        titlecase character is also one code point.
+        For example, the one-character 'ffi' ligature should be title-cased to the
+        three-character string 'Ffi'. This code leaves it as the 'ffi' ligature,
+        which is incorrect.
+        However, nothing much can be done about it unless GLib gets a
+        g_utf8_strtitle() function, or we roll our own. */
         *buf = g_unichar_totitle(*buf);
         /* Call lowercase on the rest of the string */
         if(lowerrest)
         *buf = g_unichar_totitle(*buf);
         /* Call lowercase on the rest of the string */
         if(lowerrest)
@@ -170,3 +197,123 @@ glk_buffer_to_title_case_uni(glui32 *buf, glui32 len, glui32 numchars, glui32 lo
         return numchars;
  }
  
         return numchars;
  }
  
+/**
+ * glk_buffer_canon_decompose_uni:
+ * @buf: A character array in UCS-4.
+ * @len: Available length of @buf.
+ * @numchars: Number of characters in @buf.
+ *
+ * This transforms a string into its canonical decomposition
+ * (<quote>Normalization Form D</quote>). Effectively, this takes apart
+ * multipart characters into their individual parts. For example, it would
+ * convert <quote>&egrave;</quote> (character 0xE8, an accented
+ * <quote>e</quote>) into the two-character string containing <quote>e</quote>
+ * followed by Unicode character 0x0300 (COMBINING GRAVE ACCENT). If a single
+ * character has multiple accent marks, they are also rearranged into a standard
+ * order.
+ *
+ * Returns: The number of characters in @buf after decomposition.
+ */
+glui32
+glk_buffer_canon_decompose_uni(glui32 *buf, glui32 len, glui32 numchars)
+{
+       g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
+       g_return_val_if_fail(numchars <= len, 0);
+
+       long outchars;
+
+       /* Normalize the string */
+       char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+       if(!utf8)
+               return numchars;
+       char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFD);
+       g_free(utf8);
+       gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
+       g_free(decomposed);
+       if(!outbuf)
+               return numchars;
+
+       /* Copy the output buffer to the original buffer */
+       memcpy(buf, outbuf, MIN(outchars, len) * 4);
+       g_free(outbuf);
+
+       return outchars;
+}
+
+/**
+ * glk_buffer_canon_normalize_uni:
+ * @buf: A character array in UCS-4.
+ * @len: Available length of @buf.
+ * @numchars: Number of characters in @buf.
+ *
+ * This transforms a string into its canonical decomposition and recomposition
+ * (<quote>Normalization Form C</quote>). Effectively, this takes apart
+ * multipart characters, and then puts them back together in a standard way. For
+ * example, this would convert the two-character string containing
+ * <quote>e</quote> followed by Unicode character 0x0300 (COMBINING GRAVE
+ * ACCENT) into the one-character string <quote>&egrave;</quote> (character
+ * 0xE8, an accented <quote>e</quote>).
+ *
+ * The <code>canon_normalize</code> function includes decomposition as part of
+ * its implementation. You never have to call both functions on the same string.
+ *
+ * Both of these functions are idempotent.
+ *
+ * These functions provide two length arguments because a string of Unicode
+ * characters may expand when it is transformed. The @len argument is the
+ * available length of the buffer; @numchars is the number of characters in the
+ * buffer initially. (So @numchars must be less than or equal to @len. The
+ * contents of the buffer after @numchars do not affect the operation.)
+ *
+ * The functions return the number of characters after transformation. If this
+ * is greater than @len, the characters in the array will be safely truncated at
+ * @len, but the true count will be returned. (The contents of the buffer after
+ * the returned count are undefined.)
+ *
+ * <note><para>
+ *   The Unicode spec also defines stronger forms of these functions, called
+ *   <quote>compatibility decomposition and recomposition</quote>
+ *   (<quote>Normalization Form KD</quote> and <quote>Normalization Form
+ *   KC</quote>.) These do all of the accent-mangling described above, but they
+ *   also transform many other obscure Unicode characters into more familiar
+ *   forms. For example, they split ligatures apart into separate letters. They
+ *   also convert Unicode display variations such as script letters, circled
+ *   letters, and half-width letters into their common forms.
+ * </para></note>
+ *
+ * <note><para>
+ *   The Glk spec does not currently provide these stronger transformations.
+ *   Glk's expected use of Unicode normalization is for line input, and an OS
+ *   facility for line input will generally not produce these alternate
+ *   character forms (unless the user goes out of his way to type them).
+ *   Therefore, the need for these transformations does not seem to be worth the
+ *   extra data table space.
+ * </para></note>
+ *
+ * Returns: the number of characters in @buf after normalization.
+ */
+glui32
+glk_buffer_canon_normalize_uni(glui32 *buf, glui32 len, glui32 numchars)
+{
+       g_return_val_if_fail(buf != NULL && (len > 0 || numchars > 0), 0);
+       g_return_val_if_fail(numchars <= len, 0);
+
+       long outchars;
+
+       /* Normalize the string */
+       char *utf8 = convert_ucs4_to_utf8(buf, numchars);
+       if(!utf8)
+               return numchars;
+       char *decomposed = g_utf8_normalize(utf8, -1, G_NORMALIZE_NFC);
+       g_free(utf8);
+       gunichar *outbuf = convert_utf8_to_ucs4(decomposed, &outchars);
+       g_free(decomposed);
+       if(!outbuf)
+               return numchars;
+
+       /* Copy the output buffer to the original buffer */
+       memcpy(buf, outbuf, MIN(outchars, len) * 4);
+       g_free(outbuf);
+
+       return outchars;
+}