libchimara/charset.c

   1 #include "charset.h"
   2 #include "magic.h"
   3 #include <glib.h>
   4
   5 /* Internal function: change illegal (control) characters in a string to a
   6 placeholder character. Must free returned string afterwards. */
   7 static gchar *
   8 remove_latin1_control_characters(const unsigned char *s, const gsize len)
   9 {
  10         /* If len == 0, then return an empty string, not NULL */
  11         if(len == 0)
  12                 return g_strdup("");
  13
  14         gchar *retval = g_new0(gchar, len);
  15         int i;
  16         for(i = 0; i < len; i++)
  17                 if( (s[i] < 32 && s[i] != 10) || (s[i] >= 127 && s[i] <= 159) )
  18                         retval[i] = PLACEHOLDER;
  19                 else
  20                         retval[i] = s[i];
  21         return retval;
  22 }
  23
  24 /* Internal function: convert a Latin-1 string to a UTF-8 string, replacing
  25 Latin-1 control characters by a placeholder first. The UTF-8 string must be
  26 freed afterwards. Returns NULL on error. */
  27 gchar *
  28 convert_latin1_to_utf8(const gchar *s, const gsize len)
  29 {
  30         GError *error = NULL;
  31         gchar *canonical = remove_latin1_control_characters( (unsigned char *)s, len);
  32         gchar *retval = g_convert(canonical, len, "UTF-8", "ISO-8859-1", NULL, NULL, &error);
  33         g_free(canonical);
  34
  35         if(retval == NULL)
  36                 IO_WARNING("Error during latin1->utf8 conversion of string", s, error->message);
  37
  38         return retval;
  39 }
  40
  41 /* Internal function: convert a Latin-1 string to a four-byte-per-character
  42 big-endian string of gchars. The string must be freed afterwards. */
  43 gchar *
  44 convert_latin1_to_ucs4be_string(const gchar *s, const gsize len)
  45 {
  46         /* "UCS-4BE" is also a conversion type in g_convert()... but this may be more efficient */
  47         gchar *retval = g_new0(gchar, len * 4);
  48         int i;
  49         for(i = 0; i < len; i++)
  50                 retval[i * 4 + 3] = s[i];
  51         return retval;
  52 }
  53
  54 /* Internal function: convert a null-terminated UTF-8 string to a
  55 null-terminated Latin-1 string, replacing characters that cannot be represented
  56 in Latin-1 by a placeholder. If bytes_written is not NULL it will be filled with
  57 the number of bytes returned, not counting the NULL terminator. The returned
  58 string must be freed afterwards. Returns NULL on error. */
  59 gchar *
  60 convert_utf8_to_latin1(const gchar *s, gsize *bytes_written)
  61 {
  62         GError *error = NULL;
  63         gchar *retval = g_convert_with_fallback(s, -1, "ISO-8859-1", "UTF-8", PLACEHOLDER_STRING, NULL, bytes_written, &error);
  64
  65         if(retval == NULL)
  66                 IO_WARNING("Error during utf8->latin1 conversion of string", s, error->message);
  67
  68         return retval;
  69 }
  70
  71 /* Internal function: convert a null-terminated UTF-8 string to a
  72 null-terminated UCS4 string. If items_written is not NULL it will be filled with
  73 the number of code points returned, not counting the terminator. The returned
  74 string must be freed afterwards. Returns NULL on error. */
  75 gunichar *
  76 convert_utf8_to_ucs4(const gchar *s, glong *items_written)
  77 {
  78         gunichar *retval = g_utf8_to_ucs4_fast(s, -1, items_written);
  79
  80         if(retval == NULL)
  81                 WARNING_S("Error during utf8->unicode conversion of string", s);
  82
  83         return retval;
  84 }
  85
  86 /* Internal function: Convert a Unicode buffer to a null-terminated UTF-8
  87 string. The returned string must be freed afterwards. Returns NULL on error. */
  88 gchar *
  89 convert_ucs4_to_utf8(const gunichar *buf, const glong len)
  90 {
  91         GError *error = NULL;
  92         gchar *retval = g_ucs4_to_utf8(buf, len, NULL, NULL, &error);
  93
  94         if(retval == NULL)
  95                 WARNING_S("Error during unicode->utf8 conversion", error->message);
  96
  97         return retval;
  98 }
  99
 100 /* Internal function: Convert a Unicode buffer to a Latin-1 string. Do not do
 101 any character processing, just return values > 255 as the placeholder character.
 102 The returned string must be freed afterwards.*/
 103 gchar *
 104 convert_ucs4_to_latin1_binary(const gunichar *buf, const glong len)
 105 {
 106         gchar *retval = g_new0(gchar, len);
 107         int foo;
 108         for(foo = 0; foo < len; foo++)
 109                 retval[foo] = (buf[foo] > 255)? PLACEHOLDER : buf[foo];
 110         return retval;
 111 }
 112
 113 /* Internal function: convert a Unicode buffer to a four-byte-per-character
 114 big-endian string of gchars. The string must be freed afterwards. */
 115 gchar *
 116 convert_ucs4_to_ucs4be_string(const gunichar *buf, const glong len)
 117 {
 118         gchar *retval = g_new0(gchar, len * 4);
 119         int i;
 120         for(i = 0; i < len; i++)
 121         {
 122                 retval[i * 4]     = buf[i] >> 24       ;
 123                 retval[i * 4 + 1] = buf[i] >> 16 & 0xFF;
 124                 retval[i * 4 + 2] = buf[i] >> 8  & 0xFF;
 125                 retval[i * 4 + 3] = buf[i]       & 0xFF;
 126         }
 127         return retval;
 128 }