interpreters/bocfel/dict.c

   1 /*-
   2  * Copyright 2009-2012 Chris Spiegel.
   3  *
   4  * This file is part of Bocfel.
   5  *
   6  * Bocfel is free software: you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License, version
   8  * 2 or 3, as published by the Free Software Foundation.
   9  *
  10  * Bocfel is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with Bocfel.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18
  19 #include <stdlib.h>
  20 #include <stdio.h>
  21 #include <stdint.h>
  22 #include <string.h>
  23
  24 #include "dict.h"
  25 #include "memory.h"
  26 #include "process.h"
  27 #include "unicode.h"
  28 #include "util.h"
  29 #include "zterp.h"
  30
  31 static uint16_t separators;
  32 static uint8_t num_separators;
  33
  34 static uint16_t GET_WORD(uint8_t *base)
  35 {
  36   return (base[0] << 8) | base[1];
  37 }
  38 static void MAKE_WORD(uint8_t *base, uint16_t val)
  39 {
  40   base[0] = val >> 8;
  41   base[1] = val & 0xff;
  42 }
  43
  44 /* Add the character c to the nth position of the encoded text.  c is a
  45  * 5-bit value (either a shift character, which selects an alphabet, or
  46  * the index into the current alphabet).
  47  */
  48 static void add_zchar(int c, int n, uint8_t *encoded)
  49 {
  50   uint16_t w = GET_WORD(&encoded[2 * (n / 3)]);
  51
  52   /* From §3.2:
  53    * --first byte-------   --second byte---
  54    * 7    6 5 4 3 2  1 0   7 6 5  4 3 2 1 0
  55    * bit  --first--  --second---  --third--
  56    *
  57    * So to figure out which third of the word to store to:
  58    * If n is 0, 3, 6, ... then store to the first (left shift 10).
  59    * If n is 1, 4, 7, ... then store to the second (left shift 5).
  60    * If n is 2, 5, 8, ... then store to the third (left shift 0).
  61    * “Or” into the previous value because, unless this is the first
  62    * character, there are already values we’ve stored there.
  63    */
  64   w |= (c & 0x1f) << (5 * (2 - (n % 3)));
  65
  66   MAKE_WORD(&encoded[2 * (n / 3)], w);
  67 }
  68
  69 /* Encode the text at “s”, of length “len” (there is not necessarily a
  70  * terminating null character) into the buffer “encoded”.
  71  *
  72  * For V3 the encoded text is 6 Z-characters (4 bytes); for V4 and above
  73  * it’s 9 characters (6 bytes).  Due to the nature of the loop here,
  74  * it’s possible to encode too many bytes.  For example, if the string
  75  * given is "aaa<" in a V3 game, the three 'a' characters will take up a
  76  * word (all three being packed into one), but the single '<' character
  77  * will take up two words (one full word and a third of the next) due to
  78  * the fact that '<' is not in the alphabet table.  Thus the encoded
  79  * text will be 7 characters. This is OK because routines that use the
  80  * encoded string are smart enough to only pay attention to the first 6
  81  * or 9 Z-characters; and partial Z-characters are OK per §3.6.1.
  82  *
  83  * 1.1 of the standard revises the encoding for V1 and V2 games.  I am
  84  * not implementing the new rules for two basic reasons:
  85  * 1) It apparently only affects three (unnecessary) dictionary words in
  86  *    the known V1-2 games.
  87  * 2) Because of 1, it is not worth the effort to peek ahead and see
  88  *    what the next character is to determine whether to shift once or
  89  *    to lock.
  90  *
  91  * Z-character 0 is a space (§3.5.1), so theoretically a space should be
  92  * encoded simply with a zero.  However, Inform 6.32 encodes space
  93  * (which has the value 32) as a 10-bit ZSCII code, which is the
  94  * Z-characters 5, 6, 1, 0.  Assume this is correct.
  95  */
  96 static void encode_string(const uint8_t *s, size_t len, uint8_t encoded[8])
  97 {
  98   int n = 0;
  99   const int res = zversion <= 3 ? 6 : 9;
 100   const int shiftbase = zversion <= 2 ? 1 : 3;
 101
 102   memset(encoded, 0, 8);
 103
 104   for(size_t i = 0; i < len && n < res; i++)
 105   {
 106     int pos;
 107
 108     pos = atable_pos[s[i]];
 109     if(pos >= 0)
 110     {
 111       int shift = pos / 26;
 112       int c = pos % 26;
 113
 114       if(shift) add_zchar(shiftbase + shift, n++, encoded);
 115       add_zchar(c + 6, n++, encoded);
 116     }
 117     else
 118     {
 119       add_zchar(shiftbase + 2, n++, encoded);
 120       add_zchar(6, n++, encoded);
 121       add_zchar(s[i] >> 5, n++, encoded);
 122       add_zchar(s[i] & 0x1f, n++, encoded);
 123     }
 124   }
 125
 126   while(n < res)
 127   {
 128     add_zchar(5, n++, encoded);
 129   }
 130
 131   /* §3.2: the MSB of the last encoded word must be set. */
 132   if(zversion <= 3) encoded[2] |= 0x80;
 133   else              encoded[4] |= 0x80;
 134 }
 135
 136 static int dict_compar(const void *a, const void *b)
 137 {
 138   return memcmp(a, b, zversion <= 3 ? 4 : 6);
 139 }
 140 static uint16_t dict_find(const uint8_t *token, size_t len, uint16_t dictionary)
 141 {
 142   uint8_t elength;
 143   uint16_t base;
 144   long nentries;
 145   uint8_t *ret = NULL;
 146   uint8_t encoded[8];
 147
 148   encode_string(token, len, encoded);
 149
 150   elength = user_byte(dictionary + num_separators + 1);
 151   nentries = (int16_t)user_word(dictionary + num_separators + 2);
 152   base = dictionary + num_separators + 2 + 2;
 153
 154   ZASSERT(elength >= (zversion <= 3 ? 4 : 6), "dictionary entry length (%d) too small", elength);
 155   ZASSERT(base + (labs(nentries) * elength) < memory_size, "reported dictionary length extends beyond memory size");
 156
 157   if(nentries > 0)
 158   {
 159     ret = bsearch(encoded, &memory[base], nentries, elength, dict_compar);
 160   }
 161   else
 162   {
 163     for(long i = 0; i < -nentries; i++)
 164     {
 165       uint8_t *entry = &memory[base + (i * elength)];
 166
 167       if(dict_compar(encoded, entry) == 0)
 168       {
 169         ret = entry;
 170         break;
 171       }
 172     }
 173   }
 174
 175   if(ret == NULL) return 0;
 176
 177   return base + (ret - &memory[base]);
 178 }
 179
 180 static int is_sep(uint8_t c)
 181 {
 182   if(c == ZSCII_SPACE) return 1;
 183
 184   for(uint16_t i = 0; i < num_separators; i++) if(user_byte(separators + i) == c) return 1;
 185
 186   return 0;
 187 }
 188
 189 static void handle_token(const uint8_t *base, const uint8_t *token, int len, uint16_t parse, uint16_t dictionary, int found, int flag)
 190 {
 191   uint16_t d;
 192   const uint8_t examine[] = { 'e', 'x', 'a', 'm', 'i', 'n', 'e' };
 193   const uint8_t again[] = { 'a', 'g', 'a', 'i', 'n' };
 194   const uint8_t wait[] = { 'w', 'a', 'i', 't' };
 195
 196   d = dict_find(token, len, dictionary);
 197
 198   if(!options.disable_abbreviations && base == token && len == 1)
 199   {
 200     if     (*token == 'x') d = dict_find(examine, sizeof examine, dictionary);
 201     else if(*token == 'g') d = dict_find(again, sizeof again, dictionary);
 202     else if(*token == 'z') d = dict_find(wait, sizeof wait, dictionary);
 203   }
 204
 205   if(flag && d == 0) return;
 206
 207   parse = parse + 2 + (found * 4);
 208
 209   user_store_word(parse, d);
 210
 211   user_store_byte(parse + 2, len);
 212
 213   if(zversion <= 4) user_store_byte(parse + 3, token - base + 1);
 214   else              user_store_byte(parse + 3, token - base + 2);
 215 }
 216
 217 /* The behavior of tokenize is described in §15 (under the read opcode)
 218  * and §13.
 219  *
 220  * For the text buffer, byte 0 is ignored in both V3/4 and V5+.
 221  * Byte 1 of V3/4 is the start of the string, while in V5+ it is the
 222  * length of the string.
 223  * Byte 2 of V5+ is the start of the string.  V3/4 strings have a null
 224  * terminator, while V5+ do not.
 225  *
 226  * For the parse buffer, byte 0 contains the maximum number of tokens
 227  * that can be read.
 228  * The number of tokens found is stored in byte 1.
 229  * Each token is then represented by a 4-byte chunk with the following
 230  * information:
 231  * • The first two bytes are the byte address of the dictionary entry
 232  *   for the token, or 0 if the token was not found in the dictionary.
 233  * • The next byte is the length of the token.
 234  * • The final byte is the offset in the string of the token.
 235  */
 236 void tokenize(uint16_t text, uint16_t parse, uint16_t dictionary, int flag)
 237 {
 238   const uint8_t *p, *lastp;
 239   uint8_t *string;
 240   uint32_t text_len = 0;
 241   const int maxwords = user_byte(parse);
 242   int in_word = 0;
 243   int found = 0;
 244
 245   if(dictionary == 0) dictionary = header.dictionary;
 246
 247   ZASSERT(dictionary != 0, "attempt to tokenize without a valid dictionary");
 248
 249   num_separators = user_byte(dictionary);
 250   separators = dictionary + 1;
 251
 252   if(zversion >= 5) text_len = user_byte(text + 1);
 253   else              while(user_byte(text + 1 + text_len) != 0) text_len++;
 254
 255   ZASSERT(text + 1 + (zversion >= 5) + text_len < memory_size, "attempt to tokenize out-of-bounds string");
 256
 257   string = &memory[text + 1 + (zversion >= 5)];
 258
 259   for(p = string; p - string < text_len && *p == ZSCII_SPACE; p++);
 260   lastp = p;
 261
 262   text_len -= (p - string);
 263
 264   do
 265   {
 266     if(!in_word && text_len != 0 && !is_sep(*p))
 267     {
 268       in_word = 1;
 269       lastp = p;
 270     }
 271
 272     if(text_len == 0 || is_sep(*p))
 273     {
 274       if(in_word)
 275       {
 276         handle_token(string, lastp, p - lastp, parse, dictionary, found++, flag);
 277       }
 278
 279       /* §13.6.1: Separators (apart from a space) are tokens too. */
 280       if(text_len != 0 && *p != ZSCII_SPACE)
 281       {
 282         handle_token(string, p, 1, parse, dictionary, found++, flag);
 283       }
 284
 285       if(found == maxwords) break;
 286
 287       in_word = 0;
 288     }
 289
 290     p++;
 291
 292   } while(text_len--);
 293
 294   user_store_byte(parse + 1, found);
 295 }
 296
 297 static void encode_text(uint32_t text, uint16_t len, uint16_t coded)
 298 {
 299   uint8_t encoded[8];
 300
 301   ZASSERT(text + len < memory_size, "reported text length extends beyond memory size");
 302
 303   encode_string(&memory[text], len, encoded);
 304
 305   for(int i = 0; i < 6; i++) user_store_byte(coded + i, encoded[i]);
 306 }
 307
 308 void ztokenise(void)
 309 {
 310   if(znargs < 3) zargs[2] = 0;
 311   if(znargs < 4) zargs[3] = 0;
 312
 313   tokenize(zargs[0], zargs[1], zargs[2], zargs[3]);
 314 }
 315
 316 void zencode_text(void)
 317 {
 318   encode_text(zargs[0] + zargs[2], zargs[1], zargs[3]);
 319 }