interpreters/nitfol/tokenise.c

   1 /*  Nitfol - z-machine interpreter using Glk for output.
   2     Copyright (C) 1999  Evin Robertson
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
  17
  18     The author can be reached at nitfol@deja.com
  19 */
  20 #include "nitfol.h"
  21
  22 static int tokenise(zword dictionarytable, const char *text, int length,
  23                     zword *parse_dest, int maxwords,
  24                     zword sepratortable, int numseparators,
  25                     BOOL write_unrecognized);
  26
  27 static void addparsed(zword *parse_dest, int word_num, int length, int off)
  28 {
  29   if(zversion <= 4)
  30     off+=1;
  31   else
  32     off+=2;
  33
  34   LOWORDwrite(*parse_dest, word_num);
  35   *parse_dest+=ZWORD_SIZE;
  36   LOBYTEwrite(*parse_dest, length);
  37   *parse_dest+=1;
  38   LOBYTEwrite(*parse_dest, off);
  39   *parse_dest+=1;
  40 }
  41
  42 static int dictentry_len;
  43
  44 static int cmpdictentry(const void *a, const void *b)
  45 {
  46   return n_memcmp(a, b, dictentry_len);
  47 }
  48
  49
  50 static zword find_word(zword dictionarytable, const char *word, int length)
  51 {
  52   zbyte zsciibuffer[12];
  53   int entry_length, word_length;
  54   int num_entries;
  55   void *p;
  56
  57   entry_length = LOBYTE(dictionarytable);
  58   dictionarytable++;
  59   num_entries = LOWORD(dictionarytable);
  60   dictionarytable+=ZWORD_SIZE;
  61
  62   if(zversion <= 3)
  63     word_length = 4;
  64   else
  65     word_length = 6;
  66
  67   encodezscii(zsciibuffer, word_length, word_length, word, length);
  68
  69   dictentry_len = word_length;
  70
  71   if(is_neg(num_entries)) {  /* Unordered dictionary */
  72     num_entries = neg(num_entries);
  73     p = n_lfind(zsciibuffer, z_memory + dictionarytable, &num_entries,
  74                 entry_length, cmpdictentry);
  75   } else {                   /* Ordered dictionary */
  76     p = n_bsearch(zsciibuffer, z_memory + dictionarytable, num_entries,
  77                   entry_length, cmpdictentry);
  78   }
  79   if(p)
  80     return ((zbyte *) p) - z_memory;
  81
  82   return 0;
  83 }
  84
  85
  86 #ifdef SMART_TOKENISER
  87
  88 struct Typocorrection {
  89   struct Typocorrection *next;
  90   struct Typocorrection *prev;
  91   char original[13];
  92   char changedto[13];
  93 };
  94
  95 struct Typocorrection *recent_corrections;  /* Inform requests two parses of
  96                                                each input; remember what
  97                                                corrections we've made so
  98                                                we don't print twice */
  99 void forget_corrections(void)
 100 {
 101   LEdestroy(recent_corrections);
 102 }
 103
 104 static zword smart_tokeniser(zword dictionarytable,
 105                              const char *text, unsigned length, BOOL is_begin)
 106 {
 107   zword word_num = 0;
 108   unsigned tlength = (length < 12) ? length : 12;
 109   char tbuffer[13];
 110
 111   /* Letter replacements are tried in this order - */
 112   const char fixmeletters[] = "abcdefghijklmnopqrstuvwxyz";
 113   /* char fixmeletters[] = "etaonrishdlfcmugpywbvkxjqz"; */
 114
 115
 116   word_num = find_word(dictionarytable, text, length);
 117
 118   /* Some game files don't contain abbreviations for common commands */
 119   if(!word_num && do_expand && length == 1 && is_begin) {
 120     const char * const abbrevs[26] = {
 121       "a",              "b",           "close",          "down",
 122       "east",           "f",           "again",          "h",
 123       "inventory",      "j",           "attack",         "look",
 124       "m",              "north",       "oops",           "open",
 125       "quit",           "drop",        "south",          "take",
 126       "up",             "v",           "west",           "examine",
 127       "yes",            "wait"
 128     };
 129     if('a' <= text[0] && text[0] <= 'z') {
 130       strcpy(tbuffer, abbrevs[text[0] - 'a']);
 131       tlength = strlen(tbuffer);
 132       word_num = find_word(dictionarytable, tbuffer, tlength);
 133     }
 134   }
 135
 136   /* Check for various typing errors */
 137
 138   /* Don't attempt typo correction in very short words */
 139   if(do_spell_correct && length >= 3) {
 140
 141     if(!word_num) {  /* Check for transposes */
 142       /* To fix, try all possible transposes */
 143       unsigned position;
 144       for(position = 1; position < tlength; position++) {
 145         unsigned s;
 146         for(s = 0; s < tlength; s++)
 147           tbuffer[s] = text[s];
 148
 149         tbuffer[position - 1] = text[position];
 150         tbuffer[position]     = text[position - 1];
 151
 152         word_num = find_word(dictionarytable, tbuffer, tlength);
 153         if(word_num)
 154           break;
 155       }
 156     }
 157
 158     if(!word_num) {  /* Check for deletions */
 159       /* To fix, try all possible insertions */
 160       unsigned position;
 161       for(position = 0; position <= tlength; position++) {
 162         unsigned s;
 163         for(s = 0; s < position; s++)    /* letters before the insertion */
 164           tbuffer[s] = text[s];
 165
 166         for(s = position; s < tlength; s++)       /* after the insertion */
 167           tbuffer[s + 1] = text[s];
 168
 169         /* try each letter */
 170         for(s = 0; s < sizeof(fixmeletters); s++) {
 171           tbuffer[position] = fixmeletters[s];
 172           word_num = find_word(dictionarytable, tbuffer, tlength + 1);
 173           if(word_num)
 174             break;
 175         }
 176
 177         if(word_num) {
 178           tlength++;
 179           break;
 180         }
 181       }
 182     }
 183
 184     if(!word_num) {  /* Check for insertions */
 185       /* To fix, try all possible deletions */
 186       unsigned position;
 187       for(position = 0; position < tlength; position++) {
 188         unsigned s;
 189         for(s = 0; s < position; s++)    /* letters before the deletion */
 190           tbuffer[s] = text[s];
 191
 192         for(s = position + 1; s < tlength; s++)   /* after the deletion */
 193           tbuffer[s - 1] = text[s];
 194
 195         word_num = find_word(dictionarytable, tbuffer, tlength - 1);
 196
 197         if(word_num) {
 198           tlength--;
 199           break;
 200         }
 201       }
 202     }
 203
 204     if(!word_num) {  /* Check for substitutions */
 205       /* To fix, try all possible substitutions */
 206       unsigned position;
 207       for(position = 0; position < tlength; position++) {
 208       unsigned s;
 209       for(s = 0; s < tlength; s++)
 210         tbuffer[s] = text[s];
 211
 212       /* try each letter */
 213       for(s = 0; s < sizeof(fixmeletters); s++) {
 214         tbuffer[position] = fixmeletters[s];
 215         word_num = find_word(dictionarytable, tbuffer, tlength);
 216         if(word_num)
 217           break;
 218       }
 219
 220       if(word_num)
 221         break;
 222       }
 223     }
 224   }
 225
 226   /* Report any corrections made */
 227   if(word_num) {
 228     struct Typocorrection *p;
 229     char original[13], changedto[13];
 230     n_strncpy(original, text, 13);
 231     n_strncpy(changedto, tbuffer, 13);
 232     if(length < 13)
 233       original[length] = 0;
 234     if(tlength < 13)
 235       changedto[tlength] = 0;
 236
 237     LEsearch(recent_corrections, p, ((n_strncmp(p->original, original, 13) == 0) &&
 238                                      (n_strncmp(p->changedto, changedto, 13) == 0)));
 239
 240     /* Only print a correction if it hasn't yet been reported this turn */
 241     if(!p) {
 242       struct Typocorrection newcorrection;
 243       n_strncpy(newcorrection.original, original, 13);
 244       n_strncpy(newcorrection.changedto, changedto, 13);
 245       LEadd(recent_corrections, newcorrection);
 246
 247       set_glk_stream_current();
 248
 249       if(allow_output) {
 250         glk_put_char('[');
 251         w_glk_put_buffer(text, length);
 252         w_glk_put_string(" -> ");
 253         w_glk_put_buffer(tbuffer, tlength);
 254         glk_put_char(']');
 255         glk_put_char(10);
 256       }
 257     }
 258   }
 259
 260   return word_num;
 261 }
 262
 263 #endif
 264
 265 static void handle_word(zword dictionarytable, const char *text,
 266                         zword word_start, int length,
 267                         zword *parse_dest,
 268                         BOOL write_unrecognized, int *parsed_words)
 269 {
 270
 271   zword word_num;
 272
 273   word_num = find_word(dictionarytable, text + word_start, length);
 274
 275 #ifdef SMART_TOKENISER
 276   if(!word_num)
 277     word_num = smart_tokeniser(dictionarytable, text + word_start, length,
 278                                *parsed_words == 0);
 279 #endif
 280
 281   if(!word_num && !write_unrecognized)
 282     *parse_dest += ZWORD_SIZE + 2;
 283   else
 284     addparsed(parse_dest, word_num, length, word_start);
 285
 286   (*parsed_words)++;
 287 }
 288
 289
 290 static int tokenise(zword dictionarytable, const char *text, int length,
 291                     zword *parse_dest, int maxwords,
 292                     zword separatortable, int numseparators,
 293                     BOOL write_unrecognized)
 294 {
 295   int i;
 296   int parsed_words = 0;
 297   int word_start = 0;
 298   for(i = 0; i <= length && parsed_words < maxwords; i++) {
 299     BOOL do_tokenise = FALSE;
 300     BOOL do_add_separator = FALSE;
 301     if((i == length) || text[i] == ' ') {  /* A space or at the end */
 302       do_tokenise = TRUE;
 303     } else {
 304       int j;
 305       for(j = 0; j < numseparators; j++) {
 306         if(text[i] == (char) LOBYTE(separatortable + j)) {
 307           do_tokenise = TRUE;
 308           do_add_separator = TRUE;
 309           break;
 310         }
 311       }
 312     }
 313
 314     if(do_tokenise) {
 315       int wordlength = i - word_start;
 316       if(wordlength > 0) {
 317         handle_word(dictionarytable, text, word_start, wordlength,
 318                     parse_dest, write_unrecognized, &parsed_words);
 319       }
 320       word_start = i + 1;
 321     }
 322     if(do_add_separator && parsed_words < maxwords) {
 323       handle_word(dictionarytable, text, i, 1,
 324                   parse_dest, write_unrecognized, &parsed_words);
 325
 326     }
 327   }
 328   return parsed_words;
 329 }
 330
 331
 332 void z_tokenise(const char *text, int length, zword parse_dest,
 333                 zword dictionarytable, BOOL write_unrecognized)
 334 {
 335   zword separatortable;
 336   zword numparsedloc;
 337   int numseparators;
 338   int maxwords, parsed_words;
 339
 340   if(parse_dest > dynamic_size || parse_dest < 64) {
 341     n_show_error(E_OUTPUT, "parse table in invalid location", parse_dest);
 342     return;
 343   }
 344
 345   numseparators = LOBYTE(dictionarytable);
 346   separatortable = dictionarytable + 1;
 347   dictionarytable += numseparators + 1;
 348
 349   maxwords = LOBYTE(parse_dest);
 350   numparsedloc = parse_dest + 1;
 351   parse_dest+=2;
 352
 353   if(maxwords == 0)
 354     n_show_warn(E_OUTPUT, "small parse size", maxwords);
 355
 356   parsed_words = tokenise(dictionarytable, text, length,
 357                           &parse_dest, maxwords, separatortable, numseparators,
 358                           write_unrecognized);
 359
 360   LOBYTEwrite(numparsedloc, parsed_words);
 361 }
 362
 363
 364 void op_tokenise(void)
 365 {
 366   if(numoperands < 3 || operand[2] == 0)
 367     operand[2] = z_dictionary;
 368   if(numoperands < 4)
 369     operand[3] = 0;
 370   z_tokenise((char *) z_memory + operand[0] + 2, LOBYTE(operand[0] + 1),
 371              operand[1], operand[2], operand[3]==0);
 372 }
 373