interpreters/frotz/text.c

   1 /* text.c - Text manipulation functions
   2  *      Copyright (c) 1995-1997 Stefan Jokisch
   3  *
   4  * This file is part of Frotz.
   5  *
   6  * Frotz is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * Frotz is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  19  */
  20
  21 #include "frotz.h"
  22
  23 enum string_type {
  24     LOW_STRING, ABBREVIATION, HIGH_STRING, EMBEDDED_STRING, VOCABULARY
  25 };
  26
  27 extern zword object_name (zword);
  28 extern zword get_window_font (zword);
  29
  30 zchar* decoded;
  31 zchar* encoded;
  32 static int resolution;
  33
  34 /*
  35  * According to Matteo De Luigi <matteo.de.luigi@libero.it>,
  36  * 0xab and 0xbb were in each other's proper positions.
  37  *   Sat Apr 21, 2001
  38  */
  39 static zchar zscii_to_latin1[] = {
  40     0x0e4, 0x0f6, 0x0fc, 0x0c4, 0x0d6, 0x0dc, 0x0df, 0x0bb,
  41     0x0ab, 0x0eb, 0x0ef, 0x0ff, 0x0cb, 0x0cf, 0x0e1, 0x0e9,
  42     0x0ed, 0x0f3, 0x0fa, 0x0fd, 0x0c1, 0x0c9, 0x0cd, 0x0d3,
  43     0x0da, 0x0dd, 0x0e0, 0x0e8, 0x0ec, 0x0f2, 0x0f9, 0x0c0,
  44     0x0c8, 0x0cc, 0x0d2, 0x0d9, 0x0e2, 0x0ea, 0x0ee, 0x0f4,
  45     0x0fb, 0x0c2, 0x0ca, 0x0ce, 0x0d4, 0x0db, 0x0e5, 0x0c5,
  46     0x0f8, 0x0d8, 0x0e3, 0x0f1, 0x0f5, 0x0c3, 0x0d1, 0x0d5,
  47     0x0e6, 0x0c6, 0x0e7, 0x0c7, 0x0fe, 0x0f0, 0x0de, 0x0d0,
  48     0x0a3, 0x153, 0x152, 0x0a1, 0x0bf
  49 };
  50
  51 /*
  52  * init_text
  53  *
  54  * Initialize text variables.
  55  *
  56  */
  57
  58 void init_text (void)
  59 {
  60     decoded = NULL;
  61     encoded = NULL;
  62
  63     resolution = 0;
  64 }
  65
  66 /*
  67  * translate_from_zscii
  68  *
  69  * Map a ZSCII character into Unicode.
  70  *
  71  */
  72
  73 zchar translate_from_zscii (zbyte c)
  74 {
  75
  76     if (c == 0xfc)
  77         return ZC_MENU_CLICK;
  78     if (c == 0xfd)
  79         return ZC_DOUBLE_CLICK;
  80     if (c == 0xfe)
  81         return ZC_SINGLE_CLICK;
  82
  83     if (c >= 0x9b && story_id != BEYOND_ZORK) {
  84
  85         if (hx_unicode_table != 0) {    /* game has its own Unicode table */
  86
  87             zbyte N;
  88
  89             LOW_BYTE (hx_unicode_table, N)
  90
  91             if (c - 0x9b < N) {
  92
  93                 zword addr = hx_unicode_table + 1 + 2 * (c - 0x9b);
  94                 zword unicode;
  95
  96                 LOW_WORD (addr, unicode)
  97
  98                 if (unicode < 0x20)
  99                         return '?';
 100
 101                 return unicode;
 102
 103             } else return '?';
 104
 105         } else                          /* game uses standard set */
 106
 107             if (c <= 0xdf) {
 108
 109                 return zscii_to_latin1[c - 0x9b];
 110
 111             } else return '?';
 112     }
 113
 114     return (zchar) c;
 115
 116 }/* translate_from_zscii */
 117
 118 /*
 119  * unicode_to_zscii
 120  *
 121  * Convert a Unicode character to ZSCII, returning 0 on failure.
 122  *
 123  */
 124
 125 zbyte unicode_to_zscii (zchar c)
 126 {
 127     int i;
 128
 129     if (c >= ZC_LATIN1_MIN) {
 130
 131         if (hx_unicode_table != 0) {    /* game has its own Unicode table */
 132
 133             zbyte N;
 134             int i;
 135
 136             LOW_BYTE (hx_unicode_table, N)
 137
 138             for (i = 0x9b; i < 0x9b + N; i++) {
 139
 140                 zword addr = hx_unicode_table + 1 + 2 * (i - 0x9b);
 141                 zword unicode;
 142
 143                 LOW_WORD (addr, unicode)
 144
 145                 if (c == unicode)
 146                     return (zbyte) i;
 147
 148             }
 149
 150             return 0;
 151
 152         } else {                        /* game uses standard set */
 153
 154             for (i = 0x9b; i <= 0xdf; i++)
 155                 if (c == zscii_to_latin1[i - 0x9b])
 156                     return (zbyte) i;
 157
 158             return 0;
 159
 160         }
 161     }
 162
 163     return (zbyte) c;
 164
 165 }/* unicode_to_zscii */
 166
 167 /*
 168  * translate_to_zscii
 169  *
 170  * Map a Unicode character onto the ZSCII alphabet.
 171  *
 172  */
 173
 174 zbyte translate_to_zscii (zchar c)
 175 {
 176
 177     if (c == ZC_SINGLE_CLICK)
 178         return 0xfe;
 179     if (c == ZC_DOUBLE_CLICK)
 180         return 0xfd;
 181     if (c == ZC_MENU_CLICK)
 182         return 0xfc;
 183     if (c == 0)
 184         return 0;
 185
 186     c = unicode_to_zscii (c);
 187     if (c == 0)
 188         c = '?';
 189
 190     return (zbyte) c;
 191
 192 }/* translate_to_zscii */
 193
 194 /*
 195  * alphabet
 196  *
 197  * Return a character from one of the three character sets.
 198  *
 199  */
 200
 201 static zchar alphabet (int set, int index)
 202 {
 203     if (h_version > V1 && set == 2 && index == 1)
 204         return 0x0D;            /* always newline */
 205
 206     if (h_alphabet != 0) {      /* game uses its own alphabet */
 207
 208         zbyte c;
 209
 210         zword addr = h_alphabet + 26 * set + index;
 211         LOW_BYTE (addr, c)
 212
 213         return translate_from_zscii (c);
 214
 215     } else                      /* game uses default alphabet */
 216
 217         if (set == 0)
 218             return 'a' + index;
 219         else if (set == 1)
 220             return 'A' + index;
 221         else if (h_version == V1)
 222             return " 0123456789.,!?_#'\"/\\<-:()"[index];
 223         else
 224             return " ^0123456789.,!?_#'\"/\\-:()"[index];
 225
 226 }/* alphabet */
 227
 228 /*
 229  * find_resolution
 230  *
 231  * Find the number of bytes used for dictionary resolution.
 232  *
 233  */
 234
 235 static void find_resolution (void)
 236 {
 237     zword dct = h_dictionary;
 238     zword entry_count;
 239     zbyte sep_count;
 240     zbyte entry_len;
 241
 242     LOW_BYTE (dct, sep_count)
 243     dct += 1 + sep_count;  /* skip word separators */
 244     LOW_BYTE (dct, entry_len)
 245     dct += 1;              /* skip entry length */
 246     LOW_WORD (dct, entry_count)
 247     dct += 2;              /* get number of entries */
 248
 249     if (h_version < V9) {
 250
 251         resolution = (h_version <= V3) ? 2 : 3;
 252
 253     } else {
 254
 255         zword addr = dct;
 256         zword code;
 257
 258         if (entry_count == 0) {
 259
 260             runtime_error (ERR_DICT_LEN);
 261
 262         }
 263
 264         /* check the first word in the dictionary */
 265
 266         do {
 267
 268             LOW_WORD (addr, code)
 269             addr += 2;
 270
 271         } while (!(code & 0x8000) && (addr - dct < entry_len + 1));
 272
 273         resolution = (addr - dct) / 2;
 274
 275     }
 276
 277     if (2 * resolution > entry_len) {
 278
 279         runtime_error (ERR_DICT_LEN);
 280
 281     }
 282
 283     decoded = (zchar *)malloc (sizeof (zchar) * (3 * resolution) + 1);
 284     encoded = (zchar *)malloc (sizeof (zchar) * resolution);
 285
 286 }/* find_resolution */
 287
 288 /*
 289  * load_string
 290  *
 291  * Copy a ZSCII string from the memory to the global "decoded" string.
 292  *
 293  */
 294
 295 static void load_string (zword addr, zword length)
 296 {
 297     int i = 0;
 298
 299     if (resolution == 0) find_resolution();
 300
 301     while (i < 3 * resolution)
 302
 303         if (i < length) {
 304
 305             zbyte c;
 306
 307             LOW_BYTE (addr, c)
 308             addr++;
 309
 310             decoded[i++] = translate_from_zscii (c);
 311
 312         } else decoded[i++] = 0;
 313
 314 }/* load_string */
 315
 316 /*
 317  * encode_text
 318  *
 319  * Encode the Unicode text in the global "decoded" string then write
 320  * the result to the global "encoded" array. (This is used to look up
 321  * words in the dictionary.) Up to V3 the vocabulary resolution is
 322  * two, from V4 it is three, and from V9 it is any number of words.
 323  * Because each word contains three Z-characters, that makes six or
 324  * nine Z-characters respectively. Longer words are chopped to the
 325  * proper size, shorter words are are padded out with 5's. For word
 326  * completion we pad with 0s and 31s, the minimum and maximum
 327  * Z-characters.
 328  *
 329  */
 330
 331 static void encode_text (int padding)
 332 {
 333     static zchar again[] = { 'a', 'g', 'a', 'i', 'n', 0, 0, 0, 0 };
 334     static zchar examine[] = { 'e', 'x', 'a', 'm', 'i', 'n', 'e', 0, 0 };
 335     static zchar wait[] = { 'w', 'a', 'i', 't', 0, 0, 0, 0, 0 };
 336
 337     zbyte *zchars;
 338     const zchar *ptr;
 339     zchar c;
 340     int i = 0;
 341
 342     if (resolution == 0) find_resolution();
 343
 344     zchars = (zbyte *)malloc (sizeof (zbyte) * 3 * (resolution + 1));
 345     ptr = decoded;
 346
 347     /* Expand abbreviations that some old Infocom games lack */
 348
 349     if (option_expand_abbreviations && (h_version <= V8))
 350
 351         if (padding == 0x05 && decoded[1] == 0)
 352
 353             switch (decoded[0]) {
 354                 case 'g': ptr = again; break;
 355                 case 'x': ptr = examine; break;
 356                 case 'z': ptr = wait; break;
 357             }
 358
 359     /* Translate string to a sequence of Z-characters */
 360
 361     while (i < 3 * resolution)
 362
 363         if ((c = *ptr++) != 0) {
 364
 365             int index, set;
 366             zbyte c2;
 367
 368             if (c == 32) {
 369
 370                 zchars[i++] = 0;
 371
 372                 continue;
 373
 374             }
 375
 376             /* Search character in the alphabet */
 377
 378             for (set = 0; set < 3; set++)
 379                 for (index = 0; index < 26; index++)
 380                     if (c == alphabet (set, index))
 381                         goto letter_found;
 382
 383             /* Character not found, store its ZSCII value */
 384
 385             c2 = translate_to_zscii (c);
 386
 387             zchars[i++] = 5;
 388             zchars[i++] = 6;
 389             zchars[i++] = c2 >> 5;
 390             zchars[i++] = c2 & 0x1f;
 391
 392             continue;
 393
 394         letter_found:
 395
 396             /* Character found, store its index */
 397
 398             if (set != 0)
 399                 zchars[i++] = ((h_version <= V2) ? 1 : 3) + set;
 400
 401             zchars[i++] = index + 6;
 402
 403         } else zchars[i++] = padding;
 404
 405     /* Three Z-characters make a 16bit word */
 406
 407     for (i = 0; i < resolution; i++)
 408
 409         encoded[i] =
 410             (zchars[3 * i + 0] << 10) |
 411             (zchars[3 * i + 1] << 5) |
 412             (zchars[3 * i + 2]);
 413
 414     encoded[resolution - 1] |= 0x8000;
 415
 416     free (zchars);
 417
 418 }/* encode_text */
 419
 420 /*
 421  * z_check_unicode, test if a unicode character can be printed (bit 0) and read (bit 1).
 422  *
 423  *      zargs[0] = Unicode
 424  *
 425  */
 426
 427 void z_check_unicode (void)
 428 {
 429     zword c = zargs[0];
 430     zword result = 0;
 431
 432     if (c <= 0x1f)
 433     {
 434         if ((c == 0x08) || (c == 0x0d) || (c == 0x1b))
 435             result = 2;
 436     }
 437     else if (c <= 0x7e)
 438         result = 3;
 439     else
 440         result = 1; // we support unicode
 441
 442     store (result);
 443
 444 }/* z_check_unicode */
 445
 446 /*
 447  * z_encode_text, encode a ZSCII string for use in a dictionary.
 448  *
 449  *      zargs[0] = address of text buffer
 450  *      zargs[1] = length of ASCII string
 451  *      zargs[2] = offset of ASCII string within the text buffer
 452  *      zargs[3] = address to store encoded text in
 453  *
 454  * This is a V5+ opcode and therefore the dictionary resolution must be
 455  * three 16bit words.
 456  *
 457  */
 458
 459 void z_encode_text (void)
 460 {
 461     int i;
 462
 463     load_string ((zword) (zargs[0] + zargs[2]), zargs[1]);
 464
 465     encode_text (0x05);
 466
 467     for (i = 0; i < resolution; i++)
 468         storew ((zword) (zargs[3] + 2 * i), encoded[i]);
 469
 470 }/* z_encode_text */
 471
 472 /*
 473  * decode_text
 474  *
 475  * Convert encoded text to Unicode. The encoded text consists of 16bit
 476  * words. Every word holds 3 Z-characters (5 bits each) plus a spare
 477  * bit to mark the last word. The Z-characters translate to ZSCII by
 478  * looking at the current current character set. Some select another
 479  * character set, others refer to abbreviations.
 480  *
 481  * There are several different string types:
 482  *
 483  *    LOW_STRING - from the lower 64KB (byte address)
 484  *    ABBREVIATION - from the abbreviations table (word address)
 485  *    HIGH_STRING - from the end of the memory map (packed address)
 486  *    EMBEDDED_STRING - from the instruction stream (at PC)
 487  *    VOCABULARY - from the dictionary (byte address)
 488  *
 489  * The last type is only used for word completion.
 490  *
 491  */
 492
 493 #define outchar(c)      if (st==VOCABULARY) *ptr++=c; else print_char(c)
 494
 495 static void decode_text (enum string_type st, zword addr)
 496 {
 497     zchar *ptr;
 498     long byte_addr;
 499     zchar c2;
 500     zword code;
 501     zbyte c, prev_c = 0;
 502     int shift_state = 0;
 503     int shift_lock = 0;
 504     int status = 0;
 505
 506     ptr = NULL;         /* makes compilers shut up */
 507     byte_addr = 0;
 508
 509     if (resolution == 0) find_resolution();
 510
 511     /* Calculate the byte address if necessary */
 512
 513     if (st == ABBREVIATION)
 514
 515         byte_addr = (long) addr << 1;
 516
 517     else if (st == HIGH_STRING) {
 518
 519         if (h_version <= V3)
 520             byte_addr = (long) addr << 1;
 521         else if (h_version <= V5)
 522             byte_addr = (long) addr << 2;
 523         else if (h_version <= V7)
 524             byte_addr = ((long) addr << 2) + ((long) h_strings_offset << 3);
 525         else if (h_version <= V8)
 526             byte_addr = (long) addr << 3;
 527         else /* h_version == V9 */ {
 528             long indirect = (long) addr << 2;
 529             HIGH_LONG(indirect, byte_addr);
 530         }
 531
 532         if (byte_addr >= story_size)
 533             runtime_error (ERR_ILL_PRINT_ADDR);
 534
 535     }
 536
 537     /* Loop until a 16bit word has the highest bit set */
 538
 539     if (st == VOCABULARY)
 540         ptr = decoded;
 541
 542     do {
 543
 544         int i;
 545
 546         /* Fetch the next 16bit word */
 547
 548         if (st == LOW_STRING || st == VOCABULARY) {
 549             LOW_WORD (addr, code)
 550             addr += 2;
 551         } else if (st == HIGH_STRING || st == ABBREVIATION) {
 552             HIGH_WORD (byte_addr, code)
 553             byte_addr += 2;
 554         } else
 555             CODE_WORD (code)
 556
 557         /* Read its three Z-characters */
 558
 559         for (i = 10; i >= 0; i -= 5) {
 560
 561             zword abbr_addr;
 562             zword ptr_addr;
 563             zchar zc;
 564
 565             c = (code >> i) & 0x1f;
 566
 567             switch (status) {
 568
 569             case 0:     /* normal operation */
 570
 571                 if (shift_state == 2 && c == 6)
 572                     status = 2;
 573
 574                 else if (h_version == V1 && c == 1)
 575                     new_line ();
 576
 577                 else if (h_version >= V2 && shift_state == 2 && c == 7)
 578                     new_line ();
 579
 580                 else if (c >= 6)
 581                     outchar (alphabet (shift_state, c - 6));
 582
 583                 else if (c == 0)
 584                     outchar (' ');
 585
 586                 else if (h_version >= V2 && c == 1)
 587                     status = 1;
 588
 589                 else if (h_version >= V3 && c <= 3)
 590                     status = 1;
 591
 592                 else {
 593
 594                     shift_state = (shift_lock + (c & 1) + 1) % 3;
 595
 596                     if (h_version <= V2 && c >= 4)
 597                         shift_lock = shift_state;
 598
 599                     break;
 600
 601                 }
 602
 603                 shift_state = shift_lock;
 604
 605                 break;
 606
 607             case 1:     /* abbreviation */
 608
 609                 ptr_addr = h_abbreviations + 64 * (prev_c - 1) + 2 * c;
 610
 611                 LOW_WORD (ptr_addr, abbr_addr)
 612                 decode_text (ABBREVIATION, abbr_addr);
 613
 614                 status = 0;
 615                 break;
 616
 617             case 2:     /* ZSCII character - first part */
 618
 619                 status = 3;
 620                 break;
 621
 622             case 3:     /* ZSCII character - second part */
 623
 624                 zc = (prev_c << 5) | c;
 625
 626                 if (zc > 767) { /* Unicode escape */
 627
 628                     while (zc-- > 767) {
 629
 630                         if (st == LOW_STRING || st == VOCABULARY) {
 631                             LOW_WORD (addr, c2)
 632                             addr += 2;
 633                         } else if (st == HIGH_STRING || st == ABBREVIATION) {
 634                             HIGH_WORD (byte_addr, c2)
 635                             byte_addr += 2;
 636                         } else
 637                             CODE_WORD (c2)
 638
 639                         outchar (c2 ^ 0xFFFF);
 640                     }
 641
 642                 } else {
 643
 644                     c2 = translate_from_zscii (zc);
 645                     outchar (c2);
 646
 647                 }
 648
 649                 status = 0;
 650                 break;
 651
 652             }
 653
 654             prev_c = c;
 655
 656         }
 657
 658     } while (!(code & 0x8000));
 659
 660     if (st == VOCABULARY)
 661         *ptr = 0;
 662
 663 }/* decode_text */
 664
 665 #undef outchar
 666
 667 /*
 668  * z_new_line, print a new line.
 669  *
 670  *      no zargs used
 671  *
 672  */
 673
 674 void z_new_line (void)
 675 {
 676
 677     new_line ();
 678
 679 }/* z_new_line */
 680
 681 /*
 682  * z_print, print a string embedded in the instruction stream.
 683  *
 684  *      no zargs used
 685  *
 686  */
 687
 688 void z_print (void)
 689 {
 690
 691     decode_text (EMBEDDED_STRING, 0);
 692
 693 }/* z_print */
 694
 695 /*
 696  * z_print_addr, print a string from the lower 64KB.
 697  *
 698  *      zargs[0] = address of string to print
 699  *
 700  */
 701
 702 void z_print_addr (void)
 703 {
 704
 705     decode_text (LOW_STRING, zargs[0]);
 706
 707 }/* z_print_addr */
 708
 709 /*
 710  * z_print_char print a single ZSCII character.
 711  *
 712  *      zargs[0] = ZSCII character to be printed
 713  *
 714  */
 715
 716 void z_print_char (void)
 717 {
 718
 719     print_char (translate_from_zscii (zargs[0]));
 720
 721 }/* z_print_char */
 722
 723 /*
 724  * z_print_form, print a formatted table.
 725  *
 726  *      zargs[0] = address of formatted table to be printed
 727  *
 728  */
 729
 730 void z_print_form (void)
 731 {
 732     zword count;
 733     zword addr = zargs[0];
 734
 735     bool first = TRUE;
 736
 737     for (;;) {
 738
 739         LOW_WORD (addr, count)
 740         addr += 2;
 741
 742         if (count == 0)
 743             break;
 744
 745         if (!first)
 746             new_line ();
 747
 748         while (count--) {
 749
 750             zbyte c;
 751
 752             LOW_BYTE (addr, c)
 753             addr++;
 754
 755             print_char (translate_from_zscii (c));
 756
 757         }
 758
 759         first = FALSE;
 760
 761     }
 762
 763 }/* z_print_form */
 764
 765 /*
 766  * print_num
 767  *
 768  * Print a signed 16bit number.
 769  *
 770  */
 771
 772 void print_num (zword value)
 773 {
 774     int i;
 775
 776     /* Print sign */
 777
 778     if ((short) value < 0) {
 779         print_char ('-');
 780         value = - (short) value;
 781     }
 782
 783     /* Print absolute value */
 784
 785     for (i = 10000; i != 0; i /= 10)
 786         if (value >= i || i == 1)
 787             print_char ('0' + (value / i) % 10);
 788
 789 }/* print_num */
 790
 791 /*
 792  * z_print_num, print a signed number.
 793  *
 794  *      zargs[0] = number to print
 795  *
 796  */
 797
 798 void z_print_num (void)
 799 {
 800
 801     print_num (zargs[0]);
 802
 803 }/* z_print_num */
 804
 805 /*
 806  * print_object
 807  *
 808  * Print an object description.
 809  *
 810  */
 811
 812 void print_object (zword object)
 813 {
 814     zword addr = object_name (object);
 815     zword code = 0x94a5;
 816     zbyte length;
 817
 818     LOW_BYTE (addr, length)
 819     addr++;
 820
 821     if (length != 0)
 822         LOW_WORD (addr, code)
 823
 824     if (code == 0x94a5) {       /* encoded text 0x94a5 == empty string */
 825
 826         print_string ("object#");       /* supply a generic name */
 827         print_num (object);             /* for anonymous objects */
 828
 829     } else decode_text (LOW_STRING, addr);
 830
 831 }/* print_object */
 832
 833 /*
 834  * z_print_obj, print an object description.
 835  *
 836  *      zargs[0] = number of object to be printed
 837  *
 838  */
 839
 840 void z_print_obj (void)
 841 {
 842
 843     print_object (zargs[0]);
 844
 845 }/* z_print_obj */
 846
 847 /*
 848  * z_print_paddr, print the string at the given packed address.
 849  *
 850  *      zargs[0] = packed address of string to be printed
 851  *
 852  */
 853
 854 void z_print_paddr (void)
 855 {
 856
 857     decode_text (HIGH_STRING, zargs[0]);
 858
 859 }/* z_print_paddr */
 860
 861 /*
 862  * z_print_ret, print the string at PC, print newline then return true.
 863  *
 864  *      no zargs used
 865  *
 866  */
 867
 868 void z_print_ret (void)
 869 {
 870
 871     decode_text (EMBEDDED_STRING, 0);
 872     new_line ();
 873     ret (1);
 874
 875 }/* z_print_ret */
 876
 877 /*
 878  * print_string
 879  *
 880  * Print a string of ASCII characters.
 881  *
 882  */
 883
 884 void print_string (const char *s)
 885 {
 886     char c;
 887
 888     while ((c = *s++) != 0)
 889
 890         if (c == '\n')
 891             new_line ();
 892         else
 893             print_char (c);
 894
 895 }/* print_string */
 896
 897 /*
 898  * z_print_unicode
 899  *
 900  *      zargs[0] = Unicode
 901  *
 902  */
 903
 904 void z_print_unicode (void)
 905 {
 906
 907     if (zargs[0] < 0x20)
 908         print_char ('?');
 909     else
 910         print_char (zargs[0]);
 911
 912 }/* z_print_unicode */
 913
 914 /*
 915  * lookup_text
 916  *
 917  * Scan a dictionary searching for the given word. The first argument
 918  * can be
 919  *
 920  * 0x00 - find the first word which is >= the given one
 921  * 0x05 - find the word which exactly matches the given one
 922  * 0x1f - find the last word which is <= the given one
 923  *
 924  * The return value is 0 if the search fails.
 925  *
 926  */
 927
 928 static zword lookup_text (int padding, zword dct)
 929 {
 930     zword entry_addr;
 931     zword entry_count;
 932     zword entry;
 933     zword addr;
 934     zbyte entry_len;
 935     zbyte sep_count;
 936     int entry_number;
 937     int lower, upper;
 938     int i;
 939     bool sorted;
 940
 941     if (resolution == 0) find_resolution();
 942
 943     encode_text (padding);
 944
 945     LOW_BYTE (dct, sep_count)           /* skip word separators */
 946     dct += 1 + sep_count;
 947     LOW_BYTE (dct, entry_len)           /* get length of entries */
 948     dct += 1;
 949     LOW_WORD (dct, entry_count)         /* get number of entries */
 950     dct += 2;
 951
 952     if ((short) entry_count < 0) {      /* bad luck, entries aren't sorted */
 953
 954         entry_count = - (short) entry_count;
 955         sorted = FALSE;
 956
 957     } else sorted = TRUE;               /* entries are sorted */
 958
 959     lower = 0;
 960     upper = entry_count - 1;
 961
 962     while (lower <= upper) {
 963
 964         if (sorted)                             /* binary search */
 965             entry_number = (lower + upper) / 2;
 966         else                                    /* linear search */
 967             entry_number = lower;
 968
 969         entry_addr = dct + entry_number * entry_len;
 970
 971         /* Compare word to dictionary entry */
 972
 973         addr = entry_addr;
 974
 975         for (i = 0; i < resolution; i++) {
 976             LOW_WORD (addr, entry)
 977             if (encoded[i] != entry)
 978                 goto continuing;
 979             addr += 2;
 980         }
 981
 982         return entry_addr;              /* exact match found, return now */
 983
 984     continuing:
 985
 986         if (sorted)                             /* binary search */
 987
 988             if (encoded[i] > entry)
 989                 lower = entry_number + 1;
 990             else
 991                 upper = entry_number - 1;
 992
 993         else lower++;                           /* linear search */
 994
 995     }
 996
 997     /* No exact match has been found */
 998
 999     if (padding == 0x05)
1000         return 0;
1001
1002     entry_number = (padding == 0x00) ? lower : upper;
1003
1004     if (entry_number == -1 || entry_number == entry_count)
1005         return 0;
1006
1007     return dct + entry_number * entry_len;
1008
1009 }/* lookup_text */
1010
1011 /*
1012  * tokenise_text
1013  *
1014  * Translate a single word to a token and append it to the token
1015  * buffer. Every token consists of the address of the dictionary
1016  * entry, the length of the word and the offset of the word from
1017  * the start of the text buffer. Unknown words cause empty slots
1018  * if the flag is set (such that the text can be scanned several
1019  * times with different dictionaries); otherwise they are zero.
1020  *
1021  */
1022
1023 static void tokenise_text (zword text, zword length, zword from, zword parse, zword dct, bool flag)
1024 {
1025     zword addr;
1026     zbyte token_max, token_count;
1027
1028     LOW_BYTE (parse, token_max)
1029     parse++;
1030     LOW_BYTE (parse, token_count)
1031
1032     if (token_count < token_max) {      /* sufficient space left for token? */
1033
1034         storeb (parse++, token_count + 1);
1035
1036         load_string ((zword) (text + from), length);
1037
1038         addr = lookup_text (0x05, dct);
1039
1040         if (addr != 0 || !flag) {
1041
1042             parse += 4 * token_count;
1043
1044             storew ((zword) (parse + 0), addr);
1045             storeb ((zword) (parse + 2), length);
1046             storeb ((zword) (parse + 3), from);
1047
1048         }
1049
1050     }
1051
1052 }/* tokenise_text */
1053
1054 /*
1055  * tokenise_line
1056  *
1057  * Split an input line into words and translate the words to tokens.
1058  *
1059  */
1060
1061 void tokenise_line (zword text, zword token, zword dct, bool flag)
1062 {
1063     zword addr1;
1064     zword addr2;
1065     zbyte length;
1066     zbyte c;
1067
1068     length = 0;         /* makes compilers shut up */
1069
1070     /* Use standard dictionary if the given dictionary is zero */
1071
1072     if (dct == 0)
1073         dct = h_dictionary;
1074
1075     /* Remove all tokens before inserting new ones */
1076
1077     storeb ((zword) (token + 1), 0);
1078
1079     /* Move the first pointer across the text buffer searching for the
1080        beginning of a word. If this succeeds, store the position in a
1081        second pointer. Move the first pointer searching for the end of
1082        the word. When it is found, "tokenise" the word. Continue until
1083        the end of the buffer is reached. */
1084
1085     addr1 = text;
1086     addr2 = 0;
1087
1088     if (h_version >= V5) {
1089         addr1++;
1090         LOW_BYTE (addr1, length)
1091     }
1092
1093     do {
1094
1095         zword sep_addr;
1096         zbyte sep_count;
1097         zbyte separator;
1098
1099         /* Fetch next ZSCII character */
1100
1101         addr1++;
1102
1103         if (h_version >= V5 && addr1 == text + 2 + length)
1104             c = 0;
1105         else
1106             LOW_BYTE (addr1, c)
1107
1108         /* Check for separator */
1109
1110         sep_addr = dct;
1111
1112         LOW_BYTE (sep_addr, sep_count)
1113         sep_addr++;
1114
1115         do {
1116
1117             LOW_BYTE (sep_addr, separator)
1118             sep_addr++;
1119
1120         } while (c != separator && --sep_count != 0);
1121
1122         /* This could be the start or the end of a word */
1123
1124         if (sep_count == 0 && c != ' ' && c != 0) {
1125
1126             if (addr2 == 0)
1127                 addr2 = addr1;
1128
1129         } else if (addr2 != 0) {
1130
1131             tokenise_text (
1132                 text,
1133                 (zword) (addr1 - addr2),
1134                 (zword) (addr2 - text),
1135                 token, dct, flag );
1136
1137             addr2 = 0;
1138
1139         }
1140
1141         /* Translate separator (which is a word in its own right) */
1142
1143         if (sep_count != 0)
1144
1145             tokenise_text (
1146                 text,
1147                 (zword) (1),
1148                 (zword) (addr1 - text),
1149                 token, dct, flag );
1150
1151     } while (c != 0);
1152
1153 }/* tokenise_line */
1154
1155 /*
1156  * z_tokenise, make a lexical analysis of a ZSCII string.
1157  *
1158  *      zargs[0] = address of string to analyze
1159  *      zargs[1] = address of token buffer
1160  *      zargs[2] = address of dictionary (optional)
1161  *      zargs[3] = set when unknown words cause empty slots (optional)
1162  *
1163  */
1164
1165 void z_tokenise (void)
1166 {
1167
1168     /* Supply default arguments */
1169
1170     if (zargc < 3)
1171         zargs[2] = 0;
1172     if (zargc < 4)
1173         zargs[3] = 0;
1174
1175     /* Call tokenise_line to do the real work */
1176
1177     tokenise_line (zargs[0], zargs[1], zargs[2], zargs[3] != 0);
1178
1179 }/* z_tokenise */
1180
1181 /*
1182  * completion
1183  *
1184  * Scan the vocabulary to complete the last word on the input line
1185  * (similar to "tcsh" under Unix). The return value is
1186  *
1187  *    2 ==> completion is impossible
1188  *    1 ==> completion is ambiguous
1189  *    0 ==> completion is successful
1190  *
1191  * The function also returns a string in its second argument. In case
1192  * of 2, the string is empty; in case of 1, the string is the longest
1193  * extension of the last word on the input line that is common to all
1194  * possible completions (for instance, if the last word on the input
1195  * is "fo" and its only possible completions are "follow" and "folly"
1196  * then the string is "ll"); in case of 0, the string is an extension
1197  * to the last word that results in the only possible completion.
1198  *
1199  */
1200
1201 int completion (const zchar *buffer, zchar *result)
1202 {
1203     zword minaddr;
1204     zword maxaddr;
1205     zchar *ptr;
1206     zchar c;
1207     int len;
1208     int i;
1209
1210     *result = 0;
1211
1212     if (resolution == 0) find_resolution();
1213
1214     /* Copy last word to "decoded" string */
1215
1216     len = 0;
1217
1218     while ((c = *buffer++) != 0)
1219
1220         if (c != ' ') {
1221
1222             if (len < 3 * resolution)
1223                 decoded[len++] = c;
1224
1225         } else len = 0;
1226
1227     decoded[len] = 0;
1228
1229     /* Search the dictionary for first and last possible extensions */
1230
1231     minaddr = lookup_text (0x00, h_dictionary);
1232     maxaddr = lookup_text (0x1f, h_dictionary);
1233
1234     if (minaddr == 0 || maxaddr == 0 || minaddr > maxaddr)
1235         return 2;
1236
1237     /* Copy first extension to "result" string */
1238
1239     decode_text (VOCABULARY, minaddr);
1240
1241     ptr = result;
1242
1243     for (i = len; (c = decoded[i]) != 0; i++)
1244         *ptr++ = c;
1245     *ptr = 0;
1246
1247     /* Merge second extension with "result" string */
1248
1249     decode_text (VOCABULARY, maxaddr);
1250
1251     for (i = len, ptr = result; (c = decoded[i]) != 0; i++, ptr++)
1252         if (*ptr != c) break;
1253     *ptr = 0;
1254
1255     /* Search was ambiguous or successful */
1256
1257     return (minaddr == maxaddr) ? 0 : 1;
1258
1259 }/* completion */
1260
1261 /*
1262  * unicode_tolower
1263  *
1264  * Convert a Unicode character to lowercase.
1265  * Taken from Zip2000 by Kevin Bracey.
1266  *
1267  */
1268
1269 zchar unicode_tolower (zchar c)
1270 {
1271     const static unsigned char tolower_basic_latin[0x100] = {
1272         0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
1273         0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
1274         0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
1275         0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
1276         0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
1277         0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x5B,0x5C,0x5D,0x5E,0x5F,
1278         0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
1279         0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
1280         0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
1281         0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
1282         0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
1283         0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
1284         0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
1285         0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xD7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xDF,
1286         0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
1287         0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
1288     };
1289     const static unsigned char tolower_latin_extended_a[0x80] = {
1290         0x01,0x01,0x03,0x03,0x05,0x05,0x07,0x07,0x09,0x09,0x0B,0x0B,0x0D,0x0D,0x0F,0x0F,
1291         0x11,0x11,0x13,0x13,0x15,0x15,0x17,0x17,0x19,0x19,0x1B,0x1B,0x1D,0x1D,0x1F,0x1F,
1292         0x21,0x21,0x23,0x23,0x25,0x25,0x27,0x27,0x29,0x29,0x2B,0x2B,0x2D,0x2D,0x2F,0x2F,
1293         0x00,0x31,0x33,0x33,0x35,0x35,0x37,0x37,0x38,0x3A,0x3A,0x3C,0x3C,0x3E,0x3E,0x40,
1294         0x40,0x42,0x42,0x44,0x44,0x46,0x46,0x48,0x48,0x49,0x4B,0x4B,0x4D,0x4D,0x4F,0x4F,
1295         0x51,0x51,0x53,0x53,0x55,0x55,0x57,0x57,0x59,0x59,0x5B,0x5B,0x5D,0x5D,0x5F,0x5F,
1296         0x61,0x61,0x63,0x63,0x65,0x65,0x67,0x67,0x69,0x69,0x6B,0x6B,0x6D,0x6D,0x6F,0x6F,
1297         0x71,0x71,0x73,0x73,0x75,0x75,0x77,0x77,0x00,0x7A,0x7A,0x7C,0x7C,0x7E,0x7E,0x7F
1298     };
1299     const static unsigned char tolower_greek[0x50] = {
1300         0x80,0x81,0x82,0x83,0x84,0x85,0xAC,0x87,0xAD,0xAE,0xAF,0x8B,0xCC,0x8D,0xCD,0xCE,
1301         0x90,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
1302         0xC0,0xC1,0xA2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0xAC,0xAD,0xAE,0xAF,
1303         0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
1304         0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF
1305     };
1306     const static unsigned char tolower_cyrillic[0x60] = {
1307         0x00,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
1308         0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
1309         0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
1310         0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
1311         0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
1312         0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F
1313     };
1314
1315     if (c < 0x0100)
1316         c = tolower_basic_latin[c];
1317     else if (c == 0x0130)
1318         c = 0x0069;     /* Capital I with dot -> lower case i */
1319     else if (c == 0x0178)
1320         c = 0x00FF;     /* Capital Y diaeresis -> lower case y diaeresis */
1321     else if (c < 0x0180)
1322         c = tolower_latin_extended_a[c-0x100] + 0x100;
1323     else if (c >= 0x380 && c < 0x3D0)
1324         c = tolower_greek[c-0x380] + 0x300;
1325     else if (c >= 0x400 && c < 0x460)
1326         c = tolower_cyrillic[c-0x400] + 0x400;
1327
1328     return c;
1329 }
1330