2 * Copyright 2009-2012 Chris Spiegel.
4 * This file is part of Bocfel.
6 * Bocfel is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version
8 * 2 or 3, as published by the Free Software Foundation.
10 * Bocfel is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with Bocfel. If not, see <http://www.gnu.org/licenses/>.
31 static uint16_t separators;
32 static uint8_t num_separators;
34 static uint16_t GET_WORD(uint8_t *base)
36 return (base[0] << 8) | base[1];
38 static void MAKE_WORD(uint8_t *base, uint16_t val)
44 /* Add the character c to the nth position of the encoded text. c is a
45 * 5-bit value (either a shift character, which selects an alphabet, or
46 * the index into the current alphabet).
48 static void add_zchar(int c, int n, uint8_t *encoded)
50 uint16_t w = GET_WORD(&encoded[2 * (n / 3)]);
53 * --first byte------- --second byte---
54 * 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
55 * bit --first-- --second--- --third--
57 * So to figure out which third of the word to store to:
58 * If n is 0, 3, 6, ... then store to the first (left shift 10).
59 * If n is 1, 4, 7, ... then store to the second (left shift 5).
60 * If n is 2, 5, 8, ... then store to the third (left shift 0).
61 * “Or” into the previous value because, unless this is the first
62 * character, there are already values we’ve stored there.
64 w |= (c & 0x1f) << (5 * (2 - (n % 3)));
66 MAKE_WORD(&encoded[2 * (n / 3)], w);
69 /* Encode the text at “s”, of length “len” (there is not necessarily a
70 * terminating null character) into the buffer “encoded”.
72 * For V3 the encoded text is 6 Z-characters (4 bytes); for V4 and above
73 * it’s 9 characters (6 bytes). Due to the nature of the loop here,
74 * it’s possible to encode too many bytes. For example, if the string
75 * given is "aaa<" in a V3 game, the three 'a' characters will take up a
76 * word (all three being packed into one), but the single '<' character
77 * will take up two words (one full word and a third of the next) due to
78 * the fact that '<' is not in the alphabet table. Thus the encoded
79 * text will be 7 characters. This is OK because routines that use the
80 * encoded string are smart enough to only pay attention to the first 6
81 * or 9 Z-characters; and partial Z-characters are OK per §3.6.1.
83 * 1.1 of the standard revises the encoding for V1 and V2 games. I am
84 * not implementing the new rules for two basic reasons:
85 * 1) It apparently only affects three (unnecessary) dictionary words in
86 * the known V1-2 games.
87 * 2) Because of 1, it is not worth the effort to peek ahead and see
88 * what the next character is to determine whether to shift once or
91 * Z-character 0 is a space (§3.5.1), so theoretically a space should be
92 * encoded simply with a zero. However, Inform 6.32 encodes space
93 * (which has the value 32) as a 10-bit ZSCII code, which is the
94 * Z-characters 5, 6, 1, 0. Assume this is correct.
96 static void encode_string(const uint8_t *s, size_t len, uint8_t encoded[8])
99 const int res = zversion <= 3 ? 6 : 9;
100 const int shiftbase = zversion <= 2 ? 1 : 3;
102 memset(encoded, 0, 8);
104 for(size_t i = 0; i < len && n < res; i++)
108 pos = atable_pos[s[i]];
111 int shift = pos / 26;
114 if(shift) add_zchar(shiftbase + shift, n++, encoded);
115 add_zchar(c + 6, n++, encoded);
119 add_zchar(shiftbase + 2, n++, encoded);
120 add_zchar(6, n++, encoded);
121 add_zchar(s[i] >> 5, n++, encoded);
122 add_zchar(s[i] & 0x1f, n++, encoded);
128 add_zchar(5, n++, encoded);
131 /* §3.2: the MSB of the last encoded word must be set. */
132 if(zversion <= 3) encoded[2] |= 0x80;
133 else encoded[4] |= 0x80;
136 static int dict_compar(const void *a, const void *b)
138 return memcmp(a, b, zversion <= 3 ? 4 : 6);
140 static uint16_t dict_find(const uint8_t *token, size_t len, uint16_t dictionary)
148 encode_string(token, len, encoded);
150 elength = user_byte(dictionary + num_separators + 1);
151 nentries = (int16_t)user_word(dictionary + num_separators + 2);
152 base = dictionary + num_separators + 2 + 2;
154 ZASSERT(elength >= (zversion <= 3 ? 4 : 6), "dictionary entry length (%d) too small", elength);
155 ZASSERT(base + (labs(nentries) * elength) < memory_size, "reported dictionary length extends beyond memory size");
159 ret = bsearch(encoded, &memory[base], nentries, elength, dict_compar);
163 for(long i = 0; i < -nentries; i++)
165 uint8_t *entry = &memory[base + (i * elength)];
167 if(dict_compar(encoded, entry) == 0)
175 if(ret == NULL) return 0;
177 return base + (ret - &memory[base]);
180 static int is_sep(uint8_t c)
182 if(c == ZSCII_SPACE) return 1;
184 for(uint16_t i = 0; i < num_separators; i++) if(user_byte(separators + i) == c) return 1;
189 static void handle_token(const uint8_t *base, const uint8_t *token, int len, uint16_t parse, uint16_t dictionary, int found, int flag)
192 const uint8_t examine[] = { 'e', 'x', 'a', 'm', 'i', 'n', 'e' };
193 const uint8_t again[] = { 'a', 'g', 'a', 'i', 'n' };
194 const uint8_t wait[] = { 'w', 'a', 'i', 't' };
196 d = dict_find(token, len, dictionary);
198 if(!options.disable_abbreviations && base == token && len == 1)
200 if (*token == 'x') d = dict_find(examine, sizeof examine, dictionary);
201 else if(*token == 'g') d = dict_find(again, sizeof again, dictionary);
202 else if(*token == 'z') d = dict_find(wait, sizeof wait, dictionary);
205 if(flag && d == 0) return;
207 parse = parse + 2 + (found * 4);
209 user_store_word(parse, d);
211 user_store_byte(parse + 2, len);
213 if(zversion <= 4) user_store_byte(parse + 3, token - base + 1);
214 else user_store_byte(parse + 3, token - base + 2);
217 /* The behavior of tokenize is described in §15 (under the read opcode)
220 * For the text buffer, byte 0 is ignored in both V3/4 and V5+.
221 * Byte 1 of V3/4 is the start of the string, while in V5+ it is the
222 * length of the string.
223 * Byte 2 of V5+ is the start of the string. V3/4 strings have a null
224 * terminator, while V5+ do not.
226 * For the parse buffer, byte 0 contains the maximum number of tokens
228 * The number of tokens found is stored in byte 1.
229 * Each token is then represented by a 4-byte chunk with the following
231 * • The first two bytes are the byte address of the dictionary entry
232 * for the token, or 0 if the token was not found in the dictionary.
233 * • The next byte is the length of the token.
234 * • The final byte is the offset in the string of the token.
236 void tokenize(uint16_t text, uint16_t parse, uint16_t dictionary, int flag)
238 const uint8_t *p, *lastp;
240 uint32_t text_len = 0;
241 const int maxwords = user_byte(parse);
245 if(dictionary == 0) dictionary = header.dictionary;
247 ZASSERT(dictionary != 0, "attempt to tokenize without a valid dictionary");
249 num_separators = user_byte(dictionary);
250 separators = dictionary + 1;
252 if(zversion >= 5) text_len = user_byte(text + 1);
253 else while(user_byte(text + 1 + text_len) != 0) text_len++;
255 ZASSERT(text + 1 + (zversion >= 5) + text_len < memory_size, "attempt to tokenize out-of-bounds string");
257 string = &memory[text + 1 + (zversion >= 5)];
259 for(p = string; p - string < text_len && *p == ZSCII_SPACE; p++);
262 text_len -= (p - string);
266 if(!in_word && text_len != 0 && !is_sep(*p))
272 if(text_len == 0 || is_sep(*p))
276 handle_token(string, lastp, p - lastp, parse, dictionary, found++, flag);
279 /* §13.6.1: Separators (apart from a space) are tokens too. */
280 if(text_len != 0 && *p != ZSCII_SPACE)
282 handle_token(string, p, 1, parse, dictionary, found++, flag);
285 if(found == maxwords) break;
294 user_store_byte(parse + 1, found);
297 static void encode_text(uint32_t text, uint16_t len, uint16_t coded)
301 ZASSERT(text + len < memory_size, "reported text length extends beyond memory size");
303 encode_string(&memory[text], len, encoded);
305 for(int i = 0; i < 6; i++) user_store_byte(coded + i, encoded[i]);
310 if(znargs < 3) zargs[2] = 0;
311 if(znargs < 4) zargs[3] = 0;
313 tokenize(zargs[0], zargs[1], zargs[2], zargs[3]);
316 void zencode_text(void)
318 encode_text(zargs[0] + zargs[2], zargs[1], zargs[3]);