Line data Source code
1 : /*
2 : * (c) Thomas Pornin 1999 - 2002
3 : *
4 : * Redistribution and use in source and binary forms, with or without
5 : * modification, are permitted provided that the following conditions
6 : * are met:
7 : * 1. Redistributions of source code must retain the above copyright
8 : * notice, this list of conditions and the following disclaimer.
9 : * 2. Redistributions in binary form must reproduce the above copyright
10 : * notice, this list of conditions and the following disclaimer in the
11 : * documentation and/or other materials provided with the distribution.
12 : * 4. The name of the authors may not be used to endorse or promote
13 : * products derived from this software without specific prior written
14 : * permission.
15 : *
16 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17 : * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 : * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 : * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
20 : * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 : * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
22 : * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
23 : * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
25 : * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 : * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 : *
28 : */
29 :
30 : #include "tune.h"
31 : #include <stdio.h>
32 : #include <string.h>
33 : #include <stddef.h>
34 : #include <limits.h>
35 : #include "ucppi.h"
36 : #include "mem.h"
37 : #ifdef UCPP_MMAP
38 : #include <unistd.h>
39 : #include <sys/types.h>
40 : #include <sys/mman.h>
41 : #endif
42 :
43 : /*
44 : * Character classes for description of the automaton.
45 : * The characters used for representing classes should not appear
46 : * explicitely in an automaton rule.
47 : */
48 : #define SPC ' ' /* whitespace characters */
49 : #define ALP 'Z' /* A-Z, a-z, _ */
50 : #define NUM '9' /* 0-9 */
51 : #define ANY 'Y' /* any character */
52 : #define VCH 'F' /* void character (for end of input) */
53 :
54 : /*
55 : * flags and macros to test those flags
56 : * STO: the currently read string is a complete token
57 : * PUT: the currently read character must be added to the string
58 : * FRZ: the currently read character must be kept and read again
59 : */
60 : #define MOD_MK 255
61 : #define noMOD(x) ((x) & 255)
62 : #define STO(x) ((x) | 256)
63 : #define ttSTO(x) ((x) & 256)
64 : #define FRZ(x) ((x) | 512)
65 : #define ttFRZ(x) ((x) & 512)
66 : #define PUT(x) ((x) | 1024)
67 : #define ttPUT(x) ((x) & 1024)
68 :
69 : /* order is important */
70 : enum {
71 : S_START, S_SPACE, S_BANG, S_STRING, S_STRING2, S_COLON,
72 : S_SHARP, S_PCT, S_PCT2, S_PCT3, S_AMPER, S_CHAR, S_CHAR2, S_STAR,
73 : S_PLUS, S_MINUS, S_DOT, S_DOT2, S_SLASH, S_NUMBER, S_NUMBER2, S_LT,
74 : S_LT2, S_EQ, S_GT, S_GT2, S_CIRC, S_PIPE, S_BACKSLASH,
75 : S_COMMENT, S_COMMENT2, S_COMMENT3, S_COMMENT4, S_COMMENT5,
76 : S_NAME, S_NAME_BS, S_LCHAR,
77 : MSTATE,
78 : S_ILL, S_DDOT, S_DDSHARP, S_BS, S_ROGUE_BS, S_BEHEAD, S_DECAY,
79 : S_TRUNC, S_TRUNCC, S_OUCH
80 : };
81 :
82 : #define CMT(x) ((x) >= S_COMMENT && (x) <= S_COMMENT5)
83 :
84 : #define CMCR 2
85 :
86 : /*
87 : * This is the description of the automaton. It is not used "as is"
88 : * but copied at execution time into a table.
89 : *
90 : * To my utmost displeasure, there are a few hacks in read_token()
91 : * (which uses the transformed automaton) about the special handling
92 : * of slashes, sharps, and the letter L.
93 : */
94 : static struct machine_state {
95 : int state;
96 : unsigned char input[CMCR];
97 : int new_state;
98 : } cppms[] = {
99 : /* S_START is the generic beginning state */
100 : { S_START, { ANY }, S_ILL },
101 : #ifdef SEMPER_FIDELIS
102 : { S_START, { SPC }, PUT(S_SPACE) },
103 : #else
104 : { S_START, { SPC }, S_SPACE },
105 : #endif
106 : { S_START, { '\n' }, STO(NEWLINE) },
107 : { S_START, { '!' }, S_BANG },
108 : { S_START, { '"' }, PUT(S_STRING) },
109 : { S_START, { '#' }, S_SHARP },
110 : { S_START, { '%' }, S_PCT },
111 : { S_START, { '&' }, S_AMPER },
112 : { S_START, { '\'' }, PUT(S_CHAR) },
113 : { S_START, { '(' }, STO(LPAR) },
114 : { S_START, { ')' }, STO(RPAR) },
115 : { S_START, { '*' }, S_STAR },
116 : { S_START, { '+' }, S_PLUS },
117 : { S_START, { ',' }, STO(COMMA) },
118 : { S_START, { '-' }, S_MINUS },
119 : { S_START, { '.' }, PUT(S_DOT) },
120 : #ifdef SEMPER_FIDELIS
121 : { S_START, { '/' }, PUT(S_SLASH) },
122 : #else
123 : { S_START, { '/' }, S_SLASH },
124 : #endif
125 : { S_START, { NUM }, PUT(S_NUMBER) },
126 : { S_START, { ':' }, S_COLON },
127 : { S_START, { ';' }, STO(SEMIC) },
128 : { S_START, { '<' }, S_LT },
129 : { S_START, { '=' }, S_EQ },
130 : { S_START, { '>' }, S_GT },
131 : { S_START, { '?' }, STO(QUEST) },
132 : { S_START, { ALP }, PUT(S_NAME) },
133 : { S_START, { 'L' }, PUT(S_LCHAR) },
134 : { S_START, { '[' }, STO(LBRK) },
135 : { S_START, { ']' }, STO(RBRK) },
136 : { S_START, { '^' }, S_CIRC },
137 : { S_START, { '{' }, STO(LBRA) },
138 : { S_START, { '|' }, S_PIPE },
139 : { S_START, { '}' }, STO(RBRA) },
140 : { S_START, { '~' }, STO(NOT) },
141 : { S_START, { '\\' }, S_BACKSLASH },
142 :
143 : /* after a space */
144 : { S_SPACE, { ANY }, FRZ(STO(NONE)) },
145 : #ifdef SEMPER_FIDELIS
146 : { S_SPACE, { SPC }, PUT(S_SPACE) },
147 : #else
148 : { S_SPACE, { SPC }, S_SPACE },
149 : #endif
150 :
151 : /* after a ! */
152 : { S_BANG, { ANY }, FRZ(STO(LNOT)) },
153 : { S_BANG, { '=' }, STO(NEQ) },
154 :
155 : /* after a " */
156 : { S_STRING, { ANY }, PUT(S_STRING) },
157 : { S_STRING, { VCH }, FRZ(S_TRUNC) },
158 : { S_STRING, { '\n' }, FRZ(S_BEHEAD) },
159 : { S_STRING, { '\\' }, PUT(S_STRING2) },
160 : { S_STRING, { '"' }, PUT(STO(STRING)) },
161 :
162 : { S_STRING2, { ANY }, PUT(S_STRING) },
163 : { S_STRING2, { VCH }, FRZ(S_TRUNC) },
164 :
165 : /* after a # */
166 : { S_SHARP, { ANY }, FRZ(STO(SHARP)) },
167 : { S_SHARP, { '#' }, STO(DSHARP) },
168 :
169 : /* after a : */
170 : { S_COLON, { ANY }, FRZ(STO(COLON)) },
171 : { S_COLON, { '>' }, STO(DIG_RBRK) },
172 :
173 : /* after a % */
174 : { S_PCT, { ANY }, FRZ(STO(PCT)) },
175 : { S_PCT, { '=' }, STO(ASPCT) },
176 : { S_PCT, { '>' }, STO(DIG_RBRA) },
177 : { S_PCT, { ':' }, S_PCT2 },
178 :
179 : /* after a %: */
180 : { S_PCT2, { ANY }, FRZ(STO(DIG_SHARP)) },
181 : { S_PCT2, { '%' }, S_PCT3 },
182 :
183 : /* after a %:% */
184 : { S_PCT3, { ANY }, FRZ(S_DDSHARP) },
185 : { S_PCT3, { ':' }, STO(DIG_DSHARP) },
186 :
187 : /* after a & */
188 : { S_AMPER, { ANY }, FRZ(STO(AND)) },
189 : { S_AMPER, { '=' }, STO(ASAND) },
190 : { S_AMPER, { '&' }, STO(LAND) },
191 :
192 : /* after a ' */
193 : { S_CHAR, { ANY }, PUT(S_CHAR) },
194 : { S_CHAR, { VCH }, FRZ(S_TRUNC) },
195 : { S_CHAR, { '\'' }, PUT(STO(CHAR)) },
196 : { S_CHAR, { '\\' }, PUT(S_CHAR2) },
197 :
198 : /* after a \ in a character constant
199 : useful only for '\'' */
200 : { S_CHAR2, { ANY }, PUT(S_CHAR) },
201 : { S_CHAR2, { VCH }, FRZ(S_TRUNC) },
202 :
203 : /* after a * */
204 : { S_STAR, { ANY }, FRZ(STO(STAR)) },
205 : { S_STAR, { '=' }, STO(ASSTAR) },
206 :
207 : /* after a + */
208 : { S_PLUS, { ANY }, FRZ(STO(PLUS)) },
209 : { S_PLUS, { '+' }, STO(PPLUS) },
210 : { S_PLUS, { '=' }, STO(ASPLUS) },
211 :
212 : /* after a - */
213 : { S_MINUS, { ANY }, FRZ(STO(MINUS)) },
214 : { S_MINUS, { '-' }, STO(MMINUS) },
215 : { S_MINUS, { '=' }, STO(ASMINUS) },
216 : { S_MINUS, { '>' }, STO(ARROW) },
217 :
218 : /* after a . */
219 : { S_DOT, { ANY }, FRZ(STO(DOT)) },
220 : { S_DOT, { NUM }, PUT(S_NUMBER) },
221 : { S_DOT, { '.' }, S_DOT2 },
222 :
223 : /* after .. */
224 : { S_DOT2, { ANY }, FRZ(S_DDOT) },
225 : { S_DOT2, { '.' }, STO(MDOTS) },
226 :
227 : /* after a / */
228 : { S_SLASH, { ANY }, FRZ(STO(SLASH)) },
229 : { S_SLASH, { '=' }, STO(ASSLASH) },
230 : #ifdef SEMPER_FIDELIS
231 : { S_SLASH, { '*' }, PUT(S_COMMENT) },
232 : { S_SLASH, { '/' }, PUT(S_COMMENT5) },
233 : #else
234 : { S_SLASH, { '*' }, S_COMMENT },
235 : { S_SLASH, { '/' }, S_COMMENT5 },
236 : #endif
237 : /*
238 : * There is a little hack in read_token() to disable
239 : * this last rule, if C++ (C99) comments are not enabled.
240 : */
241 :
242 : /* after a number */
243 : { S_NUMBER, { ANY }, FRZ(STO(NUMBER)) },
244 : { S_NUMBER, { ALP, NUM }, PUT(S_NUMBER) },
245 : { S_NUMBER, { '.' }, PUT(S_NUMBER) },
246 : { S_NUMBER, { 'E', 'e' }, PUT(S_NUMBER2) },
247 : { S_NUMBER, { 'P', 'p' }, PUT(S_NUMBER2) },
248 :
249 : { S_NUMBER2, { ANY }, FRZ(STO(NUMBER)) },
250 : { S_NUMBER2, { ALP, NUM }, PUT(S_NUMBER) },
251 : { S_NUMBER2, { '+', '-' }, PUT(S_NUMBER) },
252 :
253 : /* after a < */
254 : { S_LT, { ANY }, FRZ(STO(LT)) },
255 : { S_LT, { '=' }, STO(LEQ) },
256 : { S_LT, { '<' }, S_LT2 },
257 : { S_LT, { ':' }, STO(DIG_LBRK) },
258 : { S_LT, { '%' }, STO(DIG_LBRA) },
259 :
260 : { S_LT2, { ANY }, FRZ(STO(LSH)) },
261 : { S_LT2, { '=' }, STO(ASLSH) },
262 :
263 : /* after a > */
264 : { S_GT, { ANY }, FRZ(STO(GT)) },
265 : { S_GT, { '=' }, STO(GEQ) },
266 : { S_GT, { '>' }, S_GT2 },
267 :
268 : { S_GT2, { ANY }, FRZ(STO(RSH)) },
269 : { S_GT2, { '=' }, STO(ASRSH) },
270 :
271 : /* after a = */
272 : { S_EQ, { ANY }, FRZ(STO(ASGN)) },
273 : { S_EQ, { '=' }, STO(SAME) },
274 : #ifdef CAST_OP
275 : { S_EQ, { '>' }, STO(CAST) },
276 : #endif
277 :
278 : /* after a \ */
279 : { S_BACKSLASH, { ANY }, FRZ(S_BS) },
280 : { S_BACKSLASH, { 'U', 'u' }, FRZ(S_NAME_BS) },
281 :
282 : /* after a letter */
283 : { S_NAME, { ANY }, FRZ(STO(NAME)) },
284 : { S_NAME, { ALP, NUM }, PUT(S_NAME) },
285 : { S_NAME, { '\\' }, S_NAME_BS },
286 :
287 : /* after a \ in an identifier */
288 : { S_NAME_BS, { ANY }, FRZ(S_ROGUE_BS) },
289 : { S_NAME_BS, { 'u', 'U' }, PUT(S_NAME) },
290 :
291 : /* after a L */
292 : { S_LCHAR, { ANY }, FRZ(S_NAME) },
293 : { S_LCHAR, { '"' }, PUT(S_STRING) },
294 : { S_LCHAR, { '\'' }, PUT(S_CHAR) },
295 :
296 : /* after a ^ */
297 : { S_CIRC, { ANY }, FRZ(STO(CIRC)) },
298 : { S_CIRC, { '=' }, STO(ASCIRC) },
299 :
300 : /* after a | */
301 : { S_PIPE, { ANY }, FRZ(STO(OR)) },
302 : { S_PIPE, { '=' }, STO(ASOR) },
303 : { S_PIPE, { '|' }, STO(LOR) },
304 :
305 : /* after a / and * */
306 : #ifdef SEMPER_FIDELIS
307 : { S_COMMENT, { ANY }, PUT(S_COMMENT) },
308 : { S_COMMENT, { VCH }, FRZ(S_TRUNCC) },
309 : { S_COMMENT, { '*' }, PUT(S_COMMENT2) },
310 :
311 : { S_COMMENT2, { ANY }, FRZ(S_COMMENT) },
312 : { S_COMMENT2, { VCH }, FRZ(S_TRUNCC) },
313 : { S_COMMENT2, { '*' }, PUT(S_COMMENT2) },
314 : { S_COMMENT2, { '/' }, STO(PUT(COMMENT)) },
315 :
316 : { S_COMMENT5, { ANY }, PUT(S_COMMENT5) },
317 : { S_COMMENT5, { VCH }, FRZ(S_DECAY) },
318 : { S_COMMENT5, { '\n' }, FRZ(STO(COMMENT)) },
319 : #else
320 : { S_COMMENT, { ANY }, S_COMMENT },
321 : { S_COMMENT, { VCH }, FRZ(S_TRUNCC) },
322 : { S_COMMENT, { '*' }, S_COMMENT2 },
323 :
324 : { S_COMMENT2, { ANY }, FRZ(S_COMMENT) },
325 : { S_COMMENT2, { VCH }, FRZ(S_TRUNCC) },
326 : { S_COMMENT2, { '*' }, S_COMMENT2 },
327 : { S_COMMENT2, { '/' }, STO(COMMENT) },
328 :
329 : { S_COMMENT5, { ANY }, S_COMMENT5 },
330 : { S_COMMENT5, { VCH }, FRZ(S_DECAY) },
331 : { S_COMMENT5, { '\n' }, FRZ(STO(COMMENT)) },
332 : #endif
333 :
334 : /* dummy end of machine description */
335 : { 0, { 0 }, 0 }
336 : };
337 :
338 : /*
339 : * cppm is the table used to store the automaton: if we are in state s
340 : * and we read character c, we apply the action cppm[s][c] (jumping to
341 : * another state, or emitting a token).
342 : * cppm_vch is the table for the special virtual character "end of input"
343 : */
344 : static int cppm[MSTATE][MAX_CHAR_VAL];
345 : static int cppm_vch[MSTATE];
346 :
347 : /*
348 : * init_cppm() fills cppm[][] with the information stored in cppms[].
349 : * It must be called before beginning the lexing process.
350 : */
351 0 : void init_cppm(void)
352 : {
353 : int i, j, k, c;
354 : static unsigned char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
355 : static unsigned char lower[] = "abcdefghijklmnopqrstuvwxyz";
356 : unsigned char *cp;
357 :
358 0 : for (i = 0; i < MSTATE; i ++) {
359 0 : for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[i][j] = S_OUCH;
360 0 : cppm_vch[i] = S_OUCH;
361 : }
362 0 : for (i = 0; cppms[i].input[0]; i ++) for (k = 0; k < CMCR; k ++) {
363 0 : int s = cppms[i].state;
364 0 : int ns = cppms[i].new_state;
365 :
366 0 : switch (c = cppms[i].input[k]) {
367 : case 0:
368 0 : break;
369 : case SPC:
370 : /* see space_char() also */
371 0 : cppm[s][' '] = ns;
372 0 : cppm[s]['\t'] = ns;
373 0 : cppm[s]['\v'] = ns;
374 0 : cppm[s]['\f'] = ns;
375 : #ifdef UNBREAKABLE_SPACE
376 : if (MAX_CHAR_VAL > UNBREAKABLE_SPACE)
377 : cppm[s][UNBREAKABLE_SPACE] = ns;
378 : #endif
379 0 : break;
380 : case ALP:
381 0 : for (cp = upper; *cp; cp ++) cppm[s][(int)*cp] = ns;
382 0 : for (cp = lower; *cp; cp ++) cppm[s][(int)*cp] = ns;
383 0 : cppm[s]['_'] = ns;
384 0 : break;
385 : case NUM:
386 0 : for (j = '0'; j <= '9'; j ++) cppm[s][j] = ns;
387 0 : break;
388 : case ANY:
389 0 : for (j = 0; j < MAX_CHAR_VAL; j ++) cppm[s][j] = ns;
390 0 : cppm_vch[s] = ns;
391 0 : break;
392 : case VCH:
393 0 : cppm_vch[s] = ns;
394 0 : break;
395 : default:
396 0 : cppm[s][c] = ns;
397 0 : break;
398 : }
399 : }
400 0 : }
401 :
402 : /*
403 : * Make some character as equivalent to a letter for identifiers.
404 : */
405 0 : void set_identifier_char(int c)
406 : {
407 0 : cppm[S_START][c] = PUT(S_NAME);
408 0 : cppm[S_NAME][c] = PUT(S_NAME);
409 0 : }
410 :
411 : /*
412 : * Remove the "identifier" status from a character.
413 : */
414 0 : void unset_identifier_char(int c)
415 : {
416 0 : cppm[S_START][c] = S_ILL;
417 0 : cppm[S_NAME][c] = FRZ(STO(NAME));
418 0 : }
419 :
420 0 : int space_char(int c)
421 : {
422 0 : if (c == ' ' || c == '\t' || c == '\v' || c == '\f'
423 : #ifdef UNBREAKABLE_SPACE
424 : || c == UNBREAKABLE_SPACE
425 : #endif
426 0 : ) return 1;
427 0 : return 0;
428 : }
429 :
430 : #ifndef NO_UCPP_BUF
431 : /*
432 : * our output buffer is full, flush it
433 : */
434 : void flush_output(struct lexer_state *ls)
435 : {
436 : size_t x = ls->sbuf, y = 0, z;
437 :
438 : if (ls->sbuf == 0) return;
439 : do {
440 : z = fwrite(ls->output_buf + y, 1, x, ls->output);
441 : x -= z;
442 : y += z;
443 : } while (z && x > 0);
444 : if (!y) {
445 : error(ls->line, "could not flush output (disk full ?)");
446 : die();
447 : }
448 : ls->sbuf = 0;
449 : }
450 : #endif
451 :
452 : /*
453 : * Output one character; flush the buffer if needed.
454 : * This function should not be called, except by put_char().
455 : */
456 0 : static inline void write_char(struct lexer_state *ls, unsigned char c)
457 : {
458 : #ifndef NO_UCPP_BUF
459 : ls->output_buf[ls->sbuf ++] = c;
460 : if (ls->sbuf == OUTPUT_BUF_MEMG) flush_output(ls);
461 : #else
462 0 : if (putc((int)c, ls->output) == EOF) {
463 0 : error(ls->line, "output write error (disk full ?)");
464 0 : die();
465 : }
466 : #endif
467 0 : if (c == '\n') {
468 0 : ls->oline ++;
469 : }
470 0 : }
471 :
472 : /*
473 : * schedule a character for output
474 : */
475 0 : void put_char(struct lexer_state *ls, unsigned char c)
476 : {
477 0 : if (ls->flags & KEEP_OUTPUT) write_char(ls, c);
478 0 : }
479 :
480 : /*
481 : * get next raw input character
482 : */
483 0 : static inline int read_char(struct lexer_state *ls)
484 : {
485 : unsigned char c;
486 :
487 0 : if (!ls->input) {
488 0 : return ((ls->pbuf ++) < ls->ebuf) ?
489 0 : ls->input_string[ls->pbuf - 1] : -1;
490 : }
491 : while (1) {
492 : #ifndef NO_UCPP_BUF
493 : if (ls->pbuf == ls->ebuf) {
494 : #ifdef UCPP_MMAP
495 : if (ls->from_mmap) {
496 : munmap((void *)ls->input_buf, ls->ebuf);
497 : ls->from_mmap = 0;
498 : ls->input_buf = ls->input_buf_sav;
499 : }
500 : #endif
501 : ls->ebuf = fread(ls->input_buf, 1,
502 : INPUT_BUF_MEMG, ls->input);
503 : ls->pbuf = 0;
504 : }
505 : if (ls->ebuf == 0) return -1;
506 : c = ls->input_buf[ls->pbuf ++];
507 : #else
508 0 : int x = getc(ls->input);
509 :
510 0 : if (x == EOF) return -1;
511 0 : c = x;
512 : #endif
513 0 : if (ls->flags & COPY_LINE) {
514 0 : if (c == '\n') {
515 0 : ls->copy_line[ls->cli] = 0;
516 0 : ls->cli = 0;
517 0 : } else if (ls->cli < (COPY_LINE_LENGTH - 1)) {
518 0 : ls->copy_line[ls->cli ++] = c;
519 : }
520 : }
521 0 : if (ls->macfile && c == '\n') {
522 0 : ls->macfile = 0;
523 0 : continue;
524 : }
525 0 : ls->macfile = 0;
526 0 : if (c == '\r') {
527 : /*
528 : * We found a '\r'; we handle it as a newline
529 : * and ignore the next newline. This should work
530 : * with all combinations of Msdos, MacIntosh and
531 : * Unix files on these three platforms. On other
532 : * platforms, native file formats are always
533 : * supported.
534 : */
535 0 : ls->macfile = 1;
536 0 : c = '\n';
537 : }
538 0 : break;
539 0 : }
540 0 : return c;
541 : }
542 :
543 : /*
544 : * next_fifo_char(), char_lka1() and char_lka2() give a two character
545 : * look-ahead on the input stream; this is needed for trigraphs
546 : */
547 0 : static inline int next_fifo_char(struct lexer_state *ls)
548 : {
549 : int c;
550 :
551 0 : if (ls->nlka != 0) {
552 0 : c = ls->lka[0];
553 0 : ls->lka[0] = ls->lka[1];
554 0 : ls->nlka --;
555 0 : } else c = read_char(ls);
556 0 : return c;
557 : }
558 :
559 0 : static inline int char_lka1(struct lexer_state *ls)
560 : {
561 0 : if (ls->nlka == 0) {
562 0 : ls->lka[0] = read_char(ls);
563 0 : ls->nlka ++;
564 : }
565 0 : return ls->lka[0];
566 : }
567 :
568 0 : static inline int char_lka2(struct lexer_state *ls)
569 : {
570 : #ifdef AUDIT
571 : if (ls->nlka == 0) ouch("always in motion future is");
572 : #endif
573 0 : if (ls->nlka == 1) {
574 0 : ls->lka[1] = read_char(ls);
575 0 : ls->nlka ++;
576 : }
577 0 : return ls->lka[1];
578 : }
579 :
580 : static struct trigraph {
581 : int old, new;
582 : } trig[9] = {
583 : { '=', '#' },
584 : { '/', '\\' },
585 : { '\'', '^' },
586 : { '(', '[' },
587 : { ')', ']' },
588 : { '!', '|' },
589 : { '<', '{' },
590 : { '>', '}' },
591 : { '-', '~' }
592 : };
593 :
594 : /*
595 : * Returns the next character, after treatment of trigraphs and terminating
596 : * backslashes. Return value is -1 if there is no more input.
597 : */
598 0 : static inline int next_char(struct lexer_state *ls)
599 : {
600 : int c;
601 :
602 0 : if (!ls->discard) return ls->last;
603 0 : ls->discard = 0;
604 : do {
605 0 : c = next_fifo_char(ls);
606 : /* check trigraphs */
607 0 : if (c == '?' && char_lka1(ls) == '?'
608 0 : && (ls->flags & HANDLE_TRIGRAPHS)) {
609 : int i, d;
610 :
611 0 : d = char_lka2(ls);
612 0 : for (i = 0; i < 9; i ++) if (d == trig[i].old) {
613 0 : if (ls->flags & WARN_TRIGRAPHS) {
614 0 : ls->count_trigraphs ++;
615 : }
616 0 : if (ls->flags & WARN_TRIGRAPHS_MORE) {
617 0 : warning(ls->line, "trigraph ?""?%c "
618 : "encountered", d);
619 : }
620 0 : next_fifo_char(ls);
621 0 : next_fifo_char(ls);
622 0 : c = trig[i].new;
623 0 : break;
624 : }
625 : }
626 0 : if (c == '\\' && char_lka1(ls) == '\n') {
627 0 : ls->line ++;
628 0 : next_fifo_char(ls);
629 0 : } else if (c == '\r' && char_lka1(ls) == '\n') {
630 0 : ls->line ++;
631 0 : next_fifo_char(ls);
632 0 : c = '\n';
633 0 : return c;
634 : } else {
635 0 : ls->last = c;
636 0 : return c;
637 : }
638 0 : } while (1);
639 : }
640 :
641 : /*
642 : * wrapper for next_char(), to be called from outside
643 : * (used by #error, #include directives)
644 : */
645 0 : int grap_char(struct lexer_state *ls)
646 : {
647 0 : return next_char(ls);
648 : }
649 :
650 : /*
651 : * Discard the current character, so that the next call to next_char()
652 : * will step into the input stream.
653 : */
654 0 : void discard_char(struct lexer_state *ls)
655 : {
656 : #ifdef AUDIT
657 : if (ls->discard) ouch("overcollecting garbage");
658 : #endif
659 0 : ls->discard = 1;
660 0 : ls->utf8 = 0;
661 0 : if (ls->last == '\n') ls->line ++;
662 0 : }
663 :
664 : /*
665 : * Convert an UTF-8 encoded character to a Universal Character Name
666 : * using \u (or \U when appropriate).
667 : */
668 0 : static int utf8_to_string(unsigned char buf[], unsigned long utf8)
669 : {
670 0 : unsigned long val = 0;
671 : static char hex[16] = "0123456789abcdef";
672 :
673 0 : if (utf8 & 0x80UL) {
674 : unsigned long x1, x2, x3, x4;
675 :
676 0 : x1 = (utf8 >> 24) & 0x7fUL;
677 0 : x2 = (utf8 >> 16) & 0x7fUL;
678 0 : x3 = (utf8 >> 8) & 0x7fUL;
679 0 : x4 = (utf8) & 0x3fUL;
680 0 : x1 &= 0x07UL;
681 0 : if (x2 & 0x40UL) x2 &= 0x0fUL;
682 0 : if (x3 & 0x40UL) x3 &= 0x1fUL;
683 0 : val = x4 | (x3 << 6) | (x2 << 12) | (x1 << 16);
684 0 : } else val = utf8;
685 0 : if (val < 128) {
686 0 : buf[0] = val;
687 0 : buf[1] = 0;
688 0 : return 1;
689 0 : } else if (val < 0xffffUL) {
690 0 : buf[0] = '\\';
691 0 : buf[1] = 'u';
692 0 : buf[2] = hex[(size_t)(val >> 12)];
693 0 : buf[3] = hex[(size_t)((val >> 8) & 0xfU)];
694 0 : buf[4] = hex[(size_t)((val >> 4) & 0xfU)];
695 0 : buf[5] = hex[(size_t)(val & 0xfU)];
696 0 : buf[6] = 0;
697 0 : return 6;
698 : }
699 0 : buf[0] = '\\';
700 0 : buf[1] = 'U';
701 0 : buf[2] = '0';
702 0 : buf[3] = '0';
703 0 : buf[4] = hex[(size_t)(val >> 20)];
704 0 : buf[5] = hex[(size_t)((val >> 16) & 0xfU)];
705 0 : buf[6] = hex[(size_t)((val >> 12) & 0xfU)];
706 0 : buf[7] = hex[(size_t)((val >> 8) & 0xfU)];
707 0 : buf[8] = hex[(size_t)((val >> 4) & 0xfU)];
708 0 : buf[9] = hex[(size_t)(val & 0xfU)];
709 0 : buf[10] = 0;
710 0 : return 10;
711 : }
712 :
713 : /*
714 : * Scan the identifier and put it in canonical form:
715 : * -- tranform \U0000xxxx into \uxxxx
716 : * -- inside \u and \U, make letters low case
717 : * -- report (some) incorrect use of UCN
718 : */
719 0 : static void canonize_id(struct lexer_state *ls, char *id)
720 : {
721 : char *c, *d;
722 :
723 0 : for (c = d = id; *c;) {
724 0 : if (*c == '\\') {
725 : int i;
726 :
727 0 : if (!*(c + 1)) goto canon_error;
728 0 : if (*(c + 1) == 'U') {
729 0 : for (i = 0; i < 8 && *(c + i + 2); i ++);
730 0 : if (i != 8) goto canon_error;
731 0 : *(d ++) = '\\';
732 0 : c += 2;
733 0 : for (i = 0; i < 4 && *(c + i) == '0'; i ++);
734 0 : if (i == 4) {
735 0 : *(d ++) = 'u';
736 0 : c += 4;
737 : } else {
738 0 : *(d ++) = 'U';
739 0 : i = 8;
740 : }
741 0 : for (; i > 0; i --) {
742 0 : switch (*c) {
743 0 : case 'A': *(d ++) = 'a'; break;
744 0 : case 'B': *(d ++) = 'b'; break;
745 0 : case 'C': *(d ++) = 'c'; break;
746 0 : case 'D': *(d ++) = 'd'; break;
747 0 : case 'E': *(d ++) = 'e'; break;
748 0 : case 'F': *(d ++) = 'f'; break;
749 0 : default: *(d ++) = *c; break;
750 : }
751 0 : c ++;
752 : }
753 0 : } else if (*(c + 1) == 'u') {
754 0 : for (i = 0; i < 4 && *(c + i + 2); i ++);
755 0 : if (i != 4) goto canon_error;
756 0 : *(d ++) = '\\';
757 0 : *(d ++) = 'u';
758 0 : c += 2;
759 0 : for (; i > 0; i --) {
760 0 : switch (*c) {
761 0 : case 'A': *(d ++) = 'a'; break;
762 0 : case 'B': *(d ++) = 'b'; break;
763 0 : case 'C': *(d ++) = 'c'; break;
764 0 : case 'D': *(d ++) = 'd'; break;
765 0 : case 'E': *(d ++) = 'e'; break;
766 0 : case 'F': *(d ++) = 'f'; break;
767 0 : default: *(d ++) = *c; break;
768 : }
769 0 : c ++;
770 : }
771 0 : } else goto canon_error;
772 0 : continue;
773 : }
774 0 : *(d ++) = *(c ++);
775 : }
776 0 : *d = 0;
777 0 : return;
778 :
779 : canon_error:
780 0 : for (; *c; *(d ++) = *(c ++));
781 0 : if (ls->flags & WARN_STANDARD) {
782 0 : warning(ls->line, "malformed identifier with UCN: '%s'", id);
783 : }
784 0 : *d = 0;
785 : }
786 :
787 : /*
788 : * Run the automaton, in order to get the next token.
789 : * This function should not be called, except by next_token()
790 : *
791 : * return value: 1 on error, 2 on end-of-file, 0 otherwise.
792 : */
793 0 : static inline int read_token(struct lexer_state *ls)
794 : {
795 0 : int cstat = S_START, nstat;
796 0 : size_t ltok = 0;
797 0 : int c, outc = 0, ucn_in_id = 0;
798 : int shift_state;
799 : unsigned long utf8;
800 0 : long l = ls->line;
801 :
802 0 : ls->ctok->line = l;
803 0 : if (ls->pending_token) {
804 0 : if ((ls->ctok->type = ls->pending_token) == BUNCH) {
805 0 : ls->ctok->name[0] = '\\';
806 0 : ls->ctok->name[1] = 0;
807 : }
808 0 : ls->pending_token = 0;
809 0 : return 0;
810 : }
811 0 : if (ls->flags & UTF8_SOURCE) {
812 0 : utf8 = ls->utf8;
813 0 : shift_state = 0;
814 : }
815 0 : if (!(ls->flags & LEXER) && (ls->flags & KEEP_OUTPUT))
816 0 : for (; ls->line > ls->oline;) put_char(ls, '\n');
817 : do {
818 0 : c = next_char(ls);
819 0 : if (c < 0) {
820 0 : if ((ls->flags & UTF8_SOURCE) && shift_state) {
821 0 : if (ls->flags & WARN_STANDARD)
822 0 : warning(ls->line, "truncated UTF-8 "
823 : "character");
824 0 : shift_state = 0;
825 0 : utf8 = 0;
826 : }
827 0 : if (cstat == S_START) return 2;
828 0 : nstat = cppm_vch[cstat];
829 : } else {
830 0 : if (ls->flags & UTF8_SOURCE) {
831 0 : if (shift_state) {
832 0 : if ((c & 0xc0) != 0x80) {
833 0 : if (ls->flags & WARN_STANDARD)
834 0 : warning(ls->line,
835 : "truncated "
836 : "UTF-8 "
837 : "character");
838 0 : shift_state = 0;
839 0 : utf8 = 0;
840 0 : c = '_';
841 : } else {
842 0 : utf8 = (utf8 << 8) | c;
843 0 : if (-- shift_state) {
844 0 : ls->discard = 1;
845 0 : continue;
846 : }
847 0 : c = '_';
848 : }
849 0 : } else if ((c & 0xc0) == 0xc0) {
850 0 : if ((c & 0x30) == 0x30) {
851 0 : shift_state = 3;
852 0 : } else if (c & 0x20) {
853 0 : shift_state = 2;
854 : } else {
855 0 : shift_state = 1;
856 : }
857 0 : utf8 = c;
858 0 : ls->discard = 1;
859 0 : continue;
860 0 : } else utf8 = 0;
861 : }
862 0 : nstat = cppm[cstat][c < MAX_CHAR_VAL ? c : 0];
863 : }
864 : #ifdef AUDIT
865 : if (nstat == S_OUCH) {
866 : ouch("bad move...");
867 : }
868 : #endif
869 : /*
870 : * disable C++-like comments
871 : */
872 0 : if (nstat == S_COMMENT5 && !(ls->flags & CPLUSPLUS_COMMENTS))
873 0 : nstat = FRZ(STO(SLASH));
874 :
875 0 : if (noMOD(nstat) >= MSTATE && !ttSTO(nstat))
876 0 : switch (noMOD(nstat)) {
877 : case S_ILL:
878 0 : if (ls->flags & CCHARSET) {
879 0 : error(ls->line, "illegal character '%c'", c);
880 0 : return 1;
881 : }
882 0 : nstat = PUT(STO(BUNCH));
883 0 : break;
884 : case S_BS:
885 0 : ls->ctok->name[0] = '\\';
886 0 : ltok ++;
887 0 : nstat = FRZ(STO(BUNCH));
888 0 : if (!(ls->flags & LEXER)) put_char(ls, '\\');
889 0 : break;
890 : case S_ROGUE_BS:
891 0 : ls->pending_token = BUNCH;
892 0 : nstat = FRZ(STO(NAME));
893 0 : break;
894 : case S_DDOT:
895 0 : ls->pending_token = DOT;
896 0 : nstat = FRZ(STO(DOT));
897 0 : break;
898 : case S_DDSHARP:
899 0 : ls->pending_token = PCT;
900 0 : nstat = FRZ(STO(DIG_SHARP));
901 0 : break;
902 : case S_BEHEAD:
903 0 : error(l, "unfinished string at end of line");
904 0 : return 1;
905 : case S_DECAY:
906 0 : warning(l, "unterminated // comment");
907 0 : nstat = FRZ(STO(COMMENT));
908 0 : break;
909 : case S_TRUNC:
910 0 : error(l, "truncated token");
911 0 : return 1;
912 : case S_TRUNCC:
913 0 : error(l, "truncated comment");
914 0 : return 1;
915 : #ifdef AUDIT
916 : case S_OUCH:
917 : ouch("machine went out of control");
918 : break;
919 : #endif
920 : }
921 0 : if (!ttFRZ(nstat)) {
922 0 : discard_char(ls);
923 0 : if (!(ls->flags & LEXER) && ls->condcomp) {
924 0 : int z = ttSTO(nstat) ? S_ILL : noMOD(nstat);
925 :
926 0 : if (cstat == S_NAME || z == S_NAME
927 0 : || ((CMT(cstat) || CMT(z))
928 0 : && (ls->flags & DISCARD_COMMENTS))) {
929 0 : outc = 0;
930 0 : } else if (z == S_LCHAR || z == S_SLASH
931 0 : || (z == S_SHARP && ls->ltwnl)
932 0 : || (z == S_PCT && ls->ltwnl)
933 0 : || (z == S_BACKSLASH)) {
934 0 : outc = c;
935 0 : } else if (z == S_PCT2 && ls->ltwnl) {
936 0 : outc = -1;
937 0 : } else if (z == S_PCT3 && ls->ltwnl) {
938 : /* we have %:% but this still might
939 : not be a %:%: */
940 0 : outc = -2;
941 : } else {
942 0 : if (outc < 0) {
943 0 : put_char(ls, '%');
944 0 : put_char(ls, ':');
945 0 : if (outc == -2)
946 0 : put_char(ls, '%');
947 0 : outc = 0;
948 0 : } else if (outc) {
949 0 : put_char(ls, outc);
950 0 : outc = 0;
951 : }
952 0 : put_char(ls, c);
953 : }
954 : }
955 0 : } else if (outc == '/' && !(ls->flags & LEXER)
956 0 : && ls->condcomp) {
957 : /* this is a hack: we need to dump a pending slash */
958 0 : put_char(ls, outc);
959 0 : outc = 0;
960 : }
961 0 : if (ttPUT(nstat)) {
962 0 : if (cstat == S_NAME_BS) {
963 0 : ucn_in_id = 1;
964 0 : wan(ls->ctok->name, ltok, '\\', ls->tknl);
965 : }
966 0 : if ((ls->flags & UTF8_SOURCE) && utf8) {
967 : unsigned char buf[11];
968 : int i, j;
969 :
970 0 : for (i = 0, j = utf8_to_string(buf, utf8);
971 0 : i < j; i ++)
972 0 : wan(ls->ctok->name, ltok, buf[i],
973 : ls->tknl);
974 : /* if (j > 1) ucn_in_id = 1; */
975 0 : } else wan(ls->ctok->name, ltok,
976 : (unsigned char)c, ls->tknl);
977 : }
978 0 : if (ttSTO(nstat)) {
979 0 : if (S_TOKEN(noMOD(nstat))) {
980 0 : wan(ls->ctok->name, ltok,
981 : (unsigned char)0, ls->tknl);
982 : }
983 0 : ls->ctok->type = noMOD(nstat);
984 0 : break;
985 : }
986 0 : cstat = noMOD(nstat);
987 0 : } while (1);
988 0 : if (!(ls->flags & LEXER) && (ls->flags & DISCARD_COMMENTS)
989 0 : && ls->ctok->type == COMMENT) put_char(ls, ' ');
990 0 : if (ucn_in_id && ls->ctok->type == NAME)
991 0 : canonize_id(ls, ls->ctok->name);
992 0 : return 0;
993 : }
994 :
995 : /*
996 : * fills ls->ctok with the next token
997 : */
998 0 : int next_token(struct lexer_state *ls)
999 : {
1000 0 : if (ls->flags & READ_AGAIN) {
1001 0 : ls->flags &= ~READ_AGAIN;
1002 0 : if (!(ls->flags & LEXER)) {
1003 0 : char *c = S_TOKEN(ls->ctok->type) ?
1004 0 : ls->ctok->name : token_name(ls->ctok);
1005 0 : if (ls->ctok->type == OPT_NONE) {
1006 0 : ls->ctok->type = NONE;
1007 : #ifdef SEMPER_FIDELIS
1008 : ls->ctok->name[0] = ' ';
1009 : ls->ctok->name[1] = 0;
1010 : #endif
1011 0 : put_char(ls, ' ');
1012 0 : } else if (ls->ctok->type != NAME &&
1013 0 : !(ls->ltwnl && (ls->ctok->type == SHARP
1014 0 : || ls->ctok->type == DIG_SHARP)))
1015 0 : for (; *c; c ++) put_char(ls, *c);
1016 : }
1017 0 : return 0;
1018 : }
1019 0 : return read_token(ls);
1020 : }
|