Line data Source code
1 :
2 : /* Tokenizer implementation */
3 :
4 : #include "Python.h"
5 : #include "pgenheaders.h"
6 :
7 : #include <ctype.h>
8 : #include <assert.h>
9 :
10 : #include "tokenizer.h"
11 : #include "errcode.h"
12 :
13 : #ifndef PGEN
14 : #include "unicodeobject.h"
15 : #include "bytesobject.h"
16 : #include "fileobject.h"
17 : #include "codecs.h"
18 : #include "abstract.h"
19 : #endif /* PGEN */
20 :
21 : #define is_potential_identifier_start(c) (\
22 : (c >= 'a' && c <= 'z')\
23 : || (c >= 'A' && c <= 'Z')\
24 : || c == '_'\
25 : || (c >= 128))
26 :
27 : #define is_potential_identifier_char(c) (\
28 : (c >= 'a' && c <= 'z')\
29 : || (c >= 'A' && c <= 'Z')\
30 : || (c >= '0' && c <= '9')\
31 : || c == '_'\
32 : || (c >= 128))
33 :
34 : extern char *PyOS_Readline(FILE *, FILE *, char *);
35 : /* Return malloc'ed string including trailing \n;
36 : empty malloc'ed string for EOF;
37 : NULL if interrupted */
38 :
39 : /* Don't ever change this -- it would break the portability of Python code */
40 : #define TABSIZE 8
41 :
42 : /* Forward */
43 : static struct tok_state *tok_new(void);
44 : static int tok_nextc(struct tok_state *tok);
45 : static void tok_backup(struct tok_state *tok, int c);
46 :
47 :
48 : /* Token names */
49 :
50 : char *_PyParser_TokenNames[] = {
51 : "ENDMARKER",
52 : "NAME",
53 : "NUMBER",
54 : "STRING",
55 : "NEWLINE",
56 : "INDENT",
57 : "DEDENT",
58 : "LPAR",
59 : "RPAR",
60 : "LSQB",
61 : "RSQB",
62 : "COLON",
63 : "COMMA",
64 : "SEMI",
65 : "PLUS",
66 : "MINUS",
67 : "STAR",
68 : "SLASH",
69 : "VBAR",
70 : "AMPER",
71 : "LESS",
72 : "GREATER",
73 : "EQUAL",
74 : "DOT",
75 : "PERCENT",
76 : "LBRACE",
77 : "RBRACE",
78 : "EQEQUAL",
79 : "NOTEQUAL",
80 : "LESSEQUAL",
81 : "GREATEREQUAL",
82 : "TILDE",
83 : "CIRCUMFLEX",
84 : "LEFTSHIFT",
85 : "RIGHTSHIFT",
86 : "DOUBLESTAR",
87 : "PLUSEQUAL",
88 : "MINEQUAL",
89 : "STAREQUAL",
90 : "SLASHEQUAL",
91 : "PERCENTEQUAL",
92 : "AMPEREQUAL",
93 : "VBAREQUAL",
94 : "CIRCUMFLEXEQUAL",
95 : "LEFTSHIFTEQUAL",
96 : "RIGHTSHIFTEQUAL",
97 : "DOUBLESTAREQUAL",
98 : "DOUBLESLASH",
99 : "DOUBLESLASHEQUAL",
100 : "AT",
101 : "RARROW",
102 : "ELLIPSIS",
103 : /* This table must match the #defines in token.h! */
104 : "OP",
105 : "<ERRORTOKEN>",
106 : "<N_TOKENS>"
107 : };
108 :
109 :
110 : /* Create and initialize a new tok_state structure */
111 :
112 : static struct tok_state *
113 3 : tok_new(void)
114 : {
115 3 : struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 : sizeof(struct tok_state));
117 3 : if (tok == NULL)
118 0 : return NULL;
119 3 : tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 3 : tok->done = E_OK;
121 3 : tok->fp = NULL;
122 3 : tok->input = NULL;
123 3 : tok->tabsize = TABSIZE;
124 3 : tok->indent = 0;
125 3 : tok->indstack[0] = 0;
126 3 : tok->atbol = 1;
127 3 : tok->pendin = 0;
128 3 : tok->prompt = tok->nextprompt = NULL;
129 3 : tok->lineno = 0;
130 3 : tok->level = 0;
131 3 : tok->altwarning = 1;
132 3 : tok->alterror = 1;
133 3 : tok->alttabsize = 1;
134 3 : tok->altindstack[0] = 0;
135 3 : tok->decoding_state = STATE_INIT;
136 3 : tok->decoding_erred = 0;
137 3 : tok->read_coding_spec = 0;
138 3 : tok->enc = NULL;
139 3 : tok->encoding = NULL;
140 3 : tok->cont_line = 0;
141 : #ifndef PGEN
142 3 : tok->filename = NULL;
143 3 : tok->decoding_readline = NULL;
144 3 : tok->decoding_buffer = NULL;
145 : #endif
146 3 : return tok;
147 : }
148 :
149 : static char *
150 0 : new_string(const char *s, Py_ssize_t len)
151 : {
152 0 : char* result = (char *)PyMem_MALLOC(len + 1);
153 0 : if (result != NULL) {
154 0 : memcpy(result, s, len);
155 0 : result[len] = '\0';
156 : }
157 0 : return result;
158 : }
159 :
160 : #ifdef PGEN
161 :
162 : static char *
163 0 : decoding_fgets(char *s, int size, struct tok_state *tok)
164 : {
165 0 : return fgets(s, size, tok->fp);
166 : }
167 :
168 : static int
169 0 : decoding_feof(struct tok_state *tok)
170 : {
171 0 : return feof(tok->fp);
172 : }
173 :
174 : static char *
175 0 : decode_str(const char *str, int exec_input, struct tok_state *tok)
176 : {
177 0 : return new_string(str, strlen(str));
178 : }
179 :
180 : #else /* PGEN */
181 :
182 : static char *
183 0 : error_ret(struct tok_state *tok) /* XXX */
184 : {
185 0 : tok->decoding_erred = 1;
186 0 : if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187 0 : PyMem_FREE(tok->buf);
188 0 : tok->buf = NULL;
189 0 : return NULL; /* as if it were EOF */
190 : }
191 :
192 :
193 : static char *
194 0 : get_normal_name(char *s) /* for utf-8 and latin-1 */
195 : {
196 : char buf[13];
197 : int i;
198 0 : for (i = 0; i < 12; i++) {
199 0 : int c = s[i];
200 0 : if (c == '\0')
201 0 : break;
202 0 : else if (c == '_')
203 0 : buf[i] = '-';
204 : else
205 0 : buf[i] = tolower(c);
206 : }
207 0 : buf[i] = '\0';
208 0 : if (strcmp(buf, "utf-8") == 0 ||
209 0 : strncmp(buf, "utf-8-", 6) == 0)
210 0 : return "utf-8";
211 0 : else if (strcmp(buf, "latin-1") == 0 ||
212 0 : strcmp(buf, "iso-8859-1") == 0 ||
213 0 : strcmp(buf, "iso-latin-1") == 0 ||
214 0 : strncmp(buf, "latin-1-", 8) == 0 ||
215 0 : strncmp(buf, "iso-8859-1-", 11) == 0 ||
216 0 : strncmp(buf, "iso-latin-1-", 12) == 0)
217 0 : return "iso-8859-1";
218 : else
219 0 : return s;
220 : }
221 :
222 : /* Return the coding spec in S, or NULL if none is found. */
223 :
224 : static char *
225 0 : get_coding_spec(const char *s, Py_ssize_t size)
226 : {
227 : Py_ssize_t i;
228 : /* Coding spec must be in a comment, and that comment must be
229 : * the only statement on the source code line. */
230 0 : for (i = 0; i < size - 6; i++) {
231 0 : if (s[i] == '#')
232 0 : break;
233 0 : if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234 0 : return NULL;
235 : }
236 0 : for (; i < size - 6; i++) { /* XXX inefficient search */
237 0 : const char* t = s + i;
238 0 : if (strncmp(t, "coding", 6) == 0) {
239 0 : const char* begin = NULL;
240 0 : t += 6;
241 0 : if (t[0] != ':' && t[0] != '=')
242 0 : continue;
243 : do {
244 0 : t++;
245 0 : } while (t[0] == '\x20' || t[0] == '\t');
246 :
247 0 : begin = t;
248 0 : while (Py_ISALNUM(t[0]) ||
249 0 : t[0] == '-' || t[0] == '_' || t[0] == '.')
250 0 : t++;
251 :
252 0 : if (begin < t) {
253 0 : char* r = new_string(begin, t - begin);
254 0 : char* q = get_normal_name(r);
255 0 : if (r != q) {
256 0 : PyMem_FREE(r);
257 0 : r = new_string(q, strlen(q));
258 : }
259 0 : return r;
260 : }
261 : }
262 : }
263 0 : return NULL;
264 : }
265 :
266 : /* Check whether the line contains a coding spec. If it does,
267 : invoke the set_readline function for the new encoding.
268 : This function receives the tok_state and the new encoding.
269 : Return 1 on success, 0 on failure. */
270 :
271 : static int
272 0 : check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
273 : int set_readline(struct tok_state *, const char *))
274 : {
275 : char * cs;
276 0 : int r = 1;
277 :
278 0 : if (tok->cont_line)
279 : /* It's a continuation line, so it can't be a coding spec. */
280 0 : return 1;
281 0 : cs = get_coding_spec(line, size);
282 0 : if (cs != NULL) {
283 0 : tok->read_coding_spec = 1;
284 0 : if (tok->encoding == NULL) {
285 : assert(tok->decoding_state == STATE_RAW);
286 0 : if (strcmp(cs, "utf-8") == 0) {
287 0 : tok->encoding = cs;
288 : } else {
289 0 : r = set_readline(tok, cs);
290 0 : if (r) {
291 0 : tok->encoding = cs;
292 0 : tok->decoding_state = STATE_NORMAL;
293 : }
294 : else
295 0 : PyMem_FREE(cs);
296 : }
297 : } else { /* then, compare cs with BOM */
298 0 : r = (strcmp(tok->encoding, cs) == 0);
299 0 : PyMem_FREE(cs);
300 : }
301 : }
302 0 : if (!r) {
303 0 : cs = tok->encoding;
304 0 : if (!cs)
305 0 : cs = "with BOM";
306 0 : PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
307 : }
308 0 : return r;
309 : }
310 :
311 : /* See whether the file starts with a BOM. If it does,
312 : invoke the set_readline function with the new encoding.
313 : Return 1 on success, 0 on failure. */
314 :
315 : static int
316 0 : check_bom(int get_char(struct tok_state *),
317 : void unget_char(int, struct tok_state *),
318 : int set_readline(struct tok_state *, const char *),
319 : struct tok_state *tok)
320 : {
321 : int ch1, ch2, ch3;
322 0 : ch1 = get_char(tok);
323 0 : tok->decoding_state = STATE_RAW;
324 0 : if (ch1 == EOF) {
325 0 : return 1;
326 0 : } else if (ch1 == 0xEF) {
327 0 : ch2 = get_char(tok);
328 0 : if (ch2 != 0xBB) {
329 0 : unget_char(ch2, tok);
330 0 : unget_char(ch1, tok);
331 0 : return 1;
332 : }
333 0 : ch3 = get_char(tok);
334 0 : if (ch3 != 0xBF) {
335 0 : unget_char(ch3, tok);
336 0 : unget_char(ch2, tok);
337 0 : unget_char(ch1, tok);
338 0 : return 1;
339 : }
340 : #if 0
341 : /* Disable support for UTF-16 BOMs until a decision
342 : is made whether this needs to be supported. */
343 : } else if (ch1 == 0xFE) {
344 : ch2 = get_char(tok);
345 : if (ch2 != 0xFF) {
346 : unget_char(ch2, tok);
347 : unget_char(ch1, tok);
348 : return 1;
349 : }
350 : if (!set_readline(tok, "utf-16-be"))
351 : return 0;
352 : tok->decoding_state = STATE_NORMAL;
353 : } else if (ch1 == 0xFF) {
354 : ch2 = get_char(tok);
355 : if (ch2 != 0xFE) {
356 : unget_char(ch2, tok);
357 : unget_char(ch1, tok);
358 : return 1;
359 : }
360 : if (!set_readline(tok, "utf-16-le"))
361 : return 0;
362 : tok->decoding_state = STATE_NORMAL;
363 : #endif
364 : } else {
365 0 : unget_char(ch1, tok);
366 0 : return 1;
367 : }
368 0 : if (tok->encoding != NULL)
369 0 : PyMem_FREE(tok->encoding);
370 0 : tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
371 : /* No need to set_readline: input is already utf-8 */
372 0 : return 1;
373 : }
374 :
375 : /* Read a line of text from TOK into S, using the stream in TOK.
376 : Return NULL on failure, else S.
377 :
378 : On entry, tok->decoding_buffer will be one of:
379 : 1) NULL: need to call tok->decoding_readline to get a new line
380 : 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
381 : stored the result in tok->decoding_buffer
382 : 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
383 : (in the s buffer) to copy entire contents of the line read
384 : by tok->decoding_readline. tok->decoding_buffer has the overflow.
385 : In this case, fp_readl is called in a loop (with an expanded buffer)
386 : until the buffer ends with a '\n' (or until the end of the file is
387 : reached): see tok_nextc and its calls to decoding_fgets.
388 : */
389 :
390 : static char *
391 0 : fp_readl(char *s, int size, struct tok_state *tok)
392 : {
393 : PyObject* bufobj;
394 : const char *buf;
395 : Py_ssize_t buflen;
396 :
397 : /* Ask for one less byte so we can terminate it */
398 : assert(size > 0);
399 0 : size--;
400 :
401 0 : if (tok->decoding_buffer) {
402 0 : bufobj = tok->decoding_buffer;
403 0 : Py_INCREF(bufobj);
404 : }
405 : else
406 : {
407 0 : bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
408 0 : if (bufobj == NULL)
409 0 : goto error;
410 : }
411 0 : if (PyUnicode_CheckExact(bufobj))
412 : {
413 0 : buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
414 0 : if (buf == NULL) {
415 0 : goto error;
416 : }
417 : }
418 : else
419 : {
420 0 : buf = PyByteArray_AsString(bufobj);
421 0 : if (buf == NULL) {
422 0 : goto error;
423 : }
424 0 : buflen = PyByteArray_GET_SIZE(bufobj);
425 : }
426 :
427 0 : Py_XDECREF(tok->decoding_buffer);
428 0 : if (buflen > size) {
429 : /* Too many chars, the rest goes into tok->decoding_buffer */
430 0 : tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
431 : buflen-size);
432 0 : if (tok->decoding_buffer == NULL)
433 0 : goto error;
434 0 : buflen = size;
435 : }
436 : else
437 0 : tok->decoding_buffer = NULL;
438 :
439 0 : memcpy(s, buf, buflen);
440 0 : s[buflen] = '\0';
441 0 : if (buflen == 0) /* EOF */
442 0 : s = NULL;
443 0 : Py_DECREF(bufobj);
444 0 : return s;
445 :
446 : error:
447 0 : Py_XDECREF(bufobj);
448 0 : return error_ret(tok);
449 : }
450 :
451 : /* Set the readline function for TOK to a StreamReader's
452 : readline function. The StreamReader is named ENC.
453 :
454 : This function is called from check_bom and check_coding_spec.
455 :
456 : ENC is usually identical to the future value of tok->encoding,
457 : except for the (currently unsupported) case of UTF-16.
458 :
459 : Return 1 on success, 0 on failure. */
460 :
461 : static int
462 0 : fp_setreadl(struct tok_state *tok, const char* enc)
463 : {
464 0 : PyObject *readline = NULL, *stream = NULL, *io = NULL;
465 : _Py_IDENTIFIER(open);
466 : _Py_IDENTIFIER(readline);
467 : int fd;
468 :
469 0 : io = PyImport_ImportModuleNoBlock("io");
470 0 : if (io == NULL)
471 0 : goto cleanup;
472 :
473 0 : fd = fileno(tok->fp);
474 0 : if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
475 0 : PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
476 0 : goto cleanup;
477 : }
478 :
479 0 : stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
480 : fd, "r", -1, enc, Py_None, Py_None, Py_False);
481 0 : if (stream == NULL)
482 0 : goto cleanup;
483 :
484 0 : Py_XDECREF(tok->decoding_readline);
485 0 : readline = _PyObject_GetAttrId(stream, &PyId_readline);
486 0 : tok->decoding_readline = readline;
487 :
488 : /* The file has been reopened; parsing will restart from
489 : * the beginning of the file, we have to reset the line number.
490 : * But this function has been called from inside tok_nextc() which
491 : * will increment lineno before it returns. So we set it -1 so that
492 : * the next call to tok_nextc() will start with tok->lineno == 0.
493 : */
494 0 : tok->lineno = -1;
495 :
496 : cleanup:
497 0 : Py_XDECREF(stream);
498 0 : Py_XDECREF(io);
499 0 : return readline != NULL;
500 : }
501 :
502 : /* Fetch the next byte from TOK. */
503 :
504 0 : static int fp_getc(struct tok_state *tok) {
505 0 : return getc(tok->fp);
506 : }
507 :
508 : /* Unfetch the last byte back into TOK. */
509 :
510 0 : static void fp_ungetc(int c, struct tok_state *tok) {
511 0 : ungetc(c, tok->fp);
512 0 : }
513 :
514 : /* Check whether the characters at s start a valid
515 : UTF-8 sequence. Return the number of characters forming
516 : the sequence if yes, 0 if not. */
517 0 : static int valid_utf8(const unsigned char* s)
518 : {
519 0 : int expected = 0;
520 : int length;
521 0 : if (*s < 0x80)
522 : /* single-byte code */
523 0 : return 1;
524 0 : if (*s < 0xc0)
525 : /* following byte */
526 0 : return 0;
527 0 : if (*s < 0xE0)
528 0 : expected = 1;
529 0 : else if (*s < 0xF0)
530 0 : expected = 2;
531 0 : else if (*s < 0xF8)
532 0 : expected = 3;
533 : else
534 0 : return 0;
535 0 : length = expected + 1;
536 0 : for (; expected; expected--)
537 0 : if (s[expected] < 0x80 || s[expected] >= 0xC0)
538 0 : return 0;
539 0 : return length;
540 : }
541 :
542 : /* Read a line of input from TOK. Determine encoding
543 : if necessary. */
544 :
545 : static char *
546 0 : decoding_fgets(char *s, int size, struct tok_state *tok)
547 : {
548 0 : char *line = NULL;
549 0 : int badchar = 0;
550 : for (;;) {
551 0 : if (tok->decoding_state == STATE_NORMAL) {
552 : /* We already have a codec associated with
553 : this input. */
554 0 : line = fp_readl(s, size, tok);
555 0 : break;
556 0 : } else if (tok->decoding_state == STATE_RAW) {
557 : /* We want a 'raw' read. */
558 0 : line = Py_UniversalNewlineFgets(s, size,
559 : tok->fp, NULL);
560 0 : break;
561 : } else {
562 : /* We have not yet determined the encoding.
563 : If an encoding is found, use the file-pointer
564 : reader functions from now on. */
565 0 : if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
566 0 : return error_ret(tok);
567 : assert(tok->decoding_state != STATE_INIT);
568 : }
569 0 : }
570 0 : if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
571 0 : if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
572 0 : return error_ret(tok);
573 : }
574 : }
575 : #ifndef PGEN
576 : /* The default encoding is UTF-8, so make sure we don't have any
577 : non-UTF-8 sequences in it. */
578 0 : if (line && !tok->encoding) {
579 : unsigned char *c;
580 : int length;
581 0 : for (c = (unsigned char *)line; *c; c += length)
582 0 : if (!(length = valid_utf8(c))) {
583 0 : badchar = *c;
584 0 : break;
585 : }
586 : }
587 0 : if (badchar) {
588 : /* Need to add 1 to the line number, since this line
589 : has not been counted, yet. */
590 0 : PyErr_Format(PyExc_SyntaxError,
591 : "Non-UTF-8 code starting with '\\x%.2x' "
592 : "in file %U on line %i, "
593 : "but no encoding declared; "
594 : "see http://python.org/dev/peps/pep-0263/ for details",
595 0 : badchar, tok->filename, tok->lineno + 1);
596 0 : return error_ret(tok);
597 : }
598 : #endif
599 0 : return line;
600 : }
601 :
602 : static int
603 0 : decoding_feof(struct tok_state *tok)
604 : {
605 0 : if (tok->decoding_state != STATE_NORMAL) {
606 0 : return feof(tok->fp);
607 : } else {
608 0 : PyObject* buf = tok->decoding_buffer;
609 0 : if (buf == NULL) {
610 0 : buf = PyObject_CallObject(tok->decoding_readline, NULL);
611 0 : if (buf == NULL) {
612 0 : error_ret(tok);
613 0 : return 1;
614 : } else {
615 0 : tok->decoding_buffer = buf;
616 : }
617 : }
618 0 : return PyObject_Length(buf) == 0;
619 : }
620 : }
621 :
622 : /* Fetch a byte from TOK, using the string buffer. */
623 :
624 : static int
625 0 : buf_getc(struct tok_state *tok) {
626 0 : return Py_CHARMASK(*tok->str++);
627 : }
628 :
629 : /* Unfetch a byte from TOK, using the string buffer. */
630 :
631 : static void
632 0 : buf_ungetc(int c, struct tok_state *tok) {
633 0 : tok->str--;
634 : assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
635 0 : }
636 :
637 : /* Set the readline function for TOK to ENC. For the string-based
638 : tokenizer, this means to just record the encoding. */
639 :
640 : static int
641 0 : buf_setreadl(struct tok_state *tok, const char* enc) {
642 0 : tok->enc = enc;
643 0 : return 1;
644 : }
645 :
646 : /* Return a UTF-8 encoding Python string object from the
647 : C byte string STR, which is encoded with ENC. */
648 :
649 : static PyObject *
650 0 : translate_into_utf8(const char* str, const char* enc) {
651 : PyObject *utf8;
652 0 : PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
653 0 : if (buf == NULL)
654 0 : return NULL;
655 0 : utf8 = PyUnicode_AsUTF8String(buf);
656 0 : Py_DECREF(buf);
657 0 : return utf8;
658 : }
659 :
660 :
661 : static char *
662 3 : translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
663 3 : int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
664 : char *buf, *current;
665 3 : char c = '\0';
666 3 : buf = PyMem_MALLOC(needed_length);
667 3 : if (buf == NULL) {
668 0 : tok->done = E_NOMEM;
669 0 : return NULL;
670 : }
671 10395 : for (current = buf; *s; s++, current++) {
672 10392 : c = *s;
673 10392 : if (skip_next_lf) {
674 0 : skip_next_lf = 0;
675 0 : if (c == '\n') {
676 0 : c = *++s;
677 0 : if (!c)
678 0 : break;
679 : }
680 : }
681 10392 : if (c == '\r') {
682 0 : skip_next_lf = 1;
683 0 : c = '\n';
684 : }
685 10392 : *current = c;
686 : }
687 : /* If this is exec input, add a newline to the end of the string if
688 : there isn't one already. */
689 3 : if (exec_input && c != '\n') {
690 0 : *current = '\n';
691 0 : current++;
692 : }
693 3 : *current = '\0';
694 3 : final_length = current - buf + 1;
695 3 : if (final_length < needed_length && final_length)
696 : /* should never fail */
697 3 : buf = PyMem_REALLOC(buf, final_length);
698 3 : return buf;
699 : }
700 :
701 : /* Decode a byte string STR for use as the buffer of TOK.
702 : Look for encoding declarations inside STR, and record them
703 : inside TOK. */
704 :
705 : static const char *
706 0 : decode_str(const char *input, int single, struct tok_state *tok)
707 : {
708 0 : PyObject* utf8 = NULL;
709 : const char *str;
710 : const char *s;
711 0 : const char *newl[2] = {NULL, NULL};
712 0 : int lineno = 0;
713 0 : tok->input = str = translate_newlines(input, single, tok);
714 0 : if (str == NULL)
715 0 : return NULL;
716 0 : tok->enc = NULL;
717 0 : tok->str = str;
718 0 : if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
719 0 : return error_ret(tok);
720 0 : str = tok->str; /* string after BOM if any */
721 : assert(str);
722 0 : if (tok->enc != NULL) {
723 0 : utf8 = translate_into_utf8(str, tok->enc);
724 0 : if (utf8 == NULL)
725 0 : return error_ret(tok);
726 0 : str = PyBytes_AsString(utf8);
727 : }
728 0 : for (s = str;; s++) {
729 0 : if (*s == '\0') break;
730 0 : else if (*s == '\n') {
731 : assert(lineno < 2);
732 0 : newl[lineno] = s;
733 0 : lineno++;
734 0 : if (lineno == 2) break;
735 : }
736 0 : }
737 0 : tok->enc = NULL;
738 : /* need to check line 1 and 2 separately since check_coding_spec
739 : assumes a single line as input */
740 0 : if (newl[0]) {
741 0 : if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
742 0 : return error_ret(tok);
743 0 : if (tok->enc == NULL && newl[1]) {
744 0 : if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
745 : tok, buf_setreadl))
746 0 : return error_ret(tok);
747 : }
748 : }
749 0 : if (tok->enc != NULL) {
750 : assert(utf8 == NULL);
751 0 : utf8 = translate_into_utf8(str, tok->enc);
752 0 : if (utf8 == NULL)
753 0 : return error_ret(tok);
754 0 : str = PyBytes_AS_STRING(utf8);
755 : }
756 : assert(tok->decoding_buffer == NULL);
757 0 : tok->decoding_buffer = utf8; /* CAUTION */
758 0 : return str;
759 : }
760 :
761 : #endif /* PGEN */
762 :
763 : /* Set up tokenizer for string */
764 :
765 : struct tok_state *
766 0 : PyTokenizer_FromString(const char *str, int exec_input)
767 : {
768 0 : struct tok_state *tok = tok_new();
769 0 : if (tok == NULL)
770 0 : return NULL;
771 0 : str = (char *)decode_str(str, exec_input, tok);
772 0 : if (str == NULL) {
773 0 : PyTokenizer_Free(tok);
774 0 : return NULL;
775 : }
776 :
777 : /* XXX: constify members. */
778 0 : tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
779 0 : return tok;
780 : }
781 :
782 : struct tok_state *
783 3 : PyTokenizer_FromUTF8(const char *str, int exec_input)
784 : {
785 3 : struct tok_state *tok = tok_new();
786 3 : if (tok == NULL)
787 0 : return NULL;
788 : #ifndef PGEN
789 3 : tok->input = str = translate_newlines(str, exec_input, tok);
790 : #endif
791 3 : if (str == NULL) {
792 0 : PyTokenizer_Free(tok);
793 0 : return NULL;
794 : }
795 3 : tok->decoding_state = STATE_RAW;
796 3 : tok->read_coding_spec = 1;
797 3 : tok->enc = NULL;
798 3 : tok->str = str;
799 3 : tok->encoding = (char *)PyMem_MALLOC(6);
800 3 : if (!tok->encoding) {
801 0 : PyTokenizer_Free(tok);
802 0 : return NULL;
803 : }
804 3 : strcpy(tok->encoding, "utf-8");
805 :
806 : /* XXX: constify members. */
807 3 : tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
808 3 : return tok;
809 : }
810 :
811 : /* Set up tokenizer for file */
812 :
813 : struct tok_state *
814 0 : PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
815 : {
816 0 : struct tok_state *tok = tok_new();
817 0 : if (tok == NULL)
818 0 : return NULL;
819 0 : if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
820 0 : PyTokenizer_Free(tok);
821 0 : return NULL;
822 : }
823 0 : tok->cur = tok->inp = tok->buf;
824 0 : tok->end = tok->buf + BUFSIZ;
825 0 : tok->fp = fp;
826 0 : tok->prompt = ps1;
827 0 : tok->nextprompt = ps2;
828 0 : if (enc != NULL) {
829 : /* Must copy encoding declaration since it
830 : gets copied into the parse tree. */
831 0 : tok->encoding = PyMem_MALLOC(strlen(enc)+1);
832 0 : if (!tok->encoding) {
833 0 : PyTokenizer_Free(tok);
834 0 : return NULL;
835 : }
836 0 : strcpy(tok->encoding, enc);
837 0 : tok->decoding_state = STATE_NORMAL;
838 : }
839 0 : return tok;
840 : }
841 :
842 :
843 : /* Free a tok_state structure */
844 :
845 : void
846 3 : PyTokenizer_Free(struct tok_state *tok)
847 : {
848 3 : if (tok->encoding != NULL)
849 0 : PyMem_FREE(tok->encoding);
850 : #ifndef PGEN
851 3 : Py_XDECREF(tok->decoding_readline);
852 3 : Py_XDECREF(tok->decoding_buffer);
853 3 : Py_XDECREF(tok->filename);
854 : #endif
855 3 : if (tok->fp != NULL && tok->buf != NULL)
856 0 : PyMem_FREE(tok->buf);
857 3 : if (tok->input)
858 3 : PyMem_FREE((char *)tok->input);
859 3 : PyMem_FREE(tok);
860 3 : }
861 :
862 : /* Get next char, updating state; error code goes into tok->done */
863 :
864 : static int
865 12167 : tok_nextc(register struct tok_state *tok)
866 : {
867 : for (;;) {
868 12167 : if (tok->cur != tok->inp) {
869 11897 : return Py_CHARMASK(*tok->cur++); /* Fast path */
870 : }
871 270 : if (tok->done != E_OK)
872 6 : return EOF;
873 264 : if (tok->fp == NULL) {
874 264 : char *end = strchr(tok->inp, '\n');
875 264 : if (end != NULL)
876 261 : end++;
877 : else {
878 3 : end = strchr(tok->inp, '\0');
879 3 : if (end == tok->inp) {
880 3 : tok->done = E_EOF;
881 3 : return EOF;
882 : }
883 : }
884 261 : if (tok->start == NULL)
885 261 : tok->buf = tok->cur;
886 261 : tok->line_start = tok->cur;
887 261 : tok->lineno++;
888 261 : tok->inp = end;
889 261 : return Py_CHARMASK(*tok->cur++);
890 : }
891 0 : if (tok->prompt != NULL) {
892 0 : char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
893 : #ifndef PGEN
894 0 : if (newtok != NULL) {
895 0 : char *translated = translate_newlines(newtok, 0, tok);
896 0 : PyMem_FREE(newtok);
897 0 : if (translated == NULL)
898 0 : return EOF;
899 0 : newtok = translated;
900 : }
901 0 : if (tok->encoding && newtok && *newtok) {
902 : /* Recode to UTF-8 */
903 : Py_ssize_t buflen;
904 : const char* buf;
905 0 : PyObject *u = translate_into_utf8(newtok, tok->encoding);
906 0 : PyMem_FREE(newtok);
907 0 : if (!u) {
908 0 : tok->done = E_DECODE;
909 0 : return EOF;
910 : }
911 0 : buflen = PyBytes_GET_SIZE(u);
912 0 : buf = PyBytes_AS_STRING(u);
913 0 : if (!buf) {
914 0 : Py_DECREF(u);
915 0 : tok->done = E_DECODE;
916 0 : return EOF;
917 : }
918 0 : newtok = PyMem_MALLOC(buflen+1);
919 0 : strcpy(newtok, buf);
920 0 : Py_DECREF(u);
921 : }
922 : #endif
923 0 : if (tok->nextprompt != NULL)
924 0 : tok->prompt = tok->nextprompt;
925 0 : if (newtok == NULL)
926 0 : tok->done = E_INTR;
927 0 : else if (*newtok == '\0') {
928 0 : PyMem_FREE(newtok);
929 0 : tok->done = E_EOF;
930 : }
931 0 : else if (tok->start != NULL) {
932 0 : size_t start = tok->start - tok->buf;
933 0 : size_t oldlen = tok->cur - tok->buf;
934 0 : size_t newlen = oldlen + strlen(newtok);
935 0 : char *buf = tok->buf;
936 0 : buf = (char *)PyMem_REALLOC(buf, newlen+1);
937 0 : tok->lineno++;
938 0 : if (buf == NULL) {
939 0 : PyMem_FREE(tok->buf);
940 0 : tok->buf = NULL;
941 0 : PyMem_FREE(newtok);
942 0 : tok->done = E_NOMEM;
943 0 : return EOF;
944 : }
945 0 : tok->buf = buf;
946 0 : tok->cur = tok->buf + oldlen;
947 0 : tok->line_start = tok->cur;
948 0 : strcpy(tok->buf + oldlen, newtok);
949 0 : PyMem_FREE(newtok);
950 0 : tok->inp = tok->buf + newlen;
951 0 : tok->end = tok->inp + 1;
952 0 : tok->start = tok->buf + start;
953 : }
954 : else {
955 0 : tok->lineno++;
956 0 : if (tok->buf != NULL)
957 0 : PyMem_FREE(tok->buf);
958 0 : tok->buf = newtok;
959 0 : tok->line_start = tok->buf;
960 0 : tok->cur = tok->buf;
961 0 : tok->line_start = tok->buf;
962 0 : tok->inp = strchr(tok->buf, '\0');
963 0 : tok->end = tok->inp + 1;
964 : }
965 : }
966 : else {
967 0 : int done = 0;
968 0 : Py_ssize_t cur = 0;
969 : char *pt;
970 0 : if (tok->start == NULL) {
971 0 : if (tok->buf == NULL) {
972 0 : tok->buf = (char *)
973 0 : PyMem_MALLOC(BUFSIZ);
974 0 : if (tok->buf == NULL) {
975 0 : tok->done = E_NOMEM;
976 0 : return EOF;
977 : }
978 0 : tok->end = tok->buf + BUFSIZ;
979 : }
980 0 : if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
981 : tok) == NULL) {
982 0 : tok->done = E_EOF;
983 0 : done = 1;
984 : }
985 : else {
986 0 : tok->done = E_OK;
987 0 : tok->inp = strchr(tok->buf, '\0');
988 0 : done = tok->inp[-1] == '\n';
989 : }
990 : }
991 : else {
992 0 : cur = tok->cur - tok->buf;
993 0 : if (decoding_feof(tok)) {
994 0 : tok->done = E_EOF;
995 0 : done = 1;
996 : }
997 : else
998 0 : tok->done = E_OK;
999 : }
1000 0 : tok->lineno++;
1001 : /* Read until '\n' or EOF */
1002 0 : while (!done) {
1003 0 : Py_ssize_t curstart = tok->start == NULL ? -1 :
1004 0 : tok->start - tok->buf;
1005 0 : Py_ssize_t curvalid = tok->inp - tok->buf;
1006 0 : Py_ssize_t newsize = curvalid + BUFSIZ;
1007 0 : char *newbuf = tok->buf;
1008 0 : newbuf = (char *)PyMem_REALLOC(newbuf,
1009 : newsize);
1010 0 : if (newbuf == NULL) {
1011 0 : tok->done = E_NOMEM;
1012 0 : tok->cur = tok->inp;
1013 0 : return EOF;
1014 : }
1015 0 : tok->buf = newbuf;
1016 0 : tok->inp = tok->buf + curvalid;
1017 0 : tok->end = tok->buf + newsize;
1018 0 : tok->start = curstart < 0 ? NULL :
1019 0 : tok->buf + curstart;
1020 0 : if (decoding_fgets(tok->inp,
1021 0 : (int)(tok->end - tok->inp),
1022 : tok) == NULL) {
1023 : /* Break out early on decoding
1024 : errors, as tok->buf will be NULL
1025 : */
1026 0 : if (tok->decoding_erred)
1027 0 : return EOF;
1028 : /* Last line does not end in \n,
1029 : fake one */
1030 0 : strcpy(tok->inp, "\n");
1031 : }
1032 0 : tok->inp = strchr(tok->inp, '\0');
1033 0 : done = tok->inp[-1] == '\n';
1034 : }
1035 0 : if (tok->buf != NULL) {
1036 0 : tok->cur = tok->buf + cur;
1037 0 : tok->line_start = tok->cur;
1038 : /* replace "\r\n" with "\n" */
1039 : /* For Mac leave the \r, giving a syntax error */
1040 0 : pt = tok->inp - 2;
1041 0 : if (pt >= tok->buf && *pt == '\r') {
1042 0 : *pt++ = '\n';
1043 0 : *pt = '\0';
1044 0 : tok->inp = pt;
1045 : }
1046 : }
1047 : }
1048 0 : if (tok->done != E_OK) {
1049 0 : if (tok->prompt != NULL)
1050 0 : PySys_WriteStderr("\n");
1051 0 : tok->cur = tok->inp;
1052 0 : return EOF;
1053 : }
1054 0 : }
1055 : /*NOTREACHED*/
1056 : }
1057 :
1058 :
1059 : /* Back-up one character */
1060 :
1061 : static void
1062 1769 : tok_backup(register struct tok_state *tok, register int c)
1063 : {
1064 1769 : if (c != EOF) {
1065 1766 : if (--tok->cur < tok->buf)
1066 0 : Py_FatalError("tok_backup: beginning of buffer");
1067 1766 : if (*tok->cur != c)
1068 0 : *tok->cur = c;
1069 : }
1070 1769 : }
1071 :
1072 :
1073 : /* Return the token corresponding to a single character */
1074 :
1075 : int
1076 564 : PyToken_OneChar(int c)
1077 : {
1078 564 : switch (c) {
1079 123 : case '(': return LPAR;
1080 123 : case ')': return RPAR;
1081 23 : case '[': return LSQB;
1082 23 : case ']': return RSQB;
1083 60 : case ':': return COLON;
1084 110 : case ',': return COMMA;
1085 0 : case ';': return SEMI;
1086 13 : case '+': return PLUS;
1087 1 : case '-': return MINUS;
1088 1 : case '*': return STAR;
1089 0 : case '/': return SLASH;
1090 0 : case '|': return VBAR;
1091 0 : case '&': return AMPER;
1092 0 : case '<': return LESS;
1093 2 : case '>': return GREATER;
1094 74 : case '=': return EQUAL;
1095 0 : case '.': return DOT;
1096 7 : case '%': return PERCENT;
1097 1 : case '{': return LBRACE;
1098 1 : case '}': return RBRACE;
1099 0 : case '^': return CIRCUMFLEX;
1100 0 : case '~': return TILDE;
1101 2 : case '@': return AT;
1102 0 : default: return OP;
1103 : }
1104 : }
1105 :
1106 :
1107 : int
1108 579 : PyToken_TwoChars(int c1, int c2)
1109 : {
1110 579 : switch (c1) {
1111 : case '=':
1112 83 : switch (c2) {
1113 9 : case '=': return EQEQUAL;
1114 : }
1115 74 : break;
1116 : case '!':
1117 3 : switch (c2) {
1118 3 : case '=': return NOTEQUAL;
1119 : }
1120 0 : break;
1121 : case '<':
1122 0 : switch (c2) {
1123 0 : case '>': return NOTEQUAL;
1124 0 : case '=': return LESSEQUAL;
1125 0 : case '<': return LEFTSHIFT;
1126 : }
1127 0 : break;
1128 : case '>':
1129 2 : switch (c2) {
1130 0 : case '=': return GREATEREQUAL;
1131 0 : case '>': return RIGHTSHIFT;
1132 : }
1133 2 : break;
1134 : case '+':
1135 14 : switch (c2) {
1136 1 : case '=': return PLUSEQUAL;
1137 : }
1138 13 : break;
1139 : case '-':
1140 1 : switch (c2) {
1141 0 : case '=': return MINEQUAL;
1142 0 : case '>': return RARROW;
1143 : }
1144 1 : break;
1145 : case '*':
1146 3 : switch (c2) {
1147 2 : case '*': return DOUBLESTAR;
1148 0 : case '=': return STAREQUAL;
1149 : }
1150 1 : break;
1151 : case '/':
1152 0 : switch (c2) {
1153 0 : case '/': return DOUBLESLASH;
1154 0 : case '=': return SLASHEQUAL;
1155 : }
1156 0 : break;
1157 : case '|':
1158 0 : switch (c2) {
1159 0 : case '=': return VBAREQUAL;
1160 : }
1161 0 : break;
1162 : case '%':
1163 7 : switch (c2) {
1164 0 : case '=': return PERCENTEQUAL;
1165 : }
1166 7 : break;
1167 : case '&':
1168 0 : switch (c2) {
1169 0 : case '=': return AMPEREQUAL;
1170 : }
1171 0 : break;
1172 : case '^':
1173 0 : switch (c2) {
1174 0 : case '=': return CIRCUMFLEXEQUAL;
1175 : }
1176 0 : break;
1177 : }
1178 564 : return OP;
1179 : }
1180 :
1181 : int
1182 15 : PyToken_ThreeChars(int c1, int c2, int c3)
1183 : {
1184 15 : switch (c1) {
1185 : case '<':
1186 0 : switch (c2) {
1187 : case '<':
1188 0 : switch (c3) {
1189 : case '=':
1190 0 : return LEFTSHIFTEQUAL;
1191 : }
1192 0 : break;
1193 : }
1194 0 : break;
1195 : case '>':
1196 0 : switch (c2) {
1197 : case '>':
1198 0 : switch (c3) {
1199 : case '=':
1200 0 : return RIGHTSHIFTEQUAL;
1201 : }
1202 0 : break;
1203 : }
1204 0 : break;
1205 : case '*':
1206 2 : switch (c2) {
1207 : case '*':
1208 2 : switch (c3) {
1209 : case '=':
1210 0 : return DOUBLESTAREQUAL;
1211 : }
1212 2 : break;
1213 : }
1214 2 : break;
1215 : case '/':
1216 0 : switch (c2) {
1217 : case '/':
1218 0 : switch (c3) {
1219 : case '=':
1220 0 : return DOUBLESLASHEQUAL;
1221 : }
1222 0 : break;
1223 : }
1224 0 : break;
1225 : case '.':
1226 0 : switch (c2) {
1227 : case '.':
1228 0 : switch (c3) {
1229 : case '.':
1230 0 : return ELLIPSIS;
1231 : }
1232 0 : break;
1233 : }
1234 0 : break;
1235 : }
1236 15 : return OP;
1237 : }
1238 :
1239 : static int
1240 0 : indenterror(struct tok_state *tok)
1241 : {
1242 0 : if (tok->alterror) {
1243 0 : tok->done = E_TABSPACE;
1244 0 : tok->cur = tok->inp;
1245 0 : return 1;
1246 : }
1247 0 : if (tok->altwarning) {
1248 : #ifdef PGEN
1249 0 : PySys_WriteStderr("inconsistent use of tabs and spaces "
1250 : "in indentation\n");
1251 : #else
1252 0 : PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1253 : "in indentation\n", tok->filename);
1254 : #endif
1255 0 : tok->altwarning = 0;
1256 : }
1257 0 : return 0;
1258 : }
1259 :
1260 : #ifdef PGEN
1261 : #define verify_identifier(tok) 1
1262 : #else
1263 : /* Verify that the identifier follows PEP 3131.
1264 : All identifier strings are guaranteed to be "ready" unicode objects.
1265 : */
1266 : static int
1267 0 : verify_identifier(struct tok_state *tok)
1268 : {
1269 : PyObject *s;
1270 : int result;
1271 0 : s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1272 0 : if (s == NULL || PyUnicode_READY(s) == -1) {
1273 0 : if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1274 0 : PyErr_Clear();
1275 0 : tok->done = E_IDENTIFIER;
1276 : } else {
1277 0 : tok->done = E_ERROR;
1278 : }
1279 0 : return 0;
1280 : }
1281 0 : result = PyUnicode_IsIdentifier(s);
1282 0 : Py_DECREF(s);
1283 0 : if (result == 0)
1284 0 : tok->done = E_IDENTIFIER;
1285 0 : return result;
1286 : }
1287 : #endif
1288 :
1289 : /* Get next token, after space stripping etc. */
1290 :
1291 : static int
1292 1823 : tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1293 : {
1294 : register int c;
1295 : int blankline, nonascii;
1296 :
1297 1823 : *p_start = *p_end = NULL;
1298 : nextline:
1299 1884 : tok->start = NULL;
1300 1884 : blankline = 0;
1301 :
1302 : /* Get indentation level */
1303 1884 : if (tok->atbol) {
1304 259 : register int col = 0;
1305 259 : register int altcol = 0;
1306 259 : tok->atbol = 0;
1307 : for (;;) {
1308 1991 : c = tok_nextc(tok);
1309 1991 : if (c == ' ')
1310 1732 : col++, altcol++;
1311 259 : else if (c == '\t') {
1312 0 : col = (col/tok->tabsize + 1) * tok->tabsize;
1313 0 : altcol = (altcol/tok->alttabsize + 1)
1314 0 : * tok->alttabsize;
1315 : }
1316 259 : else if (c == '\014') /* Control-L (formfeed) */
1317 0 : col = altcol = 0; /* For Emacs users */
1318 : else
1319 259 : break;
1320 1732 : }
1321 259 : tok_backup(tok, c);
1322 259 : if (c == '#' || c == '\n') {
1323 : /* Lines with only whitespace and/or comments
1324 : shouldn't affect the indentation and are
1325 : not passed to the parser as NEWLINE tokens,
1326 : except *totally* empty lines in interactive
1327 : mode, which signal the end of a command group. */
1328 59 : if (col == 0 && c == '\n' && tok->prompt != NULL)
1329 0 : blankline = 0; /* Let it through */
1330 : else
1331 59 : blankline = 1; /* Ignore completely */
1332 : /* We can't jump back right here since we still
1333 : may need to skip to the end of a comment */
1334 : }
1335 259 : if (!blankline && tok->level == 0) {
1336 198 : if (col == tok->indstack[tok->indent]) {
1337 : /* No change */
1338 93 : if (altcol != tok->altindstack[tok->indent]) {
1339 0 : if (indenterror(tok))
1340 0 : return ERRORTOKEN;
1341 : }
1342 : }
1343 105 : else if (col > tok->indstack[tok->indent]) {
1344 : /* Indent -- always one */
1345 56 : if (tok->indent+1 >= MAXINDENT) {
1346 0 : tok->done = E_TOODEEP;
1347 0 : tok->cur = tok->inp;
1348 0 : return ERRORTOKEN;
1349 : }
1350 56 : if (altcol <= tok->altindstack[tok->indent]) {
1351 0 : if (indenterror(tok))
1352 0 : return ERRORTOKEN;
1353 : }
1354 56 : tok->pendin++;
1355 56 : tok->indstack[++tok->indent] = col;
1356 56 : tok->altindstack[tok->indent] = altcol;
1357 : }
1358 : else /* col < tok->indstack[tok->indent] */ {
1359 : /* Dedent -- any number, must be consistent */
1360 254 : while (tok->indent > 0 &&
1361 100 : col < tok->indstack[tok->indent]) {
1362 56 : tok->pendin--;
1363 56 : tok->indent--;
1364 : }
1365 49 : if (col != tok->indstack[tok->indent]) {
1366 0 : tok->done = E_DEDENT;
1367 0 : tok->cur = tok->inp;
1368 0 : return ERRORTOKEN;
1369 : }
1370 49 : if (altcol != tok->altindstack[tok->indent]) {
1371 0 : if (indenterror(tok))
1372 0 : return ERRORTOKEN;
1373 : }
1374 : }
1375 : }
1376 : }
1377 :
1378 1884 : tok->start = tok->cur;
1379 :
1380 : /* Return pending indents/dedents */
1381 1884 : if (tok->pendin != 0) {
1382 112 : if (tok->pendin < 0) {
1383 56 : tok->pendin++;
1384 56 : return DEDENT;
1385 : }
1386 : else {
1387 56 : tok->pendin--;
1388 56 : return INDENT;
1389 : }
1390 : }
1391 :
1392 : again:
1393 1777 : tok->start = NULL;
1394 : /* Skip spaces */
1395 : do {
1396 2288 : c = tok_nextc(tok);
1397 2288 : } while (c == ' ' || c == '\t' || c == '\014');
1398 :
1399 : /* Set start of current token */
1400 1777 : tok->start = tok->cur - 1;
1401 :
1402 : /* Skip comment */
1403 1777 : if (c == '#')
1404 410 : while (c != EOF && c != '\n')
1405 390 : c = tok_nextc(tok);
1406 :
1407 : /* Check for EOF and errors now */
1408 1777 : if (c == EOF) {
1409 6 : return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1410 : }
1411 :
1412 : /* Identifier (most frequent token!) */
1413 1771 : nonascii = 0;
1414 1771 : if (is_potential_identifier_start(c)) {
1415 : /* Process b"", r"", u"", br"" and rb"" */
1416 701 : int saw_b = 0, saw_r = 0, saw_u = 0;
1417 : while (1) {
1418 778 : if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
1419 3 : saw_b = 1;
1420 : /* Since this is a backwards compatibility support literal we don't
1421 : want to support it in arbitrary order like byte literals. */
1422 775 : else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
1423 6 : saw_u = 1;
1424 : /* ur"" and ru"" are not supported */
1425 769 : else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
1426 68 : saw_r = 1;
1427 : else
1428 : break;
1429 77 : c = tok_nextc(tok);
1430 77 : if (c == '"' || c == '\'')
1431 : goto letter_quote;
1432 77 : }
1433 6193 : while (is_potential_identifier_char(c)) {
1434 4791 : if (c >= 128)
1435 0 : nonascii = 1;
1436 4791 : c = tok_nextc(tok);
1437 : }
1438 701 : tok_backup(tok, c);
1439 701 : if (nonascii &&
1440 0 : !verify_identifier(tok)) {
1441 0 : tok->done = E_IDENTIFIER;
1442 0 : return ERRORTOKEN;
1443 : }
1444 701 : *p_start = tok->start;
1445 701 : *p_end = tok->cur;
1446 701 : return NAME;
1447 : }
1448 :
1449 : /* Newline */
1450 1070 : if (c == '\n') {
1451 256 : tok->atbol = 1;
1452 256 : if (blankline || tok->level > 0)
1453 : goto nextline;
1454 195 : *p_start = tok->start;
1455 195 : *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1456 195 : tok->cont_line = 0;
1457 195 : return NEWLINE;
1458 : }
1459 :
1460 : /* Period or number starting with period? */
1461 814 : if (c == '.') {
1462 120 : c = tok_nextc(tok);
1463 120 : if (isdigit(c)) {
1464 0 : goto fraction;
1465 120 : } else if (c == '.') {
1466 0 : c = tok_nextc(tok);
1467 0 : if (c == '.') {
1468 0 : *p_start = tok->start;
1469 0 : *p_end = tok->cur;
1470 0 : return ELLIPSIS;
1471 : } else {
1472 0 : tok_backup(tok, c);
1473 : }
1474 0 : tok_backup(tok, '.');
1475 : } else {
1476 120 : tok_backup(tok, c);
1477 : }
1478 120 : *p_start = tok->start;
1479 120 : *p_end = tok->cur;
1480 120 : return DOT;
1481 : }
1482 :
1483 : /* Number */
1484 694 : if (isdigit(c)) {
1485 33 : if (c == '0') {
1486 : /* Hex, octal or binary -- maybe. */
1487 9 : c = tok_nextc(tok);
1488 9 : if (c == '.')
1489 0 : goto fraction;
1490 9 : if (c == 'j' || c == 'J')
1491 : goto imaginary;
1492 9 : if (c == 'x' || c == 'X') {
1493 :
1494 : /* Hex */
1495 0 : c = tok_nextc(tok);
1496 0 : if (!isxdigit(c)) {
1497 0 : tok->done = E_TOKEN;
1498 0 : tok_backup(tok, c);
1499 0 : return ERRORTOKEN;
1500 : }
1501 : do {
1502 0 : c = tok_nextc(tok);
1503 0 : } while (isxdigit(c));
1504 : }
1505 9 : else if (c == 'o' || c == 'O') {
1506 : /* Octal */
1507 0 : c = tok_nextc(tok);
1508 0 : if (c < '0' || c >= '8') {
1509 0 : tok->done = E_TOKEN;
1510 0 : tok_backup(tok, c);
1511 0 : return ERRORTOKEN;
1512 : }
1513 : do {
1514 0 : c = tok_nextc(tok);
1515 0 : } while ('0' <= c && c < '8');
1516 : }
1517 9 : else if (c == 'b' || c == 'B') {
1518 : /* Binary */
1519 0 : c = tok_nextc(tok);
1520 0 : if (c != '0' && c != '1') {
1521 0 : tok->done = E_TOKEN;
1522 0 : tok_backup(tok, c);
1523 0 : return ERRORTOKEN;
1524 : }
1525 : do {
1526 0 : c = tok_nextc(tok);
1527 0 : } while (c == '0' || c == '1');
1528 : }
1529 : else {
1530 9 : int nonzero = 0;
1531 : /* maybe old-style octal; c is first char of it */
1532 : /* in any case, allow '0' as a literal */
1533 18 : while (c == '0')
1534 0 : c = tok_nextc(tok);
1535 18 : while (isdigit(c)) {
1536 0 : nonzero = 1;
1537 0 : c = tok_nextc(tok);
1538 : }
1539 9 : if (c == '.')
1540 0 : goto fraction;
1541 9 : else if (c == 'e' || c == 'E')
1542 : goto exponent;
1543 9 : else if (c == 'j' || c == 'J')
1544 : goto imaginary;
1545 9 : else if (nonzero) {
1546 0 : tok->done = E_TOKEN;
1547 0 : tok_backup(tok, c);
1548 0 : return ERRORTOKEN;
1549 : }
1550 : }
1551 : }
1552 : else {
1553 : /* Decimal */
1554 : do {
1555 24 : c = tok_nextc(tok);
1556 24 : } while (isdigit(c));
1557 : {
1558 : /* Accept floating point numbers. */
1559 24 : if (c == '.') {
1560 : fraction:
1561 : /* Fraction */
1562 : do {
1563 0 : c = tok_nextc(tok);
1564 0 : } while (isdigit(c));
1565 : }
1566 24 : if (c == 'e' || c == 'E') {
1567 : exponent:
1568 : /* Exponent part */
1569 0 : c = tok_nextc(tok);
1570 0 : if (c == '+' || c == '-')
1571 0 : c = tok_nextc(tok);
1572 0 : if (!isdigit(c)) {
1573 0 : tok->done = E_TOKEN;
1574 0 : tok_backup(tok, c);
1575 0 : return ERRORTOKEN;
1576 : }
1577 : do {
1578 0 : c = tok_nextc(tok);
1579 0 : } while (isdigit(c));
1580 : }
1581 24 : if (c == 'j' || c == 'J')
1582 : /* Imaginary part */
1583 : imaginary:
1584 0 : c = tok_nextc(tok);
1585 : }
1586 : }
1587 33 : tok_backup(tok, c);
1588 33 : *p_start = tok->start;
1589 33 : *p_end = tok->cur;
1590 33 : return NUMBER;
1591 : }
1592 :
1593 : letter_quote:
1594 : /* String */
1595 661 : if (c == '\'' || c == '"') {
1596 77 : int quote = c;
1597 77 : int quote_size = 1; /* 1 or 3 */
1598 77 : int end_quote_size = 0;
1599 :
1600 : /* Find the quote size and start of string */
1601 77 : c = tok_nextc(tok);
1602 77 : if (c == quote) {
1603 5 : c = tok_nextc(tok);
1604 5 : if (c == quote)
1605 0 : quote_size = 3;
1606 : else
1607 5 : end_quote_size = 1; /* empty string found */
1608 : }
1609 77 : if (c != quote)
1610 77 : tok_backup(tok, c);
1611 :
1612 : /* Get rest of string */
1613 1947 : while (end_quote_size != quote_size) {
1614 1793 : c = tok_nextc(tok);
1615 1793 : if (c == EOF) {
1616 0 : if (quote_size == 3)
1617 0 : tok->done = E_EOFS;
1618 : else
1619 0 : tok->done = E_EOLS;
1620 0 : tok->cur = tok->inp;
1621 0 : return ERRORTOKEN;
1622 : }
1623 1793 : if (quote_size == 1 && c == '\n') {
1624 0 : tok->done = E_EOLS;
1625 0 : tok->cur = tok->inp;
1626 0 : return ERRORTOKEN;
1627 : }
1628 1793 : if (c == quote)
1629 72 : end_quote_size += 1;
1630 : else {
1631 1721 : end_quote_size = 0;
1632 1721 : if (c == '\\')
1633 3 : c = tok_nextc(tok); /* skip escaped char */
1634 : }
1635 : }
1636 :
1637 77 : *p_start = tok->start;
1638 77 : *p_end = tok->cur;
1639 77 : return STRING;
1640 : }
1641 :
1642 : /* Line continuation */
1643 584 : if (c == '\\') {
1644 5 : c = tok_nextc(tok);
1645 5 : if (c != '\n') {
1646 0 : tok->done = E_LINECONT;
1647 0 : tok->cur = tok->inp;
1648 0 : return ERRORTOKEN;
1649 : }
1650 5 : tok->cont_line = 1;
1651 5 : goto again; /* Read next line */
1652 : }
1653 :
1654 : /* Check for two-character token */
1655 : {
1656 579 : int c2 = tok_nextc(tok);
1657 579 : int token = PyToken_TwoChars(c, c2);
1658 579 : if (token != OP) {
1659 15 : int c3 = tok_nextc(tok);
1660 15 : int token3 = PyToken_ThreeChars(c, c2, c3);
1661 15 : if (token3 != OP) {
1662 0 : token = token3;
1663 : } else {
1664 15 : tok_backup(tok, c3);
1665 : }
1666 15 : *p_start = tok->start;
1667 15 : *p_end = tok->cur;
1668 15 : return token;
1669 : }
1670 564 : tok_backup(tok, c2);
1671 : }
1672 :
1673 : /* Keep track of parentheses nesting level */
1674 564 : switch (c) {
1675 : case '(':
1676 : case '[':
1677 : case '{':
1678 147 : tok->level++;
1679 147 : break;
1680 : case ')':
1681 : case ']':
1682 : case '}':
1683 147 : tok->level--;
1684 147 : break;
1685 : }
1686 :
1687 : /* Punctuation character */
1688 564 : *p_start = tok->start;
1689 564 : *p_end = tok->cur;
1690 564 : return PyToken_OneChar(c);
1691 : }
1692 :
1693 : int
1694 1823 : PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1695 : {
1696 1823 : int result = tok_get(tok, p_start, p_end);
1697 1823 : if (tok->decoding_erred) {
1698 0 : result = ERRORTOKEN;
1699 0 : tok->done = E_DECODE;
1700 : }
1701 1823 : return result;
1702 : }
1703 :
1704 : /* Get the encoding of a Python file. Check for the coding cookie and check if
1705 : the file starts with a BOM.
1706 :
1707 : PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1708 : encoding in the first or second line of the file (in which case the encoding
1709 : should be assumed to be UTF-8).
1710 :
1711 : The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1712 : by the caller. */
1713 :
1714 : char *
1715 0 : PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1716 : {
1717 : struct tok_state *tok;
1718 : FILE *fp;
1719 0 : char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1720 :
1721 0 : fd = dup(fd);
1722 0 : if (fd < 0) {
1723 0 : return NULL;
1724 : }
1725 0 : fp = fdopen(fd, "r");
1726 0 : if (fp == NULL) {
1727 0 : return NULL;
1728 : }
1729 0 : tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1730 0 : if (tok == NULL) {
1731 0 : fclose(fp);
1732 0 : return NULL;
1733 : }
1734 : #ifndef PGEN
1735 0 : if (filename != NULL) {
1736 0 : Py_INCREF(filename);
1737 0 : tok->filename = filename;
1738 : }
1739 : else {
1740 0 : tok->filename = PyUnicode_FromString("<string>");
1741 0 : if (tok->filename == NULL) {
1742 0 : fclose(fp);
1743 0 : PyTokenizer_Free(tok);
1744 0 : return encoding;
1745 : }
1746 : }
1747 : #endif
1748 0 : while (tok->lineno < 2 && tok->done == E_OK) {
1749 0 : PyTokenizer_Get(tok, &p_start, &p_end);
1750 : }
1751 0 : fclose(fp);
1752 0 : if (tok->encoding) {
1753 0 : encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1754 0 : if (encoding)
1755 0 : strcpy(encoding, tok->encoding);
1756 : }
1757 0 : PyTokenizer_Free(tok);
1758 0 : return encoding;
1759 : }
1760 :
1761 : char *
1762 0 : PyTokenizer_FindEncoding(int fd)
1763 : {
1764 0 : return PyTokenizer_FindEncodingFilename(fd, NULL);
1765 : }
1766 :
1767 : #ifdef Py_DEBUG
1768 :
1769 : void
1770 : tok_dump(int type, char *start, char *end)
1771 : {
1772 : printf("%s", _PyParser_TokenNames[type]);
1773 : if (type == NAME || type == NUMBER || type == STRING || type == OP)
1774 : printf("(%.*s)", (int)(end - start), start);
1775 : }
1776 :
1777 : #endif
|