LCOV - code coverage report
Current view: top level - libreoffice/workdir/unxlngi6.pro/UnpackedTarball/python3/Parser - tokenizer.c (source / functions) Hit Total Coverage
Test: libreoffice_filtered.info Lines: 297 934 31.8 %
Date: 2012-12-17 Functions: 11 35 31.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : 
       2             : /* Tokenizer implementation */
       3             : 
       4             : #include "Python.h"
       5             : #include "pgenheaders.h"
       6             : 
       7             : #include <ctype.h>
       8             : #include <assert.h>
       9             : 
      10             : #include "tokenizer.h"
      11             : #include "errcode.h"
      12             : 
      13             : #ifndef PGEN
      14             : #include "unicodeobject.h"
      15             : #include "bytesobject.h"
      16             : #include "fileobject.h"
      17             : #include "codecs.h"
      18             : #include "abstract.h"
      19             : #endif /* PGEN */
      20             : 
      21             : #define is_potential_identifier_start(c) (\
      22             :               (c >= 'a' && c <= 'z')\
      23             :                || (c >= 'A' && c <= 'Z')\
      24             :                || c == '_'\
      25             :                || (c >= 128))
      26             : 
      27             : #define is_potential_identifier_char(c) (\
      28             :               (c >= 'a' && c <= 'z')\
      29             :                || (c >= 'A' && c <= 'Z')\
      30             :                || (c >= '0' && c <= '9')\
      31             :                || c == '_'\
      32             :                || (c >= 128))
      33             : 
      34             : extern char *PyOS_Readline(FILE *, FILE *, char *);
      35             : /* Return malloc'ed string including trailing \n;
      36             :    empty malloc'ed string for EOF;
      37             :    NULL if interrupted */
      38             : 
      39             : /* Don't ever change this -- it would break the portability of Python code */
      40             : #define TABSIZE 8
      41             : 
      42             : /* Forward */
      43             : static struct tok_state *tok_new(void);
      44             : static int tok_nextc(struct tok_state *tok);
      45             : static void tok_backup(struct tok_state *tok, int c);
      46             : 
      47             : 
      48             : /* Token names */
      49             : 
      50             : char *_PyParser_TokenNames[] = {
      51             :     "ENDMARKER",
      52             :     "NAME",
      53             :     "NUMBER",
      54             :     "STRING",
      55             :     "NEWLINE",
      56             :     "INDENT",
      57             :     "DEDENT",
      58             :     "LPAR",
      59             :     "RPAR",
      60             :     "LSQB",
      61             :     "RSQB",
      62             :     "COLON",
      63             :     "COMMA",
      64             :     "SEMI",
      65             :     "PLUS",
      66             :     "MINUS",
      67             :     "STAR",
      68             :     "SLASH",
      69             :     "VBAR",
      70             :     "AMPER",
      71             :     "LESS",
      72             :     "GREATER",
      73             :     "EQUAL",
      74             :     "DOT",
      75             :     "PERCENT",
      76             :     "LBRACE",
      77             :     "RBRACE",
      78             :     "EQEQUAL",
      79             :     "NOTEQUAL",
      80             :     "LESSEQUAL",
      81             :     "GREATEREQUAL",
      82             :     "TILDE",
      83             :     "CIRCUMFLEX",
      84             :     "LEFTSHIFT",
      85             :     "RIGHTSHIFT",
      86             :     "DOUBLESTAR",
      87             :     "PLUSEQUAL",
      88             :     "MINEQUAL",
      89             :     "STAREQUAL",
      90             :     "SLASHEQUAL",
      91             :     "PERCENTEQUAL",
      92             :     "AMPEREQUAL",
      93             :     "VBAREQUAL",
      94             :     "CIRCUMFLEXEQUAL",
      95             :     "LEFTSHIFTEQUAL",
      96             :     "RIGHTSHIFTEQUAL",
      97             :     "DOUBLESTAREQUAL",
      98             :     "DOUBLESLASH",
      99             :     "DOUBLESLASHEQUAL",
     100             :     "AT",
     101             :     "RARROW",
     102             :     "ELLIPSIS",
     103             :     /* This table must match the #defines in token.h! */
     104             :     "OP",
     105             :     "<ERRORTOKEN>",
     106             :     "<N_TOKENS>"
     107             : };
     108             : 
     109             : 
     110             : /* Create and initialize a new tok_state structure */
     111             : 
     112             : static struct tok_state *
     113           3 : tok_new(void)
     114             : {
     115           3 :     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
     116             :                                             sizeof(struct tok_state));
     117           3 :     if (tok == NULL)
     118           0 :         return NULL;
     119           3 :     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
     120           3 :     tok->done = E_OK;
     121           3 :     tok->fp = NULL;
     122           3 :     tok->input = NULL;
     123           3 :     tok->tabsize = TABSIZE;
     124           3 :     tok->indent = 0;
     125           3 :     tok->indstack[0] = 0;
     126           3 :     tok->atbol = 1;
     127           3 :     tok->pendin = 0;
     128           3 :     tok->prompt = tok->nextprompt = NULL;
     129           3 :     tok->lineno = 0;
     130           3 :     tok->level = 0;
     131           3 :     tok->altwarning = 1;
     132           3 :     tok->alterror = 1;
     133           3 :     tok->alttabsize = 1;
     134           3 :     tok->altindstack[0] = 0;
     135           3 :     tok->decoding_state = STATE_INIT;
     136           3 :     tok->decoding_erred = 0;
     137           3 :     tok->read_coding_spec = 0;
     138           3 :     tok->enc = NULL;
     139           3 :     tok->encoding = NULL;
     140           3 :     tok->cont_line = 0;
     141             : #ifndef PGEN
     142           3 :     tok->filename = NULL;
     143           3 :     tok->decoding_readline = NULL;
     144           3 :     tok->decoding_buffer = NULL;
     145             : #endif
     146           3 :     return tok;
     147             : }
     148             : 
     149             : static char *
     150           0 : new_string(const char *s, Py_ssize_t len)
     151             : {
     152           0 :     char* result = (char *)PyMem_MALLOC(len + 1);
     153           0 :     if (result != NULL) {
     154           0 :         memcpy(result, s, len);
     155           0 :         result[len] = '\0';
     156             :     }
     157           0 :     return result;
     158             : }
     159             : 
     160             : #ifdef PGEN
     161             : 
     162             : static char *
     163           0 : decoding_fgets(char *s, int size, struct tok_state *tok)
     164             : {
     165           0 :     return fgets(s, size, tok->fp);
     166             : }
     167             : 
     168             : static int
     169           0 : decoding_feof(struct tok_state *tok)
     170             : {
     171           0 :     return feof(tok->fp);
     172             : }
     173             : 
     174             : static char *
     175           0 : decode_str(const char *str, int exec_input, struct tok_state *tok)
     176             : {
     177           0 :     return new_string(str, strlen(str));
     178             : }
     179             : 
     180             : #else /* PGEN */
     181             : 
     182             : static char *
     183           0 : error_ret(struct tok_state *tok) /* XXX */
     184             : {
     185           0 :     tok->decoding_erred = 1;
     186           0 :     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
     187           0 :         PyMem_FREE(tok->buf);
     188           0 :     tok->buf = NULL;
     189           0 :     return NULL;                /* as if it were EOF */
     190             : }
     191             : 
     192             : 
     193             : static char *
     194           0 : get_normal_name(char *s)        /* for utf-8 and latin-1 */
     195             : {
     196             :     char buf[13];
     197             :     int i;
     198           0 :     for (i = 0; i < 12; i++) {
     199           0 :         int c = s[i];
     200           0 :         if (c == '\0')
     201           0 :             break;
     202           0 :         else if (c == '_')
     203           0 :             buf[i] = '-';
     204             :         else
     205           0 :             buf[i] = tolower(c);
     206             :     }
     207           0 :     buf[i] = '\0';
     208           0 :     if (strcmp(buf, "utf-8") == 0 ||
     209           0 :         strncmp(buf, "utf-8-", 6) == 0)
     210           0 :         return "utf-8";
     211           0 :     else if (strcmp(buf, "latin-1") == 0 ||
     212           0 :              strcmp(buf, "iso-8859-1") == 0 ||
     213           0 :              strcmp(buf, "iso-latin-1") == 0 ||
     214           0 :              strncmp(buf, "latin-1-", 8) == 0 ||
     215           0 :              strncmp(buf, "iso-8859-1-", 11) == 0 ||
     216           0 :              strncmp(buf, "iso-latin-1-", 12) == 0)
     217           0 :         return "iso-8859-1";
     218             :     else
     219           0 :         return s;
     220             : }
     221             : 
     222             : /* Return the coding spec in S, or NULL if none is found.  */
     223             : 
     224             : static char *
     225           0 : get_coding_spec(const char *s, Py_ssize_t size)
     226             : {
     227             :     Py_ssize_t i;
     228             :     /* Coding spec must be in a comment, and that comment must be
     229             :      * the only statement on the source code line. */
     230           0 :     for (i = 0; i < size - 6; i++) {
     231           0 :         if (s[i] == '#')
     232           0 :             break;
     233           0 :         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
     234           0 :             return NULL;
     235             :     }
     236           0 :     for (; i < size - 6; i++) { /* XXX inefficient search */
     237           0 :         const char* t = s + i;
     238           0 :         if (strncmp(t, "coding", 6) == 0) {
     239           0 :             const char* begin = NULL;
     240           0 :             t += 6;
     241           0 :             if (t[0] != ':' && t[0] != '=')
     242           0 :                 continue;
     243             :             do {
     244           0 :                 t++;
     245           0 :             } while (t[0] == '\x20' || t[0] == '\t');
     246             : 
     247           0 :             begin = t;
     248           0 :             while (Py_ISALNUM(t[0]) ||
     249           0 :                    t[0] == '-' || t[0] == '_' || t[0] == '.')
     250           0 :                 t++;
     251             : 
     252           0 :             if (begin < t) {
     253           0 :                 char* r = new_string(begin, t - begin);
     254           0 :                 char* q = get_normal_name(r);
     255           0 :                 if (r != q) {
     256           0 :                     PyMem_FREE(r);
     257           0 :                     r = new_string(q, strlen(q));
     258             :                 }
     259           0 :                 return r;
     260             :             }
     261             :         }
     262             :     }
     263           0 :     return NULL;
     264             : }
     265             : 
     266             : /* Check whether the line contains a coding spec. If it does,
     267             :    invoke the set_readline function for the new encoding.
     268             :    This function receives the tok_state and the new encoding.
     269             :    Return 1 on success, 0 on failure.  */
     270             : 
     271             : static int
     272           0 : check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
     273             :                   int set_readline(struct tok_state *, const char *))
     274             : {
     275             :     char * cs;
     276           0 :     int r = 1;
     277             : 
     278           0 :     if (tok->cont_line)
     279             :         /* It's a continuation line, so it can't be a coding spec. */
     280           0 :         return 1;
     281           0 :     cs = get_coding_spec(line, size);
     282           0 :     if (cs != NULL) {
     283           0 :         tok->read_coding_spec = 1;
     284           0 :         if (tok->encoding == NULL) {
     285             :             assert(tok->decoding_state == STATE_RAW);
     286           0 :             if (strcmp(cs, "utf-8") == 0) {
     287           0 :                 tok->encoding = cs;
     288             :             } else {
     289           0 :                 r = set_readline(tok, cs);
     290           0 :                 if (r) {
     291           0 :                     tok->encoding = cs;
     292           0 :                     tok->decoding_state = STATE_NORMAL;
     293             :                 }
     294             :                 else
     295           0 :                     PyMem_FREE(cs);
     296             :             }
     297             :         } else {                /* then, compare cs with BOM */
     298           0 :             r = (strcmp(tok->encoding, cs) == 0);
     299           0 :             PyMem_FREE(cs);
     300             :         }
     301             :     }
     302           0 :     if (!r) {
     303           0 :         cs = tok->encoding;
     304           0 :         if (!cs)
     305           0 :             cs = "with BOM";
     306           0 :         PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
     307             :     }
     308           0 :     return r;
     309             : }
     310             : 
     311             : /* See whether the file starts with a BOM. If it does,
     312             :    invoke the set_readline function with the new encoding.
     313             :    Return 1 on success, 0 on failure.  */
     314             : 
     315             : static int
     316           0 : check_bom(int get_char(struct tok_state *),
     317             :           void unget_char(int, struct tok_state *),
     318             :           int set_readline(struct tok_state *, const char *),
     319             :           struct tok_state *tok)
     320             : {
     321             :     int ch1, ch2, ch3;
     322           0 :     ch1 = get_char(tok);
     323           0 :     tok->decoding_state = STATE_RAW;
     324           0 :     if (ch1 == EOF) {
     325           0 :         return 1;
     326           0 :     } else if (ch1 == 0xEF) {
     327           0 :         ch2 = get_char(tok);
     328           0 :         if (ch2 != 0xBB) {
     329           0 :             unget_char(ch2, tok);
     330           0 :             unget_char(ch1, tok);
     331           0 :             return 1;
     332             :         }
     333           0 :         ch3 = get_char(tok);
     334           0 :         if (ch3 != 0xBF) {
     335           0 :             unget_char(ch3, tok);
     336           0 :             unget_char(ch2, tok);
     337           0 :             unget_char(ch1, tok);
     338           0 :             return 1;
     339             :         }
     340             : #if 0
     341             :     /* Disable support for UTF-16 BOMs until a decision
     342             :        is made whether this needs to be supported.  */
     343             :     } else if (ch1 == 0xFE) {
     344             :         ch2 = get_char(tok);
     345             :         if (ch2 != 0xFF) {
     346             :             unget_char(ch2, tok);
     347             :             unget_char(ch1, tok);
     348             :             return 1;
     349             :         }
     350             :         if (!set_readline(tok, "utf-16-be"))
     351             :             return 0;
     352             :         tok->decoding_state = STATE_NORMAL;
     353             :     } else if (ch1 == 0xFF) {
     354             :         ch2 = get_char(tok);
     355             :         if (ch2 != 0xFE) {
     356             :             unget_char(ch2, tok);
     357             :             unget_char(ch1, tok);
     358             :             return 1;
     359             :         }
     360             :         if (!set_readline(tok, "utf-16-le"))
     361             :             return 0;
     362             :         tok->decoding_state = STATE_NORMAL;
     363             : #endif
     364             :     } else {
     365           0 :         unget_char(ch1, tok);
     366           0 :         return 1;
     367             :     }
     368           0 :     if (tok->encoding != NULL)
     369           0 :         PyMem_FREE(tok->encoding);
     370           0 :     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
     371             :     /* No need to set_readline: input is already utf-8 */
     372           0 :     return 1;
     373             : }
     374             : 
     375             : /* Read a line of text from TOK into S, using the stream in TOK.
     376             :    Return NULL on failure, else S.
     377             : 
     378             :    On entry, tok->decoding_buffer will be one of:
     379             :      1) NULL: need to call tok->decoding_readline to get a new line
     380             :      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
     381             :        stored the result in tok->decoding_buffer
     382             :      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
     383             :        (in the s buffer) to copy entire contents of the line read
     384             :        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
     385             :        In this case, fp_readl is called in a loop (with an expanded buffer)
     386             :        until the buffer ends with a '\n' (or until the end of the file is
     387             :        reached): see tok_nextc and its calls to decoding_fgets.
     388             : */
     389             : 
     390             : static char *
     391           0 : fp_readl(char *s, int size, struct tok_state *tok)
     392             : {
     393             :     PyObject* bufobj;
     394             :     const char *buf;
     395             :     Py_ssize_t buflen;
     396             : 
     397             :     /* Ask for one less byte so we can terminate it */
     398             :     assert(size > 0);
     399           0 :     size--;
     400             : 
     401           0 :     if (tok->decoding_buffer) {
     402           0 :         bufobj = tok->decoding_buffer;
     403           0 :         Py_INCREF(bufobj);
     404             :     }
     405             :     else
     406             :     {
     407           0 :         bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
     408           0 :         if (bufobj == NULL)
     409           0 :             goto error;
     410             :     }
     411           0 :     if (PyUnicode_CheckExact(bufobj))
     412             :     {
     413           0 :         buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
     414           0 :         if (buf == NULL) {
     415           0 :             goto error;
     416             :         }
     417             :     }
     418             :     else
     419             :     {
     420           0 :         buf = PyByteArray_AsString(bufobj);
     421           0 :         if (buf == NULL) {
     422           0 :             goto error;
     423             :         }
     424           0 :         buflen = PyByteArray_GET_SIZE(bufobj);
     425             :     }
     426             : 
     427           0 :     Py_XDECREF(tok->decoding_buffer);
     428           0 :     if (buflen > size) {
     429             :         /* Too many chars, the rest goes into tok->decoding_buffer */
     430           0 :         tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
     431             :                                                          buflen-size);
     432           0 :         if (tok->decoding_buffer == NULL)
     433           0 :             goto error;
     434           0 :         buflen = size;
     435             :     }
     436             :     else
     437           0 :         tok->decoding_buffer = NULL;
     438             : 
     439           0 :     memcpy(s, buf, buflen);
     440           0 :     s[buflen] = '\0';
     441           0 :     if (buflen == 0) /* EOF */
     442           0 :         s = NULL;
     443           0 :     Py_DECREF(bufobj);
     444           0 :     return s;
     445             : 
     446             : error:
     447           0 :     Py_XDECREF(bufobj);
     448           0 :     return error_ret(tok);
     449             : }
     450             : 
     451             : /* Set the readline function for TOK to a StreamReader's
     452             :    readline function. The StreamReader is named ENC.
     453             : 
     454             :    This function is called from check_bom and check_coding_spec.
     455             : 
     456             :    ENC is usually identical to the future value of tok->encoding,
     457             :    except for the (currently unsupported) case of UTF-16.
     458             : 
     459             :    Return 1 on success, 0 on failure. */
     460             : 
     461             : static int
     462           0 : fp_setreadl(struct tok_state *tok, const char* enc)
     463             : {
     464           0 :     PyObject *readline = NULL, *stream = NULL, *io = NULL;
     465             :     _Py_IDENTIFIER(open);
     466             :     _Py_IDENTIFIER(readline);
     467             :     int fd;
     468             : 
     469           0 :     io = PyImport_ImportModuleNoBlock("io");
     470           0 :     if (io == NULL)
     471           0 :         goto cleanup;
     472             : 
     473           0 :     fd = fileno(tok->fp);
     474           0 :     if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
     475           0 :         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
     476           0 :         goto cleanup;
     477             :     }
     478             : 
     479           0 :     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
     480             :                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
     481           0 :     if (stream == NULL)
     482           0 :         goto cleanup;
     483             : 
     484           0 :     Py_XDECREF(tok->decoding_readline);
     485           0 :     readline = _PyObject_GetAttrId(stream, &PyId_readline);
     486           0 :     tok->decoding_readline = readline;
     487             : 
     488             :     /* The file has been reopened; parsing will restart from
     489             :      * the beginning of the file, we have to reset the line number.
     490             :      * But this function has been called from inside tok_nextc() which
     491             :      * will increment lineno before it returns. So we set it -1 so that
     492             :      * the next call to tok_nextc() will start with tok->lineno == 0.
     493             :      */
     494           0 :     tok->lineno = -1;
     495             : 
     496             :   cleanup:
     497           0 :     Py_XDECREF(stream);
     498           0 :     Py_XDECREF(io);
     499           0 :     return readline != NULL;
     500             : }
     501             : 
     502             : /* Fetch the next byte from TOK. */
     503             : 
     504           0 : static int fp_getc(struct tok_state *tok) {
     505           0 :     return getc(tok->fp);
     506             : }
     507             : 
     508             : /* Unfetch the last byte back into TOK.  */
     509             : 
     510           0 : static void fp_ungetc(int c, struct tok_state *tok) {
     511           0 :     ungetc(c, tok->fp);
     512           0 : }
     513             : 
     514             : /* Check whether the characters at s start a valid
     515             :    UTF-8 sequence. Return the number of characters forming
     516             :    the sequence if yes, 0 if not.  */
     517           0 : static int valid_utf8(const unsigned char* s)
     518             : {
     519           0 :     int expected = 0;
     520             :     int length;
     521           0 :     if (*s < 0x80)
     522             :         /* single-byte code */
     523           0 :         return 1;
     524           0 :     if (*s < 0xc0)
     525             :         /* following byte */
     526           0 :         return 0;
     527           0 :     if (*s < 0xE0)
     528           0 :         expected = 1;
     529           0 :     else if (*s < 0xF0)
     530           0 :         expected = 2;
     531           0 :     else if (*s < 0xF8)
     532           0 :         expected = 3;
     533             :     else
     534           0 :         return 0;
     535           0 :     length = expected + 1;
     536           0 :     for (; expected; expected--)
     537           0 :         if (s[expected] < 0x80 || s[expected] >= 0xC0)
     538           0 :             return 0;
     539           0 :     return length;
     540             : }
     541             : 
     542             : /* Read a line of input from TOK. Determine encoding
     543             :    if necessary.  */
     544             : 
     545             : static char *
     546           0 : decoding_fgets(char *s, int size, struct tok_state *tok)
     547             : {
     548           0 :     char *line = NULL;
     549           0 :     int badchar = 0;
     550             :     for (;;) {
     551           0 :         if (tok->decoding_state == STATE_NORMAL) {
     552             :             /* We already have a codec associated with
     553             :                this input. */
     554           0 :             line = fp_readl(s, size, tok);
     555           0 :             break;
     556           0 :         } else if (tok->decoding_state == STATE_RAW) {
     557             :             /* We want a 'raw' read. */
     558           0 :             line = Py_UniversalNewlineFgets(s, size,
     559             :                                             tok->fp, NULL);
     560           0 :             break;
     561             :         } else {
     562             :             /* We have not yet determined the encoding.
     563             :                If an encoding is found, use the file-pointer
     564             :                reader functions from now on. */
     565           0 :             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
     566           0 :                 return error_ret(tok);
     567             :             assert(tok->decoding_state != STATE_INIT);
     568             :         }
     569           0 :     }
     570           0 :     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
     571           0 :         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
     572           0 :             return error_ret(tok);
     573             :         }
     574             :     }
     575             : #ifndef PGEN
     576             :     /* The default encoding is UTF-8, so make sure we don't have any
     577             :        non-UTF-8 sequences in it. */
     578           0 :     if (line && !tok->encoding) {
     579             :         unsigned char *c;
     580             :         int length;
     581           0 :         for (c = (unsigned char *)line; *c; c += length)
     582           0 :             if (!(length = valid_utf8(c))) {
     583           0 :                 badchar = *c;
     584           0 :                 break;
     585             :             }
     586             :     }
     587           0 :     if (badchar) {
     588             :         /* Need to add 1 to the line number, since this line
     589             :            has not been counted, yet.  */
     590           0 :         PyErr_Format(PyExc_SyntaxError,
     591             :                 "Non-UTF-8 code starting with '\\x%.2x' "
     592             :                 "in file %U on line %i, "
     593             :                 "but no encoding declared; "
     594             :                 "see http://python.org/dev/peps/pep-0263/ for details",
     595           0 :                 badchar, tok->filename, tok->lineno + 1);
     596           0 :         return error_ret(tok);
     597             :     }
     598             : #endif
     599           0 :     return line;
     600             : }
     601             : 
     602             : static int
     603           0 : decoding_feof(struct tok_state *tok)
     604             : {
     605           0 :     if (tok->decoding_state != STATE_NORMAL) {
     606           0 :         return feof(tok->fp);
     607             :     } else {
     608           0 :         PyObject* buf = tok->decoding_buffer;
     609           0 :         if (buf == NULL) {
     610           0 :             buf = PyObject_CallObject(tok->decoding_readline, NULL);
     611           0 :             if (buf == NULL) {
     612           0 :                 error_ret(tok);
     613           0 :                 return 1;
     614             :             } else {
     615           0 :                 tok->decoding_buffer = buf;
     616             :             }
     617             :         }
     618           0 :         return PyObject_Length(buf) == 0;
     619             :     }
     620             : }
     621             : 
     622             : /* Fetch a byte from TOK, using the string buffer. */
     623             : 
     624             : static int
     625           0 : buf_getc(struct tok_state *tok) {
     626           0 :     return Py_CHARMASK(*tok->str++);
     627             : }
     628             : 
     629             : /* Unfetch a byte from TOK, using the string buffer. */
     630             : 
     631             : static void
     632           0 : buf_ungetc(int c, struct tok_state *tok) {
     633           0 :     tok->str--;
     634             :     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
     635           0 : }
     636             : 
     637             : /* Set the readline function for TOK to ENC. For the string-based
     638             :    tokenizer, this means to just record the encoding. */
     639             : 
     640             : static int
     641           0 : buf_setreadl(struct tok_state *tok, const char* enc) {
     642           0 :     tok->enc = enc;
     643           0 :     return 1;
     644             : }
     645             : 
     646             : /* Return a UTF-8 encoding Python string object from the
     647             :    C byte string STR, which is encoded with ENC. */
     648             : 
     649             : static PyObject *
     650           0 : translate_into_utf8(const char* str, const char* enc) {
     651             :     PyObject *utf8;
     652           0 :     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
     653           0 :     if (buf == NULL)
     654           0 :         return NULL;
     655           0 :     utf8 = PyUnicode_AsUTF8String(buf);
     656           0 :     Py_DECREF(buf);
     657           0 :     return utf8;
     658             : }
     659             : 
     660             : 
     661             : static char *
     662           3 : translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     663           3 :     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
     664             :     char *buf, *current;
     665           3 :     char c = '\0';
     666           3 :     buf = PyMem_MALLOC(needed_length);
     667           3 :     if (buf == NULL) {
     668           0 :         tok->done = E_NOMEM;
     669           0 :         return NULL;
     670             :     }
     671       10395 :     for (current = buf; *s; s++, current++) {
     672       10392 :         c = *s;
     673       10392 :         if (skip_next_lf) {
     674           0 :             skip_next_lf = 0;
     675           0 :             if (c == '\n') {
     676           0 :                 c = *++s;
     677           0 :                 if (!c)
     678           0 :                     break;
     679             :             }
     680             :         }
     681       10392 :         if (c == '\r') {
     682           0 :             skip_next_lf = 1;
     683           0 :             c = '\n';
     684             :         }
     685       10392 :         *current = c;
     686             :     }
     687             :     /* If this is exec input, add a newline to the end of the string if
     688             :        there isn't one already. */
     689           3 :     if (exec_input && c != '\n') {
     690           0 :         *current = '\n';
     691           0 :         current++;
     692             :     }
     693           3 :     *current = '\0';
     694           3 :     final_length = current - buf + 1;
     695           3 :     if (final_length < needed_length && final_length)
     696             :         /* should never fail */
     697           3 :         buf = PyMem_REALLOC(buf, final_length);
     698           3 :     return buf;
     699             : }
     700             : 
     701             : /* Decode a byte string STR for use as the buffer of TOK.
     702             :    Look for encoding declarations inside STR, and record them
     703             :    inside TOK.  */
     704             : 
     705             : static const char *
     706           0 : decode_str(const char *input, int single, struct tok_state *tok)
     707             : {
     708           0 :     PyObject* utf8 = NULL;
     709             :     const char *str;
     710             :     const char *s;
     711           0 :     const char *newl[2] = {NULL, NULL};
     712           0 :     int lineno = 0;
     713           0 :     tok->input = str = translate_newlines(input, single, tok);
     714           0 :     if (str == NULL)
     715           0 :         return NULL;
     716           0 :     tok->enc = NULL;
     717           0 :     tok->str = str;
     718           0 :     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
     719           0 :         return error_ret(tok);
     720           0 :     str = tok->str;             /* string after BOM if any */
     721             :     assert(str);
     722           0 :     if (tok->enc != NULL) {
     723           0 :         utf8 = translate_into_utf8(str, tok->enc);
     724           0 :         if (utf8 == NULL)
     725           0 :             return error_ret(tok);
     726           0 :         str = PyBytes_AsString(utf8);
     727             :     }
     728           0 :     for (s = str;; s++) {
     729           0 :         if (*s == '\0') break;
     730           0 :         else if (*s == '\n') {
     731             :             assert(lineno < 2);
     732           0 :             newl[lineno] = s;
     733           0 :             lineno++;
     734           0 :             if (lineno == 2) break;
     735             :         }
     736           0 :     }
     737           0 :     tok->enc = NULL;
     738             :     /* need to check line 1 and 2 separately since check_coding_spec
     739             :        assumes a single line as input */
     740           0 :     if (newl[0]) {
     741           0 :         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
     742           0 :             return error_ret(tok);
     743           0 :         if (tok->enc == NULL && newl[1]) {
     744           0 :             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
     745             :                                    tok, buf_setreadl))
     746           0 :                 return error_ret(tok);
     747             :         }
     748             :     }
     749           0 :     if (tok->enc != NULL) {
     750             :         assert(utf8 == NULL);
     751           0 :         utf8 = translate_into_utf8(str, tok->enc);
     752           0 :         if (utf8 == NULL)
     753           0 :             return error_ret(tok);
     754           0 :         str = PyBytes_AS_STRING(utf8);
     755             :     }
     756             :     assert(tok->decoding_buffer == NULL);
     757           0 :     tok->decoding_buffer = utf8; /* CAUTION */
     758           0 :     return str;
     759             : }
     760             : 
     761             : #endif /* PGEN */
     762             : 
     763             : /* Set up tokenizer for string */
     764             : 
     765             : struct tok_state *
     766           0 : PyTokenizer_FromString(const char *str, int exec_input)
     767             : {
     768           0 :     struct tok_state *tok = tok_new();
     769           0 :     if (tok == NULL)
     770           0 :         return NULL;
     771           0 :     str = (char *)decode_str(str, exec_input, tok);
     772           0 :     if (str == NULL) {
     773           0 :         PyTokenizer_Free(tok);
     774           0 :         return NULL;
     775             :     }
     776             : 
     777             :     /* XXX: constify members. */
     778           0 :     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
     779           0 :     return tok;
     780             : }
     781             : 
     782             : struct tok_state *
     783           3 : PyTokenizer_FromUTF8(const char *str, int exec_input)
     784             : {
     785           3 :     struct tok_state *tok = tok_new();
     786           3 :     if (tok == NULL)
     787           0 :         return NULL;
     788             : #ifndef PGEN
     789           3 :     tok->input = str = translate_newlines(str, exec_input, tok);
     790             : #endif
     791           3 :     if (str == NULL) {
     792           0 :         PyTokenizer_Free(tok);
     793           0 :         return NULL;
     794             :     }
     795           3 :     tok->decoding_state = STATE_RAW;
     796           3 :     tok->read_coding_spec = 1;
     797           3 :     tok->enc = NULL;
     798           3 :     tok->str = str;
     799           3 :     tok->encoding = (char *)PyMem_MALLOC(6);
     800           3 :     if (!tok->encoding) {
     801           0 :         PyTokenizer_Free(tok);
     802           0 :         return NULL;
     803             :     }
     804           3 :     strcpy(tok->encoding, "utf-8");
     805             : 
     806             :     /* XXX: constify members. */
     807           3 :     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
     808           3 :     return tok;
     809             : }
     810             : 
     811             : /* Set up tokenizer for file */
     812             : 
     813             : struct tok_state *
     814           0 : PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
     815             : {
     816           0 :     struct tok_state *tok = tok_new();
     817           0 :     if (tok == NULL)
     818           0 :         return NULL;
     819           0 :     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
     820           0 :         PyTokenizer_Free(tok);
     821           0 :         return NULL;
     822             :     }
     823           0 :     tok->cur = tok->inp = tok->buf;
     824           0 :     tok->end = tok->buf + BUFSIZ;
     825           0 :     tok->fp = fp;
     826           0 :     tok->prompt = ps1;
     827           0 :     tok->nextprompt = ps2;
     828           0 :     if (enc != NULL) {
     829             :         /* Must copy encoding declaration since it
     830             :            gets copied into the parse tree. */
     831           0 :         tok->encoding = PyMem_MALLOC(strlen(enc)+1);
     832           0 :         if (!tok->encoding) {
     833           0 :             PyTokenizer_Free(tok);
     834           0 :             return NULL;
     835             :         }
     836           0 :         strcpy(tok->encoding, enc);
     837           0 :         tok->decoding_state = STATE_NORMAL;
     838             :     }
     839           0 :     return tok;
     840             : }
     841             : 
     842             : 
     843             : /* Free a tok_state structure */
     844             : 
     845             : void
     846           3 : PyTokenizer_Free(struct tok_state *tok)
     847             : {
     848           3 :     if (tok->encoding != NULL)
     849           0 :         PyMem_FREE(tok->encoding);
     850             : #ifndef PGEN
     851           3 :     Py_XDECREF(tok->decoding_readline);
     852           3 :     Py_XDECREF(tok->decoding_buffer);
     853           3 :     Py_XDECREF(tok->filename);
     854             : #endif
     855           3 :     if (tok->fp != NULL && tok->buf != NULL)
     856           0 :         PyMem_FREE(tok->buf);
     857           3 :     if (tok->input)
     858           3 :         PyMem_FREE((char *)tok->input);
     859           3 :     PyMem_FREE(tok);
     860           3 : }
     861             : 
     862             : /* Get next char, updating state; error code goes into tok->done */
     863             : 
     864             : static int
     865       12167 : tok_nextc(register struct tok_state *tok)
     866             : {
     867             :     for (;;) {
     868       12167 :         if (tok->cur != tok->inp) {
     869       11897 :             return Py_CHARMASK(*tok->cur++); /* Fast path */
     870             :         }
     871         270 :         if (tok->done != E_OK)
     872           6 :             return EOF;
     873         264 :         if (tok->fp == NULL) {
     874         264 :             char *end = strchr(tok->inp, '\n');
     875         264 :             if (end != NULL)
     876         261 :                 end++;
     877             :             else {
     878           3 :                 end = strchr(tok->inp, '\0');
     879           3 :                 if (end == tok->inp) {
     880           3 :                     tok->done = E_EOF;
     881           3 :                     return EOF;
     882             :                 }
     883             :             }
     884         261 :             if (tok->start == NULL)
     885         261 :                 tok->buf = tok->cur;
     886         261 :             tok->line_start = tok->cur;
     887         261 :             tok->lineno++;
     888         261 :             tok->inp = end;
     889         261 :             return Py_CHARMASK(*tok->cur++);
     890             :         }
     891           0 :         if (tok->prompt != NULL) {
     892           0 :             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
     893             : #ifndef PGEN
     894           0 :             if (newtok != NULL) {
     895           0 :                 char *translated = translate_newlines(newtok, 0, tok);
     896           0 :                 PyMem_FREE(newtok);
     897           0 :                 if (translated == NULL)
     898           0 :                     return EOF;
     899           0 :                 newtok = translated;
     900             :             }
     901           0 :             if (tok->encoding && newtok && *newtok) {
     902             :                 /* Recode to UTF-8 */
     903             :                 Py_ssize_t buflen;
     904             :                 const char* buf;
     905           0 :                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
     906           0 :                 PyMem_FREE(newtok);
     907           0 :                 if (!u) {
     908           0 :                     tok->done = E_DECODE;
     909           0 :                     return EOF;
     910             :                 }
     911           0 :                 buflen = PyBytes_GET_SIZE(u);
     912           0 :                 buf = PyBytes_AS_STRING(u);
     913           0 :                 if (!buf) {
     914           0 :                     Py_DECREF(u);
     915           0 :                     tok->done = E_DECODE;
     916           0 :                     return EOF;
     917             :                 }
     918           0 :                 newtok = PyMem_MALLOC(buflen+1);
     919           0 :                 strcpy(newtok, buf);
     920           0 :                 Py_DECREF(u);
     921             :             }
     922             : #endif
     923           0 :             if (tok->nextprompt != NULL)
     924           0 :                 tok->prompt = tok->nextprompt;
     925           0 :             if (newtok == NULL)
     926           0 :                 tok->done = E_INTR;
     927           0 :             else if (*newtok == '\0') {
     928           0 :                 PyMem_FREE(newtok);
     929           0 :                 tok->done = E_EOF;
     930             :             }
     931           0 :             else if (tok->start != NULL) {
     932           0 :                 size_t start = tok->start - tok->buf;
     933           0 :                 size_t oldlen = tok->cur - tok->buf;
     934           0 :                 size_t newlen = oldlen + strlen(newtok);
     935           0 :                 char *buf = tok->buf;
     936           0 :                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
     937           0 :                 tok->lineno++;
     938           0 :                 if (buf == NULL) {
     939           0 :                     PyMem_FREE(tok->buf);
     940           0 :                     tok->buf = NULL;
     941           0 :                     PyMem_FREE(newtok);
     942           0 :                     tok->done = E_NOMEM;
     943           0 :                     return EOF;
     944             :                 }
     945           0 :                 tok->buf = buf;
     946           0 :                 tok->cur = tok->buf + oldlen;
     947           0 :                 tok->line_start = tok->cur;
     948           0 :                 strcpy(tok->buf + oldlen, newtok);
     949           0 :                 PyMem_FREE(newtok);
     950           0 :                 tok->inp = tok->buf + newlen;
     951           0 :                 tok->end = tok->inp + 1;
     952           0 :                 tok->start = tok->buf + start;
     953             :             }
     954             :             else {
     955           0 :                 tok->lineno++;
     956           0 :                 if (tok->buf != NULL)
     957           0 :                     PyMem_FREE(tok->buf);
     958           0 :                 tok->buf = newtok;
     959           0 :                 tok->line_start = tok->buf;
     960           0 :                 tok->cur = tok->buf;
     961           0 :                 tok->line_start = tok->buf;
     962           0 :                 tok->inp = strchr(tok->buf, '\0');
     963           0 :                 tok->end = tok->inp + 1;
     964             :             }
     965             :         }
     966             :         else {
     967           0 :             int done = 0;
     968           0 :             Py_ssize_t cur = 0;
     969             :             char *pt;
     970           0 :             if (tok->start == NULL) {
     971           0 :                 if (tok->buf == NULL) {
     972           0 :                     tok->buf = (char *)
     973           0 :                         PyMem_MALLOC(BUFSIZ);
     974           0 :                     if (tok->buf == NULL) {
     975           0 :                         tok->done = E_NOMEM;
     976           0 :                         return EOF;
     977             :                     }
     978           0 :                     tok->end = tok->buf + BUFSIZ;
     979             :                 }
     980           0 :                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
     981             :                           tok) == NULL) {
     982           0 :                     tok->done = E_EOF;
     983           0 :                     done = 1;
     984             :                 }
     985             :                 else {
     986           0 :                     tok->done = E_OK;
     987           0 :                     tok->inp = strchr(tok->buf, '\0');
     988           0 :                     done = tok->inp[-1] == '\n';
     989             :                 }
     990             :             }
     991             :             else {
     992           0 :                 cur = tok->cur - tok->buf;
     993           0 :                 if (decoding_feof(tok)) {
     994           0 :                     tok->done = E_EOF;
     995           0 :                     done = 1;
     996             :                 }
     997             :                 else
     998           0 :                     tok->done = E_OK;
     999             :             }
    1000           0 :             tok->lineno++;
    1001             :             /* Read until '\n' or EOF */
    1002           0 :             while (!done) {
    1003           0 :                 Py_ssize_t curstart = tok->start == NULL ? -1 :
    1004           0 :                           tok->start - tok->buf;
    1005           0 :                 Py_ssize_t curvalid = tok->inp - tok->buf;
    1006           0 :                 Py_ssize_t newsize = curvalid + BUFSIZ;
    1007           0 :                 char *newbuf = tok->buf;
    1008           0 :                 newbuf = (char *)PyMem_REALLOC(newbuf,
    1009             :                                                newsize);
    1010           0 :                 if (newbuf == NULL) {
    1011           0 :                     tok->done = E_NOMEM;
    1012           0 :                     tok->cur = tok->inp;
    1013           0 :                     return EOF;
    1014             :                 }
    1015           0 :                 tok->buf = newbuf;
    1016           0 :                 tok->inp = tok->buf + curvalid;
    1017           0 :                 tok->end = tok->buf + newsize;
    1018           0 :                 tok->start = curstart < 0 ? NULL :
    1019           0 :                          tok->buf + curstart;
    1020           0 :                 if (decoding_fgets(tok->inp,
    1021           0 :                                (int)(tok->end - tok->inp),
    1022             :                                tok) == NULL) {
    1023             :                     /* Break out early on decoding
    1024             :                        errors, as tok->buf will be NULL
    1025             :                      */
    1026           0 :                     if (tok->decoding_erred)
    1027           0 :                         return EOF;
    1028             :                     /* Last line does not end in \n,
    1029             :                        fake one */
    1030           0 :                     strcpy(tok->inp, "\n");
    1031             :                 }
    1032           0 :                 tok->inp = strchr(tok->inp, '\0');
    1033           0 :                 done = tok->inp[-1] == '\n';
    1034             :             }
    1035           0 :             if (tok->buf != NULL) {
    1036           0 :                 tok->cur = tok->buf + cur;
    1037           0 :                 tok->line_start = tok->cur;
    1038             :                 /* replace "\r\n" with "\n" */
    1039             :                 /* For Mac leave the \r, giving a syntax error */
    1040           0 :                 pt = tok->inp - 2;
    1041           0 :                 if (pt >= tok->buf && *pt == '\r') {
    1042           0 :                     *pt++ = '\n';
    1043           0 :                     *pt = '\0';
    1044           0 :                     tok->inp = pt;
    1045             :                 }
    1046             :             }
    1047             :         }
    1048           0 :         if (tok->done != E_OK) {
    1049           0 :             if (tok->prompt != NULL)
    1050           0 :                 PySys_WriteStderr("\n");
    1051           0 :             tok->cur = tok->inp;
    1052           0 :             return EOF;
    1053             :         }
    1054           0 :     }
    1055             :     /*NOTREACHED*/
    1056             : }
    1057             : 
    1058             : 
    1059             : /* Back-up one character */
    1060             : 
    1061             : static void
    1062        1769 : tok_backup(register struct tok_state *tok, register int c)
    1063             : {
    1064        1769 :     if (c != EOF) {
    1065        1766 :         if (--tok->cur < tok->buf)
    1066           0 :             Py_FatalError("tok_backup: beginning of buffer");
    1067        1766 :         if (*tok->cur != c)
    1068           0 :             *tok->cur = c;
    1069             :     }
    1070        1769 : }
    1071             : 
    1072             : 
    1073             : /* Return the token corresponding to a single character */
    1074             : 
    1075             : int
    1076         564 : PyToken_OneChar(int c)
    1077             : {
    1078         564 :     switch (c) {
    1079         123 :     case '(':           return LPAR;
    1080         123 :     case ')':           return RPAR;
    1081          23 :     case '[':           return LSQB;
    1082          23 :     case ']':           return RSQB;
    1083          60 :     case ':':           return COLON;
    1084         110 :     case ',':           return COMMA;
    1085           0 :     case ';':           return SEMI;
    1086          13 :     case '+':           return PLUS;
    1087           1 :     case '-':           return MINUS;
    1088           1 :     case '*':           return STAR;
    1089           0 :     case '/':           return SLASH;
    1090           0 :     case '|':           return VBAR;
    1091           0 :     case '&':           return AMPER;
    1092           0 :     case '<':           return LESS;
    1093           2 :     case '>':           return GREATER;
    1094          74 :     case '=':           return EQUAL;
    1095           0 :     case '.':           return DOT;
    1096           7 :     case '%':           return PERCENT;
    1097           1 :     case '{':           return LBRACE;
    1098           1 :     case '}':           return RBRACE;
    1099           0 :     case '^':           return CIRCUMFLEX;
    1100           0 :     case '~':           return TILDE;
    1101           2 :     case '@':       return AT;
    1102           0 :     default:            return OP;
    1103             :     }
    1104             : }
    1105             : 
    1106             : 
    1107             : int
    1108         579 : PyToken_TwoChars(int c1, int c2)
    1109             : {
    1110         579 :     switch (c1) {
    1111             :     case '=':
    1112          83 :         switch (c2) {
    1113           9 :         case '=':               return EQEQUAL;
    1114             :         }
    1115          74 :         break;
    1116             :     case '!':
    1117           3 :         switch (c2) {
    1118           3 :         case '=':               return NOTEQUAL;
    1119             :         }
    1120           0 :         break;
    1121             :     case '<':
    1122           0 :         switch (c2) {
    1123           0 :         case '>':               return NOTEQUAL;
    1124           0 :         case '=':               return LESSEQUAL;
    1125           0 :         case '<':               return LEFTSHIFT;
    1126             :         }
    1127           0 :         break;
    1128             :     case '>':
    1129           2 :         switch (c2) {
    1130           0 :         case '=':               return GREATEREQUAL;
    1131           0 :         case '>':               return RIGHTSHIFT;
    1132             :         }
    1133           2 :         break;
    1134             :     case '+':
    1135          14 :         switch (c2) {
    1136           1 :         case '=':               return PLUSEQUAL;
    1137             :         }
    1138          13 :         break;
    1139             :     case '-':
    1140           1 :         switch (c2) {
    1141           0 :         case '=':               return MINEQUAL;
    1142           0 :         case '>':               return RARROW;
    1143             :         }
    1144           1 :         break;
    1145             :     case '*':
    1146           3 :         switch (c2) {
    1147           2 :         case '*':               return DOUBLESTAR;
    1148           0 :         case '=':               return STAREQUAL;
    1149             :         }
    1150           1 :         break;
    1151             :     case '/':
    1152           0 :         switch (c2) {
    1153           0 :         case '/':               return DOUBLESLASH;
    1154           0 :         case '=':               return SLASHEQUAL;
    1155             :         }
    1156           0 :         break;
    1157             :     case '|':
    1158           0 :         switch (c2) {
    1159           0 :         case '=':               return VBAREQUAL;
    1160             :         }
    1161           0 :         break;
    1162             :     case '%':
    1163           7 :         switch (c2) {
    1164           0 :         case '=':               return PERCENTEQUAL;
    1165             :         }
    1166           7 :         break;
    1167             :     case '&':
    1168           0 :         switch (c2) {
    1169           0 :         case '=':               return AMPEREQUAL;
    1170             :         }
    1171           0 :         break;
    1172             :     case '^':
    1173           0 :         switch (c2) {
    1174           0 :         case '=':               return CIRCUMFLEXEQUAL;
    1175             :         }
    1176           0 :         break;
    1177             :     }
    1178         564 :     return OP;
    1179             : }
    1180             : 
    1181             : int
    1182          15 : PyToken_ThreeChars(int c1, int c2, int c3)
    1183             : {
    1184          15 :     switch (c1) {
    1185             :     case '<':
    1186           0 :         switch (c2) {
    1187             :         case '<':
    1188           0 :             switch (c3) {
    1189             :             case '=':
    1190           0 :                 return LEFTSHIFTEQUAL;
    1191             :             }
    1192           0 :             break;
    1193             :         }
    1194           0 :         break;
    1195             :     case '>':
    1196           0 :         switch (c2) {
    1197             :         case '>':
    1198           0 :             switch (c3) {
    1199             :             case '=':
    1200           0 :                 return RIGHTSHIFTEQUAL;
    1201             :             }
    1202           0 :             break;
    1203             :         }
    1204           0 :         break;
    1205             :     case '*':
    1206           2 :         switch (c2) {
    1207             :         case '*':
    1208           2 :             switch (c3) {
    1209             :             case '=':
    1210           0 :                 return DOUBLESTAREQUAL;
    1211             :             }
    1212           2 :             break;
    1213             :         }
    1214           2 :         break;
    1215             :     case '/':
    1216           0 :         switch (c2) {
    1217             :         case '/':
    1218           0 :             switch (c3) {
    1219             :             case '=':
    1220           0 :                 return DOUBLESLASHEQUAL;
    1221             :             }
    1222           0 :             break;
    1223             :         }
    1224           0 :         break;
    1225             :     case '.':
    1226           0 :         switch (c2) {
    1227             :         case '.':
    1228           0 :             switch (c3) {
    1229             :             case '.':
    1230           0 :                 return ELLIPSIS;
    1231             :             }
    1232           0 :             break;
    1233             :         }
    1234           0 :         break;
    1235             :     }
    1236          15 :     return OP;
    1237             : }
    1238             : 
    1239             : static int
    1240           0 : indenterror(struct tok_state *tok)
    1241             : {
    1242           0 :     if (tok->alterror) {
    1243           0 :         tok->done = E_TABSPACE;
    1244           0 :         tok->cur = tok->inp;
    1245           0 :         return 1;
    1246             :     }
    1247           0 :     if (tok->altwarning) {
    1248             : #ifdef PGEN
    1249           0 :         PySys_WriteStderr("inconsistent use of tabs and spaces "
    1250             :                           "in indentation\n");
    1251             : #else
    1252           0 :         PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
    1253             :                           "in indentation\n", tok->filename);
    1254             : #endif
    1255           0 :         tok->altwarning = 0;
    1256             :     }
    1257           0 :     return 0;
    1258             : }
    1259             : 
    1260             : #ifdef PGEN
    1261             : #define verify_identifier(tok) 1
    1262             : #else
    1263             : /* Verify that the identifier follows PEP 3131.
    1264             :    All identifier strings are guaranteed to be "ready" unicode objects.
    1265             :  */
    1266             : static int
    1267           0 : verify_identifier(struct tok_state *tok)
    1268             : {
    1269             :     PyObject *s;
    1270             :     int result;
    1271           0 :     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
    1272           0 :     if (s == NULL || PyUnicode_READY(s) == -1) {
    1273           0 :         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
    1274           0 :             PyErr_Clear();
    1275           0 :             tok->done = E_IDENTIFIER;
    1276             :         } else {
    1277           0 :             tok->done = E_ERROR;
    1278             :         }
    1279           0 :         return 0;
    1280             :     }
    1281           0 :     result = PyUnicode_IsIdentifier(s);
    1282           0 :     Py_DECREF(s);
    1283           0 :     if (result == 0)
    1284           0 :         tok->done = E_IDENTIFIER;
    1285           0 :     return result;
    1286             : }
    1287             : #endif
    1288             : 
    1289             : /* Get next token, after space stripping etc. */
    1290             : 
    1291             : static int
    1292        1823 : tok_get(register struct tok_state *tok, char **p_start, char **p_end)
    1293             : {
    1294             :     register int c;
    1295             :     int blankline, nonascii;
    1296             : 
    1297        1823 :     *p_start = *p_end = NULL;
    1298             :   nextline:
    1299        1884 :     tok->start = NULL;
    1300        1884 :     blankline = 0;
    1301             : 
    1302             :     /* Get indentation level */
    1303        1884 :     if (tok->atbol) {
    1304         259 :         register int col = 0;
    1305         259 :         register int altcol = 0;
    1306         259 :         tok->atbol = 0;
    1307             :         for (;;) {
    1308        1991 :             c = tok_nextc(tok);
    1309        1991 :             if (c == ' ')
    1310        1732 :                 col++, altcol++;
    1311         259 :             else if (c == '\t') {
    1312           0 :                 col = (col/tok->tabsize + 1) * tok->tabsize;
    1313           0 :                 altcol = (altcol/tok->alttabsize + 1)
    1314           0 :                     * tok->alttabsize;
    1315             :             }
    1316         259 :             else if (c == '\014') /* Control-L (formfeed) */
    1317           0 :                 col = altcol = 0; /* For Emacs users */
    1318             :             else
    1319         259 :                 break;
    1320        1732 :         }
    1321         259 :         tok_backup(tok, c);
    1322         259 :         if (c == '#' || c == '\n') {
    1323             :             /* Lines with only whitespace and/or comments
    1324             :                shouldn't affect the indentation and are
    1325             :                not passed to the parser as NEWLINE tokens,
    1326             :                except *totally* empty lines in interactive
    1327             :                mode, which signal the end of a command group. */
    1328          59 :             if (col == 0 && c == '\n' && tok->prompt != NULL)
    1329           0 :                 blankline = 0; /* Let it through */
    1330             :             else
    1331          59 :                 blankline = 1; /* Ignore completely */
    1332             :             /* We can't jump back right here since we still
    1333             :                may need to skip to the end of a comment */
    1334             :         }
    1335         259 :         if (!blankline && tok->level == 0) {
    1336         198 :             if (col == tok->indstack[tok->indent]) {
    1337             :                 /* No change */
    1338          93 :                 if (altcol != tok->altindstack[tok->indent]) {
    1339           0 :                     if (indenterror(tok))
    1340           0 :                         return ERRORTOKEN;
    1341             :                 }
    1342             :             }
    1343         105 :             else if (col > tok->indstack[tok->indent]) {
    1344             :                 /* Indent -- always one */
    1345          56 :                 if (tok->indent+1 >= MAXINDENT) {
    1346           0 :                     tok->done = E_TOODEEP;
    1347           0 :                     tok->cur = tok->inp;
    1348           0 :                     return ERRORTOKEN;
    1349             :                 }
    1350          56 :                 if (altcol <= tok->altindstack[tok->indent]) {
    1351           0 :                     if (indenterror(tok))
    1352           0 :                         return ERRORTOKEN;
    1353             :                 }
    1354          56 :                 tok->pendin++;
    1355          56 :                 tok->indstack[++tok->indent] = col;
    1356          56 :                 tok->altindstack[tok->indent] = altcol;
    1357             :             }
    1358             :             else /* col < tok->indstack[tok->indent] */ {
    1359             :                 /* Dedent -- any number, must be consistent */
    1360         254 :                 while (tok->indent > 0 &&
    1361         100 :                     col < tok->indstack[tok->indent]) {
    1362          56 :                     tok->pendin--;
    1363          56 :                     tok->indent--;
    1364             :                 }
    1365          49 :                 if (col != tok->indstack[tok->indent]) {
    1366           0 :                     tok->done = E_DEDENT;
    1367           0 :                     tok->cur = tok->inp;
    1368           0 :                     return ERRORTOKEN;
    1369             :                 }
    1370          49 :                 if (altcol != tok->altindstack[tok->indent]) {
    1371           0 :                     if (indenterror(tok))
    1372           0 :                         return ERRORTOKEN;
    1373             :                 }
    1374             :             }
    1375             :         }
    1376             :     }
    1377             : 
    1378        1884 :     tok->start = tok->cur;
    1379             : 
    1380             :     /* Return pending indents/dedents */
    1381        1884 :     if (tok->pendin != 0) {
    1382         112 :         if (tok->pendin < 0) {
    1383          56 :             tok->pendin++;
    1384          56 :             return DEDENT;
    1385             :         }
    1386             :         else {
    1387          56 :             tok->pendin--;
    1388          56 :             return INDENT;
    1389             :         }
    1390             :     }
    1391             : 
    1392             :  again:
    1393        1777 :     tok->start = NULL;
    1394             :     /* Skip spaces */
    1395             :     do {
    1396        2288 :         c = tok_nextc(tok);
    1397        2288 :     } while (c == ' ' || c == '\t' || c == '\014');
    1398             : 
    1399             :     /* Set start of current token */
    1400        1777 :     tok->start = tok->cur - 1;
    1401             : 
    1402             :     /* Skip comment */
    1403        1777 :     if (c == '#')
    1404         410 :         while (c != EOF && c != '\n')
    1405         390 :             c = tok_nextc(tok);
    1406             : 
    1407             :     /* Check for EOF and errors now */
    1408        1777 :     if (c == EOF) {
    1409           6 :         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
    1410             :     }
    1411             : 
    1412             :     /* Identifier (most frequent token!) */
    1413        1771 :     nonascii = 0;
    1414        1771 :     if (is_potential_identifier_start(c)) {
    1415             :         /* Process b"", r"", u"", br"" and rb"" */
    1416         701 :         int saw_b = 0, saw_r = 0, saw_u = 0;
    1417             :         while (1) {
    1418         778 :             if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
    1419           3 :                 saw_b = 1;
    1420             :             /* Since this is a backwards compatibility support literal we don't
    1421             :                want to support it in arbitrary order like byte literals. */
    1422         775 :             else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
    1423           6 :                 saw_u = 1;
    1424             :             /* ur"" and ru"" are not supported */
    1425         769 :             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
    1426          68 :                 saw_r = 1;
    1427             :             else
    1428             :                 break;
    1429          77 :             c = tok_nextc(tok);
    1430          77 :             if (c == '"' || c == '\'')
    1431             :                 goto letter_quote;
    1432          77 :         }
    1433        6193 :         while (is_potential_identifier_char(c)) {
    1434        4791 :             if (c >= 128)
    1435           0 :                 nonascii = 1;
    1436        4791 :             c = tok_nextc(tok);
    1437             :         }
    1438         701 :         tok_backup(tok, c);
    1439         701 :         if (nonascii &&
    1440           0 :             !verify_identifier(tok)) {
    1441           0 :             tok->done = E_IDENTIFIER;
    1442           0 :             return ERRORTOKEN;
    1443             :         }
    1444         701 :         *p_start = tok->start;
    1445         701 :         *p_end = tok->cur;
    1446         701 :         return NAME;
    1447             :     }
    1448             : 
    1449             :     /* Newline */
    1450        1070 :     if (c == '\n') {
    1451         256 :         tok->atbol = 1;
    1452         256 :         if (blankline || tok->level > 0)
    1453             :             goto nextline;
    1454         195 :         *p_start = tok->start;
    1455         195 :         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
    1456         195 :         tok->cont_line = 0;
    1457         195 :         return NEWLINE;
    1458             :     }
    1459             : 
    1460             :     /* Period or number starting with period? */
    1461         814 :     if (c == '.') {
    1462         120 :         c = tok_nextc(tok);
    1463         120 :         if (isdigit(c)) {
    1464           0 :             goto fraction;
    1465         120 :         } else if (c == '.') {
    1466           0 :             c = tok_nextc(tok);
    1467           0 :             if (c == '.') {
    1468           0 :                 *p_start = tok->start;
    1469           0 :                 *p_end = tok->cur;
    1470           0 :                 return ELLIPSIS;
    1471             :             } else {
    1472           0 :                 tok_backup(tok, c);
    1473             :             }
    1474           0 :             tok_backup(tok, '.');
    1475             :         } else {
    1476         120 :             tok_backup(tok, c);
    1477             :         }
    1478         120 :         *p_start = tok->start;
    1479         120 :         *p_end = tok->cur;
    1480         120 :         return DOT;
    1481             :     }
    1482             : 
    1483             :     /* Number */
    1484         694 :     if (isdigit(c)) {
    1485          33 :         if (c == '0') {
    1486             :             /* Hex, octal or binary -- maybe. */
    1487           9 :             c = tok_nextc(tok);
    1488           9 :             if (c == '.')
    1489           0 :                 goto fraction;
    1490           9 :             if (c == 'j' || c == 'J')
    1491             :                 goto imaginary;
    1492           9 :             if (c == 'x' || c == 'X') {
    1493             : 
    1494             :                 /* Hex */
    1495           0 :                 c = tok_nextc(tok);
    1496           0 :                 if (!isxdigit(c)) {
    1497           0 :                     tok->done = E_TOKEN;
    1498           0 :                     tok_backup(tok, c);
    1499           0 :                     return ERRORTOKEN;
    1500             :                 }
    1501             :                 do {
    1502           0 :                     c = tok_nextc(tok);
    1503           0 :                 } while (isxdigit(c));
    1504             :             }
    1505           9 :             else if (c == 'o' || c == 'O') {
    1506             :                 /* Octal */
    1507           0 :                 c = tok_nextc(tok);
    1508           0 :                 if (c < '0' || c >= '8') {
    1509           0 :                     tok->done = E_TOKEN;
    1510           0 :                     tok_backup(tok, c);
    1511           0 :                     return ERRORTOKEN;
    1512             :                 }
    1513             :                 do {
    1514           0 :                     c = tok_nextc(tok);
    1515           0 :                 } while ('0' <= c && c < '8');
    1516             :             }
    1517           9 :             else if (c == 'b' || c == 'B') {
    1518             :                 /* Binary */
    1519           0 :                 c = tok_nextc(tok);
    1520           0 :                 if (c != '0' && c != '1') {
    1521           0 :                     tok->done = E_TOKEN;
    1522           0 :                     tok_backup(tok, c);
    1523           0 :                     return ERRORTOKEN;
    1524             :                 }
    1525             :                 do {
    1526           0 :                     c = tok_nextc(tok);
    1527           0 :                 } while (c == '0' || c == '1');
    1528             :             }
    1529             :             else {
    1530           9 :                 int nonzero = 0;
    1531             :                 /* maybe old-style octal; c is first char of it */
    1532             :                 /* in any case, allow '0' as a literal */
    1533          18 :                 while (c == '0')
    1534           0 :                     c = tok_nextc(tok);
    1535          18 :                 while (isdigit(c)) {
    1536           0 :                     nonzero = 1;
    1537           0 :                     c = tok_nextc(tok);
    1538             :                 }
    1539           9 :                 if (c == '.')
    1540           0 :                     goto fraction;
    1541           9 :                 else if (c == 'e' || c == 'E')
    1542             :                     goto exponent;
    1543           9 :                 else if (c == 'j' || c == 'J')
    1544             :                     goto imaginary;
    1545           9 :                 else if (nonzero) {
    1546           0 :                     tok->done = E_TOKEN;
    1547           0 :                     tok_backup(tok, c);
    1548           0 :                     return ERRORTOKEN;
    1549             :                 }
    1550             :             }
    1551             :         }
    1552             :         else {
    1553             :             /* Decimal */
    1554             :             do {
    1555          24 :                 c = tok_nextc(tok);
    1556          24 :             } while (isdigit(c));
    1557             :             {
    1558             :                 /* Accept floating point numbers. */
    1559          24 :                 if (c == '.') {
    1560             :         fraction:
    1561             :                     /* Fraction */
    1562             :                     do {
    1563           0 :                         c = tok_nextc(tok);
    1564           0 :                     } while (isdigit(c));
    1565             :                 }
    1566          24 :                 if (c == 'e' || c == 'E') {
    1567             :         exponent:
    1568             :                     /* Exponent part */
    1569           0 :                     c = tok_nextc(tok);
    1570           0 :                     if (c == '+' || c == '-')
    1571           0 :                         c = tok_nextc(tok);
    1572           0 :                     if (!isdigit(c)) {
    1573           0 :                         tok->done = E_TOKEN;
    1574           0 :                         tok_backup(tok, c);
    1575           0 :                         return ERRORTOKEN;
    1576             :                     }
    1577             :                     do {
    1578           0 :                         c = tok_nextc(tok);
    1579           0 :                     } while (isdigit(c));
    1580             :                 }
    1581          24 :                 if (c == 'j' || c == 'J')
    1582             :                     /* Imaginary part */
    1583             :         imaginary:
    1584           0 :                     c = tok_nextc(tok);
    1585             :             }
    1586             :         }
    1587          33 :         tok_backup(tok, c);
    1588          33 :         *p_start = tok->start;
    1589          33 :         *p_end = tok->cur;
    1590          33 :         return NUMBER;
    1591             :     }
    1592             : 
    1593             :   letter_quote:
    1594             :     /* String */
    1595         661 :     if (c == '\'' || c == '"') {
    1596          77 :         int quote = c;
    1597          77 :         int quote_size = 1;             /* 1 or 3 */
    1598          77 :         int end_quote_size = 0;
    1599             : 
    1600             :         /* Find the quote size and start of string */
    1601          77 :         c = tok_nextc(tok);
    1602          77 :         if (c == quote) {
    1603           5 :             c = tok_nextc(tok);
    1604           5 :             if (c == quote)
    1605           0 :                 quote_size = 3;
    1606             :             else
    1607           5 :                 end_quote_size = 1;     /* empty string found */
    1608             :         }
    1609          77 :         if (c != quote)
    1610          77 :             tok_backup(tok, c);
    1611             : 
    1612             :         /* Get rest of string */
    1613        1947 :         while (end_quote_size != quote_size) {
    1614        1793 :             c = tok_nextc(tok);
    1615        1793 :             if (c == EOF) {
    1616           0 :                 if (quote_size == 3)
    1617           0 :                     tok->done = E_EOFS;
    1618             :                 else
    1619           0 :                     tok->done = E_EOLS;
    1620           0 :                 tok->cur = tok->inp;
    1621           0 :                 return ERRORTOKEN;
    1622             :             }
    1623        1793 :             if (quote_size == 1 && c == '\n') {
    1624           0 :                 tok->done = E_EOLS;
    1625           0 :                 tok->cur = tok->inp;
    1626           0 :                 return ERRORTOKEN;
    1627             :             }
    1628        1793 :             if (c == quote)
    1629          72 :                 end_quote_size += 1;
    1630             :             else {
    1631        1721 :                 end_quote_size = 0;
    1632        1721 :                 if (c == '\\')
    1633           3 :                 c = tok_nextc(tok);  /* skip escaped char */
    1634             :             }
    1635             :         }
    1636             : 
    1637          77 :         *p_start = tok->start;
    1638          77 :         *p_end = tok->cur;
    1639          77 :         return STRING;
    1640             :     }
    1641             : 
    1642             :     /* Line continuation */
    1643         584 :     if (c == '\\') {
    1644           5 :         c = tok_nextc(tok);
    1645           5 :         if (c != '\n') {
    1646           0 :             tok->done = E_LINECONT;
    1647           0 :             tok->cur = tok->inp;
    1648           0 :             return ERRORTOKEN;
    1649             :         }
    1650           5 :         tok->cont_line = 1;
    1651           5 :         goto again; /* Read next line */
    1652             :     }
    1653             : 
    1654             :     /* Check for two-character token */
    1655             :     {
    1656         579 :         int c2 = tok_nextc(tok);
    1657         579 :         int token = PyToken_TwoChars(c, c2);
    1658         579 :         if (token != OP) {
    1659          15 :             int c3 = tok_nextc(tok);
    1660          15 :             int token3 = PyToken_ThreeChars(c, c2, c3);
    1661          15 :             if (token3 != OP) {
    1662           0 :                 token = token3;
    1663             :             } else {
    1664          15 :                 tok_backup(tok, c3);
    1665             :             }
    1666          15 :             *p_start = tok->start;
    1667          15 :             *p_end = tok->cur;
    1668          15 :             return token;
    1669             :         }
    1670         564 :         tok_backup(tok, c2);
    1671             :     }
    1672             : 
    1673             :     /* Keep track of parentheses nesting level */
    1674         564 :     switch (c) {
    1675             :     case '(':
    1676             :     case '[':
    1677             :     case '{':
    1678         147 :         tok->level++;
    1679         147 :         break;
    1680             :     case ')':
    1681             :     case ']':
    1682             :     case '}':
    1683         147 :         tok->level--;
    1684         147 :         break;
    1685             :     }
    1686             : 
    1687             :     /* Punctuation character */
    1688         564 :     *p_start = tok->start;
    1689         564 :     *p_end = tok->cur;
    1690         564 :     return PyToken_OneChar(c);
    1691             : }
    1692             : 
    1693             : int
    1694        1823 : PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
    1695             : {
    1696        1823 :     int result = tok_get(tok, p_start, p_end);
    1697        1823 :     if (tok->decoding_erred) {
    1698           0 :         result = ERRORTOKEN;
    1699           0 :         tok->done = E_DECODE;
    1700             :     }
    1701        1823 :     return result;
    1702             : }
    1703             : 
    1704             : /* Get the encoding of a Python file. Check for the coding cookie and check if
    1705             :    the file starts with a BOM.
    1706             : 
    1707             :    PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
    1708             :    encoding in the first or second line of the file (in which case the encoding
    1709             :    should be assumed to be UTF-8).
    1710             : 
    1711             :    The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
    1712             :    by the caller. */
    1713             : 
    1714             : char *
    1715           0 : PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
    1716             : {
    1717             :     struct tok_state *tok;
    1718             :     FILE *fp;
    1719           0 :     char *p_start =NULL , *p_end =NULL , *encoding = NULL;
    1720             : 
    1721           0 :     fd = dup(fd);
    1722           0 :     if (fd < 0) {
    1723           0 :         return NULL;
    1724             :     }
    1725           0 :     fp = fdopen(fd, "r");
    1726           0 :     if (fp == NULL) {
    1727           0 :         return NULL;
    1728             :     }
    1729           0 :     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
    1730           0 :     if (tok == NULL) {
    1731           0 :         fclose(fp);
    1732           0 :         return NULL;
    1733             :     }
    1734             : #ifndef PGEN
    1735           0 :     if (filename != NULL) {
    1736           0 :         Py_INCREF(filename);
    1737           0 :         tok->filename = filename;
    1738             :     }
    1739             :     else {
    1740           0 :         tok->filename = PyUnicode_FromString("<string>");
    1741           0 :         if (tok->filename == NULL) {
    1742           0 :             fclose(fp);
    1743           0 :             PyTokenizer_Free(tok);
    1744           0 :             return encoding;
    1745             :         }
    1746             :     }
    1747             : #endif
    1748           0 :     while (tok->lineno < 2 && tok->done == E_OK) {
    1749           0 :         PyTokenizer_Get(tok, &p_start, &p_end);
    1750             :     }
    1751           0 :     fclose(fp);
    1752           0 :     if (tok->encoding) {
    1753           0 :         encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
    1754           0 :         if (encoding)
    1755           0 :         strcpy(encoding, tok->encoding);
    1756             :     }
    1757           0 :     PyTokenizer_Free(tok);
    1758           0 :     return encoding;
    1759             : }
    1760             : 
    1761             : char *
    1762           0 : PyTokenizer_FindEncoding(int fd)
    1763             : {
    1764           0 :     return PyTokenizer_FindEncodingFilename(fd, NULL);
    1765             : }
    1766             : 
    1767             : #ifdef Py_DEBUG
    1768             : 
    1769             : void
    1770             : tok_dump(int type, char *start, char *end)
    1771             : {
    1772             :     printf("%s", _PyParser_TokenNames[type]);
    1773             :     if (type == NAME || type == NUMBER || type == STRING || type == OP)
    1774             :         printf("(%.*s)", (int)(end - start), start);
    1775             : }
    1776             : 
    1777             : #endif

Generated by: LCOV version 1.10