LCOV - libreoffice_filtered.info - libreoffice/workdir/unxlngi6.pro/UnpackedTarball/python3/Objects/unicodeobject.c

LCOV - code coverage report

Current view:	top level - libreoffice/workdir/unxlngi6.pro/UnpackedTarball/python3/Objects - unicodeobject.c (source / functions)		Hit	Total	Coverage
Test:	libreoffice_filtered.info	Lines:	1709	6505	26.3 %
Date:	2012-12-17	Functions:	118	291	40.5 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             : 
       3             : Unicode implementation based on original code by Fredrik Lundh,
       4             : modified by Marc-Andre Lemburg <mal@lemburg.com>.
       5             : 
       6             : Major speed upgrades to the method implementations at the Reykjavik
       7             : NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
       8             : 
       9             : Copyright (c) Corporation for National Research Initiatives.
      10             : 
      11             : --------------------------------------------------------------------
      12             : The original string type implementation is:
      13             : 
      14             :   Copyright (c) 1999 by Secret Labs AB
      15             :   Copyright (c) 1999 by Fredrik Lundh
      16             : 
      17             : By obtaining, using, and/or copying this software and/or its
      18             : associated documentation, you agree that you have read, understood,
      19             : and will comply with the following terms and conditions:
      20             : 
      21             : Permission to use, copy, modify, and distribute this software and its
      22             : associated documentation for any purpose and without fee is hereby
      23             : granted, provided that the above copyright notice appears in all
      24             : copies, and that both that copyright notice and this permission notice
      25             : appear in supporting documentation, and that the name of Secret Labs
      26             : AB or the author not be used in advertising or publicity pertaining to
      27             : distribution of the software without specific, written prior
      28             : permission.
      29             : 
      30             : SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
      31             : THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
      32             : FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
      33             : ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
      34             : WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
      35             : ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
      36             : OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      37             : --------------------------------------------------------------------
      38             : 
      39             : */
      40             : 
      41             : #define PY_SSIZE_T_CLEAN
      42             : #include "Python.h"
      43             : #include "ucnhash.h"
      44             : #include "bytes_methods.h"
      45             : 
      46             : #ifdef MS_WINDOWS
      47             : #include <windows.h>
      48             : #endif
      49             : 
      50             : /* Endianness switches; defaults to little endian */
      51             : 
      52             : #ifdef WORDS_BIGENDIAN
      53             : # define BYTEORDER_IS_BIG_ENDIAN
      54             : #else
      55             : # define BYTEORDER_IS_LITTLE_ENDIAN
      56             : #endif
      57             : 
      58             : /* --- Globals ------------------------------------------------------------
      59             : 
      60             :    The globals are initialized by the _PyUnicode_Init() API and should
      61             :    not be used before calling that API.
      62             : 
      63             : */
      64             : 
      65             : 
      66             : #ifdef __cplusplus
      67             : extern "C" {
      68             : #endif
      69             : 
      70             : /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
      71             : #define MAX_UNICODE 0x10ffff
      72             : 
      73             : #ifdef Py_DEBUG
      74             : #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
      75             : #else
      76             : #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
      77             : #endif
      78             : 
      79             : #define _PyUnicode_UTF8(op)                             \
      80             :     (((PyCompactUnicodeObject*)(op))->utf8)
      81             : #define PyUnicode_UTF8(op)                              \
      82             :     (assert(_PyUnicode_CHECK(op)),                      \
      83             :      assert(PyUnicode_IS_READY(op)),                    \
      84             :      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
      85             :          ((char*)((PyASCIIObject*)(op) + 1)) :          \
      86             :          _PyUnicode_UTF8(op))
      87             : #define _PyUnicode_UTF8_LENGTH(op)                      \
      88             :     (((PyCompactUnicodeObject*)(op))->utf8_length)
      89             : #define PyUnicode_UTF8_LENGTH(op)                       \
      90             :     (assert(_PyUnicode_CHECK(op)),                      \
      91             :      assert(PyUnicode_IS_READY(op)),                    \
      92             :      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
      93             :          ((PyASCIIObject*)(op))->length :               \
      94             :          _PyUnicode_UTF8_LENGTH(op))
      95             : #define _PyUnicode_WSTR(op)                             \
      96             :     (((PyASCIIObject*)(op))->wstr)
      97             : #define _PyUnicode_WSTR_LENGTH(op)                      \
      98             :     (((PyCompactUnicodeObject*)(op))->wstr_length)
      99             : #define _PyUnicode_LENGTH(op)                           \
     100             :     (((PyASCIIObject *)(op))->length)
     101             : #define _PyUnicode_STATE(op)                            \
     102             :     (((PyASCIIObject *)(op))->state)
     103             : #define _PyUnicode_HASH(op)                             \
     104             :     (((PyASCIIObject *)(op))->hash)
     105             : #define _PyUnicode_KIND(op)                             \
     106             :     (assert(_PyUnicode_CHECK(op)),                      \
     107             :      ((PyASCIIObject *)(op))->state.kind)
     108             : #define _PyUnicode_GET_LENGTH(op)                       \
     109             :     (assert(_PyUnicode_CHECK(op)),                      \
     110             :      ((PyASCIIObject *)(op))->length)
     111             : #define _PyUnicode_DATA_ANY(op)                         \
     112             :     (((PyUnicodeObject*)(op))->data.any)
     113             : 
     114             : /* Optimized version of Py_MAX() to compute the maximum character:
     115             :    use it when your are computing the second argument of PyUnicode_New() */
     116             : #define MAX_MAXCHAR(maxchar1, maxchar2)                 \
     117             :     ((maxchar1) | (maxchar2))
     118             : 
     119             : #undef PyUnicode_READY
     120             : #define PyUnicode_READY(op)                             \
     121             :     (assert(_PyUnicode_CHECK(op)),                      \
     122             :      (PyUnicode_IS_READY(op) ?                          \
     123             :       0 :                                               \
     124             :       _PyUnicode_Ready(op)))
     125             : 
     126             : #define _PyUnicode_SHARE_UTF8(op)                       \
     127             :     (assert(_PyUnicode_CHECK(op)),                      \
     128             :      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
     129             :      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
     130             : #define _PyUnicode_SHARE_WSTR(op)                       \
     131             :     (assert(_PyUnicode_CHECK(op)),                      \
     132             :      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
     133             : 
     134             : /* true if the Unicode object has an allocated UTF-8 memory block
     135             :    (not shared with other data) */
     136             : #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
     137             :     (assert(_PyUnicode_CHECK(op)),                      \
     138             :      (!PyUnicode_IS_COMPACT_ASCII(op)                   \
     139             :       && _PyUnicode_UTF8(op)                            \
     140             :       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
     141             : 
     142             : /* true if the Unicode object has an allocated wstr memory block
     143             :    (not shared with other data) */
     144             : #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
     145             :     (assert(_PyUnicode_CHECK(op)),                      \
     146             :      (_PyUnicode_WSTR(op) &&                            \
     147             :       (!PyUnicode_IS_READY(op) ||                       \
     148             :        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
     149             : 
     150             : /* Generic helper macro to convert characters of different types.
     151             :    from_type and to_type have to be valid type names, begin and end
     152             :    are pointers to the source characters which should be of type
     153             :    "from_type *".  to is a pointer of type "to_type *" and points to the
     154             :    buffer where the result characters are written to. */
     155             : #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
     156             :     do {                                                \
     157             :         to_type *_to = (to_type *) to;                  \
     158             :         const from_type *_iter = (begin);               \
     159             :         const from_type *_end = (end);                  \
     160             :         Py_ssize_t n = (_end) - (_iter);                \
     161             :         const from_type *_unrolled_end =                \
     162             :             _iter + (n & ~ (Py_ssize_t) 3);             \
     163             :         while (_iter < (_unrolled_end)) {               \
     164             :             _to[0] = (to_type) _iter[0];                \
     165             :             _to[1] = (to_type) _iter[1];                \
     166             :             _to[2] = (to_type) _iter[2];                \
     167             :             _to[3] = (to_type) _iter[3];                \
     168             :             _iter += 4; _to += 4;                       \
     169             :         }                                               \
     170             :         while (_iter < (_end))                          \
     171             :             *_to++ = (to_type) *_iter++;                \
     172             :     } while (0)
     173             : 
     174             : /* This dictionary holds all interned unicode strings.  Note that references
     175             :    to strings in this dictionary are *not* counted in the string's ob_refcnt.
     176             :    When the interned string reaches a refcnt of 0 the string deallocation
     177             :    function will delete the reference from this dictionary.
     178             : 
     179             :    Another way to look at this is that to say that the actual reference
     180             :    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
     181             : */
     182             : static PyObject *interned;
     183             : 
     184             : /* The empty Unicode object is shared to improve performance. */
     185             : static PyObject *unicode_empty;
     186             : 
     187             : /* List of static strings. */
     188             : static _Py_Identifier *static_strings;
     189             : 
     190             : /* Single character Unicode strings in the Latin-1 range are being
     191             :    shared as well. */
     192             : static PyObject *unicode_latin1[256];
     193             : 
     194             : /* Fast detection of the most frequent whitespace characters */
     195             : const unsigned char _Py_ascii_whitespace[] = {
     196             :     0, 0, 0, 0, 0, 0, 0, 0,
     197             : /*     case 0x0009: * CHARACTER TABULATION */
     198             : /*     case 0x000A: * LINE FEED */
     199             : /*     case 0x000B: * LINE TABULATION */
     200             : /*     case 0x000C: * FORM FEED */
     201             : /*     case 0x000D: * CARRIAGE RETURN */
     202             :     0, 1, 1, 1, 1, 1, 0, 0,
     203             :     0, 0, 0, 0, 0, 0, 0, 0,
     204             : /*     case 0x001C: * FILE SEPARATOR */
     205             : /*     case 0x001D: * GROUP SEPARATOR */
     206             : /*     case 0x001E: * RECORD SEPARATOR */
     207             : /*     case 0x001F: * UNIT SEPARATOR */
     208             :     0, 0, 0, 0, 1, 1, 1, 1,
     209             : /*     case 0x0020: * SPACE */
     210             :     1, 0, 0, 0, 0, 0, 0, 0,
     211             :     0, 0, 0, 0, 0, 0, 0, 0,
     212             :     0, 0, 0, 0, 0, 0, 0, 0,
     213             :     0, 0, 0, 0, 0, 0, 0, 0,
     214             : 
     215             :     0, 0, 0, 0, 0, 0, 0, 0,
     216             :     0, 0, 0, 0, 0, 0, 0, 0,
     217             :     0, 0, 0, 0, 0, 0, 0, 0,
     218             :     0, 0, 0, 0, 0, 0, 0, 0,
     219             :     0, 0, 0, 0, 0, 0, 0, 0,
     220             :     0, 0, 0, 0, 0, 0, 0, 0,
     221             :     0, 0, 0, 0, 0, 0, 0, 0,
     222             :     0, 0, 0, 0, 0, 0, 0, 0
     223             : };
     224             : 
     225             : /* forward */
     226             : static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
     227             : static PyObject* get_latin1_char(unsigned char ch);
     228             : static int unicode_modifiable(PyObject *unicode);
     229             : 
     230             : 
     231             : static PyObject *
     232             : _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
     233             : static PyObject *
     234             : _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
     235             : static PyObject *
     236             : _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
     237             : 
     238             : static PyObject *
     239             : unicode_encode_call_errorhandler(const char *errors,
     240             :        PyObject **errorHandler,const char *encoding, const char *reason,
     241             :        PyObject *unicode, PyObject **exceptionObject,
     242             :        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
     243             : 
     244             : static void
     245             : raise_encode_exception(PyObject **exceptionObject,
     246             :                        const char *encoding,
     247             :                        PyObject *unicode,
     248             :                        Py_ssize_t startpos, Py_ssize_t endpos,
     249             :                        const char *reason);
     250             : 
     251             : /* Same for linebreaks */
     252             : static unsigned char ascii_linebreak[] = {
     253             :     0, 0, 0, 0, 0, 0, 0, 0,
     254             : /*         0x000A, * LINE FEED */
     255             : /*         0x000B, * LINE TABULATION */
     256             : /*         0x000C, * FORM FEED */
     257             : /*         0x000D, * CARRIAGE RETURN */
     258             :     0, 0, 1, 1, 1, 1, 0, 0,
     259             :     0, 0, 0, 0, 0, 0, 0, 0,
     260             : /*         0x001C, * FILE SEPARATOR */
     261             : /*         0x001D, * GROUP SEPARATOR */
     262             : /*         0x001E, * RECORD SEPARATOR */
     263             :     0, 0, 0, 0, 1, 1, 1, 0,
     264             :     0, 0, 0, 0, 0, 0, 0, 0,
     265             :     0, 0, 0, 0, 0, 0, 0, 0,
     266             :     0, 0, 0, 0, 0, 0, 0, 0,
     267             :     0, 0, 0, 0, 0, 0, 0, 0,
     268             : 
     269             :     0, 0, 0, 0, 0, 0, 0, 0,
     270             :     0, 0, 0, 0, 0, 0, 0, 0,
     271             :     0, 0, 0, 0, 0, 0, 0, 0,
     272             :     0, 0, 0, 0, 0, 0, 0, 0,
     273             :     0, 0, 0, 0, 0, 0, 0, 0,
     274             :     0, 0, 0, 0, 0, 0, 0, 0,
     275             :     0, 0, 0, 0, 0, 0, 0, 0,
     276             :     0, 0, 0, 0, 0, 0, 0, 0
     277             : };
     278             : 
     279             : /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
     280             :    This function is kept for backward compatibility with the old API. */
     281             : Py_UNICODE
     282           0 : PyUnicode_GetMax(void)
     283             : {
     284             : #ifdef Py_UNICODE_WIDE
     285           0 :     return 0x10FFFF;
     286             : #else
     287             :     /* This is actually an illegal character, so it should
     288             :        not be passed to unichr. */
     289             :     return 0xFFFF;
     290             : #endif
     291             : }
     292             : 
     293             : #ifdef Py_DEBUG
     294             : int
     295             : _PyUnicode_CheckConsistency(PyObject *op, int check_content)
     296             : {
     297             :     PyASCIIObject *ascii;
     298             :     unsigned int kind;
     299             : 
     300             :     assert(PyUnicode_Check(op));
     301             : 
     302             :     ascii = (PyASCIIObject *)op;
     303             :     kind = ascii->state.kind;
     304             : 
     305             :     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
     306             :         assert(kind == PyUnicode_1BYTE_KIND);
     307             :         assert(ascii->state.ready == 1);
     308             :     }
     309             :     else {
     310             :         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
     311             :         void *data;
     312             : 
     313             :         if (ascii->state.compact == 1) {
     314             :             data = compact + 1;
     315             :             assert(kind == PyUnicode_1BYTE_KIND
     316             :                    || kind == PyUnicode_2BYTE_KIND
     317             :                    || kind == PyUnicode_4BYTE_KIND);
     318             :             assert(ascii->state.ascii == 0);
     319             :             assert(ascii->state.ready == 1);
     320             :             assert (compact->utf8 != data);
     321             :         }
     322             :         else {
     323             :             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
     324             : 
     325             :             data = unicode->data.any;
     326             :             if (kind == PyUnicode_WCHAR_KIND) {
     327             :                 assert(ascii->length == 0);
     328             :                 assert(ascii->hash == -1);
     329             :                 assert(ascii->state.compact == 0);
     330             :                 assert(ascii->state.ascii == 0);
     331             :                 assert(ascii->state.ready == 0);
     332             :                 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
     333             :                 assert(ascii->wstr != NULL);
     334             :                 assert(data == NULL);
     335             :                 assert(compact->utf8 == NULL);
     336             :             }
     337             :             else {
     338             :                 assert(kind == PyUnicode_1BYTE_KIND
     339             :                        || kind == PyUnicode_2BYTE_KIND
     340             :                        || kind == PyUnicode_4BYTE_KIND);
     341             :                 assert(ascii->state.compact == 0);
     342             :                 assert(ascii->state.ready == 1);
     343             :                 assert(data != NULL);
     344             :                 if (ascii->state.ascii) {
     345             :                     assert (compact->utf8 == data);
     346             :                     assert (compact->utf8_length == ascii->length);
     347             :                 }
     348             :                 else
     349             :                     assert (compact->utf8 != data);
     350             :             }
     351             :         }
     352             :         if (kind != PyUnicode_WCHAR_KIND) {
     353             :             if (
     354             : #if SIZEOF_WCHAR_T == 2
     355             :                 kind == PyUnicode_2BYTE_KIND
     356             : #else
     357             :                 kind == PyUnicode_4BYTE_KIND
     358             : #endif
     359             :                )
     360             :             {
     361             :                 assert(ascii->wstr == data);
     362             :                 assert(compact->wstr_length == ascii->length);
     363             :             } else
     364             :                 assert(ascii->wstr != data);
     365             :         }
     366             : 
     367             :         if (compact->utf8 == NULL)
     368             :             assert(compact->utf8_length == 0);
     369             :         if (ascii->wstr == NULL)
     370             :             assert(compact->wstr_length == 0);
     371             :     }
     372             :     /* check that the best kind is used */
     373             :     if (check_content && kind != PyUnicode_WCHAR_KIND)
     374             :     {
     375             :         Py_ssize_t i;
     376             :         Py_UCS4 maxchar = 0;
     377             :         void *data;
     378             :         Py_UCS4 ch;
     379             : 
     380             :         data = PyUnicode_DATA(ascii);
     381             :         for (i=0; i < ascii->length; i++)
     382             :         {
     383             :             ch = PyUnicode_READ(kind, data, i);
     384             :             if (ch > maxchar)
     385             :                 maxchar = ch;
     386             :         }
     387             :         if (kind == PyUnicode_1BYTE_KIND) {
     388             :             if (ascii->state.ascii == 0) {
     389             :                 assert(maxchar >= 128);
     390             :                 assert(maxchar <= 255);
     391             :             }
     392             :             else
     393             :                 assert(maxchar < 128);
     394             :         }
     395             :         else if (kind == PyUnicode_2BYTE_KIND) {
     396             :             assert(maxchar >= 0x100);
     397             :             assert(maxchar <= 0xFFFF);
     398             :         }
     399             :         else {
     400             :             assert(maxchar >= 0x10000);
     401             :             assert(maxchar <= MAX_UNICODE);
     402             :         }
     403             :         assert(PyUnicode_READ(kind, data, ascii->length) == 0);
     404             :     }
     405             :     return 1;
     406             : }
     407             : #endif
     408             : 
     409             : static PyObject*
     410           0 : unicode_result_wchar(PyObject *unicode)
     411             : {
     412             : #ifndef Py_DEBUG
     413             :     Py_ssize_t len;
     414             : 
     415             :     assert(Py_REFCNT(unicode) == 1);
     416             : 
     417           0 :     len = _PyUnicode_WSTR_LENGTH(unicode);
     418           0 :     if (len == 0) {
     419           0 :         Py_INCREF(unicode_empty);
     420           0 :         Py_DECREF(unicode);
     421           0 :         return unicode_empty;
     422             :     }
     423             : 
     424           0 :     if (len == 1) {
     425           0 :         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
     426           0 :         if (ch < 256) {
     427           0 :             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
     428           0 :             Py_DECREF(unicode);
     429           0 :             return latin1_char;
     430             :         }
     431             :     }
     432             : 
     433           0 :     if (_PyUnicode_Ready(unicode) < 0) {
     434           0 :         Py_XDECREF(unicode);
     435           0 :         return NULL;
     436             :     }
     437             : #else
     438             :     /* don't make the result ready in debug mode to ensure that the caller
     439             :        makes the string ready before using it */
     440             :     assert(_PyUnicode_CheckConsistency(unicode, 1));
     441             : #endif
     442           0 :     return unicode;
     443             : }
     444             : 
     445             : static PyObject*
     446        3396 : unicode_result_ready(PyObject *unicode)
     447             : {
     448             :     Py_ssize_t length;
     449             : 
     450        3396 :     length = PyUnicode_GET_LENGTH(unicode);
     451        3396 :     if (length == 0) {
     452           0 :         if (unicode != unicode_empty) {
     453           0 :             Py_INCREF(unicode_empty);
     454           0 :             Py_DECREF(unicode);
     455             :         }
     456           0 :         return unicode_empty;
     457             :     }
     458             : 
     459        3396 :     if (length == 1) {
     460           7 :         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
     461           7 :         if (ch < 256) {
     462           7 :             PyObject *latin1_char = unicode_latin1[ch];
     463           7 :             if (latin1_char != NULL) {
     464           7 :                 if (unicode != latin1_char) {
     465           7 :                     Py_INCREF(latin1_char);
     466           7 :                     Py_DECREF(unicode);
     467             :                 }
     468           7 :                 return latin1_char;
     469             :             }
     470             :             else {
     471             :                 assert(_PyUnicode_CheckConsistency(unicode, 1));
     472           0 :                 Py_INCREF(unicode);
     473           0 :                 unicode_latin1[ch] = unicode;
     474           0 :                 return unicode;
     475             :             }
     476             :         }
     477             :     }
     478             : 
     479             :     assert(_PyUnicode_CheckConsistency(unicode, 1));
     480        3389 :     return unicode;
     481             : }
     482             : 
     483             : static PyObject*
     484        3396 : unicode_result(PyObject *unicode)
     485             : {
     486             :     assert(_PyUnicode_CHECK(unicode));
     487        3396 :     if (PyUnicode_IS_READY(unicode))
     488        3396 :         return unicode_result_ready(unicode);
     489             :     else
     490           0 :         return unicode_result_wchar(unicode);
     491             : }
     492             : 
     493             : static PyObject*
     494         128 : unicode_result_unchanged(PyObject *unicode)
     495             : {
     496         128 :     if (PyUnicode_CheckExact(unicode)) {
     497         128 :         if (PyUnicode_READY(unicode) == -1)
     498           0 :             return NULL;
     499         128 :         Py_INCREF(unicode);
     500         128 :         return unicode;
     501             :     }
     502             :     else
     503             :         /* Subtype -- return genuine unicode string with the same value. */
     504           0 :         return _PyUnicode_Copy(unicode);
     505             : }
     506             : 
     507             : #ifdef HAVE_MBCS
     508             : static OSVERSIONINFOEX winver;
     509             : #endif
     510             : 
     511             : /* --- Bloom Filters ----------------------------------------------------- */
     512             : 
     513             : /* stuff to implement simple "bloom filters" for Unicode characters.
     514             :    to keep things simple, we use a single bitmask, using the least 5
     515             :    bits from each unicode characters as the bit index. */
     516             : 
     517             : /* the linebreak mask is set up by Unicode_Init below */
     518             : 
     519             : #if LONG_BIT >= 128
     520             : #define BLOOM_WIDTH 128
     521             : #elif LONG_BIT >= 64
     522             : #define BLOOM_WIDTH 64
     523             : #elif LONG_BIT >= 32
     524             : #define BLOOM_WIDTH 32
     525             : #else
     526             : #error "LONG_BIT is smaller than 32"
     527             : #endif
     528             : 
     529             : #define BLOOM_MASK unsigned long
     530             : 
     531             : static BLOOM_MASK bloom_linebreak;
     532             : 
     533             : #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
     534             : #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
     535             : 
     536             : #define BLOOM_LINEBREAK(ch)                                             \
     537             :     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
     538             :      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
     539             : 
     540             : Py_LOCAL_INLINE(BLOOM_MASK)
     541           4 : make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
     542             : {
     543             :     /* calculate simple bloom-style bitmask for a given unicode string */
     544             : 
     545             :     BLOOM_MASK mask;
     546             :     Py_ssize_t i;
     547             : 
     548           4 :     mask = 0;
     549          15 :     for (i = 0; i < len; i++)
     550          11 :         BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
     551             : 
     552           4 :     return mask;
     553             : }
     554             : 
     555             : #define BLOOM_MEMBER(mask, chr, str) \
     556             :     (BLOOM(mask, chr) \
     557             :      && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
     558             : 
     559             : /* Compilation of templated routines */
     560             : 
     561             : #include "stringlib/asciilib.h"
     562             : #include "stringlib/fastsearch.h"
     563             : #include "stringlib/partition.h"
     564             : #include "stringlib/split.h"
     565             : #include "stringlib/count.h"
     566             : #include "stringlib/find.h"
     567             : #include "stringlib/find_max_char.h"
     568             : #include "stringlib/localeutil.h"
     569             : #include "stringlib/undef.h"
     570             : 
     571             : #include "stringlib/ucs1lib.h"
     572             : #include "stringlib/fastsearch.h"
     573             : #include "stringlib/partition.h"
     574             : #include "stringlib/split.h"
     575             : #include "stringlib/count.h"
     576             : #include "stringlib/find.h"
     577             : #include "stringlib/find_max_char.h"
     578             : #include "stringlib/localeutil.h"
     579             : #include "stringlib/undef.h"
     580             : 
     581             : #include "stringlib/ucs2lib.h"
     582             : #include "stringlib/fastsearch.h"
     583             : #include "stringlib/partition.h"
     584             : #include "stringlib/split.h"
     585             : #include "stringlib/count.h"
     586             : #include "stringlib/find.h"
     587             : #include "stringlib/find_max_char.h"
     588             : #include "stringlib/localeutil.h"
     589             : #include "stringlib/undef.h"
     590             : 
     591             : #include "stringlib/ucs4lib.h"
     592             : #include "stringlib/fastsearch.h"
     593             : #include "stringlib/partition.h"
     594             : #include "stringlib/split.h"
     595             : #include "stringlib/count.h"
     596             : #include "stringlib/find.h"
     597             : #include "stringlib/find_max_char.h"
     598             : #include "stringlib/localeutil.h"
     599             : #include "stringlib/undef.h"
     600             : 
     601             : #include "stringlib/unicodedefs.h"
     602             : #include "stringlib/fastsearch.h"
     603             : #include "stringlib/count.h"
     604             : #include "stringlib/find.h"
     605             : #include "stringlib/undef.h"
     606             : 
     607             : /* --- Unicode Object ----------------------------------------------------- */
     608             : 
     609             : static PyObject *
     610             : fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
     611             : 
     612          66 : Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
     613             :                                      Py_ssize_t size, Py_UCS4 ch,
     614             :                                      int direction)
     615             : {
     616          66 :     int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
     617             : 
     618          66 :     switch (kind) {
     619             :     case PyUnicode_1BYTE_KIND:
     620             :         {
     621          66 :             Py_UCS1 ch1 = (Py_UCS1) ch;
     622          66 :             if (ch1 == ch)
     623          66 :                 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
     624             :             else
     625           0 :                 return -1;
     626             :         }
     627             :     case PyUnicode_2BYTE_KIND:
     628             :         {
     629           0 :             Py_UCS2 ch2 = (Py_UCS2) ch;
     630           0 :             if (ch2 == ch)
     631           0 :                 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
     632             :             else
     633           0 :                 return -1;
     634             :         }
     635             :     case PyUnicode_4BYTE_KIND:
     636           0 :         return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
     637             :     default:
     638             :         assert(0);
     639           0 :         return -1;
     640             :     }
     641             : }
     642             : 
     643             : static PyObject*
     644         548 : resize_compact(PyObject *unicode, Py_ssize_t length)
     645             : {
     646             :     Py_ssize_t char_size;
     647             :     Py_ssize_t struct_size;
     648             :     Py_ssize_t new_size;
     649             :     int share_wstr;
     650             :     PyObject *new_unicode;
     651             :     assert(unicode_modifiable(unicode));
     652             :     assert(PyUnicode_IS_READY(unicode));
     653             :     assert(PyUnicode_IS_COMPACT(unicode));
     654             : 
     655         548 :     char_size = PyUnicode_KIND(unicode);
     656         548 :     if (PyUnicode_IS_ASCII(unicode))
     657         461 :         struct_size = sizeof(PyASCIIObject);
     658             :     else
     659          87 :         struct_size = sizeof(PyCompactUnicodeObject);
     660         548 :     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
     661             : 
     662         548 :     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
     663           0 :         PyErr_NoMemory();
     664           0 :         return NULL;
     665             :     }
     666         548 :     new_size = (struct_size + (length + 1) * char_size);
     667             : 
     668             :     _Py_DEC_REFTOTAL;
     669             :     _Py_ForgetReference(unicode);
     670             : 
     671         548 :     new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
     672         548 :     if (new_unicode == NULL) {
     673           0 :         _Py_NewReference(unicode);
     674           0 :         PyErr_NoMemory();
     675           0 :         return NULL;
     676             :     }
     677         548 :     unicode = new_unicode;
     678         548 :     _Py_NewReference(unicode);
     679             : 
     680         548 :     _PyUnicode_LENGTH(unicode) = length;
     681         548 :     if (share_wstr) {
     682           0 :         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
     683           0 :         if (!PyUnicode_IS_ASCII(unicode))
     684           0 :             _PyUnicode_WSTR_LENGTH(unicode) = length;
     685             :     }
     686         548 :     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
     687             :                     length, 0);
     688             :     assert(_PyUnicode_CheckConsistency(unicode, 0));
     689         548 :     return unicode;
     690             : }
     691             : 
     692             : static int
     693           0 : resize_inplace(PyObject *unicode, Py_ssize_t length)
     694             : {
     695             :     wchar_t *wstr;
     696             :     Py_ssize_t new_size;
     697             :     assert(!PyUnicode_IS_COMPACT(unicode));
     698             :     assert(Py_REFCNT(unicode) == 1);
     699             : 
     700           0 :     if (PyUnicode_IS_READY(unicode)) {
     701             :         Py_ssize_t char_size;
     702             :         int share_wstr, share_utf8;
     703             :         void *data;
     704             : 
     705           0 :         data = _PyUnicode_DATA_ANY(unicode);
     706           0 :         char_size = PyUnicode_KIND(unicode);
     707           0 :         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
     708           0 :         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
     709             : 
     710           0 :         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
     711           0 :             PyErr_NoMemory();
     712           0 :             return -1;
     713             :         }
     714           0 :         new_size = (length + 1) * char_size;
     715             : 
     716           0 :         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
     717             :         {
     718           0 :             PyObject_DEL(_PyUnicode_UTF8(unicode));
     719           0 :             _PyUnicode_UTF8(unicode) = NULL;
     720           0 :             _PyUnicode_UTF8_LENGTH(unicode) = 0;
     721             :         }
     722             : 
     723           0 :         data = (PyObject *)PyObject_REALLOC(data, new_size);
     724           0 :         if (data == NULL) {
     725           0 :             PyErr_NoMemory();
     726           0 :             return -1;
     727             :         }
     728           0 :         _PyUnicode_DATA_ANY(unicode) = data;
     729           0 :         if (share_wstr) {
     730           0 :             _PyUnicode_WSTR(unicode) = data;
     731           0 :             _PyUnicode_WSTR_LENGTH(unicode) = length;
     732             :         }
     733           0 :         if (share_utf8) {
     734           0 :             _PyUnicode_UTF8(unicode) = data;
     735           0 :             _PyUnicode_UTF8_LENGTH(unicode) = length;
     736             :         }
     737           0 :         _PyUnicode_LENGTH(unicode) = length;
     738           0 :         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
     739           0 :         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
     740             :             assert(_PyUnicode_CheckConsistency(unicode, 0));
     741           0 :             return 0;
     742             :         }
     743             :     }
     744             :     assert(_PyUnicode_WSTR(unicode) != NULL);
     745             : 
     746             :     /* check for integer overflow */
     747           0 :     if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
     748           0 :         PyErr_NoMemory();
     749           0 :         return -1;
     750             :     }
     751           0 :     new_size = sizeof(wchar_t) * (length + 1);
     752           0 :     wstr =  _PyUnicode_WSTR(unicode);
     753           0 :     wstr = PyObject_REALLOC(wstr, new_size);
     754           0 :     if (!wstr) {
     755           0 :         PyErr_NoMemory();
     756           0 :         return -1;
     757             :     }
     758           0 :     _PyUnicode_WSTR(unicode) = wstr;
     759           0 :     _PyUnicode_WSTR(unicode)[length] = 0;
     760           0 :     _PyUnicode_WSTR_LENGTH(unicode) = length;
     761             :     assert(_PyUnicode_CheckConsistency(unicode, 0));
     762           0 :     return 0;
     763             : }
     764             : 
     765             : static PyObject*
     766           0 : resize_copy(PyObject *unicode, Py_ssize_t length)
     767             : {
     768             :     Py_ssize_t copy_length;
     769           0 :     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
     770             :         PyObject *copy;
     771             : 
     772           0 :         if (PyUnicode_READY(unicode) == -1)
     773           0 :             return NULL;
     774             : 
     775           0 :         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
     776           0 :         if (copy == NULL)
     777           0 :             return NULL;
     778             : 
     779           0 :         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
     780           0 :         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
     781           0 :         return copy;
     782             :     }
     783             :     else {
     784             :         PyObject *w;
     785             : 
     786           0 :         w = (PyObject*)_PyUnicode_New(length);
     787           0 :         if (w == NULL)
     788           0 :             return NULL;
     789           0 :         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
     790           0 :         copy_length = Py_MIN(copy_length, length);
     791           0 :         Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
     792             :                         copy_length);
     793           0 :         return w;
     794             :     }
     795             : }
     796             : 
     797             : /* We allocate one more byte to make sure the string is
     798             :    Ux0000 terminated; some code (e.g. new_identifier)
     799             :    relies on that.
     800             : 
     801             :    XXX This allocator could further be enhanced by assuring that the
     802             :    free list never reduces its size below 1.
     803             : 
     804             : */
     805             : 
     806             : static PyUnicodeObject *
     807           2 : _PyUnicode_New(Py_ssize_t length)
     808             : {
     809             :     register PyUnicodeObject *unicode;
     810             :     size_t new_size;
     811             : 
     812             :     /* Optimization for empty strings */
     813           2 :     if (length == 0 && unicode_empty != NULL) {
     814           2 :         Py_INCREF(unicode_empty);
     815           2 :         return (PyUnicodeObject*)unicode_empty;
     816             :     }
     817             : 
     818             :     /* Ensure we won't overflow the size. */
     819           0 :     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
     820           0 :         return (PyUnicodeObject *)PyErr_NoMemory();
     821             :     }
     822           0 :     if (length < 0) {
     823           0 :         PyErr_SetString(PyExc_SystemError,
     824             :                         "Negative size passed to _PyUnicode_New");
     825           0 :         return NULL;
     826             :     }
     827             : 
     828           0 :     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
     829           0 :     if (unicode == NULL)
     830           0 :         return NULL;
     831           0 :     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
     832           0 :     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
     833           0 :     if (!_PyUnicode_WSTR(unicode)) {
     834           0 :         Py_DECREF(unicode);
     835           0 :         PyErr_NoMemory();
     836           0 :         return NULL;
     837             :     }
     838             : 
     839             :     /* Initialize the first element to guard against cases where
     840             :      * the caller fails before initializing str -- unicode_resize()
     841             :      * reads str[0], and the Keep-Alive optimization can keep memory
     842             :      * allocated for str alive across a call to unicode_dealloc(unicode).
     843             :      * We don't want unicode_resize to read uninitialized memory in
     844             :      * that case.
     845             :      */
     846           0 :     _PyUnicode_WSTR(unicode)[0] = 0;
     847           0 :     _PyUnicode_WSTR(unicode)[length] = 0;
     848           0 :     _PyUnicode_WSTR_LENGTH(unicode) = length;
     849           0 :     _PyUnicode_HASH(unicode) = -1;
     850           0 :     _PyUnicode_STATE(unicode).interned = 0;
     851           0 :     _PyUnicode_STATE(unicode).kind = 0;
     852           0 :     _PyUnicode_STATE(unicode).compact = 0;
     853           0 :     _PyUnicode_STATE(unicode).ready = 0;
     854           0 :     _PyUnicode_STATE(unicode).ascii = 0;
     855           0 :     _PyUnicode_DATA_ANY(unicode) = NULL;
     856           0 :     _PyUnicode_LENGTH(unicode) = 0;
     857           0 :     _PyUnicode_UTF8(unicode) = NULL;
     858           0 :     _PyUnicode_UTF8_LENGTH(unicode) = 0;
     859             :     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
     860           0 :     return unicode;
     861             : }
     862             : 
     863             : static const char*
     864           0 : unicode_kind_name(PyObject *unicode)
     865             : {
     866             :     /* don't check consistency: unicode_kind_name() is called from
     867             :        _PyUnicode_Dump() */
     868           0 :     if (!PyUnicode_IS_COMPACT(unicode))
     869             :     {
     870           0 :         if (!PyUnicode_IS_READY(unicode))
     871           0 :             return "wstr";
     872           0 :         switch (PyUnicode_KIND(unicode))
     873             :         {
     874             :         case PyUnicode_1BYTE_KIND:
     875           0 :             if (PyUnicode_IS_ASCII(unicode))
     876           0 :                 return "legacy ascii";
     877             :             else
     878           0 :                 return "legacy latin1";
     879             :         case PyUnicode_2BYTE_KIND:
     880           0 :             return "legacy UCS2";
     881             :         case PyUnicode_4BYTE_KIND:
     882           0 :             return "legacy UCS4";
     883             :         default:
     884           0 :             return "<legacy invalid kind>";
     885             :         }
     886             :     }
     887             :     assert(PyUnicode_IS_READY(unicode));
     888           0 :     switch (PyUnicode_KIND(unicode)) {
     889             :     case PyUnicode_1BYTE_KIND:
     890           0 :         if (PyUnicode_IS_ASCII(unicode))
     891           0 :             return "ascii";
     892             :         else
     893           0 :             return "latin1";
     894             :     case PyUnicode_2BYTE_KIND:
     895           0 :         return "UCS2";
     896             :     case PyUnicode_4BYTE_KIND:
     897           0 :         return "UCS4";
     898             :     default:
     899           0 :         return "<invalid compact kind>";
     900             :     }
     901             : }
     902             : 
     903             : #ifdef Py_DEBUG
     904             : /* Functions wrapping macros for use in debugger */
     905             : char *_PyUnicode_utf8(void *unicode){
     906             :     return PyUnicode_UTF8(unicode);
     907             : }
     908             : 
     909             : void *_PyUnicode_compact_data(void *unicode) {
     910             :     return _PyUnicode_COMPACT_DATA(unicode);
     911             : }
     912             : void *_PyUnicode_data(void *unicode){
     913             :     printf("obj %p\n", unicode);
     914             :     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
     915             :     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
     916             :     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
     917             :     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
     918             :     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
     919             :     return PyUnicode_DATA(unicode);
     920             : }
     921             : 
     922             : void
     923             : _PyUnicode_Dump(PyObject *op)
     924             : {
     925             :     PyASCIIObject *ascii = (PyASCIIObject *)op;
     926             :     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
     927             :     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
     928             :     void *data;
     929             : 
     930             :     if (ascii->state.compact)
     931             :     {
     932             :         if (ascii->state.ascii)
     933             :             data = (ascii + 1);
     934             :         else
     935             :             data = (compact + 1);
     936             :     }
     937             :     else
     938             :         data = unicode->data.any;
     939             :     printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
     940             : 
     941             :     if (ascii->wstr == data)
     942             :         printf("shared ");
     943             :     printf("wstr=%p", ascii->wstr);
     944             : 
     945             :     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
     946             :         printf(" (%zu), ", compact->wstr_length);
     947             :         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
     948             :             printf("shared ");
     949             :         printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
     950             :     }
     951             :     printf(", data=%p\n", data);
     952             : }
     953             : #endif
     954             : 
     955             : PyObject *
     956       51351 : PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     957             : {
     958             :     PyObject *obj;
     959             :     PyCompactUnicodeObject *unicode;
     960             :     void *data;
     961             :     enum PyUnicode_Kind kind;
     962             :     int is_sharing, is_ascii;
     963             :     Py_ssize_t char_size;
     964             :     Py_ssize_t struct_size;
     965             : 
     966             :     /* Optimization for empty strings */
     967       51351 :     if (size == 0 && unicode_empty != NULL) {
     968          96 :         Py_INCREF(unicode_empty);
     969          96 :         return unicode_empty;
     970             :     }
     971             : 
     972       51255 :     is_ascii = 0;
     973       51255 :     is_sharing = 0;
     974       51255 :     struct_size = sizeof(PyCompactUnicodeObject);
     975       51255 :     if (maxchar < 128) {
     976       50791 :         kind = PyUnicode_1BYTE_KIND;
     977       50791 :         char_size = 1;
     978       50791 :         is_ascii = 1;
     979       50791 :         struct_size = sizeof(PyASCIIObject);
     980             :     }
     981         464 :     else if (maxchar < 256) {
     982          14 :         kind = PyUnicode_1BYTE_KIND;
     983          14 :         char_size = 1;
     984             :     }
     985         450 :     else if (maxchar < 65536) {
     986         450 :         kind = PyUnicode_2BYTE_KIND;
     987         450 :         char_size = 2;
     988             :         if (sizeof(wchar_t) == 2)
     989             :             is_sharing = 1;
     990             :     }
     991             :     else {
     992           0 :         if (maxchar > MAX_UNICODE) {
     993           0 :             PyErr_SetString(PyExc_SystemError,
     994             :                             "invalid maximum character passed to PyUnicode_New");
     995           0 :             return NULL;
     996             :         }
     997           0 :         kind = PyUnicode_4BYTE_KIND;
     998           0 :         char_size = 4;
     999             :         if (sizeof(wchar_t) == 4)
    1000           0 :             is_sharing = 1;
    1001             :     }
    1002             : 
    1003             :     /* Ensure we won't overflow the size. */
    1004       51255 :     if (size < 0) {
    1005           0 :         PyErr_SetString(PyExc_SystemError,
    1006             :                         "Negative size passed to PyUnicode_New");
    1007           0 :         return NULL;
    1008             :     }
    1009       51255 :     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
    1010           0 :         return PyErr_NoMemory();
    1011             : 
    1012             :     /* Duplicated allocation code from _PyObject_New() instead of a call to
    1013             :      * PyObject_New() so we are able to allocate space for the object and
    1014             :      * it's data buffer.
    1015             :      */
    1016       51255 :     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
    1017       51255 :     if (obj == NULL)
    1018           0 :         return PyErr_NoMemory();
    1019       51255 :     obj = PyObject_INIT(obj, &PyUnicode_Type);
    1020       51255 :     if (obj == NULL)
    1021           0 :         return NULL;
    1022             : 
    1023       51255 :     unicode = (PyCompactUnicodeObject *)obj;
    1024       51255 :     if (is_ascii)
    1025       50791 :         data = ((PyASCIIObject*)obj) + 1;
    1026             :     else
    1027         464 :         data = unicode + 1;
    1028       51255 :     _PyUnicode_LENGTH(unicode) = size;
    1029       51255 :     _PyUnicode_HASH(unicode) = -1;
    1030       51255 :     _PyUnicode_STATE(unicode).interned = 0;
    1031       51255 :     _PyUnicode_STATE(unicode).kind = kind;
    1032       51255 :     _PyUnicode_STATE(unicode).compact = 1;
    1033       51255 :     _PyUnicode_STATE(unicode).ready = 1;
    1034       51255 :     _PyUnicode_STATE(unicode).ascii = is_ascii;
    1035       51255 :     if (is_ascii) {
    1036       50791 :         ((char*)data)[size] = 0;
    1037       50791 :         _PyUnicode_WSTR(unicode) = NULL;
    1038             :     }
    1039         464 :     else if (kind == PyUnicode_1BYTE_KIND) {
    1040          14 :         ((char*)data)[size] = 0;
    1041          14 :         _PyUnicode_WSTR(unicode) = NULL;
    1042          14 :         _PyUnicode_WSTR_LENGTH(unicode) = 0;
    1043          14 :         unicode->utf8 = NULL;
    1044          14 :         unicode->utf8_length = 0;
    1045             :     }
    1046             :     else {
    1047         450 :         unicode->utf8 = NULL;
    1048         450 :         unicode->utf8_length = 0;
    1049         450 :         if (kind == PyUnicode_2BYTE_KIND)
    1050         450 :             ((Py_UCS2*)data)[size] = 0;
    1051             :         else /* kind == PyUnicode_4BYTE_KIND */
    1052           0 :             ((Py_UCS4*)data)[size] = 0;
    1053         450 :         if (is_sharing) {
    1054           0 :             _PyUnicode_WSTR_LENGTH(unicode) = size;
    1055           0 :             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
    1056             :         }
    1057             :         else {
    1058         450 :             _PyUnicode_WSTR_LENGTH(unicode) = 0;
    1059         450 :             _PyUnicode_WSTR(unicode) = NULL;
    1060             :         }
    1061             :     }
    1062             : #ifdef Py_DEBUG
    1063             :     /* Fill the data with invalid characters to detect bugs earlier.
    1064             :        _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
    1065             :        at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
    1066             :        and U+FFFFFFFF is an invalid character in Unicode 6.0. */
    1067             :     memset(data, 0xff, size * kind);
    1068             : #endif
    1069             :     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
    1070       51255 :     return obj;
    1071             : }
    1072             : 
    1073             : #if SIZEOF_WCHAR_T == 2
    1074             : /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
    1075             :    will decode surrogate pairs, the other conversions are implemented as macros
    1076             :    for efficiency.
    1077             : 
    1078             :    This function assumes that unicode can hold one more code point than wstr
    1079             :    characters for a terminating null character. */
    1080             : static void
    1081             : unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
    1082             :                               PyObject *unicode)
    1083             : {
    1084             :     const wchar_t *iter;
    1085             :     Py_UCS4 *ucs4_out;
    1086             : 
    1087             :     assert(unicode != NULL);
    1088             :     assert(_PyUnicode_CHECK(unicode));
    1089             :     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    1090             :     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
    1091             : 
    1092             :     for (iter = begin; iter < end; ) {
    1093             :         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
    1094             :                            _PyUnicode_GET_LENGTH(unicode)));
    1095             :         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
    1096             :             && (iter+1) < end
    1097             :             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
    1098             :         {
    1099             :             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
    1100             :             iter += 2;
    1101             :         }
    1102             :         else {
    1103             :             *ucs4_out++ = *iter;
    1104             :             iter++;
    1105             :         }
    1106             :     }
    1107             :     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
    1108             :                         _PyUnicode_GET_LENGTH(unicode)));
    1109             : 
    1110             : }
    1111             : #endif
    1112             : 
    1113             : static int
    1114           0 : unicode_check_modifiable(PyObject *unicode)
    1115             : {
    1116           0 :     if (!unicode_modifiable(unicode)) {
    1117           0 :         PyErr_SetString(PyExc_SystemError,
    1118             :                         "Cannot modify a string currently used");
    1119           0 :         return -1;
    1120             :     }
    1121           0 :     return 0;
    1122             : }
    1123             : 
    1124             : static int
    1125        9691 : _copy_characters(PyObject *to, Py_ssize_t to_start,
    1126             :                  PyObject *from, Py_ssize_t from_start,
    1127             :                  Py_ssize_t how_many, int check_maxchar)
    1128             : {
    1129             :     unsigned int from_kind, to_kind;
    1130             :     void *from_data, *to_data;
    1131             : 
    1132             :     assert(0 <= how_many);
    1133             :     assert(0 <= from_start);
    1134             :     assert(0 <= to_start);
    1135             :     assert(PyUnicode_Check(from));
    1136             :     assert(PyUnicode_IS_READY(from));
    1137             :     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
    1138             : 
    1139             :     assert(PyUnicode_Check(to));
    1140             :     assert(PyUnicode_IS_READY(to));
    1141             :     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
    1142             : 
    1143        9691 :     if (how_many == 0)
    1144          11 :         return 0;
    1145             : 
    1146        9680 :     from_kind = PyUnicode_KIND(from);
    1147        9680 :     from_data = PyUnicode_DATA(from);
    1148        9680 :     to_kind = PyUnicode_KIND(to);
    1149        9680 :     to_data = PyUnicode_DATA(to);
    1150             : 
    1151             : #ifdef Py_DEBUG
    1152             :     if (!check_maxchar
    1153             :         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
    1154             :     {
    1155             :         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
    1156             :         Py_UCS4 ch;
    1157             :         Py_ssize_t i;
    1158             :         for (i=0; i < how_many; i++) {
    1159             :             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
    1160             :             assert(ch <= to_maxchar);
    1161             :         }
    1162             :     }
    1163             : #endif
    1164             : 
    1165        9680 :     if (from_kind == to_kind) {
    1166        9607 :         if (check_maxchar
    1167           0 :             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
    1168             :         {
    1169             :             /* Writing Latin-1 characters into an ASCII string requires to
    1170             :                check that all written characters are pure ASCII */
    1171             :             Py_UCS4 max_char;
    1172           0 :             max_char = ucs1lib_find_max_char(from_data,
    1173             :                                              (Py_UCS1*)from_data + how_many);
    1174           0 :             if (max_char >= 128)
    1175           0 :                 return -1;
    1176             :         }
    1177       19214 :         Py_MEMCPY((char*)to_data + to_kind * to_start,
    1178        9607 :                   (char*)from_data + from_kind * from_start,
    1179             :                   to_kind * how_many);
    1180             :     }
    1181          73 :     else if (from_kind == PyUnicode_1BYTE_KIND
    1182          73 :              && to_kind == PyUnicode_2BYTE_KIND)
    1183             :     {
    1184          73 :         _PyUnicode_CONVERT_BYTES(
    1185             :             Py_UCS1, Py_UCS2,
    1186             :             PyUnicode_1BYTE_DATA(from) + from_start,
    1187             :             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
    1188             :             PyUnicode_2BYTE_DATA(to) + to_start
    1189             :             );
    1190             :     }
    1191           0 :     else if (from_kind == PyUnicode_1BYTE_KIND
    1192           0 :              && to_kind == PyUnicode_4BYTE_KIND)
    1193             :     {
    1194           0 :         _PyUnicode_CONVERT_BYTES(
    1195             :             Py_UCS1, Py_UCS4,
    1196             :             PyUnicode_1BYTE_DATA(from) + from_start,
    1197             :             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
    1198             :             PyUnicode_4BYTE_DATA(to) + to_start
    1199             :             );
    1200             :     }
    1201           0 :     else if (from_kind == PyUnicode_2BYTE_KIND
    1202           0 :              && to_kind == PyUnicode_4BYTE_KIND)
    1203             :     {
    1204           0 :         _PyUnicode_CONVERT_BYTES(
    1205             :             Py_UCS2, Py_UCS4,
    1206             :             PyUnicode_2BYTE_DATA(from) + from_start,
    1207             :             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
    1208             :             PyUnicode_4BYTE_DATA(to) + to_start
    1209             :             );
    1210             :     }
    1211             :     else {
    1212             :         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
    1213             : 
    1214           0 :         if (!check_maxchar) {
    1215           0 :             if (from_kind == PyUnicode_2BYTE_KIND
    1216           0 :                 && to_kind == PyUnicode_1BYTE_KIND)
    1217             :             {
    1218           0 :                 _PyUnicode_CONVERT_BYTES(
    1219             :                     Py_UCS2, Py_UCS1,
    1220             :                     PyUnicode_2BYTE_DATA(from) + from_start,
    1221             :                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
    1222             :                     PyUnicode_1BYTE_DATA(to) + to_start
    1223             :                     );
    1224             :             }
    1225           0 :             else if (from_kind == PyUnicode_4BYTE_KIND
    1226           0 :                      && to_kind == PyUnicode_1BYTE_KIND)
    1227             :             {
    1228           0 :                 _PyUnicode_CONVERT_BYTES(
    1229             :                     Py_UCS4, Py_UCS1,
    1230             :                     PyUnicode_4BYTE_DATA(from) + from_start,
    1231             :                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
    1232             :                     PyUnicode_1BYTE_DATA(to) + to_start
    1233             :                     );
    1234             :             }
    1235           0 :             else if (from_kind == PyUnicode_4BYTE_KIND
    1236           0 :                      && to_kind == PyUnicode_2BYTE_KIND)
    1237             :             {
    1238           0 :                 _PyUnicode_CONVERT_BYTES(
    1239             :                     Py_UCS4, Py_UCS2,
    1240             :                     PyUnicode_4BYTE_DATA(from) + from_start,
    1241             :                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
    1242             :                     PyUnicode_2BYTE_DATA(to) + to_start
    1243             :                     );
    1244             :             }
    1245             :             else {
    1246             :                 assert(0);
    1247           0 :                 return -1;
    1248             :             }
    1249             :         }
    1250             :         else {
    1251           0 :             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
    1252             :             Py_UCS4 ch;
    1253             :             Py_ssize_t i;
    1254             : 
    1255           0 :             for (i=0; i < how_many; i++) {
    1256           0 :                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
    1257           0 :                 if (ch > to_maxchar)
    1258           0 :                     return -1;
    1259           0 :                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
    1260             :             }
    1261             :         }
    1262             :     }
    1263        9680 :     return 0;
    1264             : }
    1265             : 
    1266             : void
    1267        9691 : _PyUnicode_FastCopyCharacters(
    1268             :     PyObject *to, Py_ssize_t to_start,
    1269             :     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
    1270             : {
    1271        9691 :     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
    1272        9691 : }
    1273             : 
    1274             : Py_ssize_t
    1275           0 : PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
    1276             :                          PyObject *from, Py_ssize_t from_start,
    1277             :                          Py_ssize_t how_many)
    1278             : {
    1279             :     int err;
    1280             : 
    1281           0 :     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
    1282           0 :         PyErr_BadInternalCall();
    1283           0 :         return -1;
    1284             :     }
    1285             : 
    1286           0 :     if (PyUnicode_READY(from) == -1)
    1287           0 :         return -1;
    1288           0 :     if (PyUnicode_READY(to) == -1)
    1289           0 :         return -1;
    1290             : 
    1291           0 :     if (from_start < 0) {
    1292           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    1293           0 :         return -1;
    1294             :     }
    1295           0 :     if (to_start < 0) {
    1296           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    1297           0 :         return -1;
    1298             :     }
    1299           0 :     how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
    1300           0 :     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
    1301           0 :         PyErr_Format(PyExc_SystemError,
    1302             :                      "Cannot write %zi characters at %zi "
    1303             :                      "in a string of %zi characters",
    1304             :                      how_many, to_start, PyUnicode_GET_LENGTH(to));
    1305           0 :         return -1;
    1306             :     }
    1307             : 
    1308           0 :     if (how_many == 0)
    1309           0 :         return 0;
    1310             : 
    1311           0 :     if (unicode_check_modifiable(to))
    1312           0 :         return -1;
    1313             : 
    1314           0 :     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
    1315           0 :     if (err) {
    1316           0 :         PyErr_Format(PyExc_SystemError,
    1317             :                      "Cannot copy %s characters "
    1318             :                      "into a string of %s characters",
    1319             :                      unicode_kind_name(from),
    1320             :                      unicode_kind_name(to));
    1321           0 :         return -1;
    1322             :     }
    1323           0 :     return how_many;
    1324             : }
    1325             : 
    1326             : /* Find the maximum code point and count the number of surrogate pairs so a
    1327             :    correct string length can be computed before converting a string to UCS4.
    1328             :    This function counts single surrogates as a character and not as a pair.
    1329             : 
    1330             :    Return 0 on success, or -1 on error. */
    1331             : static int
    1332         355 : find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
    1333             :                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
    1334             : {
    1335             :     const wchar_t *iter;
    1336             :     Py_UCS4 ch;
    1337             : 
    1338             :     assert(num_surrogates != NULL && maxchar != NULL);
    1339         355 :     *num_surrogates = 0;
    1340         355 :     *maxchar = 0;
    1341             : 
    1342        6299 :     for (iter = begin; iter < end; ) {
    1343             : #if SIZEOF_WCHAR_T == 2
    1344             :         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
    1345             :             && (iter+1) < end
    1346             :             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
    1347             :         {
    1348             :             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
    1349             :             ++(*num_surrogates);
    1350             :             iter += 2;
    1351             :         }
    1352             :         else
    1353             : #endif
    1354             :         {
    1355        5589 :             ch = *iter;
    1356        5589 :             iter++;
    1357             :         }
    1358        5589 :         if (ch > *maxchar) {
    1359        1170 :             *maxchar = ch;
    1360        1170 :             if (*maxchar > MAX_UNICODE) {
    1361           0 :                 PyErr_Format(PyExc_ValueError,
    1362             :                              "character U+%x is not in range [U+0000; U+10ffff]",
    1363             :                              ch);
    1364           0 :                 return -1;
    1365             :             }
    1366             :         }
    1367             :     }
    1368         355 :     return 0;
    1369             : }
    1370             : 
    1371             : int
    1372           0 : _PyUnicode_Ready(PyObject *unicode)
    1373             : {
    1374             :     wchar_t *end;
    1375           0 :     Py_UCS4 maxchar = 0;
    1376             :     Py_ssize_t num_surrogates;
    1377             : #if SIZEOF_WCHAR_T == 2
    1378             :     Py_ssize_t length_wo_surrogates;
    1379             : #endif
    1380             : 
    1381             :     /* _PyUnicode_Ready() is only intended for old-style API usage where
    1382             :        strings were created using _PyObject_New() and where no canonical
    1383             :        representation (the str field) has been set yet aka strings
    1384             :        which are not yet ready. */
    1385             :     assert(_PyUnicode_CHECK(unicode));
    1386             :     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
    1387             :     assert(_PyUnicode_WSTR(unicode) != NULL);
    1388             :     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
    1389             :     assert(_PyUnicode_UTF8(unicode) == NULL);
    1390             :     /* Actually, it should neither be interned nor be anything else: */
    1391             :     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
    1392             : 
    1393           0 :     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
    1394           0 :     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
    1395             :                                 &maxchar, &num_surrogates) == -1)
    1396           0 :         return -1;
    1397             : 
    1398           0 :     if (maxchar < 256) {
    1399           0 :         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
    1400           0 :         if (!_PyUnicode_DATA_ANY(unicode)) {
    1401           0 :             PyErr_NoMemory();
    1402           0 :             return -1;
    1403             :         }
    1404           0 :         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
    1405             :                                 _PyUnicode_WSTR(unicode), end,
    1406             :                                 PyUnicode_1BYTE_DATA(unicode));
    1407           0 :         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
    1408           0 :         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
    1409           0 :         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
    1410           0 :         if (maxchar < 128) {
    1411           0 :             _PyUnicode_STATE(unicode).ascii = 1;
    1412           0 :             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
    1413           0 :             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
    1414             :         }
    1415             :         else {
    1416           0 :             _PyUnicode_STATE(unicode).ascii = 0;
    1417           0 :             _PyUnicode_UTF8(unicode) = NULL;
    1418           0 :             _PyUnicode_UTF8_LENGTH(unicode) = 0;
    1419             :         }
    1420           0 :         PyObject_FREE(_PyUnicode_WSTR(unicode));
    1421           0 :         _PyUnicode_WSTR(unicode) = NULL;
    1422           0 :         _PyUnicode_WSTR_LENGTH(unicode) = 0;
    1423             :     }
    1424             :     /* In this case we might have to convert down from 4-byte native
    1425             :        wchar_t to 2-byte unicode. */
    1426           0 :     else if (maxchar < 65536) {
    1427             :         assert(num_surrogates == 0 &&
    1428             :                "FindMaxCharAndNumSurrogatePairs() messed up");
    1429             : 
    1430             : #if SIZEOF_WCHAR_T == 2
    1431             :         /* We can share representations and are done. */
    1432             :         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
    1433             :         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
    1434             :         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
    1435             :         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
    1436             :         _PyUnicode_UTF8(unicode) = NULL;
    1437             :         _PyUnicode_UTF8_LENGTH(unicode) = 0;
    1438             : #else
    1439             :         /* sizeof(wchar_t) == 4 */
    1440           0 :         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
    1441           0 :             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
    1442           0 :         if (!_PyUnicode_DATA_ANY(unicode)) {
    1443           0 :             PyErr_NoMemory();
    1444           0 :             return -1;
    1445             :         }
    1446           0 :         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
    1447             :                                 _PyUnicode_WSTR(unicode), end,
    1448             :                                 PyUnicode_2BYTE_DATA(unicode));
    1449           0 :         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
    1450           0 :         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
    1451           0 :         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
    1452           0 :         _PyUnicode_UTF8(unicode) = NULL;
    1453           0 :         _PyUnicode_UTF8_LENGTH(unicode) = 0;
    1454           0 :         PyObject_FREE(_PyUnicode_WSTR(unicode));
    1455           0 :         _PyUnicode_WSTR(unicode) = NULL;
    1456           0 :         _PyUnicode_WSTR_LENGTH(unicode) = 0;
    1457             : #endif
    1458             :     }
    1459             :     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
    1460             :     else {
    1461             : #if SIZEOF_WCHAR_T == 2
    1462             :         /* in case the native representation is 2-bytes, we need to allocate a
    1463             :            new normalized 4-byte version. */
    1464             :         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
    1465             :         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
    1466             :         if (!_PyUnicode_DATA_ANY(unicode)) {
    1467             :             PyErr_NoMemory();
    1468             :             return -1;
    1469             :         }
    1470             :         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
    1471             :         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
    1472             :         _PyUnicode_UTF8(unicode) = NULL;
    1473             :         _PyUnicode_UTF8_LENGTH(unicode) = 0;
    1474             :         /* unicode_convert_wchar_to_ucs4() requires a ready string */
    1475             :         _PyUnicode_STATE(unicode).ready = 1;
    1476             :         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
    1477             :         PyObject_FREE(_PyUnicode_WSTR(unicode));
    1478             :         _PyUnicode_WSTR(unicode) = NULL;
    1479             :         _PyUnicode_WSTR_LENGTH(unicode) = 0;
    1480             : #else
    1481             :         assert(num_surrogates == 0);
    1482             : 
    1483           0 :         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
    1484           0 :         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
    1485           0 :         _PyUnicode_UTF8(unicode) = NULL;
    1486           0 :         _PyUnicode_UTF8_LENGTH(unicode) = 0;
    1487           0 :         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
    1488             : #endif
    1489           0 :         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
    1490             :     }
    1491           0 :     _PyUnicode_STATE(unicode).ready = 1;
    1492             :     assert(_PyUnicode_CheckConsistency(unicode, 1));
    1493           0 :     return 0;
    1494             : }
    1495             : 
    1496             : static void
    1497       38402 : unicode_dealloc(register PyObject *unicode)
    1498             : {
    1499       38402 :     switch (PyUnicode_CHECK_INTERNED(unicode)) {
    1500             :     case SSTATE_NOT_INTERNED:
    1501       38241 :         break;
    1502             : 
    1503             :     case SSTATE_INTERNED_MORTAL:
    1504             :         /* revive dead object temporarily for DelItem */
    1505         161 :         Py_REFCNT(unicode) = 3;
    1506         161 :         if (PyDict_DelItem(interned, unicode) != 0)
    1507           0 :             Py_FatalError(
    1508             :                 "deletion of interned string failed");
    1509         161 :         break;
    1510             : 
    1511             :     case SSTATE_INTERNED_IMMORTAL:
    1512           0 :         Py_FatalError("Immortal interned string died.");
    1513             : 
    1514             :     default:
    1515           0 :         Py_FatalError("Inconsistent interned string state.");
    1516             :     }
    1517             : 
    1518       38402 :     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
    1519           8 :         PyObject_DEL(_PyUnicode_WSTR(unicode));
    1520       38402 :     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
    1521           1 :         PyObject_DEL(_PyUnicode_UTF8(unicode));
    1522       38402 :     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
    1523           0 :         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
    1524             : 
    1525       38402 :     Py_TYPE(unicode)->tp_free(unicode);
    1526       38402 : }
    1527             : 
    1528             : #ifdef Py_DEBUG
    1529             : static int
    1530             : unicode_is_singleton(PyObject *unicode)
    1531             : {
    1532             :     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
    1533             :     if (unicode == unicode_empty)
    1534             :         return 1;
    1535             :     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
    1536             :     {
    1537             :         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
    1538             :         if (ch < 256 && unicode_latin1[ch] == unicode)
    1539             :             return 1;
    1540             :     }
    1541             :     return 0;
    1542             : }
    1543             : #endif
    1544             : 
    1545             : static int
    1546        2490 : unicode_modifiable(PyObject *unicode)
    1547             : {
    1548             :     assert(_PyUnicode_CHECK(unicode));
    1549        2490 :     if (Py_REFCNT(unicode) != 1)
    1550        1613 :         return 0;
    1551         877 :     if (_PyUnicode_HASH(unicode) != -1)
    1552           0 :         return 0;
    1553         877 :     if (PyUnicode_CHECK_INTERNED(unicode))
    1554           0 :         return 0;
    1555         877 :     if (!PyUnicode_CheckExact(unicode))
    1556           0 :         return 0;
    1557             : #ifdef Py_DEBUG
    1558             :     /* singleton refcount is greater than 1 */
    1559             :     assert(!unicode_is_singleton(unicode));
    1560             : #endif
    1561         877 :     return 1;
    1562             : }
    1563             : 
    1564             : static int
    1565       43157 : unicode_resize(PyObject **p_unicode, Py_ssize_t length)
    1566             : {
    1567             :     PyObject *unicode;
    1568             :     Py_ssize_t old_length;
    1569             : 
    1570             :     assert(p_unicode != NULL);
    1571       43157 :     unicode = *p_unicode;
    1572             : 
    1573             :     assert(unicode != NULL);
    1574             :     assert(PyUnicode_Check(unicode));
    1575             :     assert(0 <= length);
    1576             : 
    1577       43157 :     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
    1578           0 :         old_length = PyUnicode_WSTR_LENGTH(unicode);
    1579             :     else
    1580       43157 :         old_length = PyUnicode_GET_LENGTH(unicode);
    1581       43157 :     if (old_length == length)
    1582       42675 :         return 0;
    1583             : 
    1584         482 :     if (length == 0) {
    1585           0 :         Py_DECREF(*p_unicode);
    1586           0 :         *p_unicode = unicode_empty;
    1587           0 :         Py_INCREF(*p_unicode);
    1588           0 :         return 0;
    1589             :     }
    1590             : 
    1591         482 :     if (!unicode_modifiable(unicode)) {
    1592           0 :         PyObject *copy = resize_copy(unicode, length);
    1593           0 :         if (copy == NULL)
    1594           0 :             return -1;
    1595           0 :         Py_DECREF(*p_unicode);
    1596           0 :         *p_unicode = copy;
    1597           0 :         return 0;
    1598             :     }
    1599             : 
    1600         482 :     if (PyUnicode_IS_COMPACT(unicode)) {
    1601         482 :         PyObject *new_unicode = resize_compact(unicode, length);
    1602         482 :         if (new_unicode == NULL)
    1603           0 :             return -1;
    1604         482 :         *p_unicode = new_unicode;
    1605         482 :         return 0;
    1606             :     }
    1607           0 :     return resize_inplace(unicode, length);
    1608             : }
    1609             : 
    1610             : int
    1611           0 : PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
    1612             : {
    1613             :     PyObject *unicode;
    1614           0 :     if (p_unicode == NULL) {
    1615           0 :         PyErr_BadInternalCall();
    1616           0 :         return -1;
    1617             :     }
    1618           0 :     unicode = *p_unicode;
    1619           0 :     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
    1620             :     {
    1621           0 :         PyErr_BadInternalCall();
    1622           0 :         return -1;
    1623             :     }
    1624           0 :     return unicode_resize(p_unicode, length);
    1625             : }
    1626             : 
    1627             : static int
    1628        1808 : unicode_widen(PyObject **p_unicode, Py_ssize_t length,
    1629             :               unsigned int maxchar)
    1630             : {
    1631             :     PyObject *result;
    1632             :     assert(PyUnicode_IS_READY(*p_unicode));
    1633             :     assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
    1634        1808 :     if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
    1635        1721 :         return 0;
    1636          87 :     result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
    1637             :                            maxchar);
    1638          87 :     if (result == NULL)
    1639           0 :         return -1;
    1640          87 :     _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
    1641          87 :     Py_DECREF(*p_unicode);
    1642          87 :     *p_unicode = result;
    1643          87 :     return 0;
    1644             : }
    1645             : 
    1646             : static int
    1647        1808 : unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
    1648             :                 Py_UCS4 ch)
    1649             : {
    1650             :     assert(ch <= MAX_UNICODE);
    1651        1808 :     if (unicode_widen(p_unicode, *pos, ch) < 0)
    1652           0 :         return -1;
    1653        1808 :     PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
    1654             :                     PyUnicode_DATA(*p_unicode),
    1655             :                     (*pos)++, ch);
    1656        1808 :     return 0;
    1657             : }
    1658             : 
    1659             : /* Copy a ASCII or latin1 char* string into a Python Unicode string.
    1660             : 
    1661             :    WARNING: The function doesn't copy the terminating null character and
    1662             :    doesn't check the maximum character (may write a latin1 character in an
    1663             :    ASCII string). */
    1664             : static void
    1665        1890 : unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
    1666             :                    const char *str, Py_ssize_t len)
    1667             : {
    1668        1890 :     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
    1669        1890 :     void *data = PyUnicode_DATA(unicode);
    1670        1890 :     const char *end = str + len;
    1671             : 
    1672        1890 :     switch (kind) {
    1673             :     case PyUnicode_1BYTE_KIND: {
    1674             :         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
    1675        1890 :         memcpy((char *) data + index, str, len);
    1676        1890 :         break;
    1677             :     }
    1678             :     case PyUnicode_2BYTE_KIND: {
    1679           0 :         Py_UCS2 *start = (Py_UCS2 *)data + index;
    1680           0 :         Py_UCS2 *ucs2 = start;
    1681             :         assert(index <= PyUnicode_GET_LENGTH(unicode));
    1682             : 
    1683           0 :         for (; str < end; ++ucs2, ++str)
    1684           0 :             *ucs2 = (Py_UCS2)*str;
    1685             : 
    1686             :         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
    1687           0 :         break;
    1688             :     }
    1689             :     default: {
    1690           0 :         Py_UCS4 *start = (Py_UCS4 *)data + index;
    1691           0 :         Py_UCS4 *ucs4 = start;
    1692             :         assert(kind == PyUnicode_4BYTE_KIND);
    1693             :         assert(index <= PyUnicode_GET_LENGTH(unicode));
    1694             : 
    1695           0 :         for (; str < end; ++ucs4, ++str)
    1696           0 :             *ucs4 = (Py_UCS4)*str;
    1697             : 
    1698             :         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
    1699             :     }
    1700             :     }
    1701        1890 : }
    1702             : 
    1703             : 
    1704             : static PyObject*
    1705       11308 : get_latin1_char(unsigned char ch)
    1706             : {
    1707       11308 :     PyObject *unicode = unicode_latin1[ch];
    1708       11308 :     if (!unicode) {
    1709         105 :         unicode = PyUnicode_New(1, ch);
    1710         105 :         if (!unicode)
    1711           0 :             return NULL;
    1712         105 :         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
    1713             :         assert(_PyUnicode_CheckConsistency(unicode, 1));
    1714         105 :         unicode_latin1[ch] = unicode;
    1715             :     }
    1716       11308 :     Py_INCREF(unicode);
    1717       11308 :     return unicode;
    1718             : }
    1719             : 
    1720             : PyObject *
    1721         357 : PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
    1722             : {
    1723             :     PyObject *unicode;
    1724         357 :     Py_UCS4 maxchar = 0;
    1725             :     Py_ssize_t num_surrogates;
    1726             : 
    1727         357 :     if (u == NULL)
    1728           0 :         return (PyObject*)_PyUnicode_New(size);
    1729             : 
    1730             :     /* If the Unicode data is known at construction time, we can apply
    1731             :        some optimizations which share commonly used objects. */
    1732             : 
    1733             :     /* Optimization for empty strings */
    1734         357 :     if (size == 0 && unicode_empty != NULL) {
    1735           2 :         Py_INCREF(unicode_empty);
    1736           2 :         return unicode_empty;
    1737             :     }
    1738             : 
    1739             :     /* Single character Unicode objects in the Latin-1 range are
    1740             :        shared when using this constructor */
    1741         355 :     if (size == 1 && *u < 256)
    1742           0 :         return get_latin1_char((unsigned char)*u);
    1743             : 
    1744             :     /* If not empty and not single character, copy the Unicode data
    1745             :        into the new object */
    1746         355 :     if (find_maxchar_surrogates(u, u + size,
    1747             :                                 &maxchar, &num_surrogates) == -1)
    1748           0 :         return NULL;
    1749             : 
    1750         355 :     unicode = PyUnicode_New(size - num_surrogates, maxchar);
    1751         355 :     if (!unicode)
    1752           0 :         return NULL;
    1753             : 
    1754         355 :     switch (PyUnicode_KIND(unicode)) {
    1755             :     case PyUnicode_1BYTE_KIND:
    1756         355 :         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
    1757             :                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
    1758         355 :         break;
    1759             :     case PyUnicode_2BYTE_KIND:
    1760             : #if Py_UNICODE_SIZE == 2
    1761             :         Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
    1762             : #else
    1763           0 :         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
    1764             :                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
    1765             : #endif
    1766           0 :         break;
    1767             :     case PyUnicode_4BYTE_KIND:
    1768             : #if SIZEOF_WCHAR_T == 2
    1769             :         /* This is the only case which has to process surrogates, thus
    1770             :            a simple copy loop is not enough and we need a function. */
    1771             :         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
    1772             : #else
    1773             :         assert(num_surrogates == 0);
    1774           0 :         Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
    1775             : #endif
    1776           0 :         break;
    1777             :     default:
    1778             :         assert(0 && "Impossible state");
    1779             :     }
    1780             : 
    1781         355 :     return unicode_result(unicode);
    1782             : }
    1783             : 
    1784             : PyObject *
    1785        2345 : PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
    1786             : {
    1787        2345 :     if (size < 0) {
    1788           0 :         PyErr_SetString(PyExc_SystemError,
    1789             :                         "Negative size passed to PyUnicode_FromStringAndSize");
    1790           0 :         return NULL;
    1791             :     }
    1792        2345 :     if (u != NULL)
    1793        2343 :         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
    1794             :     else
    1795           2 :         return (PyObject *)_PyUnicode_New(size);
    1796             : }
    1797             : 
    1798             : PyObject *
    1799       16695 : PyUnicode_FromString(const char *u)
    1800             : {
    1801       16695 :     size_t size = strlen(u);
    1802       16695 :     if (size > PY_SSIZE_T_MAX) {
    1803           0 :         PyErr_SetString(PyExc_OverflowError, "input too long");
    1804           0 :         return NULL;
    1805             :     }
    1806       16695 :     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
    1807             : }
    1808             : 
    1809             : PyObject *
    1810       21728 : _PyUnicode_FromId(_Py_Identifier *id)
    1811             : {
    1812       21728 :     if (!id->object) {
    1813          79 :         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
    1814          79 :                                                   strlen(id->string),
    1815             :                                                   NULL, NULL);
    1816          79 :         if (!id->object)
    1817           0 :             return NULL;
    1818          79 :         PyUnicode_InternInPlace(&id->object);
    1819             :         assert(!id->next);
    1820          79 :         id->next = static_strings;
    1821          79 :         static_strings = id;
    1822             :     }
    1823       21728 :     return id->object;
    1824             : }
    1825             : 
    1826             : void
    1827           0 : _PyUnicode_ClearStaticStrings()
    1828             : {
    1829             :     _Py_Identifier *i;
    1830           0 :     for (i = static_strings; i; i = i->next) {
    1831           0 :         Py_DECREF(i->object);
    1832           0 :         i->object = NULL;
    1833           0 :         i->next = NULL;
    1834             :     }
    1835           0 : }
    1836             : 
    1837             : /* Internal function, doesn't check maximum character */
    1838             : 
    1839             : PyObject*
    1840        3415 : _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
    1841             : {
    1842        3415 :     const unsigned char *s = (const unsigned char *)buffer;
    1843             :     PyObject *unicode;
    1844        3415 :     if (size == 1) {
    1845             : #ifdef Py_DEBUG
    1846             :         assert(s[0] < 128);
    1847             : #endif
    1848         962 :         return get_latin1_char(s[0]);
    1849             :     }
    1850        2453 :     unicode = PyUnicode_New(size, 127);
    1851        2453 :     if (!unicode)
    1852           0 :         return NULL;
    1853        2453 :     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
    1854             :     assert(_PyUnicode_CheckConsistency(unicode, 1));
    1855        2453 :     return unicode;
    1856             : }
    1857             : 
    1858             : static Py_UCS4
    1859           0 : kind_maxchar_limit(unsigned int kind)
    1860             : {
    1861           0 :     switch (kind) {
    1862             :     case PyUnicode_1BYTE_KIND:
    1863           0 :         return 0x80;
    1864             :     case PyUnicode_2BYTE_KIND:
    1865           0 :         return 0x100;
    1866             :     case PyUnicode_4BYTE_KIND:
    1867           0 :         return 0x10000;
    1868             :     default:
    1869             :         assert(0 && "invalid kind");
    1870           0 :         return MAX_UNICODE;
    1871             :     }
    1872             : }
    1873             : 
    1874             : Py_LOCAL_INLINE(Py_UCS4)
    1875           0 : align_maxchar(Py_UCS4 maxchar)
    1876             : {
    1877           0 :     if (maxchar <= 127)
    1878           0 :         return 127;
    1879           0 :     else if (maxchar <= 255)
    1880           0 :         return 255;
    1881           0 :     else if (maxchar <= 65535)
    1882           0 :         return 65535;
    1883             :     else
    1884           0 :         return MAX_UNICODE;
    1885             : }
    1886             : 
    1887             : static PyObject*
    1888           0 : _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
    1889             : {
    1890             :     PyObject *res;
    1891             :     unsigned char max_char;
    1892             : 
    1893           0 :     if (size == 0) {
    1894           0 :         Py_INCREF(unicode_empty);
    1895           0 :         return unicode_empty;
    1896             :     }
    1897             :     assert(size > 0);
    1898           0 :     if (size == 1)
    1899           0 :         return get_latin1_char(u[0]);
    1900             : 
    1901           0 :     max_char = ucs1lib_find_max_char(u, u + size);
    1902           0 :     res = PyUnicode_New(size, max_char);
    1903           0 :     if (!res)
    1904           0 :         return NULL;
    1905           0 :     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
    1906             :     assert(_PyUnicode_CheckConsistency(res, 1));
    1907           0 :     return res;
    1908             : }
    1909             : 
    1910             : static PyObject*
    1911        3123 : _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
    1912             : {
    1913             :     PyObject *res;
    1914             :     Py_UCS2 max_char;
    1915             : 
    1916        3123 :     if (size == 0) {
    1917           0 :         Py_INCREF(unicode_empty);
    1918           0 :         return unicode_empty;
    1919             :     }
    1920             :     assert(size > 0);
    1921        3123 :     if (size == 1) {
    1922        3116 :         Py_UCS4 ch = u[0];
    1923        3116 :         if (ch < 256)
    1924        2936 :             return get_latin1_char((unsigned char)ch);
    1925             : 
    1926         180 :         res = PyUnicode_New(1, ch);
    1927         180 :         if (res == NULL)
    1928           0 :             return NULL;
    1929         180 :         PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
    1930             :         assert(_PyUnicode_CheckConsistency(res, 1));
    1931         180 :         return res;
    1932             :     }
    1933             : 
    1934           7 :     max_char = ucs2lib_find_max_char(u, u + size);
    1935           7 :     res = PyUnicode_New(size, max_char);
    1936           7 :     if (!res)
    1937           0 :         return NULL;
    1938           7 :     if (max_char >= 256)
    1939           7 :         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
    1940             :     else {
    1941           0 :         _PyUnicode_CONVERT_BYTES(
    1942             :             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
    1943             :     }
    1944             :     assert(_PyUnicode_CheckConsistency(res, 1));
    1945           7 :     return res;
    1946             : }
    1947             : 
    1948             : static PyObject*
    1949           0 : _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
    1950             : {
    1951             :     PyObject *res;
    1952             :     Py_UCS4 max_char;
    1953             : 
    1954           0 :     if (size == 0) {
    1955           0 :         Py_INCREF(unicode_empty);
    1956           0 :         return unicode_empty;
    1957             :     }
    1958             :     assert(size > 0);
    1959           0 :     if (size == 1) {
    1960           0 :         Py_UCS4 ch = u[0];
    1961           0 :         if (ch < 256)
    1962           0 :             return get_latin1_char((unsigned char)ch);
    1963             : 
    1964           0 :         res = PyUnicode_New(1, ch);
    1965           0 :         if (res == NULL)
    1966           0 :             return NULL;
    1967           0 :         PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
    1968             :         assert(_PyUnicode_CheckConsistency(res, 1));
    1969           0 :         return res;
    1970             :     }
    1971             : 
    1972           0 :     max_char = ucs4lib_find_max_char(u, u + size);
    1973           0 :     res = PyUnicode_New(size, max_char);
    1974           0 :     if (!res)
    1975           0 :         return NULL;
    1976           0 :     if (max_char < 256)
    1977           0 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
    1978             :                                  PyUnicode_1BYTE_DATA(res));
    1979           0 :     else if (max_char < 0x10000)
    1980           0 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
    1981             :                                  PyUnicode_2BYTE_DATA(res));
    1982             :     else
    1983           0 :         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
    1984             :     assert(_PyUnicode_CheckConsistency(res, 1));
    1985           0 :     return res;
    1986             : }
    1987             : 
    1988             : PyObject*
    1989        3123 : PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
    1990             : {
    1991        3123 :     if (size < 0) {
    1992           0 :         PyErr_SetString(PyExc_ValueError, "size must be positive");
    1993           0 :         return NULL;
    1994             :     }
    1995        3123 :     switch (kind) {
    1996             :     case PyUnicode_1BYTE_KIND:
    1997           0 :         return _PyUnicode_FromUCS1(buffer, size);
    1998             :     case PyUnicode_2BYTE_KIND:
    1999        3123 :         return _PyUnicode_FromUCS2(buffer, size);
    2000             :     case PyUnicode_4BYTE_KIND:
    2001           0 :         return _PyUnicode_FromUCS4(buffer, size);
    2002             :     default:
    2003           0 :         PyErr_SetString(PyExc_SystemError, "invalid kind");
    2004           0 :         return NULL;
    2005             :     }
    2006             : }
    2007             : 
    2008             : Py_UCS4
    2009         130 : _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
    2010             : {
    2011             :     enum PyUnicode_Kind kind;
    2012             :     void *startptr, *endptr;
    2013             : 
    2014             :     assert(PyUnicode_IS_READY(unicode));
    2015             :     assert(0 <= start);
    2016             :     assert(end <= PyUnicode_GET_LENGTH(unicode));
    2017             :     assert(start <= end);
    2018             : 
    2019         130 :     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
    2020           0 :         return PyUnicode_MAX_CHAR_VALUE(unicode);
    2021             : 
    2022         130 :     if (start == end)
    2023           0 :         return 127;
    2024             : 
    2025         130 :     if (PyUnicode_IS_ASCII(unicode))
    2026         130 :         return 127;
    2027             : 
    2028           0 :     kind = PyUnicode_KIND(unicode);
    2029           0 :     startptr = PyUnicode_DATA(unicode);
    2030           0 :     endptr = (char *)startptr + end * kind;
    2031           0 :     startptr = (char *)startptr + start * kind;
    2032           0 :     switch(kind) {
    2033             :     case PyUnicode_1BYTE_KIND:
    2034           0 :         return ucs1lib_find_max_char(startptr, endptr);
    2035             :     case PyUnicode_2BYTE_KIND:
    2036           0 :         return ucs2lib_find_max_char(startptr, endptr);
    2037             :     case PyUnicode_4BYTE_KIND:
    2038           0 :         return ucs4lib_find_max_char(startptr, endptr);
    2039             :     default:
    2040             :         assert(0);
    2041           0 :         return 0;
    2042             :     }
    2043             : }
    2044             : 
    2045             : /* Ensure that a string uses the most efficient storage, if it is not the
    2046             :    case: create a new string with of the right kind. Write NULL into *p_unicode
    2047             :    on error. */
    2048             : static void
    2049           0 : unicode_adjust_maxchar(PyObject **p_unicode)
    2050             : {
    2051             :     PyObject *unicode, *copy;
    2052             :     Py_UCS4 max_char;
    2053             :     Py_ssize_t len;
    2054             :     unsigned int kind;
    2055             : 
    2056             :     assert(p_unicode != NULL);
    2057           0 :     unicode = *p_unicode;
    2058             :     assert(PyUnicode_IS_READY(unicode));
    2059           0 :     if (PyUnicode_IS_ASCII(unicode))
    2060           0 :         return;
    2061             : 
    2062           0 :     len = PyUnicode_GET_LENGTH(unicode);
    2063           0 :     kind = PyUnicode_KIND(unicode);
    2064           0 :     if (kind == PyUnicode_1BYTE_KIND) {
    2065           0 :         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
    2066           0 :         max_char = ucs1lib_find_max_char(u, u + len);
    2067           0 :         if (max_char >= 128)
    2068           0 :             return;
    2069             :     }
    2070           0 :     else if (kind == PyUnicode_2BYTE_KIND) {
    2071           0 :         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
    2072           0 :         max_char = ucs2lib_find_max_char(u, u + len);
    2073           0 :         if (max_char >= 256)
    2074           0 :             return;
    2075             :     }
    2076             :     else {
    2077           0 :         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
    2078             :         assert(kind == PyUnicode_4BYTE_KIND);
    2079           0 :         max_char = ucs4lib_find_max_char(u, u + len);
    2080           0 :         if (max_char >= 0x10000)
    2081           0 :             return;
    2082             :     }
    2083           0 :     copy = PyUnicode_New(len, max_char);
    2084           0 :     if (copy != NULL)
    2085           0 :         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
    2086           0 :     Py_DECREF(unicode);
    2087           0 :     *p_unicode = copy;
    2088             : }
    2089             : 
    2090             : PyObject*
    2091           0 : _PyUnicode_Copy(PyObject *unicode)
    2092             : {
    2093             :     Py_ssize_t length;
    2094             :     PyObject *copy;
    2095             : 
    2096           0 :     if (!PyUnicode_Check(unicode)) {
    2097           0 :         PyErr_BadInternalCall();
    2098           0 :         return NULL;
    2099             :     }
    2100           0 :     if (PyUnicode_READY(unicode) == -1)
    2101           0 :         return NULL;
    2102             : 
    2103           0 :     length = PyUnicode_GET_LENGTH(unicode);
    2104           0 :     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
    2105           0 :     if (!copy)
    2106           0 :         return NULL;
    2107             :     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
    2108             : 
    2109           0 :     Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
    2110           0 :               length * PyUnicode_KIND(unicode));
    2111             :     assert(_PyUnicode_CheckConsistency(copy, 1));
    2112           0 :     return copy;
    2113             : }
    2114             : 
    2115             : 
    2116             : /* Widen Unicode objects to larger buffers. Don't write terminating null
    2117             :    character. Return NULL on error. */
    2118             : 
    2119             : void*
    2120           0 : _PyUnicode_AsKind(PyObject *s, unsigned int kind)
    2121             : {
    2122             :     Py_ssize_t len;
    2123             :     void *result;
    2124             :     unsigned int skind;
    2125             : 
    2126           0 :     if (PyUnicode_READY(s) == -1)
    2127           0 :         return NULL;
    2128             : 
    2129           0 :     len = PyUnicode_GET_LENGTH(s);
    2130           0 :     skind = PyUnicode_KIND(s);
    2131           0 :     if (skind >= kind) {
    2132           0 :         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
    2133           0 :         return NULL;
    2134             :     }
    2135           0 :     switch (kind) {
    2136             :     case PyUnicode_2BYTE_KIND:
    2137           0 :         result = PyMem_Malloc(len * sizeof(Py_UCS2));
    2138           0 :         if (!result)
    2139           0 :             return PyErr_NoMemory();
    2140             :         assert(skind == PyUnicode_1BYTE_KIND);
    2141           0 :         _PyUnicode_CONVERT_BYTES(
    2142             :             Py_UCS1, Py_UCS2,
    2143             :             PyUnicode_1BYTE_DATA(s),
    2144             :             PyUnicode_1BYTE_DATA(s) + len,
    2145             :             result);
    2146           0 :         return result;
    2147             :     case PyUnicode_4BYTE_KIND:
    2148           0 :         result = PyMem_Malloc(len * sizeof(Py_UCS4));
    2149           0 :         if (!result)
    2150           0 :             return PyErr_NoMemory();
    2151           0 :         if (skind == PyUnicode_2BYTE_KIND) {
    2152           0 :             _PyUnicode_CONVERT_BYTES(
    2153             :                 Py_UCS2, Py_UCS4,
    2154             :                 PyUnicode_2BYTE_DATA(s),
    2155             :                 PyUnicode_2BYTE_DATA(s) + len,
    2156             :                 result);
    2157             :         }
    2158             :         else {
    2159             :             assert(skind == PyUnicode_1BYTE_KIND);
    2160           0 :             _PyUnicode_CONVERT_BYTES(
    2161             :                 Py_UCS1, Py_UCS4,
    2162             :                 PyUnicode_1BYTE_DATA(s),
    2163             :                 PyUnicode_1BYTE_DATA(s) + len,
    2164             :                 result);
    2165             :         }
    2166           0 :         return result;
    2167             :     default:
    2168           0 :         break;
    2169             :     }
    2170           0 :     PyErr_SetString(PyExc_SystemError, "invalid kind");
    2171           0 :     return NULL;
    2172             : }
    2173             : 
    2174             : static Py_UCS4*
    2175           0 : as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
    2176             :         int copy_null)
    2177             : {
    2178             :     int kind;
    2179             :     void *data;
    2180             :     Py_ssize_t len, targetlen;
    2181           0 :     if (PyUnicode_READY(string) == -1)
    2182           0 :         return NULL;
    2183           0 :     kind = PyUnicode_KIND(string);
    2184           0 :     data = PyUnicode_DATA(string);
    2185           0 :     len = PyUnicode_GET_LENGTH(string);
    2186           0 :     targetlen = len;
    2187           0 :     if (copy_null)
    2188           0 :         targetlen++;
    2189           0 :     if (!target) {
    2190           0 :         if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
    2191           0 :             PyErr_NoMemory();
    2192           0 :             return NULL;
    2193             :         }
    2194           0 :         target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
    2195           0 :         if (!target) {
    2196           0 :             PyErr_NoMemory();
    2197           0 :             return NULL;
    2198             :         }
    2199             :     }
    2200             :     else {
    2201           0 :         if (targetsize < targetlen) {
    2202           0 :             PyErr_Format(PyExc_SystemError,
    2203             :                          "string is longer than the buffer");
    2204           0 :             if (copy_null && 0 < targetsize)
    2205           0 :                 target[0] = 0;
    2206           0 :             return NULL;
    2207             :         }
    2208             :     }
    2209           0 :     if (kind == PyUnicode_1BYTE_KIND) {
    2210           0 :         Py_UCS1 *start = (Py_UCS1 *) data;
    2211           0 :         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
    2212             :     }
    2213           0 :     else if (kind == PyUnicode_2BYTE_KIND) {
    2214           0 :         Py_UCS2 *start = (Py_UCS2 *) data;
    2215           0 :         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
    2216             :     }
    2217             :     else {
    2218             :         assert(kind == PyUnicode_4BYTE_KIND);
    2219           0 :         Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
    2220             :     }
    2221           0 :     if (copy_null)
    2222           0 :         target[len] = 0;
    2223           0 :     return target;
    2224             : }
    2225             : 
    2226             : Py_UCS4*
    2227           0 : PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
    2228             :                  int copy_null)
    2229             : {
    2230           0 :     if (target == NULL || targetsize < 0) {
    2231           0 :         PyErr_BadInternalCall();
    2232           0 :         return NULL;
    2233             :     }
    2234           0 :     return as_ucs4(string, target, targetsize, copy_null);
    2235             : }
    2236             : 
    2237             : Py_UCS4*
    2238           0 : PyUnicode_AsUCS4Copy(PyObject *string)
    2239             : {
    2240           0 :     return as_ucs4(string, NULL, 0, 1);
    2241             : }
    2242             : 
    2243             : #ifdef HAVE_WCHAR_H
    2244             : 
    2245             : PyObject *
    2246         357 : PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
    2247             : {
    2248         357 :     if (w == NULL) {
    2249           0 :         if (size == 0) {
    2250           0 :             Py_INCREF(unicode_empty);
    2251           0 :             return unicode_empty;
    2252             :         }
    2253           0 :         PyErr_BadInternalCall();
    2254           0 :         return NULL;
    2255             :     }
    2256             : 
    2257         357 :     if (size == -1) {
    2258           5 :         size = wcslen(w);
    2259             :     }
    2260             : 
    2261         357 :     return PyUnicode_FromUnicode(w, size);
    2262             : }
    2263             : 
    2264             : #endif /* HAVE_WCHAR_H */
    2265             : 
    2266             : static void
    2267           0 : makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
    2268             :         int zeropad, int width, int precision, char c)
    2269             : {
    2270           0 :     *fmt++ = '%';
    2271           0 :     if (width) {
    2272           0 :         if (zeropad)
    2273           0 :             *fmt++ = '0';
    2274           0 :         fmt += sprintf(fmt, "%d", width);
    2275             :     }
    2276           0 :     if (precision)
    2277           0 :         fmt += sprintf(fmt, ".%d", precision);
    2278           0 :     if (longflag)
    2279           0 :         *fmt++ = 'l';
    2280           0 :     else if (longlongflag) {
    2281             :         /* longlongflag should only ever be nonzero on machines with
    2282             :            HAVE_LONG_LONG defined */
    2283             : #ifdef HAVE_LONG_LONG
    2284           0 :         char *f = PY_FORMAT_LONG_LONG;
    2285           0 :         while (*f)
    2286           0 :             *fmt++ = *f++;
    2287             : #else
    2288             :         /* we shouldn't ever get here */
    2289             :         assert(0);
    2290             :         *fmt++ = 'l';
    2291             : #endif
    2292             :     }
    2293           0 :     else if (size_tflag) {
    2294           0 :         char *f = PY_FORMAT_SIZE_T;
    2295           0 :         while (*f)
    2296           0 :             *fmt++ = *f++;
    2297             :     }
    2298           0 :     *fmt++ = c;
    2299           0 :     *fmt = '\0';
    2300           0 : }
    2301             : 
    2302             : /* helper for PyUnicode_FromFormatV() */
    2303             : 
    2304             : static const char*
    2305       22878 : parse_format_flags(const char *f,
    2306             :                    int *p_width, int *p_precision,
    2307             :                    int *p_longflag, int *p_longlongflag, int *p_size_tflag)
    2308             : {
    2309             :     int width, precision, longflag, longlongflag, size_tflag;
    2310             : 
    2311             :     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
    2312       22878 :     f++;
    2313       22878 :     width = 0;
    2314       45756 :     while (Py_ISDIGIT((unsigned)*f))
    2315           0 :         width = (width*10) + *f++ - '0';
    2316       22878 :     precision = 0;
    2317       22878 :     if (*f == '.') {
    2318        3225 :         f++;
    2319       13497 :         while (Py_ISDIGIT((unsigned)*f))
    2320        7047 :             precision = (precision*10) + *f++ - '0';
    2321        3225 :         if (*f == '%') {
    2322             :             /* "%.3%s" => f points to "3" */
    2323           0 :             f--;
    2324             :         }
    2325             :     }
    2326       22878 :     if (*f == '\0') {
    2327             :         /* bogus format "%.1" => go backward, f points to "1" */
    2328           0 :         f--;
    2329             :     }
    2330       22878 :     if (p_width != NULL)
    2331       15252 :         *p_width = width;
    2332       22878 :     if (p_precision != NULL)
    2333        7626 :         *p_precision = precision;
    2334             : 
    2335             :     /* Handle %ld, %lu, %lld and %llu. */
    2336       22878 :     longflag = 0;
    2337       22878 :     longlongflag = 0;
    2338       22878 :     size_tflag = 0;
    2339             : 
    2340       22878 :     if (*f == 'l') {
    2341           0 :         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
    2342           0 :             longflag = 1;
    2343           0 :             ++f;
    2344             :         }
    2345             : #ifdef HAVE_LONG_LONG
    2346           0 :         else if (f[1] == 'l' &&
    2347           0 :                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
    2348           0 :             longlongflag = 1;
    2349           0 :             f += 2;
    2350             :         }
    2351             : #endif
    2352             :     }
    2353             :     /* handle the size_t flag. */
    2354       22878 :     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
    2355           0 :         size_tflag = 1;
    2356           0 :         ++f;
    2357             :     }
    2358       22878 :     if (p_longflag != NULL)
    2359        7626 :         *p_longflag = longflag;
    2360       22878 :     if (p_longlongflag != NULL)
    2361       15252 :         *p_longlongflag = longlongflag;
    2362       22878 :     if (p_size_tflag != NULL)
    2363        7626 :         *p_size_tflag = size_tflag;
    2364       22878 :     return f;
    2365             : }
    2366             : 
    2367             : /* maximum number of characters required for output of %ld.  21 characters
    2368             :    allows for 64-bit integers (in decimal) and an optional sign. */
    2369             : #define MAX_LONG_CHARS 21
    2370             : /* maximum number of characters required for output of %lld.
    2371             :    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
    2372             :    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
    2373             : #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
    2374             : 
    2375             : PyObject *
    2376        2969 : PyUnicode_FromFormatV(const char *format, va_list vargs)
    2377             : {
    2378             :     va_list count;
    2379        2969 :     Py_ssize_t callcount = 0;
    2380        2969 :     PyObject **callresults = NULL;
    2381        2969 :     PyObject **callresult = NULL;
    2382        2969 :     Py_ssize_t n = 0;
    2383        2969 :     int width = 0;
    2384        2969 :     int precision = 0;
    2385             :     int zeropad;
    2386             :     const char* f;
    2387             :     PyObject *string;
    2388             :     /* used by sprintf */
    2389             :     char fmt[61]; /* should be enough for %0width.precisionlld */
    2390        2969 :     Py_UCS4 maxchar = 127; /* result is ASCII by default */
    2391             :     Py_UCS4 argmaxchar;
    2392        2969 :     Py_ssize_t numbersize = 0;
    2393        2969 :     char *numberresults = NULL;
    2394        2969 :     char *numberresult = NULL;
    2395             :     Py_ssize_t i;
    2396             :     int kind;
    2397             :     void *data;
    2398             : 
    2399        2969 :     Py_VA_COPY(count, vargs);
    2400             :     /* step 1: count the number of %S/%R/%A/%s format specifications
    2401             :      * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
    2402             :      * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
    2403             :      * result in an array)
    2404             :      * also estimate a upper bound for all the number formats in the string,
    2405             :      * numbers will be formatted in step 3 and be kept in a '\0'-separated
    2406             :      * buffer before putting everything together. */
    2407       68677 :     for (f = format; *f; f++) {
    2408       65708 :         if (*f == '%') {
    2409             :             int longlongflag;
    2410             :             /* skip width or width.precision (eg. "1.2" of "%1.2f") */
    2411        7626 :             f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
    2412        7626 :             if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
    2413        1079 :                 ++callcount;
    2414             : 
    2415        6547 :             else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
    2416             : #ifdef HAVE_LONG_LONG
    2417        1890 :                 if (longlongflag) {
    2418           0 :                     if (width < MAX_LONG_LONG_CHARS)
    2419           0 :                         width = MAX_LONG_LONG_CHARS;
    2420             :                 }
    2421             :                 else
    2422             : #endif
    2423             :                     /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
    2424             :                        including sign.  Decimal takes the most space.  This
    2425             :                        isn't enough for octal.  If a width is specified we
    2426             :                        need more (which we allocate later). */
    2427        1890 :                     if (width < MAX_LONG_CHARS)
    2428        1890 :                         width = MAX_LONG_CHARS;
    2429             : 
    2430             :                 /* account for the size + '\0' to separate numbers
    2431             :                    inside of the numberresults buffer */
    2432        1890 :                 numbersize += (width + 1);
    2433             :             }
    2434             :         }
    2435       58082 :         else if ((unsigned char)*f > 127) {
    2436           0 :             PyErr_Format(PyExc_ValueError,
    2437             :                 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
    2438             :                 "string, got a non-ASCII byte: 0x%02x",
    2439           0 :                 (unsigned char)*f);
    2440           0 :             return NULL;
    2441             :         }
    2442             :     }
    2443             :     /* step 2: allocate memory for the results of
    2444             :      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
    2445        2969 :     if (callcount) {
    2446        1079 :         callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
    2447        1079 :         if (!callresults) {
    2448           0 :             PyErr_NoMemory();
    2449           0 :             return NULL;
    2450             :         }
    2451        1079 :         callresult = callresults;
    2452             :     }
    2453             :     /* step 2.5: allocate memory for the results of formating numbers */
    2454        2969 :     if (numbersize) {
    2455        1890 :         numberresults = PyObject_Malloc(numbersize);
    2456        1890 :         if (!numberresults) {
    2457           0 :             PyErr_NoMemory();
    2458           0 :             goto fail;
    2459             :         }
    2460        1890 :         numberresult = numberresults;
    2461             :     }
    2462             : 
    2463             :     /* step 3: format numbers and figure out how large a buffer we need */
    2464       68677 :     for (f = format; *f; f++) {
    2465       65708 :         if (*f == '%') {
    2466             :             const char* p;
    2467             :             int longflag;
    2468             :             int longlongflag;
    2469             :             int size_tflag;
    2470             :             int numprinted;
    2471             : 
    2472        7626 :             p = f;
    2473        7626 :             zeropad = (f[1] == '0');
    2474        7626 :             f = parse_format_flags(f, &width, &precision,
    2475             :                                    &longflag, &longlongflag, &size_tflag);
    2476        7626 :             switch (*f) {
    2477             :             case 'c':
    2478             :             {
    2479           0 :                 Py_UCS4 ordinal = va_arg(count, int);
    2480           0 :                 maxchar = MAX_MAXCHAR(maxchar, ordinal);
    2481           0 :                 n++;
    2482           0 :                 break;
    2483             :             }
    2484             :             case '%':
    2485           0 :                 n++;
    2486           0 :                 break;
    2487             :             case 'i':
    2488             :             case 'd':
    2489           0 :                 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
    2490           0 :                         width, precision, *f);
    2491           0 :                 if (longflag)
    2492           0 :                     numprinted = sprintf(numberresult, fmt,
    2493             :                                          va_arg(count, long));
    2494             : #ifdef HAVE_LONG_LONG
    2495           0 :                 else if (longlongflag)
    2496           0 :                     numprinted = sprintf(numberresult, fmt,
    2497             :                                          va_arg(count, PY_LONG_LONG));
    2498             : #endif
    2499           0 :                 else if (size_tflag)
    2500           0 :                     numprinted = sprintf(numberresult, fmt,
    2501             :                                          va_arg(count, Py_ssize_t));
    2502             :                 else
    2503           0 :                     numprinted = sprintf(numberresult, fmt,
    2504             :                                          va_arg(count, int));
    2505           0 :                 n += numprinted;
    2506             :                 /* advance by +1 to skip over the '\0' */
    2507           0 :                 numberresult += (numprinted + 1);
    2508             :                 assert(*(numberresult - 1) == '\0');
    2509             :                 assert(*(numberresult - 2) != '\0');
    2510             :                 assert(numprinted >= 0);
    2511             :                 assert(numberresult <= numberresults + numbersize);
    2512           0 :                 break;
    2513             :             case 'u':
    2514           0 :                 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
    2515             :                         width, precision, 'u');
    2516           0 :                 if (longflag)
    2517           0 :                     numprinted = sprintf(numberresult, fmt,
    2518             :                                          va_arg(count, unsigned long));
    2519             : #ifdef HAVE_LONG_LONG
    2520           0 :                 else if (longlongflag)
    2521           0 :                     numprinted = sprintf(numberresult, fmt,
    2522             :                                          va_arg(count, unsigned PY_LONG_LONG));
    2523             : #endif
    2524           0 :                 else if (size_tflag)
    2525           0 :                     numprinted = sprintf(numberresult, fmt,
    2526             :                                          va_arg(count, size_t));
    2527             :                 else
    2528           0 :                     numprinted = sprintf(numberresult, fmt,
    2529             :                                          va_arg(count, unsigned int));
    2530           0 :                 n += numprinted;
    2531           0 :                 numberresult += (numprinted + 1);
    2532             :                 assert(*(numberresult - 1) == '\0');
    2533             :                 assert(*(numberresult - 2) != '\0');
    2534             :                 assert(numprinted >= 0);
    2535             :                 assert(numberresult <= numberresults + numbersize);
    2536           0 :                 break;
    2537             :             case 'x':
    2538           0 :                 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
    2539           0 :                 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
    2540           0 :                 n += numprinted;
    2541           0 :                 numberresult += (numprinted + 1);
    2542             :                 assert(*(numberresult - 1) == '\0');
    2543             :                 assert(*(numberresult - 2) != '\0');
    2544             :                 assert(numprinted >= 0);
    2545             :                 assert(numberresult <= numberresults + numbersize);
    2546           0 :                 break;
    2547             :             case 'p':
    2548        1890 :                 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
    2549             :                 /* %p is ill-defined:  ensure leading 0x. */
    2550        1890 :                 if (numberresult[1] == 'X')
    2551           0 :                     numberresult[1] = 'x';
    2552        1890 :                 else if (numberresult[1] != 'x') {
    2553           0 :                     memmove(numberresult + 2, numberresult,
    2554           0 :                             strlen(numberresult) + 1);
    2555           0 :                     numberresult[0] = '0';
    2556           0 :                     numberresult[1] = 'x';
    2557           0 :                     numprinted += 2;
    2558             :                 }
    2559        1890 :                 n += numprinted;
    2560        1890 :                 numberresult += (numprinted + 1);
    2561             :                 assert(*(numberresult - 1) == '\0');
    2562             :                 assert(*(numberresult - 2) != '\0');
    2563             :                 assert(numprinted >= 0);
    2564             :                 assert(numberresult <= numberresults + numbersize);
    2565        1890 :                 break;
    2566             :             case 's':
    2567             :             {
    2568             :                 /* UTF-8 */
    2569        1075 :                 const char *s = va_arg(count, const char*);
    2570        1075 :                 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
    2571        1075 :                 if (!str)
    2572             :                     goto fail;
    2573             :                 /* since PyUnicode_DecodeUTF8 returns already flexible
    2574             :                    unicode objects, there is no need to call ready on them */
    2575        1075 :                 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
    2576        1075 :                 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2577        1075 :                 n += PyUnicode_GET_LENGTH(str);
    2578             :                 /* Remember the str and switch to the next slot */
    2579        1075 :                 *callresult++ = str;
    2580        1075 :                 break;
    2581             :             }
    2582             :             case 'U':
    2583             :             {
    2584        4657 :                 PyObject *obj = va_arg(count, PyObject *);
    2585             :                 assert(obj && _PyUnicode_CHECK(obj));
    2586        4657 :                 if (PyUnicode_READY(obj) == -1)
    2587             :                     goto fail;
    2588        4657 :                 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
    2589        4657 :                 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2590        4657 :                 n += PyUnicode_GET_LENGTH(obj);
    2591        4657 :                 break;
    2592             :             }
    2593             :             case 'V':
    2594             :             {
    2595           0 :                 PyObject *obj = va_arg(count, PyObject *);
    2596           0 :                 const char *str = va_arg(count, const char *);
    2597             :                 PyObject *str_obj;
    2598             :                 assert(obj || str);
    2599             :                 assert(!obj || _PyUnicode_CHECK(obj));
    2600           0 :                 if (obj) {
    2601           0 :                     if (PyUnicode_READY(obj) == -1)
    2602             :                         goto fail;
    2603           0 :                     argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
    2604           0 :                     maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2605           0 :                     n += PyUnicode_GET_LENGTH(obj);
    2606           0 :                     *callresult++ = NULL;
    2607             :                 }
    2608             :                 else {
    2609           0 :                     str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
    2610           0 :                     if (!str_obj)
    2611             :                         goto fail;
    2612           0 :                     if (PyUnicode_READY(str_obj) == -1) {
    2613           0 :                         Py_DECREF(str_obj);
    2614             :                         goto fail;
    2615             :                     }
    2616           0 :                     argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
    2617           0 :                     maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2618           0 :                     n += PyUnicode_GET_LENGTH(str_obj);
    2619           0 :                     *callresult++ = str_obj;
    2620             :                 }
    2621           0 :                 break;
    2622             :             }
    2623             :             case 'S':
    2624             :             {
    2625           0 :                 PyObject *obj = va_arg(count, PyObject *);
    2626             :                 PyObject *str;
    2627             :                 assert(obj);
    2628           0 :                 str = PyObject_Str(obj);
    2629           0 :                 if (!str)
    2630             :                     goto fail;
    2631           0 :                 if (PyUnicode_READY(str) == -1) {
    2632           0 :                     Py_DECREF(str);
    2633             :                     goto fail;
    2634             :                 }
    2635           0 :                 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
    2636           0 :                 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2637           0 :                 n += PyUnicode_GET_LENGTH(str);
    2638             :                 /* Remember the str and switch to the next slot */
    2639           0 :                 *callresult++ = str;
    2640           0 :                 break;
    2641             :             }
    2642             :             case 'R':
    2643             :             {
    2644           4 :                 PyObject *obj = va_arg(count, PyObject *);
    2645             :                 PyObject *repr;
    2646             :                 assert(obj);
    2647           4 :                 repr = PyObject_Repr(obj);
    2648           4 :                 if (!repr)
    2649             :                     goto fail;
    2650           4 :                 if (PyUnicode_READY(repr) == -1) {
    2651           0 :                     Py_DECREF(repr);
    2652             :                     goto fail;
    2653             :                 }
    2654           4 :                 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
    2655           4 :                 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2656           4 :                 n += PyUnicode_GET_LENGTH(repr);
    2657             :                 /* Remember the repr and switch to the next slot */
    2658           4 :                 *callresult++ = repr;
    2659           4 :                 break;
    2660             :             }
    2661             :             case 'A':
    2662             :             {
    2663           0 :                 PyObject *obj = va_arg(count, PyObject *);
    2664             :                 PyObject *ascii;
    2665             :                 assert(obj);
    2666           0 :                 ascii = PyObject_ASCII(obj);
    2667           0 :                 if (!ascii)
    2668             :                     goto fail;
    2669           0 :                 if (PyUnicode_READY(ascii) == -1) {
    2670           0 :                     Py_DECREF(ascii);
    2671             :                     goto fail;
    2672             :                 }
    2673           0 :                 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
    2674           0 :                 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
    2675           0 :                 n += PyUnicode_GET_LENGTH(ascii);
    2676             :                 /* Remember the repr and switch to the next slot */
    2677           0 :                 *callresult++ = ascii;
    2678           0 :                 break;
    2679             :             }
    2680             :             default:
    2681             :                 /* if we stumble upon an unknown
    2682             :                    formatting code, copy the rest of
    2683             :                    the format string to the output
    2684             :                    string. (we cannot just skip the
    2685             :                    code, since there's no way to know
    2686             :                    what's in the argument list) */
    2687           0 :                 n += strlen(p);
    2688             :                 goto expand;
    2689             :             }
    2690             :         } else
    2691       58082 :             n++;
    2692             :     }
    2693             :   expand:
    2694             :     /* step 4: fill the buffer */
    2695             :     /* Since we've analyzed how much space we need,
    2696             :        we don't have to resize the string.
    2697             :        There can be no errors beyond this point. */
    2698        2969 :     string = PyUnicode_New(n, maxchar);
    2699        2969 :     if (!string)
    2700           0 :         goto fail;
    2701        2969 :     kind = PyUnicode_KIND(string);
    2702        2969 :     data = PyUnicode_DATA(string);
    2703        2969 :     callresult = callresults;
    2704        2969 :     numberresult = numberresults;
    2705             : 
    2706       68677 :     for (i = 0, f = format; *f; f++) {
    2707       65708 :         if (*f == '%') {
    2708             :             const char* p;
    2709             : 
    2710        7626 :             p = f;
    2711        7626 :             f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
    2712             :             /* checking for == because the last argument could be a empty
    2713             :                string, which causes i to point to end, the assert at the end of
    2714             :                the loop */
    2715             :             assert(i <= PyUnicode_GET_LENGTH(string));
    2716             : 
    2717        7626 :             switch (*f) {
    2718             :             case 'c':
    2719             :             {
    2720           0 :                 const int ordinal = va_arg(vargs, int);
    2721           0 :                 PyUnicode_WRITE(kind, data, i++, ordinal);
    2722           0 :                 break;
    2723             :             }
    2724             :             case 'i':
    2725             :             case 'd':
    2726             :             case 'u':
    2727             :             case 'x':
    2728             :             case 'p':
    2729             :             {
    2730             :                 Py_ssize_t len;
    2731             :                 /* unused, since we already have the result */
    2732        1890 :                 if (*f == 'p')
    2733        1890 :                     (void) va_arg(vargs, void *);
    2734             :                 else
    2735           0 :                     (void) va_arg(vargs, int);
    2736             :                 /* extract the result from numberresults and append. */
    2737        1890 :                 len = strlen(numberresult);
    2738        1890 :                 unicode_write_cstr(string, i, numberresult, len);
    2739             :                 /* skip over the separating '\0' */
    2740        1890 :                 i += len;
    2741        1890 :                 numberresult += len;
    2742             :                 assert(*numberresult == '\0');
    2743        1890 :                 numberresult++;
    2744             :                 assert(numberresult <= numberresults + numbersize);
    2745        1890 :                 break;
    2746             :             }
    2747             :             case 's':
    2748             :             {
    2749             :                 /* unused, since we already have the result */
    2750             :                 Py_ssize_t size;
    2751        1075 :                 (void) va_arg(vargs, char *);
    2752        1075 :                 size = PyUnicode_GET_LENGTH(*callresult);
    2753             :                 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
    2754        1075 :                 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
    2755        1075 :                 i += size;
    2756             :                 /* We're done with the unicode()/repr() => forget it */
    2757        1075 :                 Py_DECREF(*callresult);
    2758             :                 /* switch to next unicode()/repr() result */
    2759        1075 :                 ++callresult;
    2760        1075 :                 break;
    2761             :             }
    2762             :             case 'U':
    2763             :             {
    2764        4657 :                 PyObject *obj = va_arg(vargs, PyObject *);
    2765             :                 Py_ssize_t size;
    2766             :                 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
    2767        4657 :                 size = PyUnicode_GET_LENGTH(obj);
    2768        4657 :                 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
    2769        4657 :                 i += size;
    2770        4657 :                 break;
    2771             :             }
    2772             :             case 'V':
    2773             :             {
    2774             :                 Py_ssize_t size;
    2775           0 :                 PyObject *obj = va_arg(vargs, PyObject *);
    2776           0 :                 va_arg(vargs, const char *);
    2777           0 :                 if (obj) {
    2778           0 :                     size = PyUnicode_GET_LENGTH(obj);
    2779             :                     assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
    2780           0 :                     _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
    2781           0 :                     i += size;
    2782             :                 } else {
    2783           0 :                     size = PyUnicode_GET_LENGTH(*callresult);
    2784             :                     assert(PyUnicode_KIND(*callresult) <=
    2785             :                            PyUnicode_KIND(string));
    2786           0 :                     _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
    2787           0 :                     i += size;
    2788           0 :                     Py_DECREF(*callresult);
    2789             :                 }
    2790           0 :                 ++callresult;
    2791           0 :                 break;
    2792             :             }
    2793             :             case 'S':
    2794             :             case 'R':
    2795             :             case 'A':
    2796             :             {
    2797           4 :                 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
    2798             :                 /* unused, since we already have the result */
    2799           4 :                 (void) va_arg(vargs, PyObject *);
    2800             :                 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
    2801           4 :                 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0,  size);
    2802           4 :                 i += size;
    2803             :                 /* We're done with the unicode()/repr() => forget it */
    2804           4 :                 Py_DECREF(*callresult);
    2805             :                 /* switch to next unicode()/repr() result */
    2806           4 :                 ++callresult;
    2807           4 :                 break;
    2808             :             }
    2809             :             case '%':
    2810           0 :                 PyUnicode_WRITE(kind, data, i++, '%');
    2811           0 :                 break;
    2812             :             default:
    2813             :             {
    2814           0 :                 Py_ssize_t len = strlen(p);
    2815           0 :                 unicode_write_cstr(string, i, p, len);
    2816           0 :                 i += len;
    2817             :                 assert(i == PyUnicode_GET_LENGTH(string));
    2818           0 :                 goto end;
    2819             :             }
    2820             :             }
    2821             :         }
    2822             :         else {
    2823             :             assert(i < PyUnicode_GET_LENGTH(string));
    2824       58082 :             PyUnicode_WRITE(kind, data, i++, *f);
    2825             :         }
    2826             :     }
    2827             :     assert(i == PyUnicode_GET_LENGTH(string));
    2828             : 
    2829             :   end:
    2830        2969 :     if (callresults)
    2831        1079 :         PyObject_Free(callresults);
    2832        2969 :     if (numberresults)
    2833        1890 :         PyObject_Free(numberresults);
    2834        2969 :     return unicode_result(string);
    2835             :   fail:
    2836           0 :     if (callresults) {
    2837           0 :         PyObject **callresult2 = callresults;
    2838           0 :         while (callresult2 < callresult) {
    2839           0 :             Py_XDECREF(*callresult2);
    2840           0 :             ++callresult2;
    2841             :         }
    2842           0 :         PyObject_Free(callresults);
    2843             :     }
    2844           0 :     if (numberresults)
    2845           0 :         PyObject_Free(numberresults);
    2846           0 :     return NULL;
    2847             : }
    2848             : 
    2849             : PyObject *
    2850        1893 : PyUnicode_FromFormat(const char *format, ...)
    2851             : {
    2852             :     PyObject* ret;
    2853             :     va_list vargs;
    2854             : 
    2855             : #ifdef HAVE_STDARG_PROTOTYPES
    2856        1893 :     va_start(vargs, format);
    2857             : #else
    2858             :     va_start(vargs);
    2859             : #endif
    2860        1893 :     ret = PyUnicode_FromFormatV(format, vargs);
    2861        1893 :     va_end(vargs);
    2862        1893 :     return ret;
    2863             : }
    2864             : 
    2865             : #ifdef HAVE_WCHAR_H
    2866             : 
    2867             : /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
    2868             :    convert a Unicode object to a wide character string.
    2869             : 
    2870             :    - If w is NULL: return the number of wide characters (including the null
    2871             :      character) required to convert the unicode object. Ignore size argument.
    2872             : 
    2873             :    - Otherwise: return the number of wide characters (excluding the null
    2874             :      character) written into w. Write at most size wide characters (including
    2875             :      the null character). */
    2876             : static Py_ssize_t
    2877          52 : unicode_aswidechar(PyObject *unicode,
    2878             :                    wchar_t *w,
    2879             :                    Py_ssize_t size)
    2880             : {
    2881             :     Py_ssize_t res;
    2882             :     const wchar_t *wstr;
    2883             : 
    2884          52 :     wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
    2885          52 :     if (wstr == NULL)
    2886           0 :         return -1;
    2887             : 
    2888          52 :     if (w != NULL) {
    2889          26 :         if (size > res)
    2890          26 :             size = res + 1;
    2891             :         else
    2892           0 :             res = size;
    2893          26 :         Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
    2894          26 :         return res;
    2895             :     }
    2896             :     else
    2897          26 :         return res + 1;
    2898             : }
    2899             : 
    2900             : Py_ssize_t
    2901           0 : PyUnicode_AsWideChar(PyObject *unicode,
    2902             :                      wchar_t *w,
    2903             :                      Py_ssize_t size)
    2904             : {
    2905           0 :     if (unicode == NULL) {
    2906           0 :         PyErr_BadInternalCall();
    2907           0 :         return -1;
    2908             :     }
    2909           0 :     return unicode_aswidechar(unicode, w, size);
    2910             : }
    2911             : 
    2912             : wchar_t*
    2913          26 : PyUnicode_AsWideCharString(PyObject *unicode,
    2914             :                            Py_ssize_t *size)
    2915             : {
    2916             :     wchar_t* buffer;
    2917             :     Py_ssize_t buflen;
    2918             : 
    2919          26 :     if (unicode == NULL) {
    2920           0 :         PyErr_BadInternalCall();
    2921           0 :         return NULL;
    2922             :     }
    2923             : 
    2924          26 :     buflen = unicode_aswidechar(unicode, NULL, 0);
    2925          26 :     if (buflen == -1)
    2926           0 :         return NULL;
    2927          26 :     if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
    2928           0 :         PyErr_NoMemory();
    2929           0 :         return NULL;
    2930             :     }
    2931             : 
    2932          26 :     buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
    2933          26 :     if (buffer == NULL) {
    2934           0 :         PyErr_NoMemory();
    2935           0 :         return NULL;
    2936             :     }
    2937          26 :     buflen = unicode_aswidechar(unicode, buffer, buflen);
    2938          26 :     if (buflen == -1) {
    2939           0 :         PyMem_FREE(buffer);
    2940           0 :         return NULL;
    2941             :     }
    2942          26 :     if (size != NULL)
    2943          26 :         *size = buflen;
    2944          26 :     return buffer;
    2945             : }
    2946             : 
    2947             : #endif /* HAVE_WCHAR_H */
    2948             : 
    2949             : PyObject *
    2950         387 : PyUnicode_FromOrdinal(int ordinal)
    2951             : {
    2952             :     PyObject *v;
    2953         387 :     if (ordinal < 0 || ordinal > MAX_UNICODE) {
    2954           0 :         PyErr_SetString(PyExc_ValueError,
    2955             :                         "chr() arg not in range(0x110000)");
    2956           0 :         return NULL;
    2957             :     }
    2958             : 
    2959         387 :     if (ordinal < 256)
    2960         387 :         return get_latin1_char(ordinal);
    2961             : 
    2962           0 :     v = PyUnicode_New(1, ordinal);
    2963           0 :     if (v == NULL)
    2964           0 :         return NULL;
    2965           0 :     PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
    2966             :     assert(_PyUnicode_CheckConsistency(v, 1));
    2967           0 :     return v;
    2968             : }
    2969             : 
    2970             : PyObject *
    2971        7562 : PyUnicode_FromObject(register PyObject *obj)
    2972             : {
    2973             :     /* XXX Perhaps we should make this API an alias of
    2974             :        PyObject_Str() instead ?! */
    2975        7562 :     if (PyUnicode_CheckExact(obj)) {
    2976        7560 :         if (PyUnicode_READY(obj) == -1)
    2977           0 :             return NULL;
    2978        7560 :         Py_INCREF(obj);
    2979        7560 :         return obj;
    2980             :     }
    2981           2 :     if (PyUnicode_Check(obj)) {
    2982             :         /* For a Unicode subtype that's not a Unicode object,
    2983             :            return a true Unicode object with the same data. */
    2984           0 :         return _PyUnicode_Copy(obj);
    2985             :     }
    2986           2 :     PyErr_Format(PyExc_TypeError,
    2987             :                  "Can't convert '%.100s' object to str implicitly",
    2988           2 :                  Py_TYPE(obj)->tp_name);
    2989           2 :     return NULL;
    2990             : }
    2991             : 
    2992             : PyObject *
    2993           2 : PyUnicode_FromEncodedObject(register PyObject *obj,
    2994             :                             const char *encoding,
    2995             :                             const char *errors)
    2996             : {
    2997             :     Py_buffer buffer;
    2998             :     PyObject *v;
    2999             : 
    3000           2 :     if (obj == NULL) {
    3001           0 :         PyErr_BadInternalCall();
    3002           0 :         return NULL;
    3003             :     }
    3004             : 
    3005             :     /* Decoding bytes objects is the most common case and should be fast */
    3006           2 :     if (PyBytes_Check(obj)) {
    3007           2 :         if (PyBytes_GET_SIZE(obj) == 0) {
    3008           0 :             Py_INCREF(unicode_empty);
    3009           0 :             v = unicode_empty;
    3010             :         }
    3011             :         else {
    3012           4 :             v = PyUnicode_Decode(
    3013           2 :                     PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
    3014             :                     encoding, errors);
    3015             :         }
    3016           2 :         return v;
    3017             :     }
    3018             : 
    3019           0 :     if (PyUnicode_Check(obj)) {
    3020           0 :         PyErr_SetString(PyExc_TypeError,
    3021             :                         "decoding str is not supported");
    3022           0 :         return NULL;
    3023             :     }
    3024             : 
    3025             :     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
    3026           0 :     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
    3027           0 :         PyErr_Format(PyExc_TypeError,
    3028             :                      "coercing to str: need bytes, bytearray "
    3029             :                      "or buffer-like object, %.80s found",
    3030           0 :                      Py_TYPE(obj)->tp_name);
    3031           0 :         return NULL;
    3032             :     }
    3033             : 
    3034           0 :     if (buffer.len == 0) {
    3035           0 :         Py_INCREF(unicode_empty);
    3036           0 :         v = unicode_empty;
    3037             :     }
    3038             :     else
    3039           0 :         v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
    3040             : 
    3041           0 :     PyBuffer_Release(&buffer);
    3042           0 :     return v;
    3043             : }
    3044             : 
    3045             : /* Convert encoding to lower case and replace '_' with '-' in order to
    3046             :    catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
    3047             :    1 on success. */
    3048             : static int
    3049        1051 : normalize_encoding(const char *encoding,
    3050             :                    char *lower,
    3051             :                    size_t lower_len)
    3052             : {
    3053             :     const char *e;
    3054             :     char *l;
    3055             :     char *l_end;
    3056             : 
    3057        1051 :     if (encoding == NULL) {
    3058           0 :         strcpy(lower, "utf-8");
    3059           0 :         return 1;
    3060             :     }
    3061        1051 :     e = encoding;
    3062        1051 :     l = lower;
    3063        1051 :     l_end = &lower[lower_len - 1];
    3064        7357 :     while (*e) {
    3065        5255 :         if (l == l_end)
    3066           0 :             return 0;
    3067        5255 :         if (Py_ISUPPER(*e)) {
    3068           0 :             *l++ = Py_TOLOWER(*e++);
    3069             :         }
    3070        5255 :         else if (*e == '_') {
    3071           0 :             *l++ = '-';
    3072           0 :             e++;
    3073             :         }
    3074             :         else {
    3075        5255 :             *l++ = *e++;
    3076             :         }
    3077             :     }
    3078        1051 :     *l = '\0';
    3079        1051 :     return 1;
    3080             : }
    3081             : 
    3082             : PyObject *
    3083         634 : PyUnicode_Decode(const char *s,
    3084             :                  Py_ssize_t size,
    3085             :                  const char *encoding,
    3086             :                  const char *errors)
    3087             : {
    3088         634 :     PyObject *buffer = NULL, *unicode;
    3089             :     Py_buffer info;
    3090             :     char lower[11];  /* Enough for any encoding shortcut */
    3091             : 
    3092             :     /* Shortcuts for common default encodings */
    3093         634 :     if (normalize_encoding(encoding, lower, sizeof(lower))) {
    3094         634 :         if ((strcmp(lower, "utf-8") == 0) ||
    3095           0 :             (strcmp(lower, "utf8") == 0))
    3096         634 :             return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    3097           0 :         else if ((strcmp(lower, "latin-1") == 0) ||
    3098           0 :                  (strcmp(lower, "latin1") == 0) ||
    3099           0 :                  (strcmp(lower, "iso-8859-1") == 0))
    3100           0 :             return PyUnicode_DecodeLatin1(s, size, errors);
    3101             : #ifdef HAVE_MBCS
    3102             :         else if (strcmp(lower, "mbcs") == 0)
    3103             :             return PyUnicode_DecodeMBCS(s, size, errors);
    3104             : #endif
    3105           0 :         else if (strcmp(lower, "ascii") == 0)
    3106           0 :             return PyUnicode_DecodeASCII(s, size, errors);
    3107           0 :         else if (strcmp(lower, "utf-16") == 0)
    3108           0 :             return PyUnicode_DecodeUTF16(s, size, errors, 0);
    3109           0 :         else if (strcmp(lower, "utf-32") == 0)
    3110           0 :             return PyUnicode_DecodeUTF32(s, size, errors, 0);
    3111             :     }
    3112             : 
    3113             :     /* Decode via the codec registry */
    3114           0 :     buffer = NULL;
    3115           0 :     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
    3116           0 :         goto onError;
    3117           0 :     buffer = PyMemoryView_FromBuffer(&info);
    3118           0 :     if (buffer == NULL)
    3119           0 :         goto onError;
    3120           0 :     unicode = PyCodec_Decode(buffer, encoding, errors);
    3121           0 :     if (unicode == NULL)
    3122           0 :         goto onError;
    3123           0 :     if (!PyUnicode_Check(unicode)) {
    3124           0 :         PyErr_Format(PyExc_TypeError,
    3125             :                      "decoder did not return a str object (type=%.400s)",
    3126           0 :                      Py_TYPE(unicode)->tp_name);
    3127           0 :         Py_DECREF(unicode);
    3128           0 :         goto onError;
    3129             :     }
    3130           0 :     Py_DECREF(buffer);
    3131           0 :     return unicode_result(unicode);
    3132             : 
    3133             :   onError:
    3134           0 :     Py_XDECREF(buffer);
    3135           0 :     return NULL;
    3136             : }
    3137             : 
    3138             : PyObject *
    3139           0 : PyUnicode_AsDecodedObject(PyObject *unicode,
    3140             :                           const char *encoding,
    3141             :                           const char *errors)
    3142             : {
    3143             :     PyObject *v;
    3144             : 
    3145           0 :     if (!PyUnicode_Check(unicode)) {
    3146           0 :         PyErr_BadArgument();
    3147           0 :         goto onError;
    3148             :     }
    3149             : 
    3150           0 :     if (encoding == NULL)
    3151           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3152             : 
    3153             :     /* Decode via the codec registry */
    3154           0 :     v = PyCodec_Decode(unicode, encoding, errors);
    3155           0 :     if (v == NULL)
    3156           0 :         goto onError;
    3157           0 :     return unicode_result(v);
    3158             : 
    3159             :   onError:
    3160           0 :     return NULL;
    3161             : }
    3162             : 
    3163             : PyObject *
    3164           0 : PyUnicode_AsDecodedUnicode(PyObject *unicode,
    3165             :                            const char *encoding,
    3166             :                            const char *errors)
    3167             : {
    3168             :     PyObject *v;
    3169             : 
    3170           0 :     if (!PyUnicode_Check(unicode)) {
    3171           0 :         PyErr_BadArgument();
    3172           0 :         goto onError;
    3173             :     }
    3174             : 
    3175           0 :     if (encoding == NULL)
    3176           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3177             : 
    3178             :     /* Decode via the codec registry */
    3179           0 :     v = PyCodec_Decode(unicode, encoding, errors);
    3180           0 :     if (v == NULL)
    3181           0 :         goto onError;
    3182           0 :     if (!PyUnicode_Check(v)) {
    3183           0 :         PyErr_Format(PyExc_TypeError,
    3184             :                      "decoder did not return a str object (type=%.400s)",
    3185           0 :                      Py_TYPE(v)->tp_name);
    3186           0 :         Py_DECREF(v);
    3187           0 :         goto onError;
    3188             :     }
    3189           0 :     return unicode_result(v);
    3190             : 
    3191             :   onError:
    3192           0 :     return NULL;
    3193             : }
    3194             : 
    3195             : PyObject *
    3196           0 : PyUnicode_Encode(const Py_UNICODE *s,
    3197             :                  Py_ssize_t size,
    3198             :                  const char *encoding,
    3199             :                  const char *errors)
    3200             : {
    3201             :     PyObject *v, *unicode;
    3202             : 
    3203           0 :     unicode = PyUnicode_FromUnicode(s, size);
    3204           0 :     if (unicode == NULL)
    3205           0 :         return NULL;
    3206           0 :     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
    3207           0 :     Py_DECREF(unicode);
    3208           0 :     return v;
    3209             : }
    3210             : 
    3211             : PyObject *
    3212           0 : PyUnicode_AsEncodedObject(PyObject *unicode,
    3213             :                           const char *encoding,
    3214             :                           const char *errors)
    3215             : {
    3216             :     PyObject *v;
    3217             : 
    3218           0 :     if (!PyUnicode_Check(unicode)) {
    3219           0 :         PyErr_BadArgument();
    3220           0 :         goto onError;
    3221             :     }
    3222             : 
    3223           0 :     if (encoding == NULL)
    3224           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3225             : 
    3226             :     /* Encode via the codec registry */
    3227           0 :     v = PyCodec_Encode(unicode, encoding, errors);
    3228           0 :     if (v == NULL)
    3229           0 :         goto onError;
    3230           0 :     return v;
    3231             : 
    3232             :   onError:
    3233           0 :     return NULL;
    3234             : }
    3235             : 
    3236             : static size_t
    3237           0 : wcstombs_errorpos(const wchar_t *wstr)
    3238             : {
    3239             :     size_t len;
    3240             : #if SIZEOF_WCHAR_T == 2
    3241             :     wchar_t buf[3];
    3242             : #else
    3243             :     wchar_t buf[2];
    3244             : #endif
    3245             :     char outbuf[MB_LEN_MAX];
    3246             :     const wchar_t *start, *previous;
    3247             : 
    3248             : #if SIZEOF_WCHAR_T == 2
    3249             :     buf[2] = 0;
    3250             : #else
    3251           0 :     buf[1] = 0;
    3252             : #endif
    3253           0 :     start = wstr;
    3254           0 :     while (*wstr != L'\0')
    3255             :     {
    3256           0 :         previous = wstr;
    3257             : #if SIZEOF_WCHAR_T == 2
    3258             :         if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
    3259             :             && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
    3260             :         {
    3261             :             buf[0] = wstr[0];
    3262             :             buf[1] = wstr[1];
    3263             :             wstr += 2;
    3264             :         }
    3265             :         else {
    3266             :             buf[0] = *wstr;
    3267             :             buf[1] = 0;
    3268             :             wstr++;
    3269             :         }
    3270             : #else
    3271           0 :         buf[0] = *wstr;
    3272           0 :         wstr++;
    3273             : #endif
    3274           0 :         len = wcstombs(outbuf, buf, sizeof(outbuf));
    3275           0 :         if (len == (size_t)-1)
    3276           0 :             return previous - start;
    3277             :     }
    3278             : 
    3279             :     /* failed to find the unencodable character */
    3280           0 :     return 0;
    3281             : }
    3282             : 
    3283             : static int
    3284         368 : locale_error_handler(const char *errors, int *surrogateescape)
    3285             : {
    3286         368 :     if (errors == NULL) {
    3287           4 :         *surrogateescape = 0;
    3288           4 :         return 0;
    3289             :     }
    3290             : 
    3291         364 :     if (strcmp(errors, "strict") == 0) {
    3292           0 :         *surrogateescape = 0;
    3293           0 :         return 0;
    3294             :     }
    3295         364 :     if (strcmp(errors, "surrogateescape") == 0) {
    3296         364 :         *surrogateescape = 1;
    3297         364 :         return 0;
    3298             :     }
    3299           0 :     PyErr_Format(PyExc_ValueError,
    3300             :                  "only 'strict' and 'surrogateescape' error handlers "
    3301             :                  "are supported, not '%s'",
    3302             :                  errors);
    3303           0 :     return -1;
    3304             : }
    3305             : 
    3306             : PyObject *
    3307          26 : PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
    3308             : {
    3309             :     Py_ssize_t wlen, wlen2;
    3310             :     wchar_t *wstr;
    3311          26 :     PyObject *bytes = NULL;
    3312             :     char *errmsg;
    3313             :     PyObject *reason;
    3314             :     PyObject *exc;
    3315             :     size_t error_pos;
    3316             :     int surrogateescape;
    3317             : 
    3318          26 :     if (locale_error_handler(errors, &surrogateescape) < 0)
    3319           0 :         return NULL;
    3320             : 
    3321          26 :     wstr = PyUnicode_AsWideCharString(unicode, &wlen);
    3322          26 :     if (wstr == NULL)
    3323           0 :         return NULL;
    3324             : 
    3325          26 :     wlen2 = wcslen(wstr);
    3326          26 :     if (wlen2 != wlen) {
    3327           0 :         PyMem_Free(wstr);
    3328           0 :         PyErr_SetString(PyExc_TypeError, "embedded null character");
    3329           0 :         return NULL;
    3330             :     }
    3331             : 
    3332          26 :     if (surrogateescape) {
    3333             :         /* locale encoding with surrogateescape */
    3334             :         char *str;
    3335             : 
    3336          26 :         str = _Py_wchar2char(wstr, &error_pos);
    3337          26 :         if (str == NULL) {
    3338           0 :             if (error_pos == (size_t)-1) {
    3339           0 :                 PyErr_NoMemory();
    3340           0 :                 PyMem_Free(wstr);
    3341           0 :                 return NULL;
    3342             :             }
    3343             :             else {
    3344           0 :                 goto encode_error;
    3345             :             }
    3346             :         }
    3347          26 :         PyMem_Free(wstr);
    3348             : 
    3349          26 :         bytes = PyBytes_FromString(str);
    3350          26 :         PyMem_Free(str);
    3351             :     }
    3352             :     else {
    3353             :         size_t len, len2;
    3354             : 
    3355           0 :         len = wcstombs(NULL, wstr, 0);
    3356           0 :         if (len == (size_t)-1) {
    3357           0 :             error_pos = (size_t)-1;
    3358           0 :             goto encode_error;
    3359             :         }
    3360             : 
    3361           0 :         bytes = PyBytes_FromStringAndSize(NULL, len);
    3362           0 :         if (bytes == NULL) {
    3363           0 :             PyMem_Free(wstr);
    3364           0 :             return NULL;
    3365             :         }
    3366             : 
    3367           0 :         len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
    3368           0 :         if (len2 == (size_t)-1 || len2 > len) {
    3369           0 :             error_pos = (size_t)-1;
    3370           0 :             goto encode_error;
    3371             :         }
    3372           0 :         PyMem_Free(wstr);
    3373             :     }
    3374          26 :     return bytes;
    3375             : 
    3376             : encode_error:
    3377           0 :     errmsg = strerror(errno);
    3378             :     assert(errmsg != NULL);
    3379             : 
    3380           0 :     if (error_pos == (size_t)-1)
    3381           0 :         error_pos = wcstombs_errorpos(wstr);
    3382             : 
    3383           0 :     PyMem_Free(wstr);
    3384           0 :     Py_XDECREF(bytes);
    3385             : 
    3386           0 :     if (errmsg != NULL) {
    3387             :         size_t errlen;
    3388           0 :         wstr = _Py_char2wchar(errmsg, &errlen);
    3389           0 :         if (wstr != NULL) {
    3390           0 :             reason = PyUnicode_FromWideChar(wstr, errlen);
    3391           0 :             PyMem_Free(wstr);
    3392             :         } else
    3393           0 :             errmsg = NULL;
    3394             :     }
    3395           0 :     if (errmsg == NULL)
    3396           0 :         reason = PyUnicode_FromString(
    3397             :             "wcstombs() encountered an unencodable "
    3398             :             "wide character");
    3399           0 :     if (reason == NULL)
    3400           0 :         return NULL;
    3401             : 
    3402           0 :     exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
    3403             :                                 "locale", unicode,
    3404             :                                 (Py_ssize_t)error_pos,
    3405           0 :                                 (Py_ssize_t)(error_pos+1),
    3406             :                                 reason);
    3407           0 :     Py_DECREF(reason);
    3408           0 :     if (exc != NULL) {
    3409           0 :         PyCodec_StrictErrors(exc);
    3410           0 :         Py_XDECREF(exc);
    3411             :     }
    3412           0 :     return NULL;
    3413             : }
    3414             : 
    3415             : PyObject *
    3416         431 : PyUnicode_EncodeFSDefault(PyObject *unicode)
    3417             : {
    3418             : #ifdef HAVE_MBCS
    3419             :     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
    3420             : #elif defined(__APPLE__)
    3421             :     return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
    3422             : #else
    3423         431 :     PyInterpreterState *interp = PyThreadState_GET()->interp;
    3424             :     /* Bootstrap check: if the filesystem codec is implemented in Python, we
    3425             :        cannot use it to encode and decode filenames before it is loaded. Load
    3426             :        the Python codec requires to encode at least its own filename. Use the C
    3427             :        version of the locale codec until the codec registry is initialized and
    3428             :        the Python codec is loaded.
    3429             : 
    3430             :        Py_FileSystemDefaultEncoding is shared between all interpreters, we
    3431             :        cannot only rely on it: check also interp->fscodec_initialized for
    3432             :        subinterpreters. */
    3433         431 :     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
    3434         405 :         return PyUnicode_AsEncodedString(unicode,
    3435             :                                          Py_FileSystemDefaultEncoding,
    3436             :                                          "surrogateescape");
    3437             :     }
    3438             :     else {
    3439          26 :         return PyUnicode_EncodeLocale(unicode, "surrogateescape");
    3440             :     }
    3441             : #endif
    3442             : }
    3443             : 
    3444             : PyObject *
    3445         417 : PyUnicode_AsEncodedString(PyObject *unicode,
    3446             :                           const char *encoding,
    3447             :                           const char *errors)
    3448             : {
    3449             :     PyObject *v;
    3450             :     char lower[11];  /* Enough for any encoding shortcut */
    3451             : 
    3452         417 :     if (!PyUnicode_Check(unicode)) {
    3453           0 :         PyErr_BadArgument();
    3454           0 :         return NULL;
    3455             :     }
    3456             : 
    3457             :     /* Shortcuts for common default encodings */
    3458         417 :     if (normalize_encoding(encoding, lower, sizeof(lower))) {
    3459         421 :         if ((strcmp(lower, "utf-8") == 0) ||
    3460           4 :             (strcmp(lower, "utf8") == 0))
    3461             :         {
    3462         413 :             if (errors == NULL || strcmp(errors, "strict") == 0)
    3463           4 :                 return _PyUnicode_AsUTF8String(unicode, NULL);
    3464             :             else
    3465         409 :                 return _PyUnicode_AsUTF8String(unicode, errors);
    3466             :         }
    3467           8 :         else if ((strcmp(lower, "latin-1") == 0) ||
    3468           8 :                  (strcmp(lower, "latin1") == 0) ||
    3469           4 :                  (strcmp(lower, "iso-8859-1") == 0))
    3470           0 :             return _PyUnicode_AsLatin1String(unicode, errors);
    3471             : #ifdef HAVE_MBCS
    3472             :         else if (strcmp(lower, "mbcs") == 0)
    3473             :             return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
    3474             : #endif
    3475           4 :         else if (strcmp(lower, "ascii") == 0)
    3476           4 :             return _PyUnicode_AsASCIIString(unicode, errors);
    3477             :     }
    3478             : 
    3479             :     /* Encode via the codec registry */
    3480           0 :     v = PyCodec_Encode(unicode, encoding, errors);
    3481           0 :     if (v == NULL)
    3482           0 :         return NULL;
    3483             : 
    3484             :     /* The normal path */
    3485           0 :     if (PyBytes_Check(v))
    3486           0 :         return v;
    3487             : 
    3488             :     /* If the codec returns a buffer, raise a warning and convert to bytes */
    3489           0 :     if (PyByteArray_Check(v)) {
    3490             :         int error;
    3491             :         PyObject *b;
    3492             : 
    3493           0 :         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
    3494             :             "encoder %s returned bytearray instead of bytes",
    3495             :             encoding);
    3496           0 :         if (error) {
    3497           0 :             Py_DECREF(v);
    3498           0 :             return NULL;
    3499             :         }
    3500             : 
    3501           0 :         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
    3502           0 :         Py_DECREF(v);
    3503           0 :         return b;
    3504             :     }
    3505             : 
    3506           0 :     PyErr_Format(PyExc_TypeError,
    3507             :                  "encoder did not return a bytes object (type=%.400s)",
    3508           0 :                  Py_TYPE(v)->tp_name);
    3509           0 :     Py_DECREF(v);
    3510           0 :     return NULL;
    3511             : }
    3512             : 
    3513             : PyObject *
    3514           0 : PyUnicode_AsEncodedUnicode(PyObject *unicode,
    3515             :                            const char *encoding,
    3516             :                            const char *errors)
    3517             : {
    3518             :     PyObject *v;
    3519             : 
    3520           0 :     if (!PyUnicode_Check(unicode)) {
    3521           0 :         PyErr_BadArgument();
    3522           0 :         goto onError;
    3523             :     }
    3524             : 
    3525           0 :     if (encoding == NULL)
    3526           0 :         encoding = PyUnicode_GetDefaultEncoding();
    3527             : 
    3528             :     /* Encode via the codec registry */
    3529           0 :     v = PyCodec_Encode(unicode, encoding, errors);
    3530           0 :     if (v == NULL)
    3531           0 :         goto onError;
    3532           0 :     if (!PyUnicode_Check(v)) {
    3533           0 :         PyErr_Format(PyExc_TypeError,
    3534             :                      "encoder did not return an str object (type=%.400s)",
    3535           0 :                      Py_TYPE(v)->tp_name);
    3536           0 :         Py_DECREF(v);
    3537           0 :         goto onError;
    3538             :     }
    3539           0 :     return v;
    3540             : 
    3541             :   onError:
    3542           0 :     return NULL;
    3543             : }
    3544             : 
    3545             : static size_t
    3546           0 : mbstowcs_errorpos(const char *str, size_t len)
    3547             : {
    3548             : #ifdef HAVE_MBRTOWC
    3549           0 :     const char *start = str;
    3550             :     mbstate_t mbs;
    3551             :     size_t converted;
    3552             :     wchar_t ch;
    3553             : 
    3554           0 :     memset(&mbs, 0, sizeof mbs);
    3555           0 :     while (len)
    3556             :     {
    3557           0 :         converted = mbrtowc(&ch, (char*)str, len, &mbs);
    3558           0 :         if (converted == 0)
    3559             :             /* Reached end of string */
    3560           0 :             break;
    3561           0 :         if (converted == (size_t)-1 || converted == (size_t)-2) {
    3562             :             /* Conversion error or incomplete character */
    3563           0 :             return str - start;
    3564             :         }
    3565             :         else {
    3566           0 :             str += converted;
    3567           0 :             len -= converted;
    3568             :         }
    3569             :     }
    3570             :     /* failed to find the undecodable byte sequence */
    3571           0 :     return 0;
    3572             : #endif
    3573             :     return 0;
    3574             : }
    3575             : 
    3576             : PyObject*
    3577         342 : PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
    3578             :                               const char *errors)
    3579             : {
    3580             :     wchar_t smallbuf[256];
    3581         342 :     size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
    3582             :     wchar_t *wstr;
    3583             :     size_t wlen, wlen2;
    3584             :     PyObject *unicode;
    3585             :     int surrogateescape;
    3586             :     size_t error_pos;
    3587             :     char *errmsg;
    3588             :     PyObject *reason, *exc;
    3589             : 
    3590         342 :     if (locale_error_handler(errors, &surrogateescape) < 0)
    3591           0 :         return NULL;
    3592             : 
    3593         342 :     if (str[len] != '\0' || len != strlen(str)) {
    3594           0 :         PyErr_SetString(PyExc_TypeError, "embedded null character");
    3595           0 :         return NULL;
    3596             :     }
    3597             : 
    3598         342 :     if (surrogateescape)
    3599             :     {
    3600         338 :         wstr = _Py_char2wchar(str, &wlen);
    3601         338 :         if (wstr == NULL) {
    3602           0 :             if (wlen == (size_t)-1)
    3603           0 :                 PyErr_NoMemory();
    3604             :             else
    3605           0 :                 PyErr_SetFromErrno(PyExc_OSError);
    3606           0 :             return NULL;
    3607             :         }
    3608             : 
    3609         338 :         unicode = PyUnicode_FromWideChar(wstr, wlen);
    3610         338 :         PyMem_Free(wstr);
    3611             :     }
    3612             :     else {
    3613             : #ifndef HAVE_BROKEN_MBSTOWCS
    3614           4 :         wlen = mbstowcs(NULL, str, 0);
    3615             : #else
    3616             :         wlen = len;
    3617             : #endif
    3618           4 :         if (wlen == (size_t)-1)
    3619           0 :             goto decode_error;
    3620           4 :         if (wlen+1 <= smallbuf_len) {
    3621           4 :             wstr = smallbuf;
    3622             :         }
    3623             :         else {
    3624           0 :             if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
    3625           0 :                 return PyErr_NoMemory();
    3626             : 
    3627           0 :             wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
    3628           0 :             if (!wstr)
    3629           0 :                 return PyErr_NoMemory();
    3630             :         }
    3631             : 
    3632             :         /* This shouldn't fail now */
    3633           4 :         wlen2 = mbstowcs(wstr, str, wlen+1);
    3634           4 :         if (wlen2 == (size_t)-1) {
    3635           0 :             if (wstr != smallbuf)
    3636           0 :                 PyMem_Free(wstr);
    3637           0 :             goto decode_error;
    3638             :         }
    3639             : #ifdef HAVE_BROKEN_MBSTOWCS
    3640             :         assert(wlen2 == wlen);
    3641             : #endif
    3642           4 :         unicode = PyUnicode_FromWideChar(wstr, wlen2);
    3643           4 :         if (wstr != smallbuf)
    3644           0 :             PyMem_Free(wstr);
    3645             :     }
    3646         342 :     return unicode;
    3647             : 
    3648             : decode_error:
    3649           0 :     errmsg = strerror(errno);
    3650             :     assert(errmsg != NULL);
    3651             : 
    3652           0 :     error_pos = mbstowcs_errorpos(str, len);
    3653           0 :     if (errmsg != NULL) {
    3654             :         size_t errlen;
    3655           0 :         wstr = _Py_char2wchar(errmsg, &errlen);
    3656           0 :         if (wstr != NULL) {
    3657           0 :             reason = PyUnicode_FromWideChar(wstr, errlen);
    3658           0 :             PyMem_Free(wstr);
    3659             :         } else
    3660           0 :             errmsg = NULL;
    3661             :     }
    3662           0 :     if (errmsg == NULL)
    3663           0 :         reason = PyUnicode_FromString(
    3664             :             "mbstowcs() encountered an invalid multibyte sequence");
    3665           0 :     if (reason == NULL)
    3666           0 :         return NULL;
    3667             : 
    3668           0 :     exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
    3669             :                                 "locale", str, len,
    3670             :                                 (Py_ssize_t)error_pos,
    3671           0 :                                 (Py_ssize_t)(error_pos+1),
    3672             :                                 reason);
    3673           0 :     Py_DECREF(reason);
    3674           0 :     if (exc != NULL) {
    3675           0 :         PyCodec_StrictErrors(exc);
    3676           0 :         Py_XDECREF(exc);
    3677             :     }
    3678           0 :     return NULL;
    3679             : }
    3680             : 
    3681             : PyObject*
    3682          33 : PyUnicode_DecodeLocale(const char *str, const char *errors)
    3683             : {
    3684          33 :     Py_ssize_t size = (Py_ssize_t)strlen(str);
    3685          33 :     return PyUnicode_DecodeLocaleAndSize(str, size, errors);
    3686             : }
    3687             : 
    3688             : 
    3689             : PyObject*
    3690          31 : PyUnicode_DecodeFSDefault(const char *s) {
    3691          31 :     Py_ssize_t size = (Py_ssize_t)strlen(s);
    3692          31 :     return PyUnicode_DecodeFSDefaultAndSize(s, size);
    3693             : }
    3694             : 
    3695             : PyObject*
    3696         941 : PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
    3697             : {
    3698             : #ifdef HAVE_MBCS
    3699             :     return PyUnicode_DecodeMBCS(s, size, NULL);
    3700             : #elif defined(__APPLE__)
    3701             :     return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
    3702             : #else
    3703         941 :     PyInterpreterState *interp = PyThreadState_GET()->interp;
    3704             :     /* Bootstrap check: if the filesystem codec is implemented in Python, we
    3705             :        cannot use it to encode and decode filenames before it is loaded. Load
    3706             :        the Python codec requires to encode at least its own filename. Use the C
    3707             :        version of the locale codec until the codec registry is initialized and
    3708             :        the Python codec is loaded.
    3709             : 
    3710             :        Py_FileSystemDefaultEncoding is shared between all interpreters, we
    3711             :        cannot only rely on it: check also interp->fscodec_initialized for
    3712             :        subinterpreters. */
    3713         941 :     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
    3714         632 :         return PyUnicode_Decode(s, size,
    3715             :                                 Py_FileSystemDefaultEncoding,
    3716             :                                 "surrogateescape");
    3717             :     }
    3718             :     else {
    3719         309 :         return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
    3720             :     }
    3721             : #endif
    3722             : }
    3723             : 
    3724             : 
    3725             : int
    3726           0 : _PyUnicode_HasNULChars(PyObject* s)
    3727             : {
    3728             :     static PyObject *nul = NULL;
    3729             : 
    3730           0 :     if (nul == NULL)
    3731           0 :         nul = PyUnicode_FromStringAndSize("\0", 1);
    3732           0 :     if (nul == NULL)
    3733           0 :         return -1;
    3734           0 :     return PyUnicode_Contains(s, nul);
    3735             : }
    3736             : 
    3737             : 
    3738             : int
    3739         405 : PyUnicode_FSConverter(PyObject* arg, void* addr)
    3740             : {
    3741         405 :     PyObject *output = NULL;
    3742             :     Py_ssize_t size;
    3743             :     void *data;
    3744         405 :     if (arg == NULL) {
    3745           0 :         Py_DECREF(*(PyObject**)addr);
    3746           0 :         return 1;
    3747             :     }
    3748         405 :     if (PyBytes_Check(arg)) {
    3749           1 :         output = arg;
    3750           1 :         Py_INCREF(output);
    3751             :     }
    3752             :     else {
    3753         404 :         arg = PyUnicode_FromObject(arg);
    3754         404 :         if (!arg)
    3755           0 :             return 0;
    3756         404 :         output = PyUnicode_EncodeFSDefault(arg);
    3757         404 :         Py_DECREF(arg);
    3758         404 :         if (!output)
    3759           0 :             return 0;
    3760         404 :         if (!PyBytes_Check(output)) {
    3761           0 :             Py_DECREF(output);
    3762           0 :             PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
    3763           0 :             return 0;
    3764             :         }
    3765             :     }
    3766         405 :     size = PyBytes_GET_SIZE(output);
    3767         405 :     data = PyBytes_AS_STRING(output);
    3768         405 :     if (size != strlen(data)) {
    3769           0 :         PyErr_SetString(PyExc_TypeError, "embedded NUL character");
    3770           0 :         Py_DECREF(output);
    3771           0 :         return 0;
    3772             :     }
    3773         405 :     *(PyObject**)addr = output;
    3774         405 :     return Py_CLEANUP_SUPPORTED;
    3775             : }
    3776             : 
    3777             : 
    3778             : int
    3779          20 : PyUnicode_FSDecoder(PyObject* arg, void* addr)
    3780             : {
    3781          20 :     PyObject *output = NULL;
    3782          20 :     if (arg == NULL) {
    3783           0 :         Py_DECREF(*(PyObject**)addr);
    3784           0 :         return 1;
    3785             :     }
    3786          20 :     if (PyUnicode_Check(arg)) {
    3787          20 :         if (PyUnicode_READY(arg) == -1)
    3788           0 :             return 0;
    3789          20 :         output = arg;
    3790          20 :         Py_INCREF(output);
    3791             :     }
    3792             :     else {
    3793           0 :         arg = PyBytes_FromObject(arg);
    3794           0 :         if (!arg)
    3795           0 :             return 0;
    3796           0 :         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
    3797             :                                                   PyBytes_GET_SIZE(arg));
    3798           0 :         Py_DECREF(arg);
    3799           0 :         if (!output)
    3800           0 :             return 0;
    3801           0 :         if (!PyUnicode_Check(output)) {
    3802           0 :             Py_DECREF(output);
    3803           0 :             PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
    3804           0 :             return 0;
    3805             :         }
    3806             :     }
    3807          20 :     if (PyUnicode_READY(output) == -1) {
    3808           0 :         Py_DECREF(output);
    3809           0 :         return 0;
    3810             :     }
    3811          20 :     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
    3812             :                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
    3813           0 :         PyErr_SetString(PyExc_TypeError, "embedded NUL character");
    3814           0 :         Py_DECREF(output);
    3815           0 :         return 0;
    3816             :     }
    3817          20 :     *(PyObject**)addr = output;
    3818          20 :     return Py_CLEANUP_SUPPORTED;
    3819             : }
    3820             : 
    3821             : 
    3822             : char*
    3823        1795 : PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
    3824             : {
    3825             :     PyObject *bytes;
    3826             : 
    3827        1795 :     if (!PyUnicode_Check(unicode)) {
    3828           0 :         PyErr_BadArgument();
    3829           0 :         return NULL;
    3830             :     }
    3831        1795 :     if (PyUnicode_READY(unicode) == -1)
    3832           0 :         return NULL;
    3833             : 
    3834        1795 :     if (PyUnicode_UTF8(unicode) == NULL) {
    3835             :         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
    3836           1 :         bytes = _PyUnicode_AsUTF8String(unicode, "strict");
    3837           1 :         if (bytes == NULL)
    3838           0 :             return NULL;
    3839           1 :         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
    3840           1 :         if (_PyUnicode_UTF8(unicode) == NULL) {
    3841           0 :             Py_DECREF(bytes);
    3842           0 :             return NULL;
    3843             :         }
    3844           1 :         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
    3845           2 :         Py_MEMCPY(_PyUnicode_UTF8(unicode),
    3846           1 :                   PyBytes_AS_STRING(bytes),
    3847           1 :                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
    3848           1 :         Py_DECREF(bytes);
    3849             :     }
    3850             : 
    3851        1795 :     if (psize)
    3852         502 :         *psize = PyUnicode_UTF8_LENGTH(unicode);
    3853        1795 :     return PyUnicode_UTF8(unicode);
    3854             : }
    3855             : 
    3856             : char*
    3857        1293 : PyUnicode_AsUTF8(PyObject *unicode)
    3858             : {
    3859        1293 :     return PyUnicode_AsUTF8AndSize(unicode, NULL);
    3860             : }
    3861             : 
    3862             : Py_UNICODE *
    3863          52 : PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
    3864             : {
    3865             :     const unsigned char *one_byte;
    3866             : #if SIZEOF_WCHAR_T == 4
    3867             :     const Py_UCS2 *two_bytes;
    3868             : #else
    3869             :     const Py_UCS4 *four_bytes;
    3870             :     const Py_UCS4 *ucs4_end;
    3871             :     Py_ssize_t num_surrogates;
    3872             : #endif
    3873             :     wchar_t *w;
    3874             :     wchar_t *wchar_end;
    3875             : 
    3876          52 :     if (!PyUnicode_Check(unicode)) {
    3877           0 :         PyErr_BadArgument();
    3878           0 :         return NULL;
    3879             :     }
    3880          52 :     if (_PyUnicode_WSTR(unicode) == NULL) {
    3881             :         /* Non-ASCII compact unicode object */
    3882             :         assert(_PyUnicode_KIND(unicode) != 0);
    3883             :         assert(PyUnicode_IS_READY(unicode));
    3884             : 
    3885          14 :         if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
    3886             : #if SIZEOF_WCHAR_T == 2
    3887             :             four_bytes = PyUnicode_4BYTE_DATA(unicode);
    3888             :             ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
    3889             :             num_surrogates = 0;
    3890             : 
    3891             :             for (; four_bytes < ucs4_end; ++four_bytes) {
    3892             :                 if (*four_bytes > 0xFFFF)
    3893             :                     ++num_surrogates;
    3894             :             }
    3895             : 
    3896             :             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
    3897             :                     sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
    3898             :             if (!_PyUnicode_WSTR(unicode)) {
    3899             :                 PyErr_NoMemory();
    3900             :                 return NULL;
    3901             :             }
    3902             :             _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
    3903             : 
    3904             :             w = _PyUnicode_WSTR(unicode);
    3905             :             wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
    3906             :             four_bytes = PyUnicode_4BYTE_DATA(unicode);
    3907             :             for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
    3908             :                 if (*four_bytes > 0xFFFF) {
    3909             :                     assert(*four_bytes <= MAX_UNICODE);
    3910             :                     /* encode surrogate pair in this case */
    3911             :                     *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
    3912             :                     *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
    3913             :                 }
    3914             :                 else
    3915             :                     *w = *four_bytes;
    3916             : 
    3917             :                 if (w > wchar_end) {
    3918             :                     assert(0 && "Miscalculated string end");
    3919             :                 }
    3920             :             }
    3921             :             *w = 0;
    3922             : #else
    3923             :             /* sizeof(wchar_t) == 4 */
    3924           0 :             Py_FatalError("Impossible unicode object state, wstr and str "
    3925             :                           "should share memory already.");
    3926           0 :             return NULL;
    3927             : #endif
    3928             :         }
    3929             :         else {
    3930          14 :             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
    3931          14 :                                                   (_PyUnicode_LENGTH(unicode) + 1));
    3932          14 :             if (!_PyUnicode_WSTR(unicode)) {
    3933           0 :                 PyErr_NoMemory();
    3934           0 :                 return NULL;
    3935             :             }
    3936          14 :             if (!PyUnicode_IS_COMPACT_ASCII(unicode))
    3937           0 :                 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
    3938          14 :             w = _PyUnicode_WSTR(unicode);
    3939          14 :             wchar_end = w + _PyUnicode_LENGTH(unicode);
    3940             : 
    3941          14 :             if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
    3942          14 :                 one_byte = PyUnicode_1BYTE_DATA(unicode);
    3943        1813 :                 for (; w < wchar_end; ++one_byte, ++w)
    3944        1799 :                     *w = *one_byte;
    3945             :                 /* null-terminate the wstr */
    3946          14 :                 *w = 0;
    3947             :             }
    3948           0 :             else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
    3949             : #if SIZEOF_WCHAR_T == 4
    3950           0 :                 two_bytes = PyUnicode_2BYTE_DATA(unicode);
    3951           0 :                 for (; w < wchar_end; ++two_bytes, ++w)
    3952           0 :                     *w = *two_bytes;
    3953             :                 /* null-terminate the wstr */
    3954           0 :                 *w = 0;
    3955             : #else
    3956             :                 /* sizeof(wchar_t) == 2 */
    3957             :                 PyObject_FREE(_PyUnicode_WSTR(unicode));
    3958             :                 _PyUnicode_WSTR(unicode) = NULL;
    3959             :                 Py_FatalError("Impossible unicode object state, wstr "
    3960             :                               "and str should share memory already.");
    3961             :                 return NULL;
    3962             : #endif
    3963             :             }
    3964             :             else {
    3965             :                 assert(0 && "This should never happen.");
    3966             :             }
    3967             :         }
    3968             :     }
    3969          52 :     if (size != NULL)
    3970          52 :         *size = PyUnicode_WSTR_LENGTH(unicode);
    3971          52 :     return _PyUnicode_WSTR(unicode);
    3972             : }
    3973             : 
    3974             : Py_UNICODE *
    3975           0 : PyUnicode_AsUnicode(PyObject *unicode)
    3976             : {
    3977           0 :     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
    3978             : }
    3979             : 
    3980             : 
    3981             : Py_ssize_t
    3982           0 : PyUnicode_GetSize(PyObject *unicode)
    3983             : {
    3984           0 :     if (!PyUnicode_Check(unicode)) {
    3985           0 :         PyErr_BadArgument();
    3986           0 :         goto onError;
    3987             :     }
    3988           0 :     return PyUnicode_GET_SIZE(unicode);
    3989             : 
    3990             :   onError:
    3991           0 :     return -1;
    3992             : }
    3993             : 
    3994             : Py_ssize_t
    3995           1 : PyUnicode_GetLength(PyObject *unicode)
    3996             : {
    3997           1 :     if (!PyUnicode_Check(unicode)) {
    3998           0 :         PyErr_BadArgument();
    3999           0 :         return -1;
    4000             :     }
    4001           1 :     if (PyUnicode_READY(unicode) == -1)
    4002           0 :         return -1;
    4003           1 :     return PyUnicode_GET_LENGTH(unicode);
    4004             : }
    4005             : 
    4006             : Py_UCS4
    4007           0 : PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
    4008             : {
    4009           0 :     if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
    4010           0 :         PyErr_BadArgument();
    4011           0 :         return (Py_UCS4)-1;
    4012             :     }
    4013           0 :     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
    4014           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    4015           0 :         return (Py_UCS4)-1;
    4016             :     }
    4017           0 :     return PyUnicode_READ_CHAR(unicode, index);
    4018             : }
    4019             : 
    4020             : int
    4021           0 : PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
    4022             : {
    4023           0 :     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
    4024           0 :         PyErr_BadArgument();
    4025           0 :         return -1;
    4026             :     }
    4027             :     assert(PyUnicode_IS_READY(unicode));
    4028           0 :     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
    4029           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    4030           0 :         return -1;
    4031             :     }
    4032           0 :     if (unicode_check_modifiable(unicode))
    4033           0 :         return -1;
    4034           0 :     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
    4035           0 :         PyErr_SetString(PyExc_ValueError, "character out of range");
    4036           0 :         return -1;
    4037             :     }
    4038           0 :     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
    4039             :                     index, ch);
    4040           0 :     return 0;
    4041             : }
    4042             : 
    4043             : const char *
    4044           0 : PyUnicode_GetDefaultEncoding(void)
    4045             : {
    4046           0 :     return "utf-8";
    4047             : }
    4048             : 
    4049             : /* create or adjust a UnicodeDecodeError */
    4050             : static void
    4051           0 : make_decode_exception(PyObject **exceptionObject,
    4052             :                       const char *encoding,
    4053             :                       const char *input, Py_ssize_t length,
    4054             :                       Py_ssize_t startpos, Py_ssize_t endpos,
    4055             :                       const char *reason)
    4056             : {
    4057           0 :     if (*exceptionObject == NULL) {
    4058           0 :         *exceptionObject = PyUnicodeDecodeError_Create(
    4059             :             encoding, input, length, startpos, endpos, reason);
    4060             :     }
    4061             :     else {
    4062           0 :         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
    4063           0 :             goto onError;
    4064           0 :         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
    4065           0 :             goto onError;
    4066           0 :         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
    4067           0 :             goto onError;
    4068             :     }
    4069           0 :     return;
    4070             : 
    4071             : onError:
    4072           0 :     Py_DECREF(*exceptionObject);
    4073           0 :     *exceptionObject = NULL;
    4074             : }
    4075             : 
    4076             : /* error handling callback helper:
    4077             :    build arguments, call the callback and check the arguments,
    4078             :    if no exception occurred, copy the replacement to the output
    4079             :    and adjust various state variables.
    4080             :    return 0 on success, -1 on error
    4081             : */
    4082             : 
    4083             : static int
    4084           0 : unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
    4085             :                                  const char *encoding, const char *reason,
    4086             :                                  const char **input, const char **inend, Py_ssize_t *startinpos,
    4087             :                                  Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
    4088             :                                  PyObject **output, Py_ssize_t *outpos)
    4089             : {
    4090             :     static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
    4091             : 
    4092           0 :     PyObject *restuple = NULL;
    4093           0 :     PyObject *repunicode = NULL;
    4094             :     Py_ssize_t outsize;
    4095             :     Py_ssize_t insize;
    4096             :     Py_ssize_t requiredsize;
    4097             :     Py_ssize_t newpos;
    4098           0 :     PyObject *inputobj = NULL;
    4099           0 :     int res = -1;
    4100             : 
    4101           0 :     if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
    4102           0 :         outsize = PyUnicode_GET_LENGTH(*output);
    4103             :     else
    4104           0 :         outsize = _PyUnicode_WSTR_LENGTH(*output);
    4105             : 
    4106           0 :     if (*errorHandler == NULL) {
    4107           0 :         *errorHandler = PyCodec_LookupError(errors);
    4108           0 :         if (*errorHandler == NULL)
    4109           0 :             goto onError;
    4110             :     }
    4111             : 
    4112           0 :     make_decode_exception(exceptionObject,
    4113             :         encoding,
    4114           0 :         *input, *inend - *input,
    4115             :         *startinpos, *endinpos,
    4116             :         reason);
    4117           0 :     if (*exceptionObject == NULL)
    4118           0 :         goto onError;
    4119             : 
    4120           0 :     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
    4121           0 :     if (restuple == NULL)
    4122           0 :         goto onError;
    4123           0 :     if (!PyTuple_Check(restuple)) {
    4124           0 :         PyErr_SetString(PyExc_TypeError, &argparse[4]);
    4125           0 :         goto onError;
    4126             :     }
    4127           0 :     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
    4128           0 :         goto onError;
    4129           0 :     if (PyUnicode_READY(repunicode) == -1)
    4130           0 :         goto onError;
    4131             : 
    4132             :     /* Copy back the bytes variables, which might have been modified by the
    4133             :        callback */
    4134           0 :     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    4135           0 :     if (!inputobj)
    4136           0 :         goto onError;
    4137           0 :     if (!PyBytes_Check(inputobj)) {
    4138           0 :         PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
    4139             :     }
    4140           0 :     *input = PyBytes_AS_STRING(inputobj);
    4141           0 :     insize = PyBytes_GET_SIZE(inputobj);
    4142           0 :     *inend = *input + insize;
    4143             :     /* we can DECREF safely, as the exception has another reference,
    4144             :        so the object won't go away. */
    4145           0 :     Py_DECREF(inputobj);
    4146             : 
    4147           0 :     if (newpos<0)
    4148           0 :         newpos = insize+newpos;
    4149           0 :     if (newpos<0 || newpos>insize) {
    4150           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
    4151           0 :         goto onError;
    4152             :     }
    4153             : 
    4154           0 :     if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
    4155             :         /* need more space? (at least enough for what we
    4156             :            have+the replacement+the rest of the string (starting
    4157             :            at the new input position), so we won't have to check space
    4158             :            when there are no errors in the rest of the string) */
    4159           0 :         Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
    4160           0 :         requiredsize = *outpos + replen + insize-newpos;
    4161           0 :         if (requiredsize > outsize) {
    4162           0 :             if (requiredsize<2*outsize)
    4163           0 :                 requiredsize = 2*outsize;
    4164           0 :             if (unicode_resize(output, requiredsize) < 0)
    4165           0 :                 goto onError;
    4166             :         }
    4167           0 :         if (unicode_widen(output, *outpos,
    4168           0 :                           PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
    4169           0 :             goto onError;
    4170           0 :         _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
    4171           0 :         *outpos += replen;
    4172             :     }
    4173             :     else {
    4174             :         wchar_t *repwstr;
    4175             :         Py_ssize_t repwlen;
    4176           0 :         repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
    4177           0 :         if (repwstr == NULL)
    4178             :             goto onError;
    4179             :         /* need more space? (at least enough for what we
    4180             :            have+the replacement+the rest of the string (starting
    4181             :            at the new input position), so we won't have to check space
    4182             :            when there are no errors in the rest of the string) */
    4183           0 :         requiredsize = *outpos + repwlen + insize-newpos;
    4184           0 :         if (requiredsize > outsize) {
    4185           0 :             if (requiredsize < 2*outsize)
    4186           0 :                 requiredsize = 2*outsize;
    4187           0 :             if (unicode_resize(output, requiredsize) < 0)
    4188             :                 goto onError;
    4189             :         }
    4190           0 :         wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
    4191           0 :         *outpos += repwlen;
    4192             :     }
    4193           0 :     *endinpos = newpos;
    4194           0 :     *inptr = *input + newpos;
    4195             : 
    4196             :     /* we made it! */
    4197           0 :     res = 0;
    4198             : 
    4199             :   onError:
    4200           0 :     Py_XDECREF(restuple);
    4201           0 :     return res;
    4202             : }
    4203             : 
    4204             : /* --- UTF-7 Codec -------------------------------------------------------- */
    4205             : 
    4206             : /* See RFC2152 for details.  We encode conservatively and decode liberally. */
    4207             : 
    4208             : /* Three simple macros defining base-64. */
    4209             : 
    4210             : /* Is c a base-64 character? */
    4211             : 
    4212             : #define IS_BASE64(c) \
    4213             :     (((c) >= 'A' && (c) <= 'Z') ||     \
    4214             :      ((c) >= 'a' && (c) <= 'z') ||     \
    4215             :      ((c) >= '0' && (c) <= '9') ||     \
    4216             :      (c) == '+' || (c) == '/')
    4217             : 
    4218             : /* given that c is a base-64 character, what is its base-64 value? */
    4219             : 
    4220             : #define FROM_BASE64(c)                                                  \
    4221             :     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
    4222             :      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
    4223             :      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
    4224             :      (c) == '+' ? 62 : 63)
    4225             : 
    4226             : /* What is the base-64 character of the bottom 6 bits of n? */
    4227             : 
    4228             : #define TO_BASE64(n)  \
    4229             :     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
    4230             : 
    4231             : /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
    4232             :  * decoded as itself.  We are permissive on decoding; the only ASCII
    4233             :  * byte not decoding to itself is the + which begins a base64
    4234             :  * string. */
    4235             : 
    4236             : #define DECODE_DIRECT(c)                                \
    4237             :     ((c) <= 127 && (c) != '+')
    4238             : 
    4239             : /* The UTF-7 encoder treats ASCII characters differently according to
    4240             :  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
    4241             :  * the above).  See RFC2152.  This array identifies these different
    4242             :  * sets:
    4243             :  * 0 : "Set D"
    4244             :  *     alphanumeric and '(),-./:?
    4245             :  * 1 : "Set O"
    4246             :  *     !"#$%&*;<=>@[]^_`{|}
    4247             :  * 2 : "whitespace"
    4248             :  *     ht nl cr sp
    4249             :  * 3 : special (must be base64 encoded)
    4250             :  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
    4251             :  */
    4252             : 
    4253             : static
    4254             : char utf7_category[128] = {
    4255             : /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
    4256             :     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
    4257             : /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
    4258             :     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
    4259             : /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
    4260             :     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
    4261             : /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
    4262             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
    4263             : /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
    4264             :     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    4265             : /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
    4266             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
    4267             : /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
    4268             :     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    4269             : /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
    4270             :     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
    4271             : };
    4272             : 
    4273             : /* ENCODE_DIRECT: this character should be encoded as itself.  The
    4274             :  * answer depends on whether we are encoding set O as itself, and also
    4275             :  * on whether we are encoding whitespace as itself.  RFC2152 makes it
    4276             :  * clear that the answers to these questions vary between
    4277             :  * applications, so this code needs to be flexible.  */
    4278             : 
    4279             : #define ENCODE_DIRECT(c, directO, directWS)             \
    4280             :     ((c) < 128 && (c) > 0 &&                            \
    4281             :      ((utf7_category[(c)] == 0) ||                      \
    4282             :       (directWS && (utf7_category[(c)] == 2)) ||        \
    4283             :       (directO && (utf7_category[(c)] == 1))))
    4284             : 
    4285             : PyObject *
    4286           0 : PyUnicode_DecodeUTF7(const char *s,
    4287             :                      Py_ssize_t size,
    4288             :                      const char *errors)
    4289             : {
    4290           0 :     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
    4291             : }
    4292             : 
    4293             : /* The decoder.  The only state we preserve is our read position,
    4294             :  * i.e. how many characters we have consumed.  So if we end in the
    4295             :  * middle of a shift sequence we have to back off the read position
    4296             :  * and the output to the beginning of the sequence, otherwise we lose
    4297             :  * all the shift state (seen bits, number of bits seen, high
    4298             :  * surrogate). */
    4299             : 
    4300             : PyObject *
    4301           0 : PyUnicode_DecodeUTF7Stateful(const char *s,
    4302             :                              Py_ssize_t size,
    4303             :                              const char *errors,
    4304             :                              Py_ssize_t *consumed)
    4305             : {
    4306           0 :     const char *starts = s;
    4307             :     Py_ssize_t startinpos;
    4308             :     Py_ssize_t endinpos;
    4309             :     Py_ssize_t outpos;
    4310             :     const char *e;
    4311             :     PyObject *unicode;
    4312           0 :     const char *errmsg = "";
    4313           0 :     int inShift = 0;
    4314             :     Py_ssize_t shiftOutStart;
    4315           0 :     unsigned int base64bits = 0;
    4316           0 :     unsigned long base64buffer = 0;
    4317           0 :     Py_UCS4 surrogate = 0;
    4318           0 :     PyObject *errorHandler = NULL;
    4319           0 :     PyObject *exc = NULL;
    4320             : 
    4321             :     /* Start off assuming it's all ASCII. Widen later as necessary. */
    4322           0 :     unicode = PyUnicode_New(size, 127);
    4323           0 :     if (!unicode)
    4324           0 :         return NULL;
    4325           0 :     if (size == 0) {
    4326           0 :         if (consumed)
    4327           0 :             *consumed = 0;
    4328           0 :         return unicode;
    4329             :     }
    4330             : 
    4331           0 :     shiftOutStart = outpos = 0;
    4332           0 :     e = s + size;
    4333             : 
    4334           0 :     while (s < e) {
    4335             :         Py_UCS4 ch;
    4336             :       restart:
    4337           0 :         ch = (unsigned char) *s;
    4338             : 
    4339           0 :         if (inShift) { /* in a base-64 section */
    4340           0 :             if (IS_BASE64(ch)) { /* consume a base-64 character */
    4341           0 :                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
    4342           0 :                 base64bits += 6;
    4343           0 :                 s++;
    4344           0 :                 if (base64bits >= 16) {
    4345             :                     /* we have enough bits for a UTF-16 value */
    4346           0 :                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
    4347           0 :                     base64bits -= 16;
    4348           0 :                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
    4349           0 :                     if (surrogate) {
    4350             :                         /* expecting a second surrogate */
    4351           0 :                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
    4352           0 :                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
    4353           0 :                             if (unicode_putchar(&unicode, &outpos, ch2) < 0)
    4354           0 :                                 goto onError;
    4355           0 :                             surrogate = 0;
    4356           0 :                             continue;
    4357             :                         }
    4358             :                         else {
    4359           0 :                             if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
    4360           0 :                                 goto onError;
    4361           0 :                             surrogate = 0;
    4362             :                         }
    4363             :                     }
    4364           0 :                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
    4365             :                         /* first surrogate */
    4366           0 :                         surrogate = outCh;
    4367             :                     }
    4368             :                     else {
    4369           0 :                         if (unicode_putchar(&unicode, &outpos, outCh) < 0)
    4370           0 :                             goto onError;
    4371             :                     }
    4372             :                 }
    4373             :             }
    4374             :             else { /* now leaving a base-64 section */
    4375           0 :                 inShift = 0;
    4376           0 :                 s++;
    4377           0 :                 if (surrogate) {
    4378           0 :                     if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
    4379           0 :                         goto onError;
    4380           0 :                     surrogate = 0;
    4381             :                 }
    4382           0 :                 if (base64bits > 0) { /* left-over bits */
    4383           0 :                     if (base64bits >= 6) {
    4384             :                         /* We've seen at least one base-64 character */
    4385           0 :                         errmsg = "partial character in shift sequence";
    4386           0 :                         goto utf7Error;
    4387             :                     }
    4388             :                     else {
    4389             :                         /* Some bits remain; they should be zero */
    4390           0 :                         if (base64buffer != 0) {
    4391           0 :                             errmsg = "non-zero padding bits in shift sequence";
    4392           0 :                             goto utf7Error;
    4393             :                         }
    4394             :                     }
    4395             :                 }
    4396           0 :                 if (ch != '-') {
    4397             :                     /* '-' is absorbed; other terminating
    4398             :                        characters are preserved */
    4399           0 :                     if (unicode_putchar(&unicode, &outpos, ch) < 0)
    4400           0 :                         goto onError;
    4401             :                 }
    4402             :             }
    4403             :         }
    4404           0 :         else if ( ch == '+' ) {
    4405           0 :             startinpos = s-starts;
    4406           0 :             s++; /* consume '+' */
    4407           0 :             if (s < e && *s == '-') { /* '+-' encodes '+' */
    4408           0 :                 s++;
    4409           0 :                 if (unicode_putchar(&unicode, &outpos, '+') < 0)
    4410           0 :                     goto onError;
    4411             :             }
    4412             :             else { /* begin base64-encoded section */
    4413           0 :                 inShift = 1;
    4414           0 :                 shiftOutStart = outpos;
    4415           0 :                 base64bits = 0;
    4416             :             }
    4417             :         }
    4418           0 :         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
    4419           0 :             if (unicode_putchar(&unicode, &outpos, ch) < 0)
    4420           0 :                 goto onError;
    4421           0 :             s++;
    4422             :         }
    4423             :         else {
    4424           0 :             startinpos = s-starts;
    4425           0 :             s++;
    4426           0 :             errmsg = "unexpected special character";
    4427           0 :             goto utf7Error;
    4428             :         }
    4429           0 :         continue;
    4430             : utf7Error:
    4431           0 :         endinpos = s-starts;
    4432           0 :         if (unicode_decode_call_errorhandler(
    4433             :                 errors, &errorHandler,
    4434             :                 "utf7", errmsg,
    4435             :                 &starts, &e, &startinpos, &endinpos, &exc, &s,
    4436             :                 &unicode, &outpos))
    4437           0 :             goto onError;
    4438             :     }
    4439             : 
    4440             :     /* end of string */
    4441             : 
    4442           0 :     if (inShift && !consumed) { /* in shift sequence, no more to follow */
    4443             :         /* if we're in an inconsistent state, that's an error */
    4444           0 :         if (surrogate ||
    4445           0 :                 (base64bits >= 6) ||
    4446           0 :                 (base64bits > 0 && base64buffer != 0)) {
    4447           0 :             endinpos = size;
    4448           0 :             if (unicode_decode_call_errorhandler(
    4449             :                     errors, &errorHandler,
    4450             :                     "utf7", "unterminated shift sequence",
    4451             :                     &starts, &e, &startinpos, &endinpos, &exc, &s,
    4452             :                     &unicode, &outpos))
    4453           0 :                 goto onError;
    4454           0 :             if (s < e)
    4455           0 :                 goto restart;
    4456             :         }
    4457             :     }
    4458             : 
    4459             :     /* return state */
    4460           0 :     if (consumed) {
    4461           0 :         if (inShift) {
    4462           0 :             outpos = shiftOutStart; /* back off output */
    4463           0 :             *consumed = startinpos;
    4464             :         }
    4465             :         else {
    4466           0 :             *consumed = s-starts;
    4467             :         }
    4468             :     }
    4469             : 
    4470           0 :     if (unicode_resize(&unicode, outpos) < 0)
    4471           0 :         goto onError;
    4472             : 
    4473           0 :     Py_XDECREF(errorHandler);
    4474           0 :     Py_XDECREF(exc);
    4475           0 :     return unicode_result(unicode);
    4476             : 
    4477             :   onError:
    4478           0 :     Py_XDECREF(errorHandler);
    4479           0 :     Py_XDECREF(exc);
    4480           0 :     Py_DECREF(unicode);
    4481           0 :     return NULL;
    4482             : }
    4483             : 
    4484             : 
    4485             : PyObject *
    4486           0 : _PyUnicode_EncodeUTF7(PyObject *str,
    4487             :                       int base64SetO,
    4488             :                       int base64WhiteSpace,
    4489             :                       const char *errors)
    4490             : {
    4491             :     int kind;
    4492             :     void *data;
    4493             :     Py_ssize_t len;
    4494             :     PyObject *v;
    4495             :     Py_ssize_t allocated;
    4496           0 :     int inShift = 0;
    4497             :     Py_ssize_t i;
    4498           0 :     unsigned int base64bits = 0;
    4499           0 :     unsigned long base64buffer = 0;
    4500             :     char * out;
    4501             :     char * start;
    4502             : 
    4503           0 :     if (PyUnicode_READY(str) == -1)
    4504           0 :         return NULL;
    4505           0 :     kind = PyUnicode_KIND(str);
    4506           0 :     data = PyUnicode_DATA(str);
    4507           0 :     len = PyUnicode_GET_LENGTH(str);
    4508             : 
    4509           0 :     if (len == 0)
    4510           0 :         return PyBytes_FromStringAndSize(NULL, 0);
    4511             : 
    4512             :     /* It might be possible to tighten this worst case */
    4513           0 :     allocated = 8 * len;
    4514           0 :     if (allocated / 8 != len)
    4515           0 :         return PyErr_NoMemory();
    4516             : 
    4517           0 :     v = PyBytes_FromStringAndSize(NULL, allocated);
    4518           0 :     if (v == NULL)
    4519           0 :         return NULL;
    4520             : 
    4521           0 :     start = out = PyBytes_AS_STRING(v);
    4522           0 :     for (i = 0; i < len; ++i) {
    4523           0 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    4524             : 
    4525           0 :         if (inShift) {
    4526           0 :             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
    4527             :                 /* shifting out */
    4528           0 :                 if (base64bits) { /* output remaining bits */
    4529           0 :                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
    4530           0 :                     base64buffer = 0;
    4531           0 :                     base64bits = 0;
    4532             :                 }
    4533           0 :                 inShift = 0;
    4534             :                 /* Characters not in the BASE64 set implicitly unshift the sequence
    4535             :                    so no '-' is required, except if the character is itself a '-' */
    4536           0 :                 if (IS_BASE64(ch) || ch == '-') {
    4537           0 :                     *out++ = '-';
    4538             :                 }
    4539           0 :                 *out++ = (char) ch;
    4540             :             }
    4541             :             else {
    4542             :                 goto encode_char;
    4543             :             }
    4544             :         }
    4545             :         else { /* not in a shift sequence */
    4546           0 :             if (ch == '+') {
    4547           0 :                 *out++ = '+';
    4548           0 :                         *out++ = '-';
    4549             :             }
    4550           0 :             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
    4551           0 :                 *out++ = (char) ch;
    4552             :             }
    4553             :             else {
    4554           0 :                 *out++ = '+';
    4555           0 :                 inShift = 1;
    4556           0 :                 goto encode_char;
    4557             :             }
    4558             :         }
    4559           0 :         continue;
    4560             : encode_char:
    4561           0 :         if (ch >= 0x10000) {
    4562             :             assert(ch <= MAX_UNICODE);
    4563             : 
    4564             :             /* code first surrogate */
    4565           0 :             base64bits += 16;
    4566           0 :             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
    4567           0 :             while (base64bits >= 6) {
    4568           0 :                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
    4569           0 :                 base64bits -= 6;
    4570             :             }
    4571             :             /* prepare second surrogate */
    4572           0 :             ch = Py_UNICODE_LOW_SURROGATE(ch);
    4573             :         }
    4574           0 :         base64bits += 16;
    4575           0 :         base64buffer = (base64buffer << 16) | ch;
    4576           0 :         while (base64bits >= 6) {
    4577           0 :             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
    4578           0 :             base64bits -= 6;
    4579             :         }
    4580             :     }
    4581           0 :     if (base64bits)
    4582           0 :         *out++= TO_BASE64(base64buffer << (6-base64bits) );
    4583           0 :     if (inShift)
    4584           0 :         *out++ = '-';
    4585           0 :     if (_PyBytes_Resize(&v, out - start) < 0)
    4586           0 :         return NULL;
    4587           0 :     return v;
    4588             : }
    4589             : PyObject *
    4590           0 : PyUnicode_EncodeUTF7(const Py_UNICODE *s,
    4591             :                      Py_ssize_t size,
    4592             :                      int base64SetO,
    4593             :                      int base64WhiteSpace,
    4594             :                      const char *errors)
    4595             : {
    4596             :     PyObject *result;
    4597           0 :     PyObject *tmp = PyUnicode_FromUnicode(s, size);
    4598           0 :     if (tmp == NULL)
    4599           0 :         return NULL;
    4600           0 :     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
    4601             :                                    base64WhiteSpace, errors);
    4602           0 :     Py_DECREF(tmp);
    4603           0 :     return result;
    4604             : }
    4605             : 
    4606             : #undef IS_BASE64
    4607             : #undef FROM_BASE64
    4608             : #undef TO_BASE64
    4609             : #undef DECODE_DIRECT
    4610             : #undef ENCODE_DIRECT
    4611             : 
    4612             : /* --- UTF-8 Codec -------------------------------------------------------- */
    4613             : 
    4614             : PyObject *
    4615       22855 : PyUnicode_DecodeUTF8(const char *s,
    4616             :                      Py_ssize_t size,
    4617             :                      const char *errors)
    4618             : {
    4619       22855 :     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    4620             : }
    4621             : 
    4622             : #include "stringlib/asciilib.h"
    4623             : #include "stringlib/codecs.h"
    4624             : #include "stringlib/undef.h"
    4625             : 
    4626             : #include "stringlib/ucs1lib.h"
    4627             : #include "stringlib/codecs.h"
    4628             : #include "stringlib/undef.h"
    4629             : 
    4630             : #include "stringlib/ucs2lib.h"
    4631             : #include "stringlib/codecs.h"
    4632             : #include "stringlib/undef.h"
    4633             : 
    4634             : #include "stringlib/ucs4lib.h"
    4635             : #include "stringlib/codecs.h"
    4636             : #include "stringlib/undef.h"
    4637             : 
    4638             : /* Mask to check or force alignment of a pointer to C 'long' boundaries */
    4639             : #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
    4640             : 
    4641             : /* Mask to quickly check whether a C 'long' contains a
    4642             :    non-ASCII, UTF8-encoded char. */
    4643             : #if (SIZEOF_LONG == 8)
    4644             : # define ASCII_CHAR_MASK 0x8080808080808080UL
    4645             : #elif (SIZEOF_LONG == 4)
    4646             : # define ASCII_CHAR_MASK 0x80808080UL
    4647             : #else
    4648             : # error C 'long' size should be either 4 or 8!
    4649             : #endif
    4650             : 
    4651             : static Py_ssize_t
    4652       42718 : ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
    4653             : {
    4654       42718 :     const char *p = start;
    4655       42718 :     const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
    4656             : 
    4657             : #if SIZEOF_LONG <= SIZEOF_VOID_P
    4658             :     assert(!((size_t) dest & LONG_PTR_MASK));
    4659       42718 :     if (!((size_t) p & LONG_PTR_MASK)) {
    4660             :         /* Fast path, see in STRINGLIB(utf8_decode) for
    4661             :            an explanation. */
    4662             :         /* Help register allocation */
    4663       27461 :         register const char *_p = p;
    4664       27461 :         register Py_UCS1 * q = dest;
    4665      179058 :         while (_p < aligned_end) {
    4666      124213 :             unsigned long value = *(const unsigned long *) _p;
    4667      124213 :             if (value & ASCII_CHAR_MASK)
    4668          77 :                 break;
    4669      124136 :             *((unsigned long *)q) = value;
    4670      124136 :             _p += SIZEOF_LONG;
    4671      124136 :             q += SIZEOF_LONG;
    4672             :         }
    4673       27461 :         p = _p;
    4674       90796 :         while (p < end) {
    4675       35961 :             if ((unsigned char)*p & 0x80)
    4676          87 :                 break;
    4677       35874 :             *q++ = *p++;
    4678             :         }
    4679       27461 :         return p - start;
    4680             :     }
    4681             : #endif
    4682       88396 :     while (p < end) {
    4683             :         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
    4684             :            for an explanation. */
    4685       62184 :         if (!((size_t) p & LONG_PTR_MASK)) {
    4686             :             /* Help register allocation */
    4687       15134 :             register const char *_p = p;
    4688       52174 :             while (_p < aligned_end) {
    4689       21906 :                 unsigned long value = *(unsigned long *) _p;
    4690       21906 :                 if (value & ASCII_CHAR_MASK)
    4691           0 :                     break;
    4692       21906 :                 _p += SIZEOF_LONG;
    4693             :             }
    4694       15134 :             p = _p;
    4695       15134 :             if (_p == end)
    4696        4302 :                 break;
    4697             :         }
    4698       57882 :         if ((unsigned char)*p & 0x80)
    4699           0 :             break;
    4700       57882 :         ++p;
    4701             :     }
    4702       15257 :     memcpy(dest, start, p - start);
    4703       15257 :     return p - start;
    4704             : }
    4705             : 
    4706             : PyObject *
    4707       43682 : PyUnicode_DecodeUTF8Stateful(const char *s,
    4708             :                              Py_ssize_t size,
    4709             :                              const char *errors,
    4710             :                              Py_ssize_t *consumed)
    4711             : {
    4712             :     PyObject *unicode;
    4713       43682 :     const char *starts = s;
    4714       43682 :     const char *end = s + size;
    4715             :     Py_ssize_t outpos;
    4716             : 
    4717             :     Py_ssize_t startinpos;
    4718             :     Py_ssize_t endinpos;
    4719       43682 :     const char *errmsg = "";
    4720       43682 :     PyObject *errorHandler = NULL;
    4721       43682 :     PyObject *exc = NULL;
    4722             : 
    4723       43682 :     if (size == 0) {
    4724         136 :         if (consumed)
    4725           0 :             *consumed = 0;
    4726         136 :         Py_INCREF(unicode_empty);
    4727         136 :         return unicode_empty;
    4728             :     }
    4729             : 
    4730             :     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    4731       43546 :     if (size == 1 && (unsigned char)s[0] < 128) {
    4732         856 :         if (consumed)
    4733           0 :             *consumed = 1;
    4734         856 :         return get_latin1_char((unsigned char)s[0]);
    4735             :     }
    4736             : 
    4737       42690 :     unicode = PyUnicode_New(size, 127);
    4738       42690 :     if (!unicode)
    4739           0 :         return NULL;
    4740             : 
    4741       42690 :     outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
    4742       42690 :     s += outpos;
    4743       85467 :     while (s < end) {
    4744             :         Py_UCS4 ch;
    4745         163 :         int kind = PyUnicode_KIND(unicode);
    4746         163 :         if (kind == PyUnicode_1BYTE_KIND) {
    4747          96 :             if (PyUnicode_IS_ASCII(unicode))
    4748         261 :                 ch = asciilib_utf8_decode(&s, end,
    4749         261 :                         PyUnicode_1BYTE_DATA(unicode), &outpos);
    4750             :             else
    4751          27 :                 ch = ucs1lib_utf8_decode(&s, end,
    4752          27 :                         PyUnicode_1BYTE_DATA(unicode), &outpos);
    4753          67 :         } else if (kind == PyUnicode_2BYTE_KIND) {
    4754         201 :             ch = ucs2lib_utf8_decode(&s, end,
    4755         201 :                     PyUnicode_2BYTE_DATA(unicode), &outpos);
    4756             :         } else {
    4757             :             assert(kind == PyUnicode_4BYTE_KIND);
    4758           0 :             ch = ucs4lib_utf8_decode(&s, end,
    4759           0 :                     PyUnicode_4BYTE_DATA(unicode), &outpos);
    4760             :         }
    4761             : 
    4762         163 :         switch (ch) {
    4763             :         case 0:
    4764          76 :             if (s == end || consumed)
    4765             :                 goto End;
    4766           0 :             errmsg = "unexpected end of data";
    4767           0 :             startinpos = s - starts;
    4768           0 :             endinpos = startinpos + 1;
    4769           0 :             while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
    4770           0 :                 endinpos++;
    4771           0 :             break;
    4772             :         case 1:
    4773           0 :             errmsg = "invalid start byte";
    4774           0 :             startinpos = s - starts;
    4775           0 :             endinpos = startinpos + 1;
    4776           0 :             break;
    4777             :         case 2:
    4778           0 :             errmsg = "invalid continuation byte";
    4779           0 :             startinpos = s - starts;
    4780           0 :             endinpos = startinpos + 1;
    4781           0 :             while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
    4782           0 :                 endinpos++;
    4783           0 :             break;
    4784             :         default:
    4785          87 :             if (unicode_putchar(&unicode, &outpos, ch) < 0)
    4786           0 :                 goto onError;
    4787          87 :             continue;
    4788             :         }
    4789             : 
    4790           0 :         if (unicode_decode_call_errorhandler(
    4791             :                 errors, &errorHandler,
    4792             :                 "utf-8", errmsg,
    4793             :                 &starts, &end, &startinpos, &endinpos, &exc, &s,
    4794             :                 &unicode, &outpos))
    4795           0 :             goto onError;
    4796             :     }
    4797             : 
    4798             : End:
    4799       42690 :     if (unicode_resize(&unicode, outpos) < 0)
    4800           0 :         goto onError;
    4801             : 
    4802       42690 :     if (consumed)
    4803           0 :         *consumed = s - starts;
    4804             : 
    4805       42690 :     Py_XDECREF(errorHandler);
    4806       42690 :     Py_XDECREF(exc);
    4807             :     assert(_PyUnicode_CheckConsistency(unicode, 1));
    4808       42690 :     return unicode;
    4809             : 
    4810             : onError:
    4811           0 :     Py_XDECREF(errorHandler);
    4812           0 :     Py_XDECREF(exc);
    4813           0 :     Py_XDECREF(unicode);
    4814           0 :     return NULL;
    4815             : }
    4816             : 
    4817             : #ifdef __APPLE__
    4818             : 
    4819             : /* Simplified UTF-8 decoder using surrogateescape error handler,
    4820             :    used to decode the command line arguments on Mac OS X. */
    4821             : 
    4822             : wchar_t*
    4823             : _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
    4824             : {
    4825             :     const char *e;
    4826             :     wchar_t *unicode;
    4827             :     Py_ssize_t outpos;
    4828             : 
    4829             :     /* Note: size will always be longer than the resulting Unicode
    4830             :        character count */
    4831             :     if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
    4832             :         PyErr_NoMemory();
    4833             :         return NULL;
    4834             :     }
    4835             :     unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
    4836             :     if (!unicode)
    4837             :         return NULL;
    4838             : 
    4839             :     /* Unpack UTF-8 encoded data */
    4840             :     e = s + size;
    4841             :     outpos = 0;
    4842             :     while (s < e) {
    4843             :         Py_UCS4 ch;
    4844             : #if SIZEOF_WCHAR_T == 4
    4845             :         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
    4846             : #else
    4847             :         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
    4848             : #endif
    4849             :         if (ch > 0xFF) {
    4850             : #if SIZEOF_WCHAR_T == 4
    4851             :             assert(0);
    4852             : #else
    4853             :             assert(Py_UNICODE_IS_SURROGATE(ch));
    4854             :             /*  compute and append the two surrogates: */
    4855             :             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
    4856             :             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
    4857             : #endif
    4858             :         }
    4859             :         else {
    4860             :             if (!ch && s == e)
    4861             :                 break;
    4862             :             /* surrogateescape */
    4863             :             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
    4864             :         }
    4865             :     }
    4866             :     unicode[outpos] = L'\0';
    4867             :     return unicode;
    4868             : }
    4869             : 
    4870             : #endif /* __APPLE__ */
    4871             : 
    4872             : /* Primary internal function which creates utf8 encoded bytes objects.
    4873             : 
    4874             :    Allocation strategy:  if the string is short, convert into a stack buffer
    4875             :    and allocate exactly as much space needed at the end.  Else allocate the
    4876             :    maximum possible needed (4 result bytes per Unicode character), and return
    4877             :    the excess memory at the end.
    4878             : */
    4879             : PyObject *
    4880         414 : _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
    4881             : {
    4882             :     enum PyUnicode_Kind kind;
    4883             :     void *data;
    4884             :     Py_ssize_t size;
    4885             : 
    4886         414 :     if (!PyUnicode_Check(unicode)) {
    4887           0 :         PyErr_BadArgument();
    4888           0 :         return NULL;
    4889             :     }
    4890             : 
    4891         414 :     if (PyUnicode_READY(unicode) == -1)
    4892           0 :         return NULL;
    4893             : 
    4894         414 :     if (PyUnicode_UTF8(unicode))
    4895         826 :         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
    4896         826 :                                          PyUnicode_UTF8_LENGTH(unicode));
    4897             : 
    4898           1 :     kind = PyUnicode_KIND(unicode);
    4899           1 :     data = PyUnicode_DATA(unicode);
    4900           1 :     size = PyUnicode_GET_LENGTH(unicode);
    4901             : 
    4902           1 :     switch (kind) {
    4903             :     default:
    4904             :         assert(0);
    4905             :     case PyUnicode_1BYTE_KIND:
    4906             :         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
    4907             :         assert(!PyUnicode_IS_ASCII(unicode));
    4908           1 :         return ucs1lib_utf8_encoder(unicode, data, size, errors);
    4909             :     case PyUnicode_2BYTE_KIND:
    4910           0 :         return ucs2lib_utf8_encoder(unicode, data, size, errors);
    4911             :     case PyUnicode_4BYTE_KIND:
    4912           0 :         return ucs4lib_utf8_encoder(unicode, data, size, errors);
    4913             :     }
    4914             : }
    4915             : 
    4916             : PyObject *
    4917           0 : PyUnicode_EncodeUTF8(const Py_UNICODE *s,
    4918             :                      Py_ssize_t size,
    4919             :                      const char *errors)
    4920             : {
    4921             :     PyObject *v, *unicode;
    4922             : 
    4923           0 :     unicode = PyUnicode_FromUnicode(s, size);
    4924           0 :     if (unicode == NULL)
    4925           0 :         return NULL;
    4926           0 :     v = _PyUnicode_AsUTF8String(unicode, errors);
    4927           0 :     Py_DECREF(unicode);
    4928           0 :     return v;
    4929             : }
    4930             : 
    4931             : PyObject *
    4932           0 : PyUnicode_AsUTF8String(PyObject *unicode)
    4933             : {
    4934           0 :     return _PyUnicode_AsUTF8String(unicode, NULL);
    4935             : }
    4936             : 
    4937             : /* --- UTF-32 Codec ------------------------------------------------------- */
    4938             : 
    4939             : PyObject *
    4940           0 : PyUnicode_DecodeUTF32(const char *s,
    4941             :                       Py_ssize_t size,
    4942             :                       const char *errors,
    4943             :                       int *byteorder)
    4944             : {
    4945           0 :     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
    4946             : }
    4947             : 
    4948             : PyObject *
    4949           0 : PyUnicode_DecodeUTF32Stateful(const char *s,
    4950             :                               Py_ssize_t size,
    4951             :                               const char *errors,
    4952             :                               int *byteorder,
    4953             :                               Py_ssize_t *consumed)
    4954             : {
    4955           0 :     const char *starts = s;
    4956             :     Py_ssize_t startinpos;
    4957             :     Py_ssize_t endinpos;
    4958             :     Py_ssize_t outpos;
    4959             :     PyObject *unicode;
    4960             :     const unsigned char *q, *e;
    4961           0 :     int bo = 0;       /* assume native ordering by default */
    4962           0 :     const char *errmsg = "";
    4963             :     /* Offsets from q for retrieving bytes in the right order. */
    4964             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    4965           0 :     int iorder[] = {0, 1, 2, 3};
    4966             : #else
    4967             :     int iorder[] = {3, 2, 1, 0};
    4968             : #endif
    4969           0 :     PyObject *errorHandler = NULL;
    4970           0 :     PyObject *exc = NULL;
    4971             : 
    4972           0 :     q = (unsigned char *)s;
    4973           0 :     e = q + size;
    4974             : 
    4975           0 :     if (byteorder)
    4976           0 :         bo = *byteorder;
    4977             : 
    4978             :     /* Check for BOM marks (U+FEFF) in the input and adjust current
    4979             :        byte order setting accordingly. In native mode, the leading BOM
    4980             :        mark is skipped, in all other modes, it is copied to the output
    4981             :        stream as-is (giving a ZWNBSP character). */
    4982           0 :     if (bo == 0) {
    4983           0 :         if (size >= 4) {
    4984           0 :             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
    4985           0 :                 (q[iorder[1]] << 8) | q[iorder[0]];
    4986             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    4987           0 :             if (bom == 0x0000FEFF) {
    4988           0 :                 q += 4;
    4989           0 :                 bo = -1;
    4990             :             }
    4991           0 :             else if (bom == 0xFFFE0000) {
    4992           0 :                 q += 4;
    4993           0 :                 bo = 1;
    4994             :             }
    4995             : #else
    4996             :             if (bom == 0x0000FEFF) {
    4997             :                 q += 4;
    4998             :                 bo = 1;
    4999             :             }
    5000             :             else if (bom == 0xFFFE0000) {
    5001             :                 q += 4;
    5002             :                 bo = -1;
    5003             :             }
    5004             : #endif
    5005             :         }
    5006             :     }
    5007             : 
    5008           0 :     if (bo == -1) {
    5009             :         /* force LE */
    5010           0 :         iorder[0] = 0;
    5011           0 :         iorder[1] = 1;
    5012           0 :         iorder[2] = 2;
    5013           0 :         iorder[3] = 3;
    5014             :     }
    5015           0 :     else if (bo == 1) {
    5016             :         /* force BE */
    5017           0 :         iorder[0] = 3;
    5018           0 :         iorder[1] = 2;
    5019           0 :         iorder[2] = 1;
    5020           0 :         iorder[3] = 0;
    5021             :     }
    5022             : 
    5023             :     /* This might be one to much, because of a BOM */
    5024           0 :     unicode = PyUnicode_New((size+3)/4, 127);
    5025           0 :     if (!unicode)
    5026           0 :         return NULL;
    5027           0 :     if (size == 0)
    5028           0 :         return unicode;
    5029           0 :     outpos = 0;
    5030             : 
    5031           0 :     while (q < e) {
    5032             :         Py_UCS4 ch;
    5033             :         /* remaining bytes at the end? (size should be divisible by 4) */
    5034           0 :         if (e-q<4) {
    5035           0 :             if (consumed)
    5036           0 :                 break;
    5037           0 :             errmsg = "truncated data";
    5038           0 :             startinpos = ((const char *)q)-starts;
    5039           0 :             endinpos = ((const char *)e)-starts;
    5040           0 :             goto utf32Error;
    5041             :             /* The remaining input chars are ignored if the callback
    5042             :                chooses to skip the input */
    5043             :         }
    5044           0 :         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
    5045           0 :             (q[iorder[1]] << 8) | q[iorder[0]];
    5046             : 
    5047           0 :         if (ch >= 0x110000)
    5048             :         {
    5049           0 :             errmsg = "codepoint not in range(0x110000)";
    5050           0 :             startinpos = ((const char *)q)-starts;
    5051           0 :             endinpos = startinpos+4;
    5052           0 :             goto utf32Error;
    5053             :         }
    5054           0 :         if (unicode_putchar(&unicode, &outpos, ch) < 0)
    5055           0 :             goto onError;
    5056           0 :         q += 4;
    5057           0 :         continue;
    5058             :       utf32Error:
    5059           0 :         if (unicode_decode_call_errorhandler(
    5060             :                 errors, &errorHandler,
    5061             :                 "utf32", errmsg,
    5062             :                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
    5063             :                 &unicode, &outpos))
    5064           0 :             goto onError;
    5065             :     }
    5066             : 
    5067           0 :     if (byteorder)
    5068           0 :         *byteorder = bo;
    5069             : 
    5070           0 :     if (consumed)
    5071           0 :         *consumed = (const char *)q-starts;
    5072             : 
    5073             :     /* Adjust length */
    5074           0 :     if (unicode_resize(&unicode, outpos) < 0)
    5075           0 :         goto onError;
    5076             : 
    5077           0 :     Py_XDECREF(errorHandler);
    5078           0 :     Py_XDECREF(exc);
    5079           0 :     return unicode_result(unicode);
    5080             : 
    5081             :   onError:
    5082           0 :     Py_DECREF(unicode);
    5083           0 :     Py_XDECREF(errorHandler);
    5084           0 :     Py_XDECREF(exc);
    5085           0 :     return NULL;
    5086             : }
    5087             : 
    5088             : PyObject *
    5089           0 : _PyUnicode_EncodeUTF32(PyObject *str,
    5090             :                        const char *errors,
    5091             :                        int byteorder)
    5092             : {
    5093             :     int kind;
    5094             :     void *data;
    5095             :     Py_ssize_t len;
    5096             :     PyObject *v;
    5097             :     unsigned char *p;
    5098             :     Py_ssize_t nsize, bytesize, i;
    5099             :     /* Offsets from p for storing byte pairs in the right order. */
    5100             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    5101           0 :     int iorder[] = {0, 1, 2, 3};
    5102             : #else
    5103             :     int iorder[] = {3, 2, 1, 0};
    5104             : #endif
    5105             : 
    5106             : #define STORECHAR(CH)                           \
    5107             :     do {                                        \
    5108             :         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
    5109             :         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
    5110             :         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
    5111             :         p[iorder[0]] = (CH) & 0xff;             \
    5112             :         p += 4;                                 \
    5113             :     } while(0)
    5114             : 
    5115           0 :     if (!PyUnicode_Check(str)) {
    5116           0 :         PyErr_BadArgument();
    5117           0 :         return NULL;
    5118             :     }
    5119           0 :     if (PyUnicode_READY(str) == -1)
    5120           0 :         return NULL;
    5121           0 :     kind = PyUnicode_KIND(str);
    5122           0 :     data = PyUnicode_DATA(str);
    5123           0 :     len = PyUnicode_GET_LENGTH(str);
    5124             : 
    5125           0 :     nsize = len + (byteorder == 0);
    5126           0 :     bytesize = nsize * 4;
    5127           0 :     if (bytesize / 4 != nsize)
    5128           0 :         return PyErr_NoMemory();
    5129           0 :     v = PyBytes_FromStringAndSize(NULL, bytesize);
    5130           0 :     if (v == NULL)
    5131           0 :         return NULL;
    5132             : 
    5133           0 :     p = (unsigned char *)PyBytes_AS_STRING(v);
    5134           0 :     if (byteorder == 0)
    5135           0 :         STORECHAR(0xFEFF);
    5136           0 :     if (len == 0)
    5137           0 :         goto done;
    5138             : 
    5139           0 :     if (byteorder == -1) {
    5140             :         /* force LE */
    5141           0 :         iorder[0] = 0;
    5142           0 :         iorder[1] = 1;
    5143           0 :         iorder[2] = 2;
    5144           0 :         iorder[3] = 3;
    5145             :     }
    5146           0 :     else if (byteorder == 1) {
    5147             :         /* force BE */
    5148           0 :         iorder[0] = 3;
    5149           0 :         iorder[1] = 2;
    5150           0 :         iorder[2] = 1;
    5151           0 :         iorder[3] = 0;
    5152             :     }
    5153             : 
    5154           0 :     for (i = 0; i < len; i++)
    5155           0 :         STORECHAR(PyUnicode_READ(kind, data, i));
    5156             : 
    5157             :   done:
    5158           0 :     return v;
    5159             : #undef STORECHAR
    5160             : }
    5161             : 
    5162             : PyObject *
    5163           0 : PyUnicode_EncodeUTF32(const Py_UNICODE *s,
    5164             :                       Py_ssize_t size,
    5165             :                       const char *errors,
    5166             :                       int byteorder)
    5167             : {
    5168             :     PyObject *result;
    5169           0 :     PyObject *tmp = PyUnicode_FromUnicode(s, size);
    5170           0 :     if (tmp == NULL)
    5171           0 :         return NULL;
    5172           0 :     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
    5173           0 :     Py_DECREF(tmp);
    5174           0 :     return result;
    5175             : }
    5176             : 
    5177             : PyObject *
    5178           0 : PyUnicode_AsUTF32String(PyObject *unicode)
    5179             : {
    5180           0 :     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
    5181             : }
    5182             : 
    5183             : /* --- UTF-16 Codec ------------------------------------------------------- */
    5184             : 
    5185             : PyObject *
    5186           0 : PyUnicode_DecodeUTF16(const char *s,
    5187             :                       Py_ssize_t size,
    5188             :                       const char *errors,
    5189             :                       int *byteorder)
    5190             : {
    5191           0 :     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
    5192             : }
    5193             : 
    5194             : PyObject *
    5195           0 : PyUnicode_DecodeUTF16Stateful(const char *s,
    5196             :                               Py_ssize_t size,
    5197             :                               const char *errors,
    5198             :                               int *byteorder,
    5199             :                               Py_ssize_t *consumed)
    5200             : {
    5201           0 :     const char *starts = s;
    5202             :     Py_ssize_t startinpos;
    5203             :     Py_ssize_t endinpos;
    5204             :     Py_ssize_t outpos;
    5205             :     PyObject *unicode;
    5206             :     const unsigned char *q, *e;
    5207           0 :     int bo = 0;       /* assume native ordering by default */
    5208             :     int native_ordering;
    5209           0 :     const char *errmsg = "";
    5210           0 :     PyObject *errorHandler = NULL;
    5211           0 :     PyObject *exc = NULL;
    5212             : 
    5213           0 :     q = (unsigned char *)s;
    5214           0 :     e = q + size;
    5215             : 
    5216           0 :     if (byteorder)
    5217           0 :         bo = *byteorder;
    5218             : 
    5219             :     /* Check for BOM marks (U+FEFF) in the input and adjust current
    5220             :        byte order setting accordingly. In native mode, the leading BOM
    5221             :        mark is skipped, in all other modes, it is copied to the output
    5222             :        stream as-is (giving a ZWNBSP character). */
    5223           0 :     if (bo == 0 && size >= 2) {
    5224           0 :         const Py_UCS4 bom = (q[1] << 8) | q[0];
    5225           0 :         if (bom == 0xFEFF) {
    5226           0 :             q += 2;
    5227           0 :             bo = -1;
    5228             :         }
    5229           0 :         else if (bom == 0xFFFE) {
    5230           0 :             q += 2;
    5231           0 :             bo = 1;
    5232             :         }
    5233           0 :         if (byteorder)
    5234           0 :             *byteorder = bo;
    5235             :     }
    5236             : 
    5237           0 :     if (q == e) {
    5238           0 :         if (consumed)
    5239           0 :             *consumed = size;
    5240           0 :         Py_INCREF(unicode_empty);
    5241           0 :         return unicode_empty;
    5242             :     }
    5243             : 
    5244             : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    5245           0 :     native_ordering = bo <= 0;
    5246             : #else
    5247             :     native_ordering = bo >= 0;
    5248             : #endif
    5249             : 
    5250             :     /* Note: size will always be longer than the resulting Unicode
    5251             :        character count */
    5252           0 :     unicode = PyUnicode_New((e - q + 1) / 2, 127);
    5253           0 :     if (!unicode)
    5254           0 :         return NULL;
    5255             : 
    5256           0 :     outpos = 0;
    5257             :     while (1) {
    5258           0 :         Py_UCS4 ch = 0;
    5259           0 :         if (e - q >= 2) {
    5260           0 :             int kind = PyUnicode_KIND(unicode);
    5261           0 :             if (kind == PyUnicode_1BYTE_KIND) {
    5262           0 :                 if (PyUnicode_IS_ASCII(unicode))
    5263           0 :                     ch = asciilib_utf16_decode(&q, e,
    5264           0 :                             PyUnicode_1BYTE_DATA(unicode), &outpos,
    5265             :                             native_ordering);
    5266             :                 else
    5267           0 :                     ch = ucs1lib_utf16_decode(&q, e,
    5268           0 :                             PyUnicode_1BYTE_DATA(unicode), &outpos,
    5269             :                             native_ordering);
    5270           0 :             } else if (kind == PyUnicode_2BYTE_KIND) {
    5271           0 :                 ch = ucs2lib_utf16_decode(&q, e,
    5272           0 :                         PyUnicode_2BYTE_DATA(unicode), &outpos,
    5273             :                         native_ordering);
    5274             :             } else {
    5275             :                 assert(kind == PyUnicode_4BYTE_KIND);
    5276           0 :                 ch = ucs4lib_utf16_decode(&q, e,
    5277           0 :                         PyUnicode_4BYTE_DATA(unicode), &outpos,
    5278             :                         native_ordering);
    5279             :             }
    5280             :         }
    5281             : 
    5282           0 :         switch (ch)
    5283             :         {
    5284             :         case 0:
    5285             :             /* remaining byte at the end? (size should be even) */
    5286           0 :             if (q == e || consumed)
    5287             :                 goto End;
    5288           0 :             errmsg = "truncated data";
    5289           0 :             startinpos = ((const char *)q) - starts;
    5290           0 :             endinpos = ((const char *)e) - starts;
    5291           0 :             break;
    5292             :             /* The remaining input chars are ignored if the callback
    5293             :                chooses to skip the input */
    5294             :         case 1:
    5295           0 :             errmsg = "unexpected end of data";
    5296           0 :             startinpos = ((const char *)q) - 2 - starts;
    5297           0 :             endinpos = ((const char *)e) - starts;
    5298           0 :             break;
    5299             :         case 2:
    5300           0 :             errmsg = "illegal encoding";
    5301           0 :             startinpos = ((const char *)q) - 2 - starts;
    5302           0 :             endinpos = startinpos + 2;
    5303           0 :             break;
    5304             :         case 3:
    5305           0 :             errmsg = "illegal UTF-16 surrogate";
    5306           0 :             startinpos = ((const char *)q) - 4 - starts;
    5307           0 :             endinpos = startinpos + 2;
    5308           0 :             break;
    5309             :         default:
    5310           0 :             if (unicode_putchar(&unicode, &outpos, ch) < 0)
    5311           0 :                 goto onError;
    5312           0 :             continue;
    5313             :         }
    5314             : 
    5315           0 :         if (unicode_decode_call_errorhandler(
    5316             :                 errors,
    5317             :                 &errorHandler,
    5318             :                 "utf16", errmsg,
    5319             :                 &starts,
    5320             :                 (const char **)&e,
    5321             :                 &startinpos,
    5322             :                 &endinpos,
    5323             :                 &exc,
    5324             :                 (const char **)&q,
    5325             :                 &unicode,
    5326             :                 &outpos))
    5327           0 :             goto onError;
    5328           0 :     }
    5329             : 
    5330             : End:
    5331           0 :     if (consumed)
    5332           0 :         *consumed = (const char *)q-starts;
    5333             : 
    5334             :     /* Adjust length */
    5335           0 :     if (unicode_resize(&unicode, outpos) < 0)
    5336           0 :         goto onError;
    5337             : 
    5338           0 :     Py_XDECREF(errorHandler);
    5339           0 :     Py_XDECREF(exc);
    5340           0 :     return unicode_result(unicode);
    5341             : 
    5342             :   onError:
    5343           0 :     Py_DECREF(unicode);
    5344           0 :     Py_XDECREF(errorHandler);
    5345           0 :     Py_XDECREF(exc);
    5346           0 :     return NULL;
    5347             : }
    5348             : 
    5349             : PyObject *
    5350           0 : _PyUnicode_EncodeUTF16(PyObject *str,
    5351             :                        const char *errors,
    5352             :                        int byteorder)
    5353             : {
    5354             :     enum PyUnicode_Kind kind;
    5355             :     const void *data;
    5356             :     Py_ssize_t len;
    5357             :     PyObject *v;
    5358             :     unsigned short *out;
    5359             :     Py_ssize_t bytesize;
    5360             :     Py_ssize_t pairs;
    5361             : #ifdef WORDS_BIGENDIAN
    5362             :     int native_ordering = byteorder >= 0;
    5363             : #else
    5364           0 :     int native_ordering = byteorder <= 0;
    5365             : #endif
    5366             : 
    5367           0 :     if (!PyUnicode_Check(str)) {
    5368           0 :         PyErr_BadArgument();
    5369           0 :         return NULL;
    5370             :     }
    5371           0 :     if (PyUnicode_READY(str) == -1)
    5372           0 :         return NULL;
    5373           0 :     kind = PyUnicode_KIND(str);
    5374           0 :     data = PyUnicode_DATA(str);
    5375           0 :     len = PyUnicode_GET_LENGTH(str);
    5376             : 
    5377           0 :     pairs = 0;
    5378           0 :     if (kind == PyUnicode_4BYTE_KIND) {
    5379           0 :         const Py_UCS4 *in = (const Py_UCS4 *)data;
    5380           0 :         const Py_UCS4 *end = in + len;
    5381           0 :         while (in < end)
    5382           0 :             if (*in++ >= 0x10000)
    5383           0 :                 pairs++;
    5384             :     }
    5385           0 :     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
    5386           0 :         return PyErr_NoMemory();
    5387           0 :     bytesize = (len + pairs + (byteorder == 0)) * 2;
    5388           0 :     v = PyBytes_FromStringAndSize(NULL, bytesize);
    5389           0 :     if (v == NULL)
    5390           0 :         return NULL;
    5391             : 
    5392             :     /* output buffer is 2-bytes aligned */
    5393             :     assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0);
    5394           0 :     out = (unsigned short *)PyBytes_AS_STRING(v);
    5395           0 :     if (byteorder == 0)
    5396           0 :         *out++ = 0xFEFF;
    5397           0 :     if (len == 0)
    5398           0 :         goto done;
    5399             : 
    5400           0 :     switch (kind) {
    5401             :     case PyUnicode_1BYTE_KIND: {
    5402           0 :         ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
    5403           0 :         break;
    5404             :     }
    5405             :     case PyUnicode_2BYTE_KIND: {
    5406           0 :         ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
    5407           0 :         break;
    5408             :     }
    5409             :     case PyUnicode_4BYTE_KIND: {
    5410           0 :         ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
    5411           0 :         break;
    5412             :     }
    5413             :     default:
    5414             :         assert(0);
    5415             :     }
    5416             : 
    5417             :   done:
    5418           0 :     return v;
    5419             : }
    5420             : 
    5421             : PyObject *
    5422           0 : PyUnicode_EncodeUTF16(const Py_UNICODE *s,
    5423             :                       Py_ssize_t size,
    5424             :                       const char *errors,
    5425             :                       int byteorder)
    5426             : {
    5427             :     PyObject *result;
    5428           0 :     PyObject *tmp = PyUnicode_FromUnicode(s, size);
    5429           0 :     if (tmp == NULL)
    5430           0 :         return NULL;
    5431           0 :     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
    5432           0 :     Py_DECREF(tmp);
    5433           0 :     return result;
    5434             : }
    5435             : 
    5436             : PyObject *
    5437           0 : PyUnicode_AsUTF16String(PyObject *unicode)
    5438             : {
    5439           0 :     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
    5440             : }
    5441             : 
    5442             : /* --- Unicode Escape Codec ----------------------------------------------- */
    5443             : 
    5444             : /* Helper function for PyUnicode_DecodeUnicodeEscape, determines
    5445             :    if all the escapes in the string make it still a valid ASCII string.
    5446             :    Returns -1 if any escapes were found which cause the string to
    5447             :    pop out of ASCII range.  Otherwise returns the length of the
    5448             :    required buffer to hold the string.
    5449             :    */
    5450             : static Py_ssize_t
    5451          77 : length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
    5452             : {
    5453          77 :     const unsigned char *p = (const unsigned char *)s;
    5454          77 :     const unsigned char *end = p + size;
    5455          77 :     Py_ssize_t length = 0;
    5456             : 
    5457          77 :     if (size < 0)
    5458           0 :         return -1;
    5459             : 
    5460        1798 :     for (; p < end; ++p) {
    5461        1721 :         if (*p > 127) {
    5462             :             /* Non-ASCII */
    5463           0 :             return -1;
    5464             :         }
    5465        1721 :         else if (*p != '\\') {
    5466             :             /* Normal character */
    5467        1718 :             ++length;
    5468             :         }
    5469             :         else {
    5470             :             /* Backslash-escape, check next char */
    5471           3 :             ++p;
    5472             :             /* Escape sequence reaches till end of string or
    5473             :                non-ASCII follow-up. */
    5474           3 :             if (p >= end || *p > 127)
    5475           0 :                 return -1;
    5476           3 :             switch (*p) {
    5477             :             case '\n':
    5478             :                 /* backslash + \n result in zero characters */
    5479           0 :                 break;
    5480             :             case '\\': case '\'': case '\"':
    5481             :             case 'b': case 'f': case 't':
    5482             :             case 'n': case 'r': case 'v': case 'a':
    5483           3 :                 ++length;
    5484           3 :                 break;
    5485             :             case '0': case '1': case '2': case '3':
    5486             :             case '4': case '5': case '6': case '7':
    5487             :             case 'x': case 'u': case 'U': case 'N':
    5488             :                 /* these do not guarantee ASCII characters */
    5489           0 :                 return -1;
    5490             :             default:
    5491             :                 /* count the backslash + the other character */
    5492           0 :                 length += 2;
    5493             :             }
    5494             :         }
    5495             :     }
    5496          77 :     return length;
    5497             : }
    5498             : 
    5499             : static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
    5500             : 
    5501             : PyObject *
    5502          77 : PyUnicode_DecodeUnicodeEscape(const char *s,
    5503             :                               Py_ssize_t size,
    5504             :                               const char *errors)
    5505             : {
    5506          77 :     const char *starts = s;
    5507             :     Py_ssize_t startinpos;
    5508             :     Py_ssize_t endinpos;
    5509             :     int j;
    5510             :     PyObject *v;
    5511             :     const char *end;
    5512             :     char* message;
    5513          77 :     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
    5514          77 :     PyObject *errorHandler = NULL;
    5515          77 :     PyObject *exc = NULL;
    5516             :     Py_ssize_t len;
    5517             :     Py_ssize_t i;
    5518             : 
    5519          77 :     len = length_of_escaped_ascii_string(s, size);
    5520             : 
    5521             :     /* After length_of_escaped_ascii_string() there are two alternatives,
    5522             :        either the string is pure ASCII with named escapes like \n, etc.
    5523             :        and we determined it's exact size (common case)
    5524             :        or it contains \x, \u, ... escape sequences.  then we create a
    5525             :        legacy wchar string and resize it at the end of this function. */
    5526          77 :     if (len >= 0) {
    5527          77 :         v = PyUnicode_New(len, 127);
    5528          77 :         if (!v)
    5529           0 :             goto onError;
    5530             :         assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
    5531             :     }
    5532             :     else {
    5533             :         /* Escaped strings will always be longer than the resulting
    5534             :            Unicode string, so we start with size here and then reduce the
    5535             :            length after conversion to the true value.
    5536             :            (but if the error callback returns a long replacement string
    5537             :            we'll have to allocate more space) */
    5538           0 :         v = PyUnicode_New(size, 127);
    5539           0 :         if (!v)
    5540           0 :             goto onError;
    5541           0 :         len = size;
    5542             :     }
    5543             : 
    5544          77 :     if (size == 0)
    5545           5 :         return v;
    5546          72 :     i = 0;
    5547          72 :     end = s + size;
    5548             : 
    5549        1865 :     while (s < end) {
    5550             :         unsigned char c;
    5551             :         Py_UCS4 x;
    5552             :         int digits;
    5553             : 
    5554             :         /* The only case in which i == ascii_length is a backslash
    5555             :            followed by a newline. */
    5556             :         assert(i <= len);
    5557             : 
    5558             :         /* Non-escape characters are interpreted as Unicode ordinals */
    5559        1721 :         if (*s != '\\') {
    5560        1718 :             if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
    5561           0 :                 goto onError;
    5562        1718 :             continue;
    5563             :         }
    5564             : 
    5565           3 :         startinpos = s-starts;
    5566             :         /* \ - Escapes */
    5567           3 :         s++;
    5568           3 :         c = *s++;
    5569           3 :         if (s > end)
    5570           0 :             c = '\0'; /* Invalid after \ */
    5571             : 
    5572             :         /* The only case in which i == ascii_length is a backslash
    5573             :            followed by a newline. */
    5574             :         assert(i < len || (i == len && c == '\n'));
    5575             : 
    5576           3 :         switch (c) {
    5577             : 
    5578             :             /* \x escapes */
    5579             : #define WRITECHAR(ch)                                   \
    5580             :             do {                                        \
    5581             :                 if (unicode_putchar(&v, &i, ch) < 0)    \
    5582             :                     goto onError;                       \
    5583             :             }while(0)
    5584             : 
    5585           0 :         case '\n': break;
    5586           0 :         case '\\': WRITECHAR('\\'); break;
    5587           0 :         case '\'': WRITECHAR('\''); break;
    5588           0 :         case '\"': WRITECHAR('\"'); break;
    5589           0 :         case 'b': WRITECHAR('\b'); break;
    5590             :         /* FF */
    5591           0 :         case 'f': WRITECHAR('\014'); break;
    5592           0 :         case 't': WRITECHAR('\t'); break;
    5593           3 :         case 'n': WRITECHAR('\n'); break;
    5594           0 :         case 'r': WRITECHAR('\r'); break;
    5595             :         /* VT */
    5596           0 :         case 'v': WRITECHAR('\013'); break;
    5597             :         /* BEL, not classic C */
    5598           0 :         case 'a': WRITECHAR('\007'); break;
    5599             : 
    5600             :             /* \OOO (octal) escapes */
    5601             :         case '0': case '1': case '2': case '3':
    5602             :         case '4': case '5': case '6': case '7':
    5603           0 :             x = s[-1] - '0';
    5604           0 :             if (s < end && '0' <= *s && *s <= '7') {
    5605           0 :                 x = (x<<3) + *s++ - '0';
    5606           0 :                 if (s < end && '0' <= *s && *s <= '7')
    5607           0 :                     x = (x<<3) + *s++ - '0';
    5608             :             }
    5609           0 :             WRITECHAR(x);
    5610           0 :             break;
    5611             : 
    5612             :             /* hex escapes */
    5613             :             /* \xXX */
    5614             :         case 'x':
    5615           0 :             digits = 2;
    5616           0 :             message = "truncated \\xXX escape";
    5617           0 :             goto hexescape;
    5618             : 
    5619             :             /* \uXXXX */
    5620             :         case 'u':
    5621           0 :             digits = 4;
    5622           0 :             message = "truncated \\uXXXX escape";
    5623           0 :             goto hexescape;
    5624             : 
    5625             :             /* \UXXXXXXXX */
    5626             :         case 'U':
    5627           0 :             digits = 8;
    5628           0 :             message = "truncated \\UXXXXXXXX escape";
    5629             :         hexescape:
    5630           0 :             chr = 0;
    5631           0 :             if (s+digits>end) {
    5632           0 :                 endinpos = size;
    5633           0 :                 if (unicode_decode_call_errorhandler(
    5634             :                         errors, &errorHandler,
    5635             :                         "unicodeescape", "end of string in escape sequence",
    5636             :                         &starts, &end, &startinpos, &endinpos, &exc, &s,
    5637             :                         &v, &i))
    5638           0 :                     goto onError;
    5639           0 :                 goto nextByte;
    5640             :             }
    5641           0 :             for (j = 0; j < digits; ++j) {
    5642           0 :                 c = (unsigned char) s[j];
    5643           0 :                 if (!Py_ISXDIGIT(c)) {
    5644           0 :                     endinpos = (s+j+1)-starts;
    5645           0 :                     if (unicode_decode_call_errorhandler(
    5646             :                             errors, &errorHandler,
    5647             :                             "unicodeescape", message,
    5648             :                             &starts, &end, &startinpos, &endinpos, &exc, &s,
    5649             :                             &v, &i))
    5650           0 :                         goto onError;
    5651           0 :                     len = PyUnicode_GET_LENGTH(v);
    5652           0 :                     goto nextByte;
    5653             :                 }
    5654           0 :                 chr = (chr<<4) & ~0xF;
    5655           0 :                 if (c >= '0' && c <= '9')
    5656           0 :                     chr += c - '0';
    5657           0 :                 else if (c >= 'a' && c <= 'f')
    5658           0 :                     chr += 10 + c - 'a';
    5659             :                 else
    5660           0 :                     chr += 10 + c - 'A';
    5661             :             }
    5662           0 :             s += j;
    5663           0 :             if (chr == 0xffffffff && PyErr_Occurred())
    5664             :                 /* _decoding_error will have already written into the
    5665             :                    target buffer. */
    5666           0 :                 break;
    5667             :         store:
    5668             :             /* when we get here, chr is a 32-bit unicode character */
    5669           0 :             if (chr <= MAX_UNICODE) {
    5670           0 :                 WRITECHAR(chr);
    5671             :             } else {
    5672           0 :                 endinpos = s-starts;
    5673           0 :                 if (unicode_decode_call_errorhandler(
    5674             :                         errors, &errorHandler,
    5675             :                         "unicodeescape", "illegal Unicode character",
    5676             :                         &starts, &end, &startinpos, &endinpos, &exc, &s,
    5677             :                         &v, &i))
    5678           0 :                     goto onError;
    5679             :             }
    5680           0 :             break;
    5681             : 
    5682             :             /* \N{name} */
    5683             :         case 'N':
    5684           0 :             message = "malformed \\N character escape";
    5685           0 :             if (ucnhash_CAPI == NULL) {
    5686             :                 /* load the unicode data module */
    5687           0 :                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
    5688             :                                                 PyUnicodeData_CAPSULE_NAME, 1);
    5689           0 :                 if (ucnhash_CAPI == NULL)
    5690           0 :                     goto ucnhashError;
    5691             :             }
    5692           0 :             if (*s == '{') {
    5693           0 :                 const char *start = s+1;
    5694             :                 /* look for the closing brace */
    5695           0 :                 while (*s != '}' && s < end)
    5696           0 :                     s++;
    5697           0 :                 if (s > start && s < end && *s == '}') {
    5698             :                     /* found a name.  look it up in the unicode database */
    5699           0 :                     message = "unknown Unicode character name";
    5700           0 :                     s++;
    5701           0 :                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
    5702             :                                               &chr, 0))
    5703           0 :                         goto store;
    5704             :                 }
    5705             :             }
    5706           0 :             endinpos = s-starts;
    5707           0 :             if (unicode_decode_call_errorhandler(
    5708             :                     errors, &errorHandler,
    5709             :                     "unicodeescape", message,
    5710             :                     &starts, &end, &startinpos, &endinpos, &exc, &s,
    5711             :                     &v, &i))
    5712           0 :                 goto onError;
    5713           0 :             break;
    5714             : 
    5715             :         default:
    5716           0 :             if (s > end) {
    5717           0 :                 message = "\\ at end of string";
    5718           0 :                 s--;
    5719           0 :                 endinpos = s-starts;
    5720           0 :                 if (unicode_decode_call_errorhandler(
    5721             :                         errors, &errorHandler,
    5722             :                         "unicodeescape", message,
    5723             :                         &starts, &end, &startinpos, &endinpos, &exc, &s,
    5724             :                         &v, &i))
    5725           0 :                     goto onError;
    5726             :             }
    5727             :             else {
    5728           0 :                 WRITECHAR('\\');
    5729           0 :                 WRITECHAR(s[-1]);
    5730             :             }
    5731           0 :             break;
    5732             :         }
    5733             :       nextByte:
    5734             :         ;
    5735             :     }
    5736             : #undef WRITECHAR
    5737             : 
    5738          72 :     if (unicode_resize(&v, i) < 0)
    5739           0 :         goto onError;
    5740          72 :     Py_XDECREF(errorHandler);
    5741          72 :     Py_XDECREF(exc);
    5742          72 :     return unicode_result(v);
    5743             : 
    5744             :   ucnhashError:
    5745           0 :     PyErr_SetString(
    5746             :         PyExc_UnicodeError,
    5747             :         "\\N escapes not supported (can't load unicodedata module)"
    5748             :         );
    5749           0 :     Py_XDECREF(v);
    5750           0 :     Py_XDECREF(errorHandler);
    5751           0 :     Py_XDECREF(exc);
    5752           0 :     return NULL;
    5753             : 
    5754             :   onError:
    5755           0 :     Py_XDECREF(v);
    5756           0 :     Py_XDECREF(errorHandler);
    5757           0 :     Py_XDECREF(exc);
    5758           0 :     return NULL;
    5759             : }
    5760             : 
    5761             : /* Return a Unicode-Escape string version of the Unicode object.
    5762             : 
    5763             :    If quotes is true, the string is enclosed in u"" or u'' quotes as
    5764             :    appropriate.
    5765             : 
    5766             : */
    5767             : 
    5768             : PyObject *
    5769           0 : PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
    5770             : {
    5771             :     Py_ssize_t i, len;
    5772             :     PyObject *repr;
    5773             :     char *p;
    5774             :     int kind;
    5775             :     void *data;
    5776           0 :     Py_ssize_t expandsize = 0;
    5777             : 
    5778             :     /* Initial allocation is based on the longest-possible unichr
    5779             :        escape.
    5780             : 
    5781             :        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
    5782             :        unichr, so in this case it's the longest unichr escape. In
    5783             :        narrow (UTF-16) builds this is five chars per source unichr
    5784             :        since there are two unichrs in the surrogate pair, so in narrow
    5785             :        (UTF-16) builds it's not the longest unichr escape.
    5786             : 
    5787             :        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
    5788             :        so in the narrow (UTF-16) build case it's the longest unichr
    5789             :        escape.
    5790             :     */
    5791             : 
    5792           0 :     if (!PyUnicode_Check(unicode)) {
    5793           0 :         PyErr_BadArgument();
    5794           0 :         return NULL;
    5795             :     }
    5796           0 :     if (PyUnicode_READY(unicode) == -1)
    5797           0 :         return NULL;
    5798           0 :     len = PyUnicode_GET_LENGTH(unicode);
    5799           0 :     kind = PyUnicode_KIND(unicode);
    5800           0 :     data = PyUnicode_DATA(unicode);
    5801           0 :     switch (kind) {
    5802           0 :     case PyUnicode_1BYTE_KIND: expandsize = 4; break;
    5803           0 :     case PyUnicode_2BYTE_KIND: expandsize = 6; break;
    5804           0 :     case PyUnicode_4BYTE_KIND: expandsize = 10; break;
    5805             :     }
    5806             : 
    5807           0 :     if (len == 0)
    5808           0 :         return PyBytes_FromStringAndSize(NULL, 0);
    5809             : 
    5810           0 :     if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
    5811           0 :         return PyErr_NoMemory();
    5812             : 
    5813           0 :     repr = PyBytes_FromStringAndSize(NULL,
    5814             :                                      2
    5815           0 :                                      + expandsize*len
    5816             :                                      + 1);
    5817           0 :     if (repr == NULL)
    5818           0 :         return NULL;
    5819             : 
    5820           0 :     p = PyBytes_AS_STRING(repr);
    5821             : 
    5822           0 :     for (i = 0; i < len; i++) {
    5823           0 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    5824             : 
    5825             :         /* Escape backslashes */
    5826           0 :         if (ch == '\\') {
    5827           0 :             *p++ = '\\';
    5828           0 :             *p++ = (char) ch;
    5829           0 :             continue;
    5830             :         }
    5831             : 
    5832             :         /* Map 21-bit characters to '\U00xxxxxx' */
    5833           0 :         else if (ch >= 0x10000) {
    5834             :             assert(ch <= MAX_UNICODE);
    5835           0 :             *p++ = '\\';
    5836           0 :             *p++ = 'U';
    5837           0 :             *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
    5838           0 :             *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
    5839           0 :             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
    5840           0 :             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
    5841           0 :             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
    5842           0 :             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
    5843           0 :             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
    5844           0 :             *p++ = Py_hexdigits[ch & 0x0000000F];
    5845           0 :             continue;
    5846             :         }
    5847             : 
    5848             :         /* Map 16-bit characters to '\uxxxx' */
    5849           0 :         if (ch >= 256) {
    5850           0 :             *p++ = '\\';
    5851           0 :             *p++ = 'u';
    5852           0 :             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
    5853           0 :             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
    5854           0 :             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
    5855           0 :             *p++ = Py_hexdigits[ch & 0x000F];
    5856             :         }
    5857             : 
    5858             :         /* Map special whitespace to '\t', \n', '\r' */
    5859           0 :         else if (ch == '\t') {
    5860           0 :             *p++ = '\\';
    5861           0 :             *p++ = 't';
    5862             :         }
    5863           0 :         else if (ch == '\n') {
    5864           0 :             *p++ = '\\';
    5865           0 :             *p++ = 'n';
    5866             :         }
    5867           0 :         else if (ch == '\r') {
    5868           0 :             *p++ = '\\';
    5869           0 :             *p++ = 'r';
    5870             :         }
    5871             : 
    5872             :         /* Map non-printable US ASCII to '\xhh' */
    5873           0 :         else if (ch < ' ' || ch >= 0x7F) {
    5874           0 :             *p++ = '\\';
    5875           0 :             *p++ = 'x';
    5876           0 :             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
    5877           0 :             *p++ = Py_hexdigits[ch & 0x000F];
    5878             :         }
    5879             : 
    5880             :         /* Copy everything else as-is */
    5881             :         else
    5882           0 :             *p++ = (char) ch;
    5883             :     }
    5884             : 
    5885             :     assert(p - PyBytes_AS_STRING(repr) > 0);
    5886           0 :     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
    5887           0 :         return NULL;
    5888           0 :     return repr;
    5889             : }
    5890             : 
    5891             : PyObject *
    5892           0 : PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
    5893             :                               Py_ssize_t size)
    5894             : {
    5895             :     PyObject *result;
    5896           0 :     PyObject *tmp = PyUnicode_FromUnicode(s, size);
    5897           0 :     if (tmp == NULL)
    5898           0 :         return NULL;
    5899           0 :     result = PyUnicode_AsUnicodeEscapeString(tmp);
    5900           0 :     Py_DECREF(tmp);
    5901           0 :     return result;
    5902             : }
    5903             : 
    5904             : /* --- Raw Unicode Escape Codec ------------------------------------------- */
    5905             : 
    5906             : PyObject *
    5907           0 : PyUnicode_DecodeRawUnicodeEscape(const char *s,
    5908             :                                  Py_ssize_t size,
    5909             :                                  const char *errors)
    5910             : {
    5911           0 :     const char *starts = s;
    5912             :     Py_ssize_t startinpos;
    5913             :     Py_ssize_t endinpos;
    5914             :     Py_ssize_t outpos;
    5915             :     PyObject *v;
    5916             :     const char *end;
    5917             :     const char *bs;
    5918           0 :     PyObject *errorHandler = NULL;
    5919           0 :     PyObject *exc = NULL;
    5920             : 
    5921             :     /* Escaped strings will always be longer than the resulting
    5922             :        Unicode string, so we start with size here and then reduce the
    5923             :        length after conversion to the true value. (But decoding error
    5924             :        handler might have to resize the string) */
    5925           0 :     v = PyUnicode_New(size, 127);
    5926           0 :     if (v == NULL)
    5927           0 :         goto onError;
    5928           0 :     if (size == 0)
    5929           0 :         return v;
    5930           0 :     outpos = 0;
    5931           0 :     end = s + size;
    5932           0 :     while (s < end) {
    5933             :         unsigned char c;
    5934             :         Py_UCS4 x;
    5935             :         int i;
    5936             :         int count;
    5937             : 
    5938             :         /* Non-escape characters are interpreted as Unicode ordinals */
    5939           0 :         if (*s != '\\') {
    5940           0 :             if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
    5941           0 :                 goto onError;
    5942           0 :             continue;
    5943             :         }
    5944           0 :         startinpos = s-starts;
    5945             : 
    5946             :         /* \u-escapes are only interpreted iff the number of leading
    5947             :            backslashes if odd */
    5948           0 :         bs = s;
    5949           0 :         for (;s < end;) {
    5950           0 :             if (*s != '\\')
    5951           0 :                 break;
    5952           0 :             if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
    5953           0 :                 goto onError;
    5954             :         }
    5955           0 :         if (((s - bs) & 1) == 0 ||
    5956           0 :             s >= end ||
    5957           0 :             (*s != 'u' && *s != 'U')) {
    5958           0 :             continue;
    5959             :         }
    5960           0 :         outpos--;
    5961           0 :         count = *s=='u' ? 4 : 8;
    5962           0 :         s++;
    5963             : 
    5964             :         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
    5965           0 :         for (x = 0, i = 0; i < count; ++i, ++s) {
    5966           0 :             c = (unsigned char)*s;
    5967           0 :             if (!Py_ISXDIGIT(c)) {
    5968           0 :                 endinpos = s-starts;
    5969           0 :                 if (unicode_decode_call_errorhandler(
    5970             :                         errors, &errorHandler,
    5971             :                         "rawunicodeescape", "truncated \\uXXXX",
    5972             :                         &starts, &end, &startinpos, &endinpos, &exc, &s,
    5973             :                         &v, &outpos))
    5974           0 :                     goto onError;
    5975           0 :                 goto nextByte;
    5976             :             }
    5977           0 :             x = (x<<4) & ~0xF;
    5978           0 :             if (c >= '0' && c <= '9')
    5979           0 :                 x += c - '0';
    5980           0 :             else if (c >= 'a' && c <= 'f')
    5981           0 :                 x += 10 + c - 'a';
    5982             :             else
    5983           0 :                 x += 10 + c - 'A';
    5984             :         }
    5985           0 :         if (x <= MAX_UNICODE) {
    5986           0 :             if (unicode_putchar(&v, &outpos, x) < 0)
    5987           0 :                 goto onError;
    5988             :         } else {
    5989           0 :             endinpos = s-starts;
    5990           0 :             if (unicode_decode_call_errorhandler(
    5991             :                     errors, &errorHandler,
    5992             :                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
    5993             :                     &starts, &end, &startinpos, &endinpos, &exc, &s,
    5994             :                     &v, &outpos))
    5995           0 :                 goto onError;
    5996             :         }
    5997             :       nextByte:
    5998             :         ;
    5999             :     }
    6000           0 :     if (unicode_resize(&v, outpos) < 0)
    6001           0 :         goto onError;
    6002           0 :     Py_XDECREF(errorHandler);
    6003           0 :     Py_XDECREF(exc);
    6004           0 :     return unicode_result(v);
    6005             : 
    6006             :   onError:
    6007           0 :     Py_XDECREF(v);
    6008           0 :     Py_XDECREF(errorHandler);
    6009           0 :     Py_XDECREF(exc);
    6010           0 :     return NULL;
    6011             : }
    6012             : 
    6013             : 
    6014             : PyObject *
    6015           0 : PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
    6016             : {
    6017             :     PyObject *repr;
    6018             :     char *p;
    6019             :     char *q;
    6020             :     Py_ssize_t expandsize, pos;
    6021             :     int kind;
    6022             :     void *data;
    6023             :     Py_ssize_t len;
    6024             : 
    6025           0 :     if (!PyUnicode_Check(unicode)) {
    6026           0 :         PyErr_BadArgument();
    6027           0 :         return NULL;
    6028             :     }
    6029           0 :     if (PyUnicode_READY(unicode) == -1)
    6030           0 :         return NULL;
    6031           0 :     kind = PyUnicode_KIND(unicode);
    6032           0 :     data = PyUnicode_DATA(unicode);
    6033           0 :     len = PyUnicode_GET_LENGTH(unicode);
    6034             :     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
    6035             :        bytes, and 1 byte characters 4. */
    6036           0 :     expandsize = kind * 2 + 2;
    6037             : 
    6038           0 :     if (len > PY_SSIZE_T_MAX / expandsize)
    6039           0 :         return PyErr_NoMemory();
    6040             : 
    6041           0 :     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
    6042           0 :     if (repr == NULL)
    6043           0 :         return NULL;
    6044           0 :     if (len == 0)
    6045           0 :         return repr;
    6046             : 
    6047           0 :     p = q = PyBytes_AS_STRING(repr);
    6048           0 :     for (pos = 0; pos < len; pos++) {
    6049           0 :         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
    6050             :         /* Map 32-bit characters to '\Uxxxxxxxx' */
    6051           0 :         if (ch >= 0x10000) {
    6052             :             assert(ch <= MAX_UNICODE);
    6053           0 :             *p++ = '\\';
    6054           0 :             *p++ = 'U';
    6055           0 :             *p++ = Py_hexdigits[(ch >> 28) & 0xf];
    6056           0 :             *p++ = Py_hexdigits[(ch >> 24) & 0xf];
    6057           0 :             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
    6058           0 :             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
    6059           0 :             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
    6060           0 :             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
    6061           0 :             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
    6062           0 :             *p++ = Py_hexdigits[ch & 15];
    6063             :         }
    6064             :         /* Map 16-bit characters to '\uxxxx' */
    6065           0 :         else if (ch >= 256) {
    6066           0 :             *p++ = '\\';
    6067           0 :             *p++ = 'u';
    6068           0 :             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
    6069           0 :             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
    6070           0 :             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
    6071           0 :             *p++ = Py_hexdigits[ch & 15];
    6072             :         }
    6073             :         /* Copy everything else as-is */
    6074             :         else
    6075           0 :             *p++ = (char) ch;
    6076             :     }
    6077             : 
    6078             :     assert(p > q);
    6079           0 :     if (_PyBytes_Resize(&repr, p - q) < 0)
    6080           0 :         return NULL;
    6081           0 :     return repr;
    6082             : }
    6083             : 
    6084             : PyObject *
    6085           0 : PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
    6086             :                                  Py_ssize_t size)
    6087             : {
    6088             :     PyObject *result;
    6089           0 :     PyObject *tmp = PyUnicode_FromUnicode(s, size);
    6090           0 :     if (tmp == NULL)
    6091           0 :         return NULL;
    6092           0 :     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
    6093           0 :     Py_DECREF(tmp);
    6094           0 :     return result;
    6095             : }
    6096             : 
    6097             : /* --- Unicode Internal Codec ------------------------------------------- */
    6098             : 
    6099             : PyObject *
    6100           0 : _PyUnicode_DecodeUnicodeInternal(const char *s,
    6101             :                                  Py_ssize_t size,
    6102             :                                  const char *errors)
    6103             : {
    6104           0 :     const char *starts = s;
    6105             :     Py_ssize_t startinpos;
    6106             :     Py_ssize_t endinpos;
    6107             :     Py_ssize_t outpos;
    6108             :     PyObject *v;
    6109             :     const char *end;
    6110             :     const char *reason;
    6111           0 :     PyObject *errorHandler = NULL;
    6112           0 :     PyObject *exc = NULL;
    6113             : 
    6114           0 :     if (PyErr_WarnEx(PyExc_DeprecationWarning,
    6115             :                      "unicode_internal codec has been deprecated",
    6116             :                      1))
    6117           0 :         return NULL;
    6118             : 
    6119             :     /* XXX overflow detection missing */
    6120           0 :     v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
    6121           0 :     if (v == NULL)
    6122           0 :         goto onError;
    6123           0 :     if (PyUnicode_GET_LENGTH(v) == 0)
    6124           0 :         return v;
    6125           0 :     outpos = 0;
    6126           0 :     end = s + size;
    6127             : 
    6128           0 :     while (s < end) {
    6129             :         Py_UNICODE uch;
    6130             :         Py_UCS4 ch;
    6131             :         /* We copy the raw representation one byte at a time because the
    6132             :            pointer may be unaligned (see test_codeccallbacks). */
    6133           0 :         ((char *) &uch)[0] = s[0];
    6134           0 :         ((char *) &uch)[1] = s[1];
    6135             : #ifdef Py_UNICODE_WIDE
    6136           0 :         ((char *) &uch)[2] = s[2];
    6137           0 :         ((char *) &uch)[3] = s[3];
    6138             : #endif
    6139           0 :         ch = uch;
    6140             : 
    6141             :         /* We have to sanity check the raw data, otherwise doom looms for
    6142             :            some malformed UCS-4 data. */
    6143           0 :         if (
    6144             : #ifdef Py_UNICODE_WIDE
    6145           0 :             ch > 0x10ffff ||
    6146             : #endif
    6147           0 :             end-s < Py_UNICODE_SIZE
    6148             :             )
    6149             :         {
    6150           0 :             startinpos = s - starts;
    6151           0 :             if (end-s < Py_UNICODE_SIZE) {
    6152           0 :                 endinpos = end-starts;
    6153           0 :                 reason = "truncated input";
    6154             :             }
    6155             :             else {
    6156           0 :                 endinpos = s - starts + Py_UNICODE_SIZE;
    6157           0 :                 reason = "illegal code point (> 0x10FFFF)";
    6158             :             }
    6159           0 :             if (unicode_decode_call_errorhandler(
    6160             :                     errors, &errorHandler,
    6161             :                     "unicode_internal", reason,
    6162             :                     &starts, &end, &startinpos, &endinpos, &exc, &s,
    6163             :                     &v, &outpos))
    6164             :                 goto onError;
    6165           0 :             continue;
    6166             :         }
    6167             : 
    6168           0 :         s += Py_UNICODE_SIZE;
    6169             : #ifndef Py_UNICODE_WIDE
    6170             :         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
    6171             :         {
    6172             :             Py_UNICODE uch2;
    6173             :             ((char *) &uch2)[0] = s[0];
    6174             :             ((char *) &uch2)[1] = s[1];
    6175             :             if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
    6176             :             {
    6177             :                 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
    6178             :                 s += Py_UNICODE_SIZE;
    6179             :             }
    6180             :         }
    6181             : #endif
    6182             : 
    6183           0 :         if (unicode_putchar(&v, &outpos, ch) < 0)
    6184             :             goto onError;
    6185             :     }
    6186             : 
    6187           0 :     if (unicode_resize(&v, outpos) < 0)
    6188           0 :         goto onError;
    6189           0 :     Py_XDECREF(errorHandler);
    6190           0 :     Py_XDECREF(exc);
    6191           0 :     return unicode_result(v);
    6192             : 
    6193             :   onError:
    6194           0 :     Py_XDECREF(v);
    6195           0 :     Py_XDECREF(errorHandler);
    6196           0 :     Py_XDECREF(exc);
    6197           0 :     return NULL;
    6198             : }
    6199             : 
    6200             : /* --- Latin-1 Codec ------------------------------------------------------ */
    6201             : 
    6202             : PyObject *
    6203           0 : PyUnicode_DecodeLatin1(const char *s,
    6204             :                        Py_ssize_t size,
    6205             :                        const char *errors)
    6206             : {
    6207             :     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
    6208           0 :     return _PyUnicode_FromUCS1((unsigned char*)s, size);
    6209             : }
    6210             : 
    6211             : /* create or adjust a UnicodeEncodeError */
    6212             : static void
    6213           0 : make_encode_exception(PyObject **exceptionObject,
    6214             :                       const char *encoding,
    6215             :                       PyObject *unicode,
    6216             :                       Py_ssize_t startpos, Py_ssize_t endpos,
    6217             :                       const char *reason)
    6218             : {
    6219           0 :     if (*exceptionObject == NULL) {
    6220           0 :         *exceptionObject = PyObject_CallFunction(
    6221             :             PyExc_UnicodeEncodeError, "sOnns",
    6222             :             encoding, unicode, startpos, endpos, reason);
    6223             :     }
    6224             :     else {
    6225           0 :         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
    6226           0 :             goto onError;
    6227           0 :         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
    6228           0 :             goto onError;
    6229           0 :         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
    6230           0 :             goto onError;
    6231           0 :         return;
    6232             :       onError:
    6233           0 :         Py_DECREF(*exceptionObject);
    6234           0 :         *exceptionObject = NULL;
    6235             :     }
    6236             : }
    6237             : 
    6238             : /* raises a UnicodeEncodeError */
    6239             : static void
    6240           0 : raise_encode_exception(PyObject **exceptionObject,
    6241             :                        const char *encoding,
    6242             :                        PyObject *unicode,
    6243             :                        Py_ssize_t startpos, Py_ssize_t endpos,
    6244             :                        const char *reason)
    6245             : {
    6246           0 :     make_encode_exception(exceptionObject,
    6247             :                           encoding, unicode, startpos, endpos, reason);
    6248           0 :     if (*exceptionObject != NULL)
    6249           0 :         PyCodec_StrictErrors(*exceptionObject);
    6250           0 : }
    6251             : 
    6252             : /* error handling callback helper:
    6253             :    build arguments, call the callback and check the arguments,
    6254             :    put the result into newpos and return the replacement string, which
    6255             :    has to be freed by the caller */
    6256             : static PyObject *
    6257           0 : unicode_encode_call_errorhandler(const char *errors,
    6258             :                                  PyObject **errorHandler,
    6259             :                                  const char *encoding, const char *reason,
    6260             :                                  PyObject *unicode, PyObject **exceptionObject,
    6261             :                                  Py_ssize_t startpos, Py_ssize_t endpos,
    6262             :                                  Py_ssize_t *newpos)
    6263             : {
    6264             :     static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
    6265             :     Py_ssize_t len;
    6266             :     PyObject *restuple;
    6267             :     PyObject *resunicode;
    6268             : 
    6269           0 :     if (*errorHandler == NULL) {
    6270           0 :         *errorHandler = PyCodec_LookupError(errors);
    6271           0 :         if (*errorHandler == NULL)
    6272           0 :             return NULL;
    6273             :     }
    6274             : 
    6275           0 :     if (PyUnicode_READY(unicode) == -1)
    6276           0 :         return NULL;
    6277           0 :     len = PyUnicode_GET_LENGTH(unicode);
    6278             : 
    6279           0 :     make_encode_exception(exceptionObject,
    6280             :                           encoding, unicode, startpos, endpos, reason);
    6281           0 :     if (*exceptionObject == NULL)
    6282           0 :         return NULL;
    6283             : 
    6284           0 :     restuple = PyObject_CallFunctionObjArgs(
    6285             :         *errorHandler, *exceptionObject, NULL);
    6286           0 :     if (restuple == NULL)
    6287           0 :         return NULL;
    6288           0 :     if (!PyTuple_Check(restuple)) {
    6289           0 :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    6290           0 :         Py_DECREF(restuple);
    6291           0 :         return NULL;
    6292             :     }
    6293           0 :     if (!PyArg_ParseTuple(restuple, argparse,
    6294             :                           &resunicode, newpos)) {
    6295           0 :         Py_DECREF(restuple);
    6296           0 :         return NULL;
    6297             :     }
    6298           0 :     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
    6299           0 :         PyErr_SetString(PyExc_TypeError, &argparse[3]);
    6300           0 :         Py_DECREF(restuple);
    6301           0 :         return NULL;
    6302             :     }
    6303           0 :     if (*newpos<0)
    6304           0 :         *newpos = len + *newpos;
    6305           0 :     if (*newpos<0 || *newpos>len) {
    6306           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
    6307           0 :         Py_DECREF(restuple);
    6308           0 :         return NULL;
    6309             :     }
    6310           0 :     Py_INCREF(resunicode);
    6311           0 :     Py_DECREF(restuple);
    6312           0 :     return resunicode;
    6313             : }
    6314             : 
    6315             : static PyObject *
    6316           0 : unicode_encode_ucs1(PyObject *unicode,
    6317             :                     const char *errors,
    6318             :                     unsigned int limit)
    6319             : {
    6320             :     /* input state */
    6321           0 :     Py_ssize_t pos=0, size;
    6322             :     int kind;
    6323             :     void *data;
    6324             :     /* output object */
    6325             :     PyObject *res;
    6326             :     /* pointer into the output */
    6327             :     char *str;
    6328             :     /* current output position */
    6329             :     Py_ssize_t ressize;
    6330           0 :     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
    6331           0 :     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
    6332           0 :     PyObject *errorHandler = NULL;
    6333           0 :     PyObject *exc = NULL;
    6334             :     /* the following variable is used for caching string comparisons
    6335             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    6336           0 :     int known_errorHandler = -1;
    6337             : 
    6338           0 :     if (PyUnicode_READY(unicode) == -1)
    6339           0 :         return NULL;
    6340           0 :     size = PyUnicode_GET_LENGTH(unicode);
    6341           0 :     kind = PyUnicode_KIND(unicode);
    6342           0 :     data = PyUnicode_DATA(unicode);
    6343             :     /* allocate enough for a simple encoding without
    6344             :        replacements, if we need more, we'll resize */
    6345           0 :     if (size == 0)
    6346           0 :         return PyBytes_FromStringAndSize(NULL, 0);
    6347           0 :     res = PyBytes_FromStringAndSize(NULL, size);
    6348           0 :     if (res == NULL)
    6349           0 :         return NULL;
    6350           0 :     str = PyBytes_AS_STRING(res);
    6351           0 :     ressize = size;
    6352             : 
    6353           0 :     while (pos < size) {
    6354           0 :         Py_UCS4 c = PyUnicode_READ(kind, data, pos);
    6355             : 
    6356             :         /* can we encode this? */
    6357           0 :         if (c<limit) {
    6358             :             /* no overflow check, because we know that the space is enough */
    6359           0 :             *str++ = (char)c;
    6360           0 :             ++pos;
    6361             :         }
    6362             :         else {
    6363             :             Py_ssize_t requiredsize;
    6364             :             PyObject *repunicode;
    6365             :             Py_ssize_t repsize, newpos, respos, i;
    6366             :             /* startpos for collecting unencodable chars */
    6367           0 :             Py_ssize_t collstart = pos;
    6368           0 :             Py_ssize_t collend = pos;
    6369             :             /* find all unecodable characters */
    6370           0 :             while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
    6371           0 :                 ++collend;
    6372             :             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
    6373           0 :             if (known_errorHandler==-1) {
    6374           0 :                 if ((errors==NULL) || (!strcmp(errors, "strict")))
    6375           0 :                     known_errorHandler = 1;
    6376           0 :                 else if (!strcmp(errors, "replace"))
    6377           0 :                     known_errorHandler = 2;
    6378           0 :                 else if (!strcmp(errors, "ignore"))
    6379           0 :                     known_errorHandler = 3;
    6380           0 :                 else if (!strcmp(errors, "xmlcharrefreplace"))
    6381           0 :                     known_errorHandler = 4;
    6382             :                 else
    6383           0 :                     known_errorHandler = 0;
    6384             :             }
    6385           0 :             switch (known_errorHandler) {
    6386             :             case 1: /* strict */
    6387           0 :                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
    6388             :                 goto onError;
    6389             :             case 2: /* replace */
    6390           0 :                 while (collstart++<collend)
    6391           0 :                     *str++ = '?'; /* fall through */
    6392             :             case 3: /* ignore */
    6393           0 :                 pos = collend;
    6394           0 :                 break;
    6395             :             case 4: /* xmlcharrefreplace */
    6396           0 :                 respos = str - PyBytes_AS_STRING(res);
    6397             :                 /* determine replacement size */
    6398           0 :                 for (i = collstart, repsize = 0; i < collend; ++i) {
    6399           0 :                     Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    6400           0 :                     if (ch < 10)
    6401           0 :                         repsize += 2+1+1;
    6402           0 :                     else if (ch < 100)
    6403           0 :                         repsize += 2+2+1;
    6404           0 :                     else if (ch < 1000)
    6405           0 :                         repsize += 2+3+1;
    6406           0 :                     else if (ch < 10000)
    6407           0 :                         repsize += 2+4+1;
    6408           0 :                     else if (ch < 100000)
    6409           0 :                         repsize += 2+5+1;
    6410           0 :                     else if (ch < 1000000)
    6411           0 :                         repsize += 2+6+1;
    6412             :                     else {
    6413             :                         assert(ch <= MAX_UNICODE);
    6414           0 :                         repsize += 2+7+1;
    6415             :                     }
    6416             :                 }
    6417           0 :                 requiredsize = respos+repsize+(size-collend);
    6418           0 :                 if (requiredsize > ressize) {
    6419           0 :                     if (requiredsize<2*ressize)
    6420           0 :                         requiredsize = 2*ressize;
    6421           0 :                     if (_PyBytes_Resize(&res, requiredsize))
    6422             :                         goto onError;
    6423           0 :                     str = PyBytes_AS_STRING(res) + respos;
    6424           0 :                     ressize = requiredsize;
    6425             :                 }
    6426             :                 /* generate replacement */
    6427           0 :                 for (i = collstart; i < collend; ++i) {
    6428           0 :                     str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
    6429             :                 }
    6430           0 :                 pos = collend;
    6431           0 :                 break;
    6432             :             default:
    6433           0 :                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
    6434             :                                                               encoding, reason, unicode, &exc,
    6435             :                                                               collstart, collend, &newpos);
    6436           0 :                 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
    6437           0 :                                            PyUnicode_READY(repunicode) == -1))
    6438             :                     goto onError;
    6439           0 :                 if (PyBytes_Check(repunicode)) {
    6440             :                     /* Directly copy bytes result to output. */
    6441           0 :                     repsize = PyBytes_Size(repunicode);
    6442           0 :                     if (repsize > 1) {
    6443             :                         /* Make room for all additional bytes. */
    6444           0 :                         respos = str - PyBytes_AS_STRING(res);
    6445           0 :                         if (_PyBytes_Resize(&res, ressize+repsize-1)) {
    6446           0 :                             Py_DECREF(repunicode);
    6447             :                             goto onError;
    6448             :                         }
    6449           0 :                         str = PyBytes_AS_STRING(res) + respos;
    6450           0 :                         ressize += repsize-1;
    6451             :                     }
    6452           0 :                     memcpy(str, PyBytes_AsString(repunicode), repsize);
    6453           0 :                     str += repsize;
    6454           0 :                     pos = newpos;
    6455           0 :                     Py_DECREF(repunicode);
    6456           0 :                     break;
    6457             :                 }
    6458             :                 /* need more space? (at least enough for what we
    6459             :                    have+the replacement+the rest of the string, so
    6460             :                    we won't have to check space for encodable characters) */
    6461           0 :                 respos = str - PyBytes_AS_STRING(res);
    6462           0 :                 repsize = PyUnicode_GET_LENGTH(repunicode);
    6463           0 :                 requiredsize = respos+repsize+(size-collend);
    6464           0 :                 if (requiredsize > ressize) {
    6465           0 :                     if (requiredsize<2*ressize)
    6466           0 :                         requiredsize = 2*ressize;
    6467           0 :                     if (_PyBytes_Resize(&res, requiredsize)) {
    6468           0 :                         Py_DECREF(repunicode);
    6469             :                         goto onError;
    6470             :                     }
    6471           0 :                     str = PyBytes_AS_STRING(res) + respos;
    6472           0 :                     ressize = requiredsize;
    6473             :                 }
    6474             :                 /* check if there is anything unencodable in the replacement
    6475             :                    and copy it to the output */
    6476           0 :                 for (i = 0; repsize-->0; ++i, ++str) {
    6477           0 :                     c = PyUnicode_READ_CHAR(repunicode, i);
    6478           0 :                     if (c >= limit) {
    6479           0 :                         raise_encode_exception(&exc, encoding, unicode,
    6480             :                                                pos, pos+1, reason);
    6481           0 :                         Py_DECREF(repunicode);
    6482             :                         goto onError;
    6483             :                     }
    6484           0 :                     *str = (char)c;
    6485             :                 }
    6486           0 :                 pos = newpos;
    6487           0 :                 Py_DECREF(repunicode);
    6488             :             }
    6489             :         }
    6490             :     }
    6491             :     /* Resize if we allocated to much */
    6492           0 :     size = str - PyBytes_AS_STRING(res);
    6493           0 :     if (size < ressize) { /* If this falls res will be NULL */
    6494             :         assert(size >= 0);
    6495           0 :         if (_PyBytes_Resize(&res, size) < 0)
    6496           0 :             goto onError;
    6497             :     }
    6498             : 
    6499           0 :     Py_XDECREF(errorHandler);
    6500           0 :     Py_XDECREF(exc);
    6501           0 :     return res;
    6502             : 
    6503             :   onError:
    6504           0 :     Py_XDECREF(res);
    6505           0 :     Py_XDECREF(errorHandler);
    6506           0 :     Py_XDECREF(exc);
    6507           0 :     return NULL;
    6508             : }
    6509             : 
    6510             : /* Deprecated */
    6511             : PyObject *
    6512           0 : PyUnicode_EncodeLatin1(const Py_UNICODE *p,
    6513             :                        Py_ssize_t size,
    6514             :                        const char *errors)
    6515             : {
    6516             :     PyObject *result;
    6517           0 :     PyObject *unicode = PyUnicode_FromUnicode(p, size);
    6518           0 :     if (unicode == NULL)
    6519           0 :         return NULL;
    6520           0 :     result = unicode_encode_ucs1(unicode, errors, 256);
    6521           0 :     Py_DECREF(unicode);
    6522           0 :     return result;
    6523             : }
    6524             : 
    6525             : PyObject *
    6526           0 : _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
    6527             : {
    6528           0 :     if (!PyUnicode_Check(unicode)) {
    6529           0 :         PyErr_BadArgument();
    6530           0 :         return NULL;
    6531             :     }
    6532           0 :     if (PyUnicode_READY(unicode) == -1)
    6533           0 :         return NULL;
    6534             :     /* Fast path: if it is a one-byte string, construct
    6535             :        bytes object directly. */
    6536           0 :     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
    6537           0 :         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
    6538             :                                          PyUnicode_GET_LENGTH(unicode));
    6539             :     /* Non-Latin-1 characters present. Defer to above function to
    6540             :        raise the exception. */
    6541           0 :     return unicode_encode_ucs1(unicode, errors, 256);
    6542             : }
    6543             : 
    6544             : PyObject*
    6545           0 : PyUnicode_AsLatin1String(PyObject *unicode)
    6546             : {
    6547           0 :     return _PyUnicode_AsLatin1String(unicode, NULL);
    6548             : }
    6549             : 
    6550             : /* --- 7-bit ASCII Codec -------------------------------------------------- */
    6551             : 
    6552             : PyObject *
    6553          28 : PyUnicode_DecodeASCII(const char *s,
    6554             :                       Py_ssize_t size,
    6555             :                       const char *errors)
    6556             : {
    6557          28 :     const char *starts = s;
    6558             :     PyObject *unicode;
    6559             :     int kind;
    6560             :     void *data;
    6561             :     Py_ssize_t startinpos;
    6562             :     Py_ssize_t endinpos;
    6563             :     Py_ssize_t outpos;
    6564             :     const char *e;
    6565          28 :     PyObject *errorHandler = NULL;
    6566          28 :     PyObject *exc = NULL;
    6567             : 
    6568          28 :     if (size == 0) {
    6569           0 :         Py_INCREF(unicode_empty);
    6570           0 :         return unicode_empty;
    6571             :     }
    6572             : 
    6573             :     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    6574          28 :     if (size == 1 && (unsigned char)s[0] < 128)
    6575           0 :         return get_latin1_char((unsigned char)s[0]);
    6576             : 
    6577          28 :     unicode = PyUnicode_New(size, 127);
    6578          28 :     if (unicode == NULL)
    6579           0 :         goto onError;
    6580             : 
    6581          28 :     e = s + size;
    6582          28 :     data = PyUnicode_1BYTE_DATA(unicode);
    6583          28 :     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
    6584          28 :     if (outpos == size)
    6585          28 :         return unicode;
    6586             : 
    6587           0 :     s += outpos;
    6588           0 :     kind = PyUnicode_1BYTE_KIND;
    6589           0 :     while (s < e) {
    6590           0 :         register unsigned char c = (unsigned char)*s;
    6591           0 :         if (c < 128) {
    6592           0 :             PyUnicode_WRITE(kind, data, outpos++, c);
    6593           0 :             ++s;
    6594             :         }
    6595             :         else {
    6596           0 :             startinpos = s-starts;
    6597           0 :             endinpos = startinpos + 1;
    6598           0 :             if (unicode_decode_call_errorhandler(
    6599             :                     errors, &errorHandler,
    6600             :                     "ascii", "ordinal not in range(128)",
    6601             :                     &starts, &e, &startinpos, &endinpos, &exc, &s,
    6602             :                     &unicode, &outpos))
    6603           0 :                 goto onError;
    6604           0 :             kind = PyUnicode_KIND(unicode);
    6605           0 :             data = PyUnicode_DATA(unicode);
    6606             :         }
    6607             :     }
    6608           0 :     if (unicode_resize(&unicode, outpos) < 0)
    6609           0 :         goto onError;
    6610           0 :     Py_XDECREF(errorHandler);
    6611           0 :     Py_XDECREF(exc);
    6612             :     assert(_PyUnicode_CheckConsistency(unicode, 1));
    6613           0 :     return unicode;
    6614             : 
    6615             :   onError:
    6616           0 :     Py_XDECREF(unicode);
    6617           0 :     Py_XDECREF(errorHandler);
    6618           0 :     Py_XDECREF(exc);
    6619           0 :     return NULL;
    6620             : }
    6621             : 
    6622             : /* Deprecated */
    6623             : PyObject *
    6624           0 : PyUnicode_EncodeASCII(const Py_UNICODE *p,
    6625             :                       Py_ssize_t size,
    6626             :                       const char *errors)
    6627             : {
    6628             :     PyObject *result;
    6629           0 :     PyObject *unicode = PyUnicode_FromUnicode(p, size);
    6630           0 :     if (unicode == NULL)
    6631           0 :         return NULL;
    6632           0 :     result = unicode_encode_ucs1(unicode, errors, 128);
    6633           0 :     Py_DECREF(unicode);
    6634           0 :     return result;
    6635             : }
    6636             : 
    6637             : PyObject *
    6638           4 : _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
    6639             : {
    6640           4 :     if (!PyUnicode_Check(unicode)) {
    6641           0 :         PyErr_BadArgument();
    6642           0 :         return NULL;
    6643             :     }
    6644           4 :     if (PyUnicode_READY(unicode) == -1)
    6645           0 :         return NULL;
    6646             :     /* Fast path: if it is an ASCII-only string, construct bytes object
    6647             :        directly. Else defer to above function to raise the exception. */
    6648           4 :     if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
    6649           4 :         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
    6650             :                                          PyUnicode_GET_LENGTH(unicode));
    6651           0 :     return unicode_encode_ucs1(unicode, errors, 128);
    6652             : }
    6653             : 
    6654             : PyObject *
    6655           0 : PyUnicode_AsASCIIString(PyObject *unicode)
    6656             : {
    6657           0 :     return _PyUnicode_AsASCIIString(unicode, NULL);
    6658             : }
    6659             : 
    6660             : #ifdef HAVE_MBCS
    6661             : 
    6662             : /* --- MBCS codecs for Windows -------------------------------------------- */
    6663             : 
    6664             : #if SIZEOF_INT < SIZEOF_SIZE_T
    6665             : #define NEED_RETRY
    6666             : #endif
    6667             : 
    6668             : #ifndef WC_ERR_INVALID_CHARS
    6669             : #  define WC_ERR_INVALID_CHARS 0x0080
    6670             : #endif
    6671             : 
    6672             : static char*
    6673             : code_page_name(UINT code_page, PyObject **obj)
    6674             : {
    6675             :     *obj = NULL;
    6676             :     if (code_page == CP_ACP)
    6677             :         return "mbcs";
    6678             :     if (code_page == CP_UTF7)
    6679             :         return "CP_UTF7";
    6680             :     if (code_page == CP_UTF8)
    6681             :         return "CP_UTF8";
    6682             : 
    6683             :     *obj = PyBytes_FromFormat("cp%u", code_page);
    6684             :     if (*obj == NULL)
    6685             :         return NULL;
    6686             :     return PyBytes_AS_STRING(*obj);
    6687             : }
    6688             : 
    6689             : static int
    6690             : is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
    6691             : {
    6692             :     const char *curr = s + offset;
    6693             :     const char *prev;
    6694             : 
    6695             :     if (!IsDBCSLeadByteEx(code_page, *curr))
    6696             :         return 0;
    6697             : 
    6698             :     prev = CharPrevExA(code_page, s, curr, 0);
    6699             :     if (prev == curr)
    6700             :         return 1;
    6701             :     /* FIXME: This code is limited to "true" double-byte encodings,
    6702             :        as it assumes an incomplete character consists of a single
    6703             :        byte. */
    6704             :     if (curr - prev == 2)
    6705             :         return 1;
    6706             :     if (!IsDBCSLeadByteEx(code_page, *prev))
    6707             :         return 1;
    6708             :     return 0;
    6709             : }
    6710             : 
    6711             : static DWORD
    6712             : decode_code_page_flags(UINT code_page)
    6713             : {
    6714             :     if (code_page == CP_UTF7) {
    6715             :         /* The CP_UTF7 decoder only supports flags=0 */
    6716             :         return 0;
    6717             :     }
    6718             :     else
    6719             :         return MB_ERR_INVALID_CHARS;
    6720             : }
    6721             : 
    6722             : /*
    6723             :  * Decode a byte string from a Windows code page into unicode object in strict
    6724             :  * mode.
    6725             :  *
    6726             :  * Returns consumed size if succeed, returns -2 on decode error, or raise a
    6727             :  * WindowsError and returns -1 on other error.
    6728             :  */
    6729             : static int
    6730             : decode_code_page_strict(UINT code_page,
    6731             :                         PyObject **v,
    6732             :                         const char *in,
    6733             :                         int insize)
    6734             : {
    6735             :     const DWORD flags = decode_code_page_flags(code_page);
    6736             :     wchar_t *out;
    6737             :     DWORD outsize;
    6738             : 
    6739             :     /* First get the size of the result */
    6740             :     assert(insize > 0);
    6741             :     outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
    6742             :     if (outsize <= 0)
    6743             :         goto error;
    6744             : 
    6745             :     if (*v == NULL) {
    6746             :         /* Create unicode object */
    6747             :         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
    6748             :         *v = (PyObject*)_PyUnicode_New(outsize);
    6749             :         if (*v == NULL)
    6750             :             return -1;
    6751             :         out = PyUnicode_AS_UNICODE(*v);
    6752             :     }
    6753             :     else {
    6754             :         /* Extend unicode object */
    6755             :         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
    6756             :         if (unicode_resize(v, n + outsize) < 0)
    6757             :             return -1;
    6758             :         out = PyUnicode_AS_UNICODE(*v) + n;
    6759             :     }
    6760             : 
    6761             :     /* Do the conversion */
    6762             :     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
    6763             :     if (outsize <= 0)
    6764             :         goto error;
    6765             :     return insize;
    6766             : 
    6767             : error:
    6768             :     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    6769             :         return -2;
    6770             :     PyErr_SetFromWindowsErr(0);
    6771             :     return -1;
    6772             : }
    6773             : 
    6774             : /*
    6775             :  * Decode a byte string from a code page into unicode object with an error
    6776             :  * handler.
    6777             :  *
    6778             :  * Returns consumed size if succeed, or raise a WindowsError or
    6779             :  * UnicodeDecodeError exception and returns -1 on error.
    6780             :  */
    6781             : static int
    6782             : decode_code_page_errors(UINT code_page,
    6783             :                         PyObject **v,
    6784             :                         const char *in, const int size,
    6785             :                         const char *errors)
    6786             : {
    6787             :     const char *startin = in;
    6788             :     const char *endin = in + size;
    6789             :     const DWORD flags = decode_code_page_flags(code_page);
    6790             :     /* Ideally, we should get reason from FormatMessage. This is the Windows
    6791             :        2000 English version of the message. */
    6792             :     const char *reason = "No mapping for the Unicode character exists "
    6793             :                          "in the target code page.";
    6794             :     /* each step cannot decode more than 1 character, but a character can be
    6795             :        represented as a surrogate pair */
    6796             :     wchar_t buffer[2], *startout, *out;
    6797             :     int insize, outsize;
    6798             :     PyObject *errorHandler = NULL;
    6799             :     PyObject *exc = NULL;
    6800             :     PyObject *encoding_obj = NULL;
    6801             :     char *encoding;
    6802             :     DWORD err;
    6803             :     int ret = -1;
    6804             : 
    6805             :     assert(size > 0);
    6806             : 
    6807             :     encoding = code_page_name(code_page, &encoding_obj);
    6808             :     if (encoding == NULL)
    6809             :         return -1;
    6810             : 
    6811             :     if (errors == NULL || strcmp(errors, "strict") == 0) {
    6812             :         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
    6813             :            UnicodeDecodeError. */
    6814             :         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
    6815             :         if (exc != NULL) {
    6816             :             PyCodec_StrictErrors(exc);
    6817             :             Py_CLEAR(exc);
    6818             :         }
    6819             :         goto error;
    6820             :     }
    6821             : 
    6822             :     if (*v == NULL) {
    6823             :         /* Create unicode object */
    6824             :         if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
    6825             :             PyErr_NoMemory();
    6826             :             goto error;
    6827             :         }
    6828             :         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
    6829             :         *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
    6830             :         if (*v == NULL)
    6831             :             goto error;
    6832             :         startout = PyUnicode_AS_UNICODE(*v);
    6833             :     }
    6834             :     else {
    6835             :         /* Extend unicode object */
    6836             :         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
    6837             :         if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
    6838             :             PyErr_NoMemory();
    6839             :             goto error;
    6840             :         }
    6841             :         if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
    6842             :             goto error;
    6843             :         startout = PyUnicode_AS_UNICODE(*v) + n;
    6844             :     }
    6845             : 
    6846             :     /* Decode the byte string character per character */
    6847             :     out = startout;
    6848             :     while (in < endin)
    6849             :     {
    6850             :         /* Decode a character */
    6851             :         insize = 1;
    6852             :         do
    6853             :         {
    6854             :             outsize = MultiByteToWideChar(code_page, flags,
    6855             :                                           in, insize,
    6856             :                                           buffer, Py_ARRAY_LENGTH(buffer));
    6857             :             if (outsize > 0)
    6858             :                 break;
    6859             :             err = GetLastError();
    6860             :             if (err != ERROR_NO_UNICODE_TRANSLATION
    6861             :                 && err != ERROR_INSUFFICIENT_BUFFER)
    6862             :             {
    6863             :                 PyErr_SetFromWindowsErr(0);
    6864             :                 goto error;
    6865             :             }
    6866             :             insize++;
    6867             :         }
    6868             :         /* 4=maximum length of a UTF-8 sequence */
    6869             :         while (insize <= 4 && (in + insize) <= endin);
    6870             : 
    6871             :         if (outsize <= 0) {
    6872             :             Py_ssize_t startinpos, endinpos, outpos;
    6873             : 
    6874             :             startinpos = in - startin;
    6875             :             endinpos = startinpos + 1;
    6876             :             outpos = out - PyUnicode_AS_UNICODE(*v);
    6877             :             if (unicode_decode_call_errorhandler(
    6878             :                     errors, &errorHandler,
    6879             :                     encoding, reason,
    6880             :                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
    6881             :                     v, &outpos))
    6882             :             {
    6883             :                 goto error;
    6884             :             }
    6885             :             out = PyUnicode_AS_UNICODE(*v) + outpos;
    6886             :         }
    6887             :         else {
    6888             :             in += insize;
    6889             :             memcpy(out, buffer, outsize * sizeof(wchar_t));
    6890             :             out += outsize;
    6891             :         }
    6892             :     }
    6893             : 
    6894             :     /* write a NUL character at the end */
    6895             :     *out = 0;
    6896             : 
    6897             :     /* Extend unicode object */
    6898             :     outsize = out - startout;
    6899             :     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
    6900             :     if (unicode_resize(v, outsize) < 0)
    6901             :         goto error;
    6902             :     ret = size;
    6903             : 
    6904             : error:
    6905             :     Py_XDECREF(encoding_obj);
    6906             :     Py_XDECREF(errorHandler);
    6907             :     Py_XDECREF(exc);
    6908             :     return ret;
    6909             : }
    6910             : 
    6911             : static PyObject *
    6912             : decode_code_page_stateful(int code_page,
    6913             :                           const char *s, Py_ssize_t size,
    6914             :                           const char *errors, Py_ssize_t *consumed)
    6915             : {
    6916             :     PyObject *v = NULL;
    6917             :     int chunk_size, final, converted, done;
    6918             : 
    6919             :     if (code_page < 0) {
    6920             :         PyErr_SetString(PyExc_ValueError, "invalid code page number");
    6921             :         return NULL;
    6922             :     }
    6923             : 
    6924             :     if (consumed)
    6925             :         *consumed = 0;
    6926             : 
    6927             :     do
    6928             :     {
    6929             : #ifdef NEED_RETRY
    6930             :         if (size > INT_MAX) {
    6931             :             chunk_size = INT_MAX;
    6932             :             final = 0;
    6933             :             done = 0;
    6934             :         }
    6935             :         else
    6936             : #endif
    6937             :         {
    6938             :             chunk_size = (int)size;
    6939             :             final = (consumed == NULL);
    6940             :             done = 1;
    6941             :         }
    6942             : 
    6943             :         /* Skip trailing lead-byte unless 'final' is set */
    6944             :         if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
    6945             :             --chunk_size;
    6946             : 
    6947             :         if (chunk_size == 0 && done) {
    6948             :             if (v != NULL)
    6949             :                 break;
    6950             :             Py_INCREF(unicode_empty);
    6951             :             return unicode_empty;
    6952             :         }
    6953             : 
    6954             : 
    6955             :         converted = decode_code_page_strict(code_page, &v,
    6956             :                                             s, chunk_size);
    6957             :         if (converted == -2)
    6958             :             converted = decode_code_page_errors(code_page, &v,
    6959             :                                                 s, chunk_size,
    6960             :                                                 errors);
    6961             :         assert(converted != 0);
    6962             : 
    6963             :         if (converted < 0) {
    6964             :             Py_XDECREF(v);
    6965             :             return NULL;
    6966             :         }
    6967             : 
    6968             :         if (consumed)
    6969             :             *consumed += converted;
    6970             : 
    6971             :         s += converted;
    6972             :         size -= converted;
    6973             :     } while (!done);
    6974             : 
    6975             :     return unicode_result(v);
    6976             : }
    6977             : 
    6978             : PyObject *
    6979             : PyUnicode_DecodeCodePageStateful(int code_page,
    6980             :                                  const char *s,
    6981             :                                  Py_ssize_t size,
    6982             :                                  const char *errors,
    6983             :                                  Py_ssize_t *consumed)
    6984             : {
    6985             :     return decode_code_page_stateful(code_page, s, size, errors, consumed);
    6986             : }
    6987             : 
    6988             : PyObject *
    6989             : PyUnicode_DecodeMBCSStateful(const char *s,
    6990             :                              Py_ssize_t size,
    6991             :                              const char *errors,
    6992             :                              Py_ssize_t *consumed)
    6993             : {
    6994             :     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
    6995             : }
    6996             : 
    6997             : PyObject *
    6998             : PyUnicode_DecodeMBCS(const char *s,
    6999             :                      Py_ssize_t size,
    7000             :                      const char *errors)
    7001             : {
    7002             :     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
    7003             : }
    7004             : 
    7005             : static DWORD
    7006             : encode_code_page_flags(UINT code_page, const char *errors)
    7007             : {
    7008             :     if (code_page == CP_UTF8) {
    7009             :         if (winver.dwMajorVersion >= 6)
    7010             :             /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
    7011             :                and later */
    7012             :             return WC_ERR_INVALID_CHARS;
    7013             :         else
    7014             :             /* CP_UTF8 only supports flags=0 on Windows older than Vista */
    7015             :             return 0;
    7016             :     }
    7017             :     else if (code_page == CP_UTF7) {
    7018             :         /* CP_UTF7 only supports flags=0 */
    7019             :         return 0;
    7020             :     }
    7021             :     else {
    7022             :         if (errors != NULL && strcmp(errors, "replace") == 0)
    7023             :             return 0;
    7024             :         else
    7025             :             return WC_NO_BEST_FIT_CHARS;
    7026             :     }
    7027             : }
    7028             : 
    7029             : /*
    7030             :  * Encode a Unicode string to a Windows code page into a byte string in strict
    7031             :  * mode.
    7032             :  *
    7033             :  * Returns consumed characters if succeed, returns -2 on encode error, or raise
    7034             :  * a WindowsError and returns -1 on other error.
    7035             :  */
    7036             : static int
    7037             : encode_code_page_strict(UINT code_page, PyObject **outbytes,
    7038             :                         PyObject *unicode, Py_ssize_t offset, int len,
    7039             :                         const char* errors)
    7040             : {
    7041             :     BOOL usedDefaultChar = FALSE;
    7042             :     BOOL *pusedDefaultChar = &usedDefaultChar;
    7043             :     int outsize;
    7044             :     PyObject *exc = NULL;
    7045             :     wchar_t *p;
    7046             :     Py_ssize_t size;
    7047             :     const DWORD flags = encode_code_page_flags(code_page, NULL);
    7048             :     char *out;
    7049             :     /* Create a substring so that we can get the UTF-16 representation
    7050             :        of just the slice under consideration. */
    7051             :     PyObject *substring;
    7052             : 
    7053             :     assert(len > 0);
    7054             : 
    7055             :     if (code_page != CP_UTF8 && code_page != CP_UTF7)
    7056             :         pusedDefaultChar = &usedDefaultChar;
    7057             :     else
    7058             :         pusedDefaultChar = NULL;
    7059             : 
    7060             :     substring = PyUnicode_Substring(unicode, offset, offset+len);
    7061             :     if (substring == NULL)
    7062             :         return -1;
    7063             :     p = PyUnicode_AsUnicodeAndSize(substring, &size);
    7064             :     if (p == NULL) {
    7065             :         Py_DECREF(substring);
    7066             :         return -1;
    7067             :     }
    7068             : 
    7069             :     /* First get the size of the result */
    7070             :     outsize = WideCharToMultiByte(code_page, flags,
    7071             :                                   p, size,
    7072             :                                   NULL, 0,
    7073             :                                   NULL, pusedDefaultChar);
    7074             :     if (outsize <= 0)
    7075             :         goto error;
    7076             :     /* If we used a default char, then we failed! */
    7077             :     if (pusedDefaultChar && *pusedDefaultChar) {
    7078             :         Py_DECREF(substring);
    7079             :         return -2;
    7080             :     }
    7081             : 
    7082             :     if (*outbytes == NULL) {
    7083             :         /* Create string object */
    7084             :         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
    7085             :         if (*outbytes == NULL) {
    7086             :             Py_DECREF(substring);
    7087             :             return -1;
    7088             :         }
    7089             :         out = PyBytes_AS_STRING(*outbytes);
    7090             :     }
    7091             :     else {
    7092             :         /* Extend string object */
    7093             :         const Py_ssize_t n = PyBytes_Size(*outbytes);
    7094             :         if (outsize > PY_SSIZE_T_MAX - n) {
    7095             :             PyErr_NoMemory();
    7096             :             Py_DECREF(substring);
    7097             :             return -1;
    7098             :         }
    7099             :         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
    7100             :             Py_DECREF(substring);
    7101             :             return -1;
    7102             :         }
    7103             :         out = PyBytes_AS_STRING(*outbytes) + n;
    7104             :     }
    7105             : 
    7106             :     /* Do the conversion */
    7107             :     outsize = WideCharToMultiByte(code_page, flags,
    7108             :                                   p, size,
    7109             :                                   out, outsize,
    7110             :                                   NULL, pusedDefaultChar);
    7111             :     Py_CLEAR(substring);
    7112             :     if (outsize <= 0)
    7113             :         goto error;
    7114             :     if (pusedDefaultChar && *pusedDefaultChar)
    7115             :         return -2;
    7116             :     return 0;
    7117             : 
    7118             : error:
    7119             :     Py_XDECREF(substring);
    7120             :     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
    7121             :         return -2;
    7122             :     PyErr_SetFromWindowsErr(0);
    7123             :     return -1;
    7124             : }
    7125             : 
    7126             : /*
    7127             :  * Encode a Unicode string to a Windows code page into a byte string using a
    7128             :  * error handler.
    7129             :  *
    7130             :  * Returns consumed characters if succeed, or raise a WindowsError and returns
    7131             :  * -1 on other error.
    7132             :  */
    7133             : static int
    7134             : encode_code_page_errors(UINT code_page, PyObject **outbytes,
    7135             :                         PyObject *unicode, Py_ssize_t unicode_offset,
    7136             :                         Py_ssize_t insize, const char* errors)
    7137             : {
    7138             :     const DWORD flags = encode_code_page_flags(code_page, errors);
    7139             :     Py_ssize_t pos = unicode_offset;
    7140             :     Py_ssize_t endin = unicode_offset + insize;
    7141             :     /* Ideally, we should get reason from FormatMessage. This is the Windows
    7142             :        2000 English version of the message. */
    7143             :     const char *reason = "invalid character";
    7144             :     /* 4=maximum length of a UTF-8 sequence */
    7145             :     char buffer[4];
    7146             :     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
    7147             :     Py_ssize_t outsize;
    7148             :     char *out;
    7149             :     PyObject *errorHandler = NULL;
    7150             :     PyObject *exc = NULL;
    7151             :     PyObject *encoding_obj = NULL;
    7152             :     char *encoding;
    7153             :     Py_ssize_t newpos, newoutsize;
    7154             :     PyObject *rep;
    7155             :     int ret = -1;
    7156             : 
    7157             :     assert(insize > 0);
    7158             : 
    7159             :     encoding = code_page_name(code_page, &encoding_obj);
    7160             :     if (encoding == NULL)
    7161             :         return -1;
    7162             : 
    7163             :     if (errors == NULL || strcmp(errors, "strict") == 0) {
    7164             :         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
    7165             :            then we raise a UnicodeEncodeError. */
    7166             :         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
    7167             :         if (exc != NULL) {
    7168             :             PyCodec_StrictErrors(exc);
    7169             :             Py_DECREF(exc);
    7170             :         }
    7171             :         Py_XDECREF(encoding_obj);
    7172             :         return -1;
    7173             :     }
    7174             : 
    7175             :     if (code_page != CP_UTF8 && code_page != CP_UTF7)
    7176             :         pusedDefaultChar = &usedDefaultChar;
    7177             :     else
    7178             :         pusedDefaultChar = NULL;
    7179             : 
    7180             :     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
    7181             :         PyErr_NoMemory();
    7182             :         goto error;
    7183             :     }
    7184             :     outsize = insize * Py_ARRAY_LENGTH(buffer);
    7185             : 
    7186             :     if (*outbytes == NULL) {
    7187             :         /* Create string object */
    7188             :         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
    7189             :         if (*outbytes == NULL)
    7190             :             goto error;
    7191             :         out = PyBytes_AS_STRING(*outbytes);
    7192             :     }
    7193             :     else {
    7194             :         /* Extend string object */
    7195             :         Py_ssize_t n = PyBytes_Size(*outbytes);
    7196             :         if (n > PY_SSIZE_T_MAX - outsize) {
    7197             :             PyErr_NoMemory();
    7198             :             goto error;
    7199             :         }
    7200             :         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
    7201             :             goto error;
    7202             :         out = PyBytes_AS_STRING(*outbytes) + n;
    7203             :     }
    7204             : 
    7205             :     /* Encode the string character per character */
    7206             :     while (pos < endin)
    7207             :     {
    7208             :         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
    7209             :         wchar_t chars[2];
    7210             :         int charsize;
    7211             :         if (ch < 0x10000) {
    7212             :             chars[0] = (wchar_t)ch;
    7213             :             charsize = 1;
    7214             :         }
    7215             :         else {
    7216             :             ch -= 0x10000;
    7217             :             chars[0] = 0xd800 + (ch >> 10);
    7218             :             chars[1] = 0xdc00 + (ch & 0x3ff);
    7219             :             charsize = 2;
    7220             :         }
    7221             : 
    7222             :         outsize = WideCharToMultiByte(code_page, flags,
    7223             :                                       chars, charsize,
    7224             :                                       buffer, Py_ARRAY_LENGTH(buffer),
    7225             :                                       NULL, pusedDefaultChar);
    7226             :         if (outsize > 0) {
    7227             :             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
    7228             :             {
    7229             :                 pos++;
    7230             :                 memcpy(out, buffer, outsize);
    7231             :                 out += outsize;
    7232             :                 continue;
    7233             :             }
    7234             :         }
    7235             :         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
    7236             :             PyErr_SetFromWindowsErr(0);
    7237             :             goto error;
    7238             :         }
    7239             : 
    7240             :         rep = unicode_encode_call_errorhandler(
    7241             :                   errors, &errorHandler, encoding, reason,
    7242             :                   unicode, &exc,
    7243             :                   pos, pos + 1, &newpos);
    7244             :         if (rep == NULL)
    7245             :             goto error;
    7246             :         pos = newpos;
    7247             : 
    7248             :         if (PyBytes_Check(rep)) {
    7249             :             outsize = PyBytes_GET_SIZE(rep);
    7250             :             if (outsize != 1) {
    7251             :                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
    7252             :                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
    7253             :                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
    7254             :                     Py_DECREF(rep);
    7255             :                     goto error;
    7256             :                 }
    7257             :                 out = PyBytes_AS_STRING(*outbytes) + offset;
    7258             :             }
    7259             :             memcpy(out, PyBytes_AS_STRING(rep), outsize);
    7260             :             out += outsize;
    7261             :         }
    7262             :         else {
    7263             :             Py_ssize_t i;
    7264             :             enum PyUnicode_Kind kind;
    7265             :             void *data;
    7266             : 
    7267             :             if (PyUnicode_READY(rep) == -1) {
    7268             :                 Py_DECREF(rep);
    7269             :                 goto error;
    7270             :             }
    7271             : 
    7272             :             outsize = PyUnicode_GET_LENGTH(rep);
    7273             :             if (outsize != 1) {
    7274             :                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
    7275             :                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
    7276             :                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
    7277             :                     Py_DECREF(rep);
    7278             :                     goto error;
    7279             :                 }
    7280             :                 out = PyBytes_AS_STRING(*outbytes) + offset;
    7281             :             }
    7282             :             kind = PyUnicode_KIND(rep);
    7283             :             data = PyUnicode_DATA(rep);
    7284             :             for (i=0; i < outsize; i++) {
    7285             :                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    7286             :                 if (ch > 127) {
    7287             :                     raise_encode_exception(&exc,
    7288             :                         encoding, unicode,
    7289             :                         pos, pos + 1,
    7290             :                         "unable to encode error handler result to ASCII");
    7291             :                     Py_DECREF(rep);
    7292             :                     goto error;
    7293             :                 }
    7294             :                 *out = (unsigned char)ch;
    7295             :                 out++;
    7296             :             }
    7297             :         }
    7298             :         Py_DECREF(rep);
    7299             :     }
    7300             :     /* write a NUL byte */
    7301             :     *out = 0;
    7302             :     outsize = out - PyBytes_AS_STRING(*outbytes);
    7303             :     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
    7304             :     if (_PyBytes_Resize(outbytes, outsize) < 0)
    7305             :         goto error;
    7306             :     ret = 0;
    7307             : 
    7308             : error:
    7309             :     Py_XDECREF(encoding_obj);
    7310             :     Py_XDECREF(errorHandler);
    7311             :     Py_XDECREF(exc);
    7312             :     return ret;
    7313             : }
    7314             : 
    7315             : static PyObject *
    7316             : encode_code_page(int code_page,
    7317             :                  PyObject *unicode,
    7318             :                  const char *errors)
    7319             : {
    7320             :     Py_ssize_t len;
    7321             :     PyObject *outbytes = NULL;
    7322             :     Py_ssize_t offset;
    7323             :     int chunk_len, ret, done;
    7324             : 
    7325             :     if (PyUnicode_READY(unicode) == -1)
    7326             :         return NULL;
    7327             :     len = PyUnicode_GET_LENGTH(unicode);
    7328             : 
    7329             :     if (code_page < 0) {
    7330             :         PyErr_SetString(PyExc_ValueError, "invalid code page number");
    7331             :         return NULL;
    7332             :     }
    7333             : 
    7334             :     if (len == 0)
    7335             :         return PyBytes_FromStringAndSize(NULL, 0);
    7336             : 
    7337             :     offset = 0;
    7338             :     do
    7339             :     {
    7340             : #ifdef NEED_RETRY
    7341             :         /* UTF-16 encoding may double the size, so use only INT_MAX/2
    7342             :            chunks. */
    7343             :         if (len > INT_MAX/2) {
    7344             :             chunk_len = INT_MAX/2;
    7345             :             done = 0;
    7346             :         }
    7347             :         else
    7348             : #endif
    7349             :         {
    7350             :             chunk_len = (int)len;
    7351             :             done = 1;
    7352             :         }
    7353             : 
    7354             :         ret = encode_code_page_strict(code_page, &outbytes,
    7355             :                                       unicode, offset, chunk_len,
    7356             :                                       errors);
    7357             :         if (ret == -2)
    7358             :             ret = encode_code_page_errors(code_page, &outbytes,
    7359             :                                           unicode, offset,
    7360             :                                           chunk_len, errors);
    7361             :         if (ret < 0) {
    7362             :             Py_XDECREF(outbytes);
    7363             :             return NULL;
    7364             :         }
    7365             : 
    7366             :         offset += chunk_len;
    7367             :         len -= chunk_len;
    7368             :     } while (!done);
    7369             : 
    7370             :     return outbytes;
    7371             : }
    7372             : 
    7373             : PyObject *
    7374             : PyUnicode_EncodeMBCS(const Py_UNICODE *p,
    7375             :                      Py_ssize_t size,
    7376             :                      const char *errors)
    7377             : {
    7378             :     PyObject *unicode, *res;
    7379             :     unicode = PyUnicode_FromUnicode(p, size);
    7380             :     if (unicode == NULL)
    7381             :         return NULL;
    7382             :     res = encode_code_page(CP_ACP, unicode, errors);
    7383             :     Py_DECREF(unicode);
    7384             :     return res;
    7385             : }
    7386             : 
    7387             : PyObject *
    7388             : PyUnicode_EncodeCodePage(int code_page,
    7389             :                          PyObject *unicode,
    7390             :                          const char *errors)
    7391             : {
    7392             :     return encode_code_page(code_page, unicode, errors);
    7393             : }
    7394             : 
    7395             : PyObject *
    7396             : PyUnicode_AsMBCSString(PyObject *unicode)
    7397             : {
    7398             :     if (!PyUnicode_Check(unicode)) {
    7399             :         PyErr_BadArgument();
    7400             :         return NULL;
    7401             :     }
    7402             :     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
    7403             : }
    7404             : 
    7405             : #undef NEED_RETRY
    7406             : 
    7407             : #endif /* HAVE_MBCS */
    7408             : 
    7409             : /* --- Character Mapping Codec -------------------------------------------- */
    7410             : 
    7411             : PyObject *
    7412           0 : PyUnicode_DecodeCharmap(const char *s,
    7413             :                         Py_ssize_t size,
    7414             :                         PyObject *mapping,
    7415             :                         const char *errors)
    7416             : {
    7417           0 :     const char *starts = s;
    7418             :     Py_ssize_t startinpos;
    7419             :     Py_ssize_t endinpos;
    7420             :     Py_ssize_t outpos;
    7421             :     const char *e;
    7422             :     PyObject *v;
    7423           0 :     Py_ssize_t extrachars = 0;
    7424           0 :     PyObject *errorHandler = NULL;
    7425           0 :     PyObject *exc = NULL;
    7426             : 
    7427             :     /* Default to Latin-1 */
    7428           0 :     if (mapping == NULL)
    7429           0 :         return PyUnicode_DecodeLatin1(s, size, errors);
    7430             : 
    7431           0 :     v = PyUnicode_New(size, 127);
    7432           0 :     if (v == NULL)
    7433           0 :         goto onError;
    7434           0 :     if (size == 0)
    7435           0 :         return v;
    7436           0 :     outpos = 0;
    7437           0 :     e = s + size;
    7438           0 :     if (PyUnicode_CheckExact(mapping)) {
    7439             :         Py_ssize_t maplen;
    7440             :         enum PyUnicode_Kind mapkind;
    7441             :         void *mapdata;
    7442             :         Py_UCS4 x;
    7443             : 
    7444           0 :         if (PyUnicode_READY(mapping) == -1)
    7445           0 :             return NULL;
    7446             : 
    7447           0 :         maplen = PyUnicode_GET_LENGTH(mapping);
    7448           0 :         mapdata = PyUnicode_DATA(mapping);
    7449           0 :         mapkind = PyUnicode_KIND(mapping);
    7450           0 :         while (s < e) {
    7451             :             unsigned char ch;
    7452           0 :             if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
    7453           0 :                 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
    7454           0 :                 if (outkind == PyUnicode_1BYTE_KIND) {
    7455           0 :                     void *outdata = PyUnicode_DATA(v);
    7456           0 :                     Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
    7457           0 :                     while (s < e) {
    7458           0 :                         unsigned char ch = *s;
    7459           0 :                         x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
    7460           0 :                         if (x > maxchar)
    7461           0 :                             goto Error;
    7462           0 :                         PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
    7463           0 :                         ++s;
    7464             :                     }
    7465           0 :                     break;
    7466             :                 }
    7467           0 :                 else if (outkind == PyUnicode_2BYTE_KIND) {
    7468           0 :                     void *outdata = PyUnicode_DATA(v);
    7469           0 :                     while (s < e) {
    7470           0 :                         unsigned char ch = *s;
    7471           0 :                         x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
    7472           0 :                         if (x == 0xFFFE)
    7473           0 :                             goto Error;
    7474           0 :                         PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
    7475           0 :                         ++s;
    7476             :                     }
    7477           0 :                     break;
    7478             :                 }
    7479             :             }
    7480           0 :             ch = *s;
    7481             : 
    7482           0 :             if (ch < maplen)
    7483           0 :                 x = PyUnicode_READ(mapkind, mapdata, ch);
    7484             :             else
    7485           0 :                 x = 0xfffe; /* invalid value */
    7486             : Error:
    7487           0 :             if (x == 0xfffe)
    7488             :             {
    7489             :                 /* undefined mapping */
    7490           0 :                 startinpos = s-starts;
    7491           0 :                 endinpos = startinpos+1;
    7492           0 :                 if (unicode_decode_call_errorhandler(
    7493             :                         errors, &errorHandler,
    7494             :                         "charmap", "character maps to <undefined>",
    7495             :                         &starts, &e, &startinpos, &endinpos, &exc, &s,
    7496             :                         &v, &outpos)) {
    7497           0 :                     goto onError;
    7498             :                 }
    7499           0 :                 continue;
    7500             :             }
    7501             : 
    7502           0 :             if (unicode_putchar(&v, &outpos, x) < 0)
    7503           0 :                 goto onError;
    7504           0 :             ++s;
    7505             :         }
    7506             :     }
    7507             :     else {
    7508           0 :         while (s < e) {
    7509           0 :             unsigned char ch = *s;
    7510             :             PyObject *w, *x;
    7511             : 
    7512             :             /* Get mapping (char ordinal -> integer, Unicode char or None) */
    7513           0 :             w = PyLong_FromLong((long)ch);
    7514           0 :             if (w == NULL)
    7515           0 :                 goto onError;
    7516           0 :             x = PyObject_GetItem(mapping, w);
    7517           0 :             Py_DECREF(w);
    7518           0 :             if (x == NULL) {
    7519           0 :                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    7520             :                     /* No mapping found means: mapping is undefined. */
    7521           0 :                     PyErr_Clear();
    7522           0 :                     x = Py_None;
    7523           0 :                     Py_INCREF(x);
    7524             :                 } else
    7525           0 :                     goto onError;
    7526             :             }
    7527             : 
    7528             :             /* Apply mapping */
    7529           0 :             if (PyLong_Check(x)) {
    7530           0 :                 long value = PyLong_AS_LONG(x);
    7531           0 :                 if (value < 0 || value > 65535) {
    7532           0 :                     PyErr_SetString(PyExc_TypeError,
    7533             :                                     "character mapping must be in range(65536)");
    7534           0 :                     Py_DECREF(x);
    7535           0 :                     goto onError;
    7536             :                 }
    7537           0 :                 if (unicode_putchar(&v, &outpos, value) < 0)
    7538           0 :                     goto onError;
    7539             :             }
    7540           0 :             else if (x == Py_None) {
    7541             :                 /* undefined mapping */
    7542           0 :                 startinpos = s-starts;
    7543           0 :                 endinpos = startinpos+1;
    7544           0 :                 if (unicode_decode_call_errorhandler(
    7545             :                         errors, &errorHandler,
    7546             :                         "charmap", "character maps to <undefined>",
    7547             :                         &starts, &e, &startinpos, &endinpos, &exc, &s,
    7548             :                         &v, &outpos)) {
    7549           0 :                     Py_DECREF(x);
    7550           0 :                     goto onError;
    7551             :                 }
    7552           0 :                 Py_DECREF(x);
    7553           0 :                 continue;
    7554             :             }
    7555           0 :             else if (PyUnicode_Check(x)) {
    7556             :                 Py_ssize_t targetsize;
    7557             : 
    7558           0 :                 if (PyUnicode_READY(x) == -1)
    7559           0 :                     goto onError;
    7560           0 :                 targetsize = PyUnicode_GET_LENGTH(x);
    7561             : 
    7562           0 :                 if (targetsize == 1) {
    7563             :                     /* 1-1 mapping */
    7564           0 :                     if (unicode_putchar(&v, &outpos,
    7565           0 :                                         PyUnicode_READ_CHAR(x, 0)) < 0)
    7566           0 :                         goto onError;
    7567             :                 }
    7568           0 :                 else if (targetsize > 1) {
    7569             :                     /* 1-n mapping */
    7570           0 :                     if (targetsize > extrachars) {
    7571             :                         /* resize first */
    7572           0 :                         Py_ssize_t needed = (targetsize - extrachars) + \
    7573           0 :                             (targetsize << 2);
    7574           0 :                         extrachars += needed;
    7575             :                         /* XXX overflow detection missing */
    7576           0 :                         if (unicode_resize(&v,
    7577           0 :                                            PyUnicode_GET_LENGTH(v) + needed) < 0)
    7578             :                         {
    7579           0 :                             Py_DECREF(x);
    7580           0 :                             goto onError;
    7581             :                         }
    7582             :                     }
    7583           0 :                     if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
    7584           0 :                         goto onError;
    7585           0 :                     PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
    7586           0 :                     outpos += targetsize;
    7587           0 :                     extrachars -= targetsize;
    7588             :                 }
    7589             :                 /* 1-0 mapping: skip the character */
    7590             :             }
    7591             :             else {
    7592             :                 /* wrong return value */
    7593           0 :                 PyErr_SetString(PyExc_TypeError,
    7594             :                                 "character mapping must return integer, None or str");
    7595           0 :                 Py_DECREF(x);
    7596           0 :                 goto onError;
    7597             :             }
    7598           0 :             Py_DECREF(x);
    7599           0 :             ++s;
    7600             :         }
    7601             :     }
    7602           0 :     if (unicode_resize(&v, outpos) < 0)
    7603           0 :         goto onError;
    7604           0 :     Py_XDECREF(errorHandler);
    7605           0 :     Py_XDECREF(exc);
    7606           0 :     return unicode_result(v);
    7607             : 
    7608             :   onError:
    7609           0 :     Py_XDECREF(errorHandler);
    7610           0 :     Py_XDECREF(exc);
    7611           0 :     Py_XDECREF(v);
    7612           0 :     return NULL;
    7613             : }
    7614             : 
    7615             : /* Charmap encoding: the lookup table */
    7616             : 
    7617             : struct encoding_map {
    7618             :     PyObject_HEAD
    7619             :     unsigned char level1[32];
    7620             :     int count2, count3;
    7621             :     unsigned char level23[1];
    7622             : };
    7623             : 
    7624             : static PyObject*
    7625           0 : encoding_map_size(PyObject *obj, PyObject* args)
    7626             : {
    7627           0 :     struct encoding_map *map = (struct encoding_map*)obj;
    7628           0 :     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
    7629           0 :                            128*map->count3);
    7630             : }
    7631             : 
    7632             : static PyMethodDef encoding_map_methods[] = {
    7633             :     {"size", encoding_map_size, METH_NOARGS,
    7634             :      PyDoc_STR("Return the size (in bytes) of this object") },
    7635             :     { 0 }
    7636             : };
    7637             : 
    7638             : static void
    7639           0 : encoding_map_dealloc(PyObject* o)
    7640             : {
    7641           0 :     PyObject_FREE(o);
    7642           0 : }
    7643             : 
    7644             : static PyTypeObject EncodingMapType = {
    7645             :     PyVarObject_HEAD_INIT(NULL, 0)
    7646             :     "EncodingMap",          /*tp_name*/
    7647             :     sizeof(struct encoding_map),   /*tp_basicsize*/
    7648             :     0,                      /*tp_itemsize*/
    7649             :     /* methods */
    7650             :     encoding_map_dealloc,   /*tp_dealloc*/
    7651             :     0,                      /*tp_print*/
    7652             :     0,                      /*tp_getattr*/
    7653             :     0,                      /*tp_setattr*/
    7654             :     0,                      /*tp_reserved*/
    7655             :     0,                      /*tp_repr*/
    7656             :     0,                      /*tp_as_number*/
    7657             :     0,                      /*tp_as_sequence*/
    7658             :     0,                      /*tp_as_mapping*/
    7659             :     0,                      /*tp_hash*/
    7660             :     0,                      /*tp_call*/
    7661             :     0,                      /*tp_str*/
    7662             :     0,                      /*tp_getattro*/
    7663             :     0,                      /*tp_setattro*/
    7664             :     0,                      /*tp_as_buffer*/
    7665             :     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
    7666             :     0,                      /*tp_doc*/
    7667             :     0,                      /*tp_traverse*/
    7668             :     0,                      /*tp_clear*/
    7669             :     0,                      /*tp_richcompare*/
    7670             :     0,                      /*tp_weaklistoffset*/
    7671             :     0,                      /*tp_iter*/
    7672             :     0,                      /*tp_iternext*/
    7673             :     encoding_map_methods,   /*tp_methods*/
    7674             :     0,                      /*tp_members*/
    7675             :     0,                      /*tp_getset*/
    7676             :     0,                      /*tp_base*/
    7677             :     0,                      /*tp_dict*/
    7678             :     0,                      /*tp_descr_get*/
    7679             :     0,                      /*tp_descr_set*/
    7680             :     0,                      /*tp_dictoffset*/
    7681             :     0,                      /*tp_init*/
    7682             :     0,                      /*tp_alloc*/
    7683             :     0,                      /*tp_new*/
    7684             :     0,                      /*tp_free*/
    7685             :     0,                      /*tp_is_gc*/
    7686             : };
    7687             : 
    7688             : PyObject*
    7689           0 : PyUnicode_BuildEncodingMap(PyObject* string)
    7690             : {
    7691             :     PyObject *result;
    7692             :     struct encoding_map *mresult;
    7693             :     int i;
    7694           0 :     int need_dict = 0;
    7695             :     unsigned char level1[32];
    7696             :     unsigned char level2[512];
    7697             :     unsigned char *mlevel1, *mlevel2, *mlevel3;
    7698           0 :     int count2 = 0, count3 = 0;
    7699             :     int kind;
    7700             :     void *data;
    7701             :     Py_ssize_t length;
    7702             :     Py_UCS4 ch;
    7703             : 
    7704           0 :     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
    7705           0 :         PyErr_BadArgument();
    7706           0 :         return NULL;
    7707             :     }
    7708           0 :     kind = PyUnicode_KIND(string);
    7709           0 :     data = PyUnicode_DATA(string);
    7710           0 :     length = PyUnicode_GET_LENGTH(string);
    7711           0 :     length = Py_MIN(length, 256);
    7712           0 :     memset(level1, 0xFF, sizeof level1);
    7713           0 :     memset(level2, 0xFF, sizeof level2);
    7714             : 
    7715             :     /* If there isn't a one-to-one mapping of NULL to \0,
    7716             :        or if there are non-BMP characters, we need to use
    7717             :        a mapping dictionary. */
    7718           0 :     if (PyUnicode_READ(kind, data, 0) != 0)
    7719           0 :         need_dict = 1;
    7720           0 :     for (i = 1; i < length; i++) {
    7721             :         int l1, l2;
    7722           0 :         ch = PyUnicode_READ(kind, data, i);
    7723           0 :         if (ch == 0 || ch > 0xFFFF) {
    7724           0 :             need_dict = 1;
    7725           0 :             break;
    7726             :         }
    7727           0 :         if (ch == 0xFFFE)
    7728             :             /* unmapped character */
    7729           0 :             continue;
    7730           0 :         l1 = ch >> 11;
    7731           0 :         l2 = ch >> 7;
    7732           0 :         if (level1[l1] == 0xFF)
    7733           0 :             level1[l1] = count2++;
    7734           0 :         if (level2[l2] == 0xFF)
    7735           0 :             level2[l2] = count3++;
    7736             :     }
    7737             : 
    7738           0 :     if (count2 >= 0xFF || count3 >= 0xFF)
    7739           0 :         need_dict = 1;
    7740             : 
    7741           0 :     if (need_dict) {
    7742           0 :         PyObject *result = PyDict_New();
    7743             :         PyObject *key, *value;
    7744           0 :         if (!result)
    7745           0 :             return NULL;
    7746           0 :         for (i = 0; i < length; i++) {
    7747           0 :             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
    7748           0 :             value = PyLong_FromLong(i);
    7749           0 :             if (!key || !value)
    7750             :                 goto failed1;
    7751           0 :             if (PyDict_SetItem(result, key, value) == -1)
    7752           0 :                 goto failed1;
    7753           0 :             Py_DECREF(key);
    7754           0 :             Py_DECREF(value);
    7755             :         }
    7756           0 :         return result;
    7757             :       failed1:
    7758           0 :         Py_XDECREF(key);
    7759           0 :         Py_XDECREF(value);
    7760           0 :         Py_DECREF(result);
    7761           0 :         return NULL;
    7762             :     }
    7763             : 
    7764             :     /* Create a three-level trie */
    7765           0 :     result = PyObject_MALLOC(sizeof(struct encoding_map) +
    7766           0 :                              16*count2 + 128*count3 - 1);
    7767           0 :     if (!result)
    7768           0 :         return PyErr_NoMemory();
    7769           0 :     PyObject_Init(result, &EncodingMapType);
    7770           0 :     mresult = (struct encoding_map*)result;
    7771           0 :     mresult->count2 = count2;
    7772           0 :     mresult->count3 = count3;
    7773           0 :     mlevel1 = mresult->level1;
    7774           0 :     mlevel2 = mresult->level23;
    7775           0 :     mlevel3 = mresult->level23 + 16*count2;
    7776           0 :     memcpy(mlevel1, level1, 32);
    7777           0 :     memset(mlevel2, 0xFF, 16*count2);
    7778           0 :     memset(mlevel3, 0, 128*count3);
    7779           0 :     count3 = 0;
    7780           0 :     for (i = 1; i < length; i++) {
    7781             :         int o1, o2, o3, i2, i3;
    7782           0 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
    7783           0 :         if (ch == 0xFFFE)
    7784             :             /* unmapped character */
    7785           0 :             continue;
    7786           0 :         o1 = ch>>11;
    7787           0 :         o2 = (ch>>7) & 0xF;
    7788           0 :         i2 = 16*mlevel1[o1] + o2;
    7789           0 :         if (mlevel2[i2] == 0xFF)
    7790           0 :             mlevel2[i2] = count3++;
    7791           0 :         o3 = ch & 0x7F;
    7792           0 :         i3 = 128*mlevel2[i2] + o3;
    7793           0 :         mlevel3[i3] = i;
    7794             :     }
    7795           0 :     return result;
    7796             : }
    7797             : 
    7798             : static int
    7799           0 : encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
    7800             : {
    7801           0 :     struct encoding_map *map = (struct encoding_map*)mapping;
    7802           0 :     int l1 = c>>11;
    7803           0 :     int l2 = (c>>7) & 0xF;
    7804           0 :     int l3 = c & 0x7F;
    7805             :     int i;
    7806             : 
    7807           0 :     if (c > 0xFFFF)
    7808           0 :         return -1;
    7809           0 :     if (c == 0)
    7810           0 :         return 0;
    7811             :     /* level 1*/
    7812           0 :     i = map->level1[l1];
    7813           0 :     if (i == 0xFF) {
    7814           0 :         return -1;
    7815             :     }
    7816             :     /* level 2*/
    7817           0 :     i = map->level23[16*i+l2];
    7818           0 :     if (i == 0xFF) {
    7819           0 :         return -1;
    7820             :     }
    7821             :     /* level 3 */
    7822           0 :     i = map->level23[16*map->count2 + 128*i + l3];
    7823           0 :     if (i == 0) {
    7824           0 :         return -1;
    7825             :     }
    7826           0 :     return i;
    7827             : }
    7828             : 
    7829             : /* Lookup the character ch in the mapping. If the character
    7830             :    can't be found, Py_None is returned (or NULL, if another
    7831             :    error occurred). */
    7832             : static PyObject *
    7833           0 : charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
    7834             : {
    7835           0 :     PyObject *w = PyLong_FromLong((long)c);
    7836             :     PyObject *x;
    7837             : 
    7838           0 :     if (w == NULL)
    7839           0 :         return NULL;
    7840           0 :     x = PyObject_GetItem(mapping, w);
    7841           0 :     Py_DECREF(w);
    7842           0 :     if (x == NULL) {
    7843           0 :         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    7844             :             /* No mapping found means: mapping is undefined. */
    7845           0 :             PyErr_Clear();
    7846           0 :             x = Py_None;
    7847           0 :             Py_INCREF(x);
    7848           0 :             return x;
    7849             :         } else
    7850           0 :             return NULL;
    7851             :     }
    7852           0 :     else if (x == Py_None)
    7853           0 :         return x;
    7854           0 :     else if (PyLong_Check(x)) {
    7855           0 :         long value = PyLong_AS_LONG(x);
    7856           0 :         if (value < 0 || value > 255) {
    7857           0 :             PyErr_SetString(PyExc_TypeError,
    7858             :                             "character mapping must be in range(256)");
    7859           0 :             Py_DECREF(x);
    7860           0 :             return NULL;
    7861             :         }
    7862           0 :         return x;
    7863             :     }
    7864           0 :     else if (PyBytes_Check(x))
    7865           0 :         return x;
    7866             :     else {
    7867             :         /* wrong return value */
    7868           0 :         PyErr_Format(PyExc_TypeError,
    7869             :                      "character mapping must return integer, bytes or None, not %.400s",
    7870           0 :                      x->ob_type->tp_name);
    7871           0 :         Py_DECREF(x);
    7872           0 :         return NULL;
    7873             :     }
    7874             : }
    7875             : 
    7876             : static int
    7877           0 : charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
    7878             : {
    7879           0 :     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
    7880             :     /* exponentially overallocate to minimize reallocations */
    7881           0 :     if (requiredsize < 2*outsize)
    7882           0 :         requiredsize = 2*outsize;
    7883           0 :     if (_PyBytes_Resize(outobj, requiredsize))
    7884           0 :         return -1;
    7885           0 :     return 0;
    7886             : }
    7887             : 
    7888             : typedef enum charmapencode_result {
    7889             :     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
    7890             : } charmapencode_result;
    7891             : /* lookup the character, put the result in the output string and adjust
    7892             :    various state variables. Resize the output bytes object if not enough
    7893             :    space is available. Return a new reference to the object that
    7894             :    was put in the output buffer, or Py_None, if the mapping was undefined
    7895             :    (in which case no character was written) or NULL, if a
    7896             :    reallocation error occurred. The caller must decref the result */
    7897             : static charmapencode_result
    7898           0 : charmapencode_output(Py_UCS4 c, PyObject *mapping,
    7899             :                      PyObject **outobj, Py_ssize_t *outpos)
    7900             : {
    7901             :     PyObject *rep;
    7902             :     char *outstart;
    7903           0 :     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
    7904             : 
    7905           0 :     if (Py_TYPE(mapping) == &EncodingMapType) {
    7906           0 :         int res = encoding_map_lookup(c, mapping);
    7907           0 :         Py_ssize_t requiredsize = *outpos+1;
    7908           0 :         if (res == -1)
    7909           0 :             return enc_FAILED;
    7910           0 :         if (outsize<requiredsize)
    7911           0 :             if (charmapencode_resize(outobj, outpos, requiredsize))
    7912           0 :                 return enc_EXCEPTION;
    7913           0 :         outstart = PyBytes_AS_STRING(*outobj);
    7914           0 :         outstart[(*outpos)++] = (char)res;
    7915           0 :         return enc_SUCCESS;
    7916             :     }
    7917             : 
    7918           0 :     rep = charmapencode_lookup(c, mapping);
    7919           0 :     if (rep==NULL)
    7920           0 :         return enc_EXCEPTION;
    7921           0 :     else if (rep==Py_None) {
    7922           0 :         Py_DECREF(rep);
    7923           0 :         return enc_FAILED;
    7924             :     } else {
    7925           0 :         if (PyLong_Check(rep)) {
    7926           0 :             Py_ssize_t requiredsize = *outpos+1;
    7927           0 :             if (outsize<requiredsize)
    7928           0 :                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
    7929           0 :                     Py_DECREF(rep);
    7930           0 :                     return enc_EXCEPTION;
    7931             :                 }
    7932           0 :             outstart = PyBytes_AS_STRING(*outobj);
    7933           0 :             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
    7934             :         }
    7935             :         else {
    7936           0 :             const char *repchars = PyBytes_AS_STRING(rep);
    7937           0 :             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
    7938           0 :             Py_ssize_t requiredsize = *outpos+repsize;
    7939           0 :             if (outsize<requiredsize)
    7940           0 :                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
    7941           0 :                     Py_DECREF(rep);
    7942           0 :                     return enc_EXCEPTION;
    7943             :                 }
    7944           0 :             outstart = PyBytes_AS_STRING(*outobj);
    7945           0 :             memcpy(outstart + *outpos, repchars, repsize);
    7946           0 :             *outpos += repsize;
    7947             :         }
    7948             :     }
    7949           0 :     Py_DECREF(rep);
    7950           0 :     return enc_SUCCESS;
    7951             : }
    7952             : 
    7953             : /* handle an error in PyUnicode_EncodeCharmap
    7954             :    Return 0 on success, -1 on error */
    7955             : static int
    7956           0 : charmap_encoding_error(
    7957             :     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
    7958             :     PyObject **exceptionObject,
    7959             :     int *known_errorHandler, PyObject **errorHandler, const char *errors,
    7960             :     PyObject **res, Py_ssize_t *respos)
    7961             : {
    7962           0 :     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    7963             :     Py_ssize_t size, repsize;
    7964             :     Py_ssize_t newpos;
    7965             :     enum PyUnicode_Kind kind;
    7966             :     void *data;
    7967             :     Py_ssize_t index;
    7968             :     /* startpos for collecting unencodable chars */
    7969           0 :     Py_ssize_t collstartpos = *inpos;
    7970           0 :     Py_ssize_t collendpos = *inpos+1;
    7971             :     Py_ssize_t collpos;
    7972           0 :     char *encoding = "charmap";
    7973           0 :     char *reason = "character maps to <undefined>";
    7974             :     charmapencode_result x;
    7975             :     Py_UCS4 ch;
    7976             :     int val;
    7977             : 
    7978           0 :     if (PyUnicode_READY(unicode) == -1)
    7979           0 :         return -1;
    7980           0 :     size = PyUnicode_GET_LENGTH(unicode);
    7981             :     /* find all unencodable characters */
    7982           0 :     while (collendpos < size) {
    7983             :         PyObject *rep;
    7984           0 :         if (Py_TYPE(mapping) == &EncodingMapType) {
    7985           0 :             ch = PyUnicode_READ_CHAR(unicode, collendpos);
    7986           0 :             val = encoding_map_lookup(ch, mapping);
    7987           0 :             if (val != -1)
    7988           0 :                 break;
    7989           0 :             ++collendpos;
    7990           0 :             continue;
    7991             :         }
    7992             : 
    7993           0 :         ch = PyUnicode_READ_CHAR(unicode, collendpos);
    7994           0 :         rep = charmapencode_lookup(ch, mapping);
    7995           0 :         if (rep==NULL)
    7996           0 :             return -1;
    7997           0 :         else if (rep!=Py_None) {
    7998           0 :             Py_DECREF(rep);
    7999           0 :             break;
    8000             :         }
    8001           0 :         Py_DECREF(rep);
    8002           0 :         ++collendpos;
    8003             :     }
    8004             :     /* cache callback name lookup
    8005             :      * (if not done yet, i.e. it's the first error) */
    8006           0 :     if (*known_errorHandler==-1) {
    8007           0 :         if ((errors==NULL) || (!strcmp(errors, "strict")))
    8008           0 :             *known_errorHandler = 1;
    8009           0 :         else if (!strcmp(errors, "replace"))
    8010           0 :             *known_errorHandler = 2;
    8011           0 :         else if (!strcmp(errors, "ignore"))
    8012           0 :             *known_errorHandler = 3;
    8013           0 :         else if (!strcmp(errors, "xmlcharrefreplace"))
    8014           0 :             *known_errorHandler = 4;
    8015             :         else
    8016           0 :             *known_errorHandler = 0;
    8017             :     }
    8018           0 :     switch (*known_errorHandler) {
    8019             :     case 1: /* strict */
    8020           0 :         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8021           0 :         return -1;
    8022             :     case 2: /* replace */
    8023           0 :         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
    8024           0 :             x = charmapencode_output('?', mapping, res, respos);
    8025           0 :             if (x==enc_EXCEPTION) {
    8026           0 :                 return -1;
    8027             :             }
    8028           0 :             else if (x==enc_FAILED) {
    8029           0 :                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8030           0 :                 return -1;
    8031             :             }
    8032             :         }
    8033             :         /* fall through */
    8034             :     case 3: /* ignore */
    8035           0 :         *inpos = collendpos;
    8036           0 :         break;
    8037             :     case 4: /* xmlcharrefreplace */
    8038             :         /* generate replacement (temporarily (mis)uses p) */
    8039           0 :         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
    8040             :             char buffer[2+29+1+1];
    8041             :             char *cp;
    8042           0 :             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
    8043           0 :             for (cp = buffer; *cp; ++cp) {
    8044           0 :                 x = charmapencode_output(*cp, mapping, res, respos);
    8045           0 :                 if (x==enc_EXCEPTION)
    8046           0 :                     return -1;
    8047           0 :                 else if (x==enc_FAILED) {
    8048           0 :                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8049           0 :                     return -1;
    8050             :                 }
    8051             :             }
    8052             :         }
    8053           0 :         *inpos = collendpos;
    8054           0 :         break;
    8055             :     default:
    8056           0 :         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
    8057             :                                                       encoding, reason, unicode, exceptionObject,
    8058             :                                                       collstartpos, collendpos, &newpos);
    8059           0 :         if (repunicode == NULL)
    8060           0 :             return -1;
    8061           0 :         if (PyBytes_Check(repunicode)) {
    8062             :             /* Directly copy bytes result to output. */
    8063           0 :             Py_ssize_t outsize = PyBytes_Size(*res);
    8064             :             Py_ssize_t requiredsize;
    8065           0 :             repsize = PyBytes_Size(repunicode);
    8066           0 :             requiredsize = *respos + repsize;
    8067           0 :             if (requiredsize > outsize)
    8068             :                 /* Make room for all additional bytes. */
    8069           0 :                 if (charmapencode_resize(res, respos, requiredsize)) {
    8070           0 :                     Py_DECREF(repunicode);
    8071           0 :                     return -1;
    8072             :                 }
    8073           0 :             memcpy(PyBytes_AsString(*res) + *respos,
    8074           0 :                    PyBytes_AsString(repunicode),  repsize);
    8075           0 :             *respos += repsize;
    8076           0 :             *inpos = newpos;
    8077           0 :             Py_DECREF(repunicode);
    8078           0 :             break;
    8079             :         }
    8080             :         /* generate replacement  */
    8081           0 :         if (PyUnicode_READY(repunicode) == -1) {
    8082           0 :             Py_DECREF(repunicode);
    8083           0 :             return -1;
    8084             :         }
    8085           0 :         repsize = PyUnicode_GET_LENGTH(repunicode);
    8086           0 :         data = PyUnicode_DATA(repunicode);
    8087           0 :         kind = PyUnicode_KIND(repunicode);
    8088           0 :         for (index = 0; index < repsize; index++) {
    8089           0 :             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
    8090           0 :             x = charmapencode_output(repch, mapping, res, respos);
    8091           0 :             if (x==enc_EXCEPTION) {
    8092           0 :                 Py_DECREF(repunicode);
    8093           0 :                 return -1;
    8094             :             }
    8095           0 :             else if (x==enc_FAILED) {
    8096           0 :                 Py_DECREF(repunicode);
    8097           0 :                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
    8098           0 :                 return -1;
    8099             :             }
    8100             :         }
    8101           0 :         *inpos = newpos;
    8102           0 :         Py_DECREF(repunicode);
    8103             :     }
    8104           0 :     return 0;
    8105             : }
    8106             : 
    8107             : PyObject *
    8108           0 : _PyUnicode_EncodeCharmap(PyObject *unicode,
    8109             :                          PyObject *mapping,
    8110             :                          const char *errors)
    8111             : {
    8112             :     /* output object */
    8113           0 :     PyObject *res = NULL;
    8114             :     /* current input position */
    8115           0 :     Py_ssize_t inpos = 0;
    8116             :     Py_ssize_t size;
    8117             :     /* current output position */
    8118           0 :     Py_ssize_t respos = 0;
    8119           0 :     PyObject *errorHandler = NULL;
    8120           0 :     PyObject *exc = NULL;
    8121             :     /* the following variable is used for caching string comparisons
    8122             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
    8123             :      * 3=ignore, 4=xmlcharrefreplace */
    8124           0 :     int known_errorHandler = -1;
    8125             : 
    8126           0 :     if (PyUnicode_READY(unicode) == -1)
    8127           0 :         return NULL;
    8128           0 :     size = PyUnicode_GET_LENGTH(unicode);
    8129             : 
    8130             :     /* Default to Latin-1 */
    8131           0 :     if (mapping == NULL)
    8132           0 :         return unicode_encode_ucs1(unicode, errors, 256);
    8133             : 
    8134             :     /* allocate enough for a simple encoding without
    8135             :        replacements, if we need more, we'll resize */
    8136           0 :     res = PyBytes_FromStringAndSize(NULL, size);
    8137           0 :     if (res == NULL)
    8138           0 :         goto onError;
    8139           0 :     if (size == 0)
    8140           0 :         return res;
    8141             : 
    8142           0 :     while (inpos<size) {
    8143           0 :         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
    8144             :         /* try to encode it */
    8145           0 :         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
    8146           0 :         if (x==enc_EXCEPTION) /* error */
    8147           0 :             goto onError;
    8148           0 :         if (x==enc_FAILED) { /* unencodable character */
    8149           0 :             if (charmap_encoding_error(unicode, &inpos, mapping,
    8150             :                                        &exc,
    8151             :                                        &known_errorHandler, &errorHandler, errors,
    8152             :                                        &res, &respos)) {
    8153           0 :                 goto onError;
    8154             :             }
    8155             :         }
    8156             :         else
    8157             :             /* done with this character => adjust input position */
    8158           0 :             ++inpos;
    8159             :     }
    8160             : 
    8161             :     /* Resize if we allocated to much */
    8162           0 :     if (respos<PyBytes_GET_SIZE(res))
    8163           0 :         if (_PyBytes_Resize(&res, respos) < 0)
    8164           0 :             goto onError;
    8165             : 
    8166           0 :     Py_XDECREF(exc);
    8167           0 :     Py_XDECREF(errorHandler);
    8168           0 :     return res;
    8169             : 
    8170             :   onError:
    8171           0 :     Py_XDECREF(res);
    8172           0 :     Py_XDECREF(exc);
    8173           0 :     Py_XDECREF(errorHandler);
    8174           0 :     return NULL;
    8175             : }
    8176             : 
    8177             : /* Deprecated */
    8178             : PyObject *
    8179           0 : PyUnicode_EncodeCharmap(const Py_UNICODE *p,
    8180             :                         Py_ssize_t size,
    8181             :                         PyObject *mapping,
    8182             :                         const char *errors)
    8183             : {
    8184             :     PyObject *result;
    8185           0 :     PyObject *unicode = PyUnicode_FromUnicode(p, size);
    8186           0 :     if (unicode == NULL)
    8187           0 :         return NULL;
    8188           0 :     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
    8189           0 :     Py_DECREF(unicode);
    8190           0 :     return result;
    8191             : }
    8192             : 
    8193             : PyObject *
    8194           0 : PyUnicode_AsCharmapString(PyObject *unicode,
    8195             :                           PyObject *mapping)
    8196             : {
    8197           0 :     if (!PyUnicode_Check(unicode) || mapping == NULL) {
    8198           0 :         PyErr_BadArgument();
    8199           0 :         return NULL;
    8200             :     }
    8201           0 :     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
    8202             : }
    8203             : 
    8204             : /* create or adjust a UnicodeTranslateError */
    8205             : static void
    8206           0 : make_translate_exception(PyObject **exceptionObject,
    8207             :                          PyObject *unicode,
    8208             :                          Py_ssize_t startpos, Py_ssize_t endpos,
    8209             :                          const char *reason)
    8210             : {
    8211           0 :     if (*exceptionObject == NULL) {
    8212           0 :         *exceptionObject = _PyUnicodeTranslateError_Create(
    8213             :             unicode, startpos, endpos, reason);
    8214             :     }
    8215             :     else {
    8216           0 :         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
    8217           0 :             goto onError;
    8218           0 :         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
    8219           0 :             goto onError;
    8220           0 :         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
    8221           0 :             goto onError;
    8222           0 :         return;
    8223             :       onError:
    8224           0 :         Py_DECREF(*exceptionObject);
    8225           0 :         *exceptionObject = NULL;
    8226             :     }
    8227             : }
    8228             : 
    8229             : /* raises a UnicodeTranslateError */
    8230             : static void
    8231           0 : raise_translate_exception(PyObject **exceptionObject,
    8232             :                           PyObject *unicode,
    8233             :                           Py_ssize_t startpos, Py_ssize_t endpos,
    8234             :                           const char *reason)
    8235             : {
    8236           0 :     make_translate_exception(exceptionObject,
    8237             :                              unicode, startpos, endpos, reason);
    8238           0 :     if (*exceptionObject != NULL)
    8239           0 :         PyCodec_StrictErrors(*exceptionObject);
    8240           0 : }
    8241             : 
    8242             : /* error handling callback helper:
    8243             :    build arguments, call the callback and check the arguments,
    8244             :    put the result into newpos and return the replacement string, which
    8245             :    has to be freed by the caller */
    8246             : static PyObject *
    8247           0 : unicode_translate_call_errorhandler(const char *errors,
    8248             :                                     PyObject **errorHandler,
    8249             :                                     const char *reason,
    8250             :                                     PyObject *unicode, PyObject **exceptionObject,
    8251             :                                     Py_ssize_t startpos, Py_ssize_t endpos,
    8252             :                                     Py_ssize_t *newpos)
    8253             : {
    8254             :     static char *argparse = "O!n;translating error handler must return (str, int) tuple";
    8255             : 
    8256             :     Py_ssize_t i_newpos;
    8257             :     PyObject *restuple;
    8258             :     PyObject *resunicode;
    8259             : 
    8260           0 :     if (*errorHandler == NULL) {
    8261           0 :         *errorHandler = PyCodec_LookupError(errors);
    8262           0 :         if (*errorHandler == NULL)
    8263           0 :             return NULL;
    8264             :     }
    8265             : 
    8266           0 :     make_translate_exception(exceptionObject,
    8267             :                              unicode, startpos, endpos, reason);
    8268           0 :     if (*exceptionObject == NULL)
    8269           0 :         return NULL;
    8270             : 
    8271           0 :     restuple = PyObject_CallFunctionObjArgs(
    8272             :         *errorHandler, *exceptionObject, NULL);
    8273           0 :     if (restuple == NULL)
    8274           0 :         return NULL;
    8275           0 :     if (!PyTuple_Check(restuple)) {
    8276           0 :         PyErr_SetString(PyExc_TypeError, &argparse[4]);
    8277           0 :         Py_DECREF(restuple);
    8278           0 :         return NULL;
    8279             :     }
    8280           0 :     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
    8281             :                           &resunicode, &i_newpos)) {
    8282           0 :         Py_DECREF(restuple);
    8283           0 :         return NULL;
    8284             :     }
    8285           0 :     if (i_newpos<0)
    8286           0 :         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
    8287             :     else
    8288           0 :         *newpos = i_newpos;
    8289           0 :     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
    8290           0 :         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
    8291           0 :         Py_DECREF(restuple);
    8292           0 :         return NULL;
    8293             :     }
    8294           0 :     Py_INCREF(resunicode);
    8295           0 :     Py_DECREF(restuple);
    8296           0 :     return resunicode;
    8297             : }
    8298             : 
    8299             : /* Lookup the character ch in the mapping and put the result in result,
    8300             :    which must be decrefed by the caller.
    8301             :    Return 0 on success, -1 on error */
    8302             : static int
    8303           0 : charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
    8304             : {
    8305           0 :     PyObject *w = PyLong_FromLong((long)c);
    8306             :     PyObject *x;
    8307             : 
    8308           0 :     if (w == NULL)
    8309           0 :         return -1;
    8310           0 :     x = PyObject_GetItem(mapping, w);
    8311           0 :     Py_DECREF(w);
    8312           0 :     if (x == NULL) {
    8313           0 :         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
    8314             :             /* No mapping found means: use 1:1 mapping. */
    8315           0 :             PyErr_Clear();
    8316           0 :             *result = NULL;
    8317           0 :             return 0;
    8318             :         } else
    8319           0 :             return -1;
    8320             :     }
    8321           0 :     else if (x == Py_None) {
    8322           0 :         *result = x;
    8323           0 :         return 0;
    8324             :     }
    8325           0 :     else if (PyLong_Check(x)) {
    8326           0 :         long value = PyLong_AS_LONG(x);
    8327           0 :         long max = PyUnicode_GetMax();
    8328           0 :         if (value < 0 || value > max) {
    8329           0 :             PyErr_Format(PyExc_TypeError,
    8330             :                          "character mapping must be in range(0x%x)", max+1);
    8331           0 :             Py_DECREF(x);
    8332           0 :             return -1;
    8333             :         }
    8334           0 :         *result = x;
    8335           0 :         return 0;
    8336             :     }
    8337           0 :     else if (PyUnicode_Check(x)) {
    8338           0 :         *result = x;
    8339           0 :         return 0;
    8340             :     }
    8341             :     else {
    8342             :         /* wrong return value */
    8343           0 :         PyErr_SetString(PyExc_TypeError,
    8344             :                         "character mapping must return integer, None or str");
    8345           0 :         Py_DECREF(x);
    8346           0 :         return -1;
    8347             :     }
    8348             : }
    8349             : /* ensure that *outobj is at least requiredsize characters long,
    8350             :    if not reallocate and adjust various state variables.
    8351             :    Return 0 on success, -1 on error */
    8352             : static int
    8353           0 : charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
    8354             :                                Py_ssize_t requiredsize)
    8355             : {
    8356           0 :     Py_ssize_t oldsize = *psize;
    8357             :     Py_UCS4 *new_outobj;
    8358           0 :     if (requiredsize > oldsize) {
    8359             :         /* exponentially overallocate to minimize reallocations */
    8360           0 :         if (requiredsize < 2 * oldsize)
    8361           0 :             requiredsize = 2 * oldsize;
    8362           0 :         new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
    8363           0 :         if (new_outobj == 0)
    8364           0 :             return -1;
    8365           0 :         *outobj = new_outobj;
    8366           0 :         *psize = requiredsize;
    8367             :     }
    8368           0 :     return 0;
    8369             : }
    8370             : /* lookup the character, put the result in the output string and adjust
    8371             :    various state variables. Return a new reference to the object that
    8372             :    was put in the output buffer in *result, or Py_None, if the mapping was
    8373             :    undefined (in which case no character was written).
    8374             :    The called must decref result.
    8375             :    Return 0 on success, -1 on error. */
    8376             : static int
    8377           0 : charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
    8378             :                         PyObject *mapping, Py_UCS4 **output,
    8379             :                         Py_ssize_t *osize, Py_ssize_t *opos,
    8380             :                         PyObject **res)
    8381             : {
    8382           0 :     Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
    8383           0 :     if (charmaptranslate_lookup(curinp, mapping, res))
    8384           0 :         return -1;
    8385           0 :     if (*res==NULL) {
    8386             :         /* not found => default to 1:1 mapping */
    8387           0 :         (*output)[(*opos)++] = curinp;
    8388             :     }
    8389           0 :     else if (*res==Py_None)
    8390             :         ;
    8391           0 :     else if (PyLong_Check(*res)) {
    8392             :         /* no overflow check, because we know that the space is enough */
    8393           0 :         (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
    8394             :     }
    8395           0 :     else if (PyUnicode_Check(*res)) {
    8396             :         Py_ssize_t repsize;
    8397           0 :         if (PyUnicode_READY(*res) == -1)
    8398           0 :             return -1;
    8399           0 :         repsize = PyUnicode_GET_LENGTH(*res);
    8400           0 :         if (repsize==1) {
    8401             :             /* no overflow check, because we know that the space is enough */
    8402           0 :             (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
    8403             :         }
    8404           0 :         else if (repsize!=0) {
    8405             :             /* more than one character */
    8406           0 :             Py_ssize_t requiredsize = *opos +
    8407           0 :                 (PyUnicode_GET_LENGTH(input) - ipos) +
    8408             :                 repsize - 1;
    8409             :             Py_ssize_t i;
    8410           0 :             if (charmaptranslate_makespace(output, osize, requiredsize))
    8411           0 :                 return -1;
    8412           0 :             for(i = 0; i < repsize; i++)
    8413           0 :                 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
    8414             :         }
    8415             :     }
    8416             :     else
    8417           0 :         return -1;
    8418           0 :     return 0;
    8419             : }
    8420             : 
    8421             : PyObject *
    8422           0 : _PyUnicode_TranslateCharmap(PyObject *input,
    8423             :                             PyObject *mapping,
    8424             :                             const char *errors)
    8425             : {
    8426             :     /* input object */
    8427             :     char *idata;
    8428             :     Py_ssize_t size, i;
    8429             :     int kind;
    8430             :     /* output buffer */
    8431           0 :     Py_UCS4 *output = NULL;
    8432             :     Py_ssize_t osize;
    8433             :     PyObject *res;
    8434             :     /* current output position */
    8435             :     Py_ssize_t opos;
    8436           0 :     char *reason = "character maps to <undefined>";
    8437           0 :     PyObject *errorHandler = NULL;
    8438           0 :     PyObject *exc = NULL;
    8439             :     /* the following variable is used for caching string comparisons
    8440             :      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
    8441             :      * 3=ignore, 4=xmlcharrefreplace */
    8442           0 :     int known_errorHandler = -1;
    8443             : 
    8444           0 :     if (mapping == NULL) {
    8445           0 :         PyErr_BadArgument();
    8446           0 :         return NULL;
    8447             :     }
    8448             : 
    8449           0 :     if (PyUnicode_READY(input) == -1)
    8450           0 :         return NULL;
    8451           0 :     idata = (char*)PyUnicode_DATA(input);
    8452           0 :     kind = PyUnicode_KIND(input);
    8453           0 :     size = PyUnicode_GET_LENGTH(input);
    8454           0 :     i = 0;
    8455             : 
    8456           0 :     if (size == 0) {
    8457           0 :         Py_INCREF(input);
    8458           0 :         return input;
    8459             :     }
    8460             : 
    8461             :     /* allocate enough for a simple 1:1 translation without
    8462             :        replacements, if we need more, we'll resize */
    8463           0 :     osize = size;
    8464           0 :     output = PyMem_Malloc(osize * sizeof(Py_UCS4));
    8465           0 :     opos = 0;
    8466           0 :     if (output == NULL) {
    8467           0 :         PyErr_NoMemory();
    8468           0 :         goto onError;
    8469             :     }
    8470             : 
    8471           0 :     while (i<size) {
    8472             :         /* try to encode it */
    8473           0 :         PyObject *x = NULL;
    8474           0 :         if (charmaptranslate_output(input, i, mapping,
    8475             :                                     &output, &osize, &opos, &x)) {
    8476           0 :             Py_XDECREF(x);
    8477             :             goto onError;
    8478             :         }
    8479           0 :         Py_XDECREF(x);
    8480           0 :         if (x!=Py_None) /* it worked => adjust input pointer */
    8481           0 :             ++i;
    8482             :         else { /* untranslatable character */
    8483           0 :             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
    8484             :             Py_ssize_t repsize;
    8485             :             Py_ssize_t newpos;
    8486             :             Py_ssize_t uni2;
    8487             :             /* startpos for collecting untranslatable chars */
    8488           0 :             Py_ssize_t collstart = i;
    8489           0 :             Py_ssize_t collend = i+1;
    8490             :             Py_ssize_t coll;
    8491             : 
    8492             :             /* find all untranslatable characters */
    8493           0 :             while (collend < size) {
    8494           0 :                 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
    8495             :                     goto onError;
    8496           0 :                 Py_XDECREF(x);
    8497           0 :                 if (x!=Py_None)
    8498           0 :                     break;
    8499           0 :                 ++collend;
    8500             :             }
    8501             :             /* cache callback name lookup
    8502             :              * (if not done yet, i.e. it's the first error) */
    8503           0 :             if (known_errorHandler==-1) {
    8504           0 :                 if ((errors==NULL) || (!strcmp(errors, "strict")))
    8505           0 :                     known_errorHandler = 1;
    8506           0 :                 else if (!strcmp(errors, "replace"))
    8507           0 :                     known_errorHandler = 2;
    8508           0 :                 else if (!strcmp(errors, "ignore"))
    8509           0 :                     known_errorHandler = 3;
    8510           0 :                 else if (!strcmp(errors, "xmlcharrefreplace"))
    8511           0 :                     known_errorHandler = 4;
    8512             :                 else
    8513           0 :                     known_errorHandler = 0;
    8514             :             }
    8515           0 :             switch (known_errorHandler) {
    8516             :             case 1: /* strict */
    8517           0 :                 raise_translate_exception(&exc, input, collstart,
    8518             :                                           collend, reason);
    8519             :                 goto onError;
    8520             :             case 2: /* replace */
    8521             :                 /* No need to check for space, this is a 1:1 replacement */
    8522           0 :                 for (coll = collstart; coll<collend; coll++)
    8523           0 :                     output[opos++] = '?';
    8524             :                 /* fall through */
    8525             :             case 3: /* ignore */
    8526           0 :                 i = collend;
    8527           0 :                 break;
    8528             :             case 4: /* xmlcharrefreplace */
    8529             :                 /* generate replacement (temporarily (mis)uses i) */
    8530           0 :                 for (i = collstart; i < collend; ++i) {
    8531             :                     char buffer[2+29+1+1];
    8532             :                     char *cp;
    8533           0 :                     sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
    8534           0 :                     if (charmaptranslate_makespace(&output, &osize,
    8535           0 :                                                    opos+strlen(buffer)+(size-collend)))
    8536             :                         goto onError;
    8537           0 :                     for (cp = buffer; *cp; ++cp)
    8538           0 :                         output[opos++] = *cp;
    8539             :                 }
    8540           0 :                 i = collend;
    8541           0 :                 break;
    8542             :             default:
    8543           0 :                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
    8544             :                                                                  reason, input, &exc,
    8545             :                                                                  collstart, collend, &newpos);
    8546           0 :                 if (repunicode == NULL)
    8547             :                     goto onError;
    8548           0 :                 if (PyUnicode_READY(repunicode) == -1) {
    8549           0 :                     Py_DECREF(repunicode);
    8550             :                     goto onError;
    8551             :                 }
    8552             :                 /* generate replacement  */
    8553           0 :                 repsize = PyUnicode_GET_LENGTH(repunicode);
    8554           0 :                 if (charmaptranslate_makespace(&output, &osize,
    8555           0 :                                                opos+repsize+(size-collend))) {
    8556           0 :                     Py_DECREF(repunicode);
    8557             :                     goto onError;
    8558             :                 }
    8559           0 :                 for (uni2 = 0; repsize-->0; ++uni2)
    8560           0 :                     output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
    8561           0 :                 i = newpos;
    8562           0 :                 Py_DECREF(repunicode);
    8563             :             }
    8564             :         }
    8565             :     }
    8566           0 :     res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
    8567           0 :     if (!res)
    8568           0 :         goto onError;
    8569           0 :     PyMem_Free(output);
    8570           0 :     Py_XDECREF(exc);
    8571           0 :     Py_XDECREF(errorHandler);
    8572           0 :     return res;
    8573             : 
    8574             :   onError:
    8575           0 :     PyMem_Free(output);
    8576           0 :     Py_XDECREF(exc);
    8577           0 :     Py_XDECREF(errorHandler);
    8578           0 :     return NULL;
    8579             : }
    8580             : 
    8581             : /* Deprecated. Use PyUnicode_Translate instead. */
    8582             : PyObject *
    8583           0 : PyUnicode_TranslateCharmap(const Py_UNICODE *p,
    8584             :                            Py_ssize_t size,
    8585             :                            PyObject *mapping,
    8586             :                            const char *errors)
    8587             : {
    8588             :     PyObject *result;
    8589           0 :     PyObject *unicode = PyUnicode_FromUnicode(p, size);
    8590           0 :     if (!unicode)
    8591           0 :         return NULL;
    8592           0 :     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
    8593           0 :     Py_DECREF(unicode);
    8594           0 :     return result;
    8595             : }
    8596             : 
    8597             : PyObject *
    8598           0 : PyUnicode_Translate(PyObject *str,
    8599             :                     PyObject *mapping,
    8600             :                     const char *errors)
    8601             : {
    8602             :     PyObject *result;
    8603             : 
    8604           0 :     str = PyUnicode_FromObject(str);
    8605           0 :     if (str == NULL)
    8606           0 :         return NULL;
    8607           0 :     result = _PyUnicode_TranslateCharmap(str, mapping, errors);
    8608           0 :     Py_DECREF(str);
    8609           0 :     return result;
    8610             : }
    8611             : 
    8612             : static Py_UCS4
    8613           0 : fix_decimal_and_space_to_ascii(PyObject *self)
    8614             : {
    8615             :     /* No need to call PyUnicode_READY(self) because this function is only
    8616             :        called as a callback from fixup() which does it already. */
    8617           0 :     const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
    8618           0 :     const int kind = PyUnicode_KIND(self);
    8619           0 :     void *data = PyUnicode_DATA(self);
    8620           0 :     Py_UCS4 maxchar = 127, ch, fixed;
    8621           0 :     int modified = 0;
    8622             :     Py_ssize_t i;
    8623             : 
    8624           0 :     for (i = 0; i < len; ++i) {
    8625           0 :         ch = PyUnicode_READ(kind, data, i);
    8626           0 :         fixed = 0;
    8627           0 :         if (ch > 127) {
    8628           0 :             if (Py_UNICODE_ISSPACE(ch))
    8629           0 :                 fixed = ' ';
    8630             :             else {
    8631           0 :                 const int decimal = Py_UNICODE_TODECIMAL(ch);
    8632           0 :                 if (decimal >= 0)
    8633           0 :                     fixed = '0' + decimal;
    8634             :             }
    8635           0 :             if (fixed != 0) {
    8636           0 :                 modified = 1;
    8637           0 :                 maxchar = MAX_MAXCHAR(maxchar, fixed);
    8638           0 :                 PyUnicode_WRITE(kind, data, i, fixed);
    8639             :             }
    8640             :             else
    8641           0 :                 maxchar = MAX_MAXCHAR(maxchar, ch);
    8642             :         }
    8643             :     }
    8644             : 
    8645           0 :     return (modified) ? maxchar : 0;
    8646             : }
    8647             : 
    8648             : PyObject *
    8649          12 : _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
    8650             : {
    8651          12 :     if (!PyUnicode_Check(unicode)) {
    8652           0 :         PyErr_BadInternalCall();
    8653           0 :         return NULL;
    8654             :     }
    8655          12 :     if (PyUnicode_READY(unicode) == -1)
    8656           0 :         return NULL;
    8657          12 :     if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
    8658             :         /* If the string is already ASCII, just return the same string */
    8659          12 :         Py_INCREF(unicode);
    8660          12 :         return unicode;
    8661             :     }
    8662           0 :     return fixup(unicode, fix_decimal_and_space_to_ascii);
    8663             : }
    8664             : 
    8665             : PyObject *
    8666           0 : PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
    8667             :                                   Py_ssize_t length)
    8668             : {
    8669             :     PyObject *decimal;
    8670             :     Py_ssize_t i;
    8671             :     Py_UCS4 maxchar;
    8672             :     enum PyUnicode_Kind kind;
    8673             :     void *data;
    8674             : 
    8675           0 :     maxchar = 127;
    8676           0 :     for (i = 0; i < length; i++) {
    8677           0 :         Py_UNICODE ch = s[i];
    8678           0 :         if (ch > 127) {
    8679           0 :             int decimal = Py_UNICODE_TODECIMAL(ch);
    8680           0 :             if (decimal >= 0)
    8681           0 :                 ch = '0' + decimal;
    8682           0 :             maxchar = MAX_MAXCHAR(maxchar, ch);
    8683             :         }
    8684             :     }
    8685             : 
    8686             :     /* Copy to a new string */
    8687           0 :     decimal = PyUnicode_New(length, maxchar);
    8688           0 :     if (decimal == NULL)
    8689           0 :         return decimal;
    8690           0 :     kind = PyUnicode_KIND(decimal);
    8691           0 :     data = PyUnicode_DATA(decimal);
    8692             :     /* Iterate over code points */
    8693           0 :     for (i = 0; i < length; i++) {
    8694           0 :         Py_UNICODE ch = s[i];
    8695           0 :         if (ch > 127) {
    8696           0 :             int decimal = Py_UNICODE_TODECIMAL(ch);
    8697           0 :             if (decimal >= 0)
    8698           0 :                 ch = '0' + decimal;
    8699             :         }
    8700           0 :         PyUnicode_WRITE(kind, data, i, ch);
    8701             :     }
    8702           0 :     return unicode_result(decimal);
    8703             : }
    8704             : /* --- Decimal Encoder ---------------------------------------------------- */
    8705             : 
    8706             : int
    8707           0 : PyUnicode_EncodeDecimal(Py_UNICODE *s,
    8708             :                         Py_ssize_t length,
    8709             :                         char *output,
    8710             :                         const char *errors)
    8711             : {
    8712             :     PyObject *unicode;
    8713             :     Py_ssize_t i;
    8714             :     enum PyUnicode_Kind kind;
    8715             :     void *data;
    8716             : 
    8717           0 :     if (output == NULL) {
    8718           0 :         PyErr_BadArgument();
    8719           0 :         return -1;
    8720             :     }
    8721             : 
    8722           0 :     unicode = PyUnicode_FromUnicode(s, length);
    8723           0 :     if (unicode == NULL)
    8724           0 :         return -1;
    8725             : 
    8726           0 :     if (PyUnicode_READY(unicode) == -1) {
    8727           0 :         Py_DECREF(unicode);
    8728           0 :         return -1;
    8729             :     }
    8730           0 :     kind = PyUnicode_KIND(unicode);
    8731           0 :     data = PyUnicode_DATA(unicode);
    8732             : 
    8733           0 :     for (i=0; i < length; ) {
    8734             :         PyObject *exc;
    8735             :         Py_UCS4 ch;
    8736             :         int decimal;
    8737             :         Py_ssize_t startpos;
    8738             : 
    8739           0 :         ch = PyUnicode_READ(kind, data, i);
    8740             : 
    8741           0 :         if (Py_UNICODE_ISSPACE(ch)) {
    8742           0 :             *output++ = ' ';
    8743           0 :             i++;
    8744           0 :             continue;
    8745             :         }
    8746           0 :         decimal = Py_UNICODE_TODECIMAL(ch);
    8747           0 :         if (decimal >= 0) {
    8748           0 :             *output++ = '0' + decimal;
    8749           0 :             i++;
    8750           0 :             continue;
    8751             :         }
    8752           0 :         if (0 < ch && ch < 256) {
    8753           0 :             *output++ = (char)ch;
    8754           0 :             i++;
    8755           0 :             continue;
    8756             :         }
    8757             : 
    8758           0 :         startpos = i;
    8759           0 :         exc = NULL;
    8760           0 :         raise_encode_exception(&exc, "decimal", unicode,
    8761             :                                startpos, startpos+1,
    8762             :                                "invalid decimal Unicode string");
    8763           0 :         Py_XDECREF(exc);
    8764           0 :         Py_DECREF(unicode);
    8765           0 :         return -1;
    8766             :     }
    8767             :     /* 0-terminate the output string */
    8768           0 :     *output++ = '\0';
    8769           0 :     Py_DECREF(unicode);
    8770           0 :     return 0;
    8771             : }
    8772             : 
    8773             : /* --- Helpers ------------------------------------------------------------ */
    8774             : 
    8775             : static Py_ssize_t
    8776           7 : any_find_slice(int direction, PyObject* s1, PyObject* s2,
    8777             :                Py_ssize_t start,
    8778             :                Py_ssize_t end)
    8779             : {
    8780             :     int kind1, kind2, kind;
    8781             :     void *buf1, *buf2;
    8782             :     Py_ssize_t len1, len2, result;
    8783             : 
    8784           7 :     kind1 = PyUnicode_KIND(s1);
    8785           7 :     kind2 = PyUnicode_KIND(s2);
    8786           7 :     kind = kind1 > kind2 ? kind1 : kind2;
    8787           7 :     buf1 = PyUnicode_DATA(s1);
    8788           7 :     buf2 = PyUnicode_DATA(s2);
    8789           7 :     if (kind1 != kind)
    8790           0 :         buf1 = _PyUnicode_AsKind(s1, kind);
    8791           7 :     if (!buf1)
    8792           0 :         return -2;
    8793           7 :     if (kind2 != kind)
    8794           0 :         buf2 = _PyUnicode_AsKind(s2, kind);
    8795           7 :     if (!buf2) {
    8796           0 :         if (kind1 != kind) PyMem_Free(buf1);
    8797           0 :         return -2;
    8798             :     }
    8799           7 :     len1 = PyUnicode_GET_LENGTH(s1);
    8800           7 :     len2 = PyUnicode_GET_LENGTH(s2);
    8801             : 
    8802           7 :     if (direction > 0) {
    8803           2 :         switch (kind) {
    8804             :         case PyUnicode_1BYTE_KIND:
    8805           2 :             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
    8806           2 :                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
    8807             :             else
    8808           0 :                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
    8809           2 :             break;
    8810             :         case PyUnicode_2BYTE_KIND:
    8811           0 :             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
    8812           0 :             break;
    8813             :         case PyUnicode_4BYTE_KIND:
    8814           0 :             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
    8815           0 :             break;
    8816             :         default:
    8817           0 :             assert(0); result = -2;
    8818             :         }
    8819             :     }
    8820             :     else {
    8821           5 :         switch (kind) {
    8822             :         case PyUnicode_1BYTE_KIND:
    8823           5 :             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
    8824           5 :                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8825             :             else
    8826           0 :                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8827           5 :             break;
    8828             :         case PyUnicode_2BYTE_KIND:
    8829           0 :             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8830           0 :             break;
    8831             :         case PyUnicode_4BYTE_KIND:
    8832           0 :             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
    8833           0 :             break;
    8834             :         default:
    8835           0 :             assert(0); result = -2;
    8836             :         }
    8837             :     }
    8838             : 
    8839           7 :     if (kind1 != kind)
    8840           0 :         PyMem_Free(buf1);
    8841           7 :     if (kind2 != kind)
    8842           0 :         PyMem_Free(buf2);
    8843             : 
    8844           7 :     return result;
    8845             : }
    8846             : 
    8847             : Py_ssize_t
    8848           0 : _PyUnicode_InsertThousandsGrouping(
    8849             :     PyObject *unicode, Py_ssize_t index,
    8850             :     Py_ssize_t n_buffer,
    8851             :     void *digits, Py_ssize_t n_digits,
    8852             :     Py_ssize_t min_width,
    8853             :     const char *grouping, PyObject *thousands_sep,
    8854             :     Py_UCS4 *maxchar)
    8855             : {
    8856             :     unsigned int kind, thousands_sep_kind;
    8857             :     char *data, *thousands_sep_data;
    8858             :     Py_ssize_t thousands_sep_len;
    8859             :     Py_ssize_t len;
    8860             : 
    8861           0 :     if (unicode != NULL) {
    8862           0 :         kind = PyUnicode_KIND(unicode);
    8863           0 :         data = (char *) PyUnicode_DATA(unicode) + index * kind;
    8864             :     }
    8865             :     else {
    8866           0 :         kind = PyUnicode_1BYTE_KIND;
    8867           0 :         data = NULL;
    8868             :     }
    8869           0 :     thousands_sep_kind = PyUnicode_KIND(thousands_sep);
    8870           0 :     thousands_sep_data = PyUnicode_DATA(thousands_sep);
    8871           0 :     thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
    8872           0 :     if (unicode != NULL && thousands_sep_kind != kind) {
    8873           0 :         if (thousands_sep_kind < kind) {
    8874           0 :             thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
    8875           0 :             if (!thousands_sep_data)
    8876           0 :                 return -1;
    8877             :         }
    8878             :         else {
    8879           0 :             data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
    8880           0 :             if (!data)
    8881           0 :                 return -1;
    8882             :         }
    8883             :     }
    8884             : 
    8885           0 :     switch (kind) {
    8886             :     case PyUnicode_1BYTE_KIND:
    8887           0 :         if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
    8888           0 :             len = asciilib_InsertThousandsGrouping(
    8889             :                 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
    8890             :                 min_width, grouping,
    8891             :                 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
    8892             :         else
    8893           0 :             len = ucs1lib_InsertThousandsGrouping(
    8894             :                 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
    8895             :                 min_width, grouping,
    8896             :                 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
    8897           0 :         break;
    8898             :     case PyUnicode_2BYTE_KIND:
    8899           0 :         len = ucs2lib_InsertThousandsGrouping(
    8900             :             (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
    8901             :             min_width, grouping,
    8902             :             (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
    8903           0 :         break;
    8904             :     case PyUnicode_4BYTE_KIND:
    8905           0 :         len = ucs4lib_InsertThousandsGrouping(
    8906             :             (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
    8907             :             min_width, grouping,
    8908             :             (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
    8909           0 :         break;
    8910             :     default:
    8911             :         assert(0);
    8912           0 :         return -1;
    8913             :     }
    8914           0 :     if (unicode != NULL && thousands_sep_kind != kind) {
    8915           0 :         if (thousands_sep_kind < kind)
    8916           0 :             PyMem_Free(thousands_sep_data);
    8917             :         else
    8918           0 :             PyMem_Free(data);
    8919             :     }
    8920           0 :     if (unicode == NULL) {
    8921           0 :         *maxchar = 127;
    8922           0 :         if (len != n_digits) {
    8923           0 :             *maxchar = MAX_MAXCHAR(*maxchar,
    8924             :                                    PyUnicode_MAX_CHAR_VALUE(thousands_sep));
    8925             :         }
    8926             :     }
    8927           0 :     return len;
    8928             : }
    8929             : 
    8930             : 
    8931             : /* helper macro to fixup start/end slice values */
    8932             : #define ADJUST_INDICES(start, end, len)         \
    8933             :     if (end > len)                              \
    8934             :         end = len;                              \
    8935             :     else if (end < 0) {                         \
    8936             :         end += len;                             \
    8937             :         if (end < 0)                            \
    8938             :             end = 0;                            \
    8939             :     }                                           \
    8940             :     if (start < 0) {                            \
    8941             :         start += len;                           \
    8942             :         if (start < 0)                          \
    8943             :             start = 0;                          \
    8944             :     }
    8945             : 
    8946             : Py_ssize_t
    8947           0 : PyUnicode_Count(PyObject *str,
    8948             :                 PyObject *substr,
    8949             :                 Py_ssize_t start,
    8950             :                 Py_ssize_t end)
    8951             : {
    8952             :     Py_ssize_t result;
    8953             :     PyObject* str_obj;
    8954             :     PyObject* sub_obj;
    8955             :     int kind1, kind2, kind;
    8956           0 :     void *buf1 = NULL, *buf2 = NULL;
    8957             :     Py_ssize_t len1, len2;
    8958             : 
    8959           0 :     str_obj = PyUnicode_FromObject(str);
    8960           0 :     if (!str_obj)
    8961           0 :         return -1;
    8962           0 :     sub_obj = PyUnicode_FromObject(substr);
    8963           0 :     if (!sub_obj) {
    8964           0 :         Py_DECREF(str_obj);
    8965           0 :         return -1;
    8966             :     }
    8967           0 :     if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
    8968           0 :         Py_DECREF(sub_obj);
    8969           0 :         Py_DECREF(str_obj);
    8970           0 :         return -1;
    8971             :     }
    8972             : 
    8973           0 :     kind1 = PyUnicode_KIND(str_obj);
    8974           0 :     kind2 = PyUnicode_KIND(sub_obj);
    8975           0 :     kind = kind1;
    8976           0 :     buf1 = PyUnicode_DATA(str_obj);
    8977           0 :     buf2 = PyUnicode_DATA(sub_obj);
    8978           0 :     if (kind2 != kind) {
    8979           0 :         if (kind2 > kind) {
    8980           0 :             Py_DECREF(sub_obj);
    8981           0 :             Py_DECREF(str_obj);
    8982           0 :             return 0;
    8983             :         }
    8984           0 :         buf2 = _PyUnicode_AsKind(sub_obj, kind);
    8985             :     }
    8986           0 :     if (!buf2)
    8987           0 :         goto onError;
    8988           0 :     len1 = PyUnicode_GET_LENGTH(str_obj);
    8989           0 :     len2 = PyUnicode_GET_LENGTH(sub_obj);
    8990             : 
    8991           0 :     ADJUST_INDICES(start, end, len1);
    8992           0 :     switch (kind) {
    8993             :     case PyUnicode_1BYTE_KIND:
    8994           0 :         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
    8995           0 :             result = asciilib_count(
    8996             :                 ((Py_UCS1*)buf1) + start, end - start,
    8997             :                 buf2, len2, PY_SSIZE_T_MAX
    8998             :                 );
    8999             :         else
    9000           0 :             result = ucs1lib_count(
    9001             :                 ((Py_UCS1*)buf1) + start, end - start,
    9002             :                 buf2, len2, PY_SSIZE_T_MAX
    9003             :                 );
    9004           0 :         break;
    9005             :     case PyUnicode_2BYTE_KIND:
    9006           0 :         result = ucs2lib_count(
    9007           0 :             ((Py_UCS2*)buf1) + start, end - start,
    9008             :             buf2, len2, PY_SSIZE_T_MAX
    9009             :             );
    9010           0 :         break;
    9011             :     case PyUnicode_4BYTE_KIND:
    9012           0 :         result = ucs4lib_count(
    9013           0 :             ((Py_UCS4*)buf1) + start, end - start,
    9014             :             buf2, len2, PY_SSIZE_T_MAX
    9015             :             );
    9016           0 :         break;
    9017             :     default:
    9018           0 :         assert(0); result = 0;
    9019             :     }
    9020             : 
    9021           0 :     Py_DECREF(sub_obj);
    9022           0 :     Py_DECREF(str_obj);
    9023             : 
    9024           0 :     if (kind2 != kind)
    9025           0 :         PyMem_Free(buf2);
    9026             : 
    9027           0 :     return result;
    9028             :   onError:
    9029           0 :     Py_DECREF(sub_obj);
    9030           0 :     Py_DECREF(str_obj);
    9031           0 :     if (kind2 != kind && buf2)
    9032           0 :         PyMem_Free(buf2);
    9033           0 :     return -1;
    9034             : }
    9035             : 
    9036             : Py_ssize_t
    9037           0 : PyUnicode_Find(PyObject *str,
    9038             :                PyObject *sub,
    9039             :                Py_ssize_t start,
    9040             :                Py_ssize_t end,
    9041             :                int direction)
    9042             : {
    9043             :     Py_ssize_t result;
    9044             : 
    9045           0 :     str = PyUnicode_FromObject(str);
    9046           0 :     if (!str)
    9047           0 :         return -2;
    9048           0 :     sub = PyUnicode_FromObject(sub);
    9049           0 :     if (!sub) {
    9050           0 :         Py_DECREF(str);
    9051           0 :         return -2;
    9052             :     }
    9053           0 :     if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
    9054           0 :         Py_DECREF(sub);
    9055           0 :         Py_DECREF(str);
    9056           0 :         return -2;
    9057             :     }
    9058             : 
    9059           0 :     result = any_find_slice(direction,
    9060             :         str, sub, start, end
    9061             :         );
    9062             : 
    9063           0 :     Py_DECREF(str);
    9064           0 :     Py_DECREF(sub);
    9065             : 
    9066           0 :     return result;
    9067             : }
    9068             : 
    9069             : Py_ssize_t
    9070          44 : PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
    9071             :                    Py_ssize_t start, Py_ssize_t end,
    9072             :                    int direction)
    9073             : {
    9074             :     int kind;
    9075             :     Py_ssize_t result;
    9076          44 :     if (PyUnicode_READY(str) == -1)
    9077           0 :         return -2;
    9078          44 :     if (start < 0 || end < 0) {
    9079           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    9080           0 :         return -2;
    9081             :     }
    9082          44 :     if (end > PyUnicode_GET_LENGTH(str))
    9083           0 :         end = PyUnicode_GET_LENGTH(str);
    9084          44 :     kind = PyUnicode_KIND(str);
    9085          44 :     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
    9086             :                       kind, end-start, ch, direction);
    9087          44 :     if (result == -1)
    9088          35 :         return -1;
    9089             :     else
    9090           9 :         return start + result;
    9091             : }
    9092             : 
    9093             : static int
    9094         713 : tailmatch(PyObject *self,
    9095             :           PyObject *substring,
    9096             :           Py_ssize_t start,
    9097             :           Py_ssize_t end,
    9098             :           int direction)
    9099             : {
    9100             :     int kind_self;
    9101             :     int kind_sub;
    9102             :     void *data_self;
    9103             :     void *data_sub;
    9104             :     Py_ssize_t offset;
    9105             :     Py_ssize_t i;
    9106             :     Py_ssize_t end_sub;
    9107             : 
    9108        1426 :     if (PyUnicode_READY(self) == -1 ||
    9109         713 :         PyUnicode_READY(substring) == -1)
    9110           0 :         return 0;
    9111             : 
    9112         713 :     if (PyUnicode_GET_LENGTH(substring) == 0)
    9113           0 :         return 1;
    9114             : 
    9115         713 :     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
    9116         713 :     end -= PyUnicode_GET_LENGTH(substring);
    9117         713 :     if (end < start)
    9118          32 :         return 0;
    9119             : 
    9120         681 :     kind_self = PyUnicode_KIND(self);
    9121         681 :     data_self = PyUnicode_DATA(self);
    9122         681 :     kind_sub = PyUnicode_KIND(substring);
    9123         681 :     data_sub = PyUnicode_DATA(substring);
    9124         681 :     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
    9125             : 
    9126         681 :     if (direction > 0)
    9127         151 :         offset = end;
    9128             :     else
    9129         530 :         offset = start;
    9130             : 
    9131        1362 :     if (PyUnicode_READ(kind_self, data_self, offset) ==
    9132         949 :         PyUnicode_READ(kind_sub, data_sub, 0) &&
    9133         268 :         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
    9134         268 :         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
    9135             :         /* If both are of the same kind, memcmp is sufficient */
    9136         181 :         if (kind_self == kind_sub) {
    9137         362 :             return ! memcmp((char *)data_self +
    9138         181 :                                 (offset * PyUnicode_KIND(substring)),
    9139             :                             data_sub,
    9140         362 :                             PyUnicode_GET_LENGTH(substring) *
    9141         181 :                                 PyUnicode_KIND(substring));
    9142             :         }
    9143             :         /* otherwise we have to compare each character by first accesing it */
    9144             :         else {
    9145             :             /* We do not need to compare 0 and len(substring)-1 because
    9146             :                the if statement above ensured already that they are equal
    9147             :                when we end up here. */
    9148             :             // TODO: honor direction and do a forward or backwards search
    9149           0 :             for (i = 1; i < end_sub; ++i) {
    9150           0 :                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
    9151           0 :                     PyUnicode_READ(kind_sub, data_sub, i))
    9152           0 :                     return 0;
    9153             :             }
    9154           0 :             return 1;
    9155             :         }
    9156             :     }
    9157             : 
    9158         500 :     return 0;
    9159             : }
    9160             : 
    9161             : Py_ssize_t
    9162           0 : PyUnicode_Tailmatch(PyObject *str,
    9163             :                     PyObject *substr,
    9164             :                     Py_ssize_t start,
    9165             :                     Py_ssize_t end,
    9166             :                     int direction)
    9167             : {
    9168             :     Py_ssize_t result;
    9169             : 
    9170           0 :     str = PyUnicode_FromObject(str);
    9171           0 :     if (str == NULL)
    9172           0 :         return -1;
    9173           0 :     substr = PyUnicode_FromObject(substr);
    9174           0 :     if (substr == NULL) {
    9175           0 :         Py_DECREF(str);
    9176           0 :         return -1;
    9177             :     }
    9178             : 
    9179           0 :     result = tailmatch(str, substr,
    9180             :                        start, end, direction);
    9181           0 :     Py_DECREF(str);
    9182           0 :     Py_DECREF(substr);
    9183           0 :     return result;
    9184             : }
    9185             : 
    9186             : /* Apply fixfct filter to the Unicode object self and return a
    9187             :    reference to the modified object */
    9188             : 
    9189             : static PyObject *
    9190           0 : fixup(PyObject *self,
    9191             :       Py_UCS4 (*fixfct)(PyObject *s))
    9192             : {
    9193             :     PyObject *u;
    9194           0 :     Py_UCS4 maxchar_old, maxchar_new = 0;
    9195             :     PyObject *v;
    9196             : 
    9197           0 :     u = _PyUnicode_Copy(self);
    9198           0 :     if (u == NULL)
    9199           0 :         return NULL;
    9200           0 :     maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
    9201             : 
    9202             :     /* fix functions return the new maximum character in a string,
    9203             :        if the kind of the resulting unicode object does not change,
    9204             :        everything is fine.  Otherwise we need to change the string kind
    9205             :        and re-run the fix function. */
    9206           0 :     maxchar_new = fixfct(u);
    9207             : 
    9208           0 :     if (maxchar_new == 0) {
    9209             :         /* no changes */;
    9210           0 :         if (PyUnicode_CheckExact(self)) {
    9211           0 :             Py_DECREF(u);
    9212           0 :             Py_INCREF(self);
    9213           0 :             return self;
    9214             :         }
    9215             :         else
    9216           0 :             return u;
    9217             :     }
    9218             : 
    9219           0 :     maxchar_new = align_maxchar(maxchar_new);
    9220             : 
    9221           0 :     if (maxchar_new == maxchar_old)
    9222           0 :         return u;
    9223             : 
    9224             :     /* In case the maximum character changed, we need to
    9225             :        convert the string to the new category. */
    9226           0 :     v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
    9227           0 :     if (v == NULL) {
    9228           0 :         Py_DECREF(u);
    9229           0 :         return NULL;
    9230             :     }
    9231           0 :     if (maxchar_new > maxchar_old) {
    9232             :         /* If the maxchar increased so that the kind changed, not all
    9233             :            characters are representable anymore and we need to fix the
    9234             :            string again. This only happens in very few cases. */
    9235           0 :         _PyUnicode_FastCopyCharacters(v, 0,
    9236             :                                       self, 0, PyUnicode_GET_LENGTH(self));
    9237           0 :         maxchar_old = fixfct(v);
    9238             :         assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
    9239             :     }
    9240             :     else {
    9241           0 :         _PyUnicode_FastCopyCharacters(v, 0,
    9242             :                                       u, 0, PyUnicode_GET_LENGTH(self));
    9243             :     }
    9244           0 :     Py_DECREF(u);
    9245             :     assert(_PyUnicode_CheckConsistency(v, 1));
    9246           0 :     return v;
    9247             : }
    9248             : 
    9249             : static PyObject *
    9250           1 : ascii_upper_or_lower(PyObject *self, int lower)
    9251             : {
    9252           1 :     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
    9253           1 :     char *resdata, *data = PyUnicode_DATA(self);
    9254             :     PyObject *res;
    9255             : 
    9256           1 :     res = PyUnicode_New(len, 127);
    9257           1 :     if (res == NULL)
    9258           0 :         return NULL;
    9259           1 :     resdata = PyUnicode_DATA(res);
    9260           1 :     if (lower)
    9261           1 :         _Py_bytes_lower(resdata, data, len);
    9262             :     else
    9263           0 :         _Py_bytes_upper(resdata, data, len);
    9264           1 :     return res;
    9265             : }
    9266             : 
    9267             : static Py_UCS4
    9268           0 : handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
    9269             : {
    9270             :     Py_ssize_t j;
    9271             :     int final_sigma;
    9272             :     Py_UCS4 c;
    9273             :     /* U+03A3 is in the Final_Sigma context when, it is found like this:
    9274             : 
    9275             :      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
    9276             : 
    9277             :     where ! is a negation and \p{xxx} is a character with property xxx.
    9278             :     */
    9279           0 :     for (j = i - 1; j >= 0; j--) {
    9280           0 :         c = PyUnicode_READ(kind, data, j);
    9281           0 :         if (!_PyUnicode_IsCaseIgnorable(c))
    9282           0 :             break;
    9283             :     }
    9284           0 :     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
    9285           0 :     if (final_sigma) {
    9286           0 :         for (j = i + 1; j < length; j++) {
    9287           0 :             c = PyUnicode_READ(kind, data, j);
    9288           0 :             if (!_PyUnicode_IsCaseIgnorable(c))
    9289           0 :                 break;
    9290             :         }
    9291           0 :         final_sigma = j == length || !_PyUnicode_IsCased(c);
    9292             :     }
    9293           0 :     return (final_sigma) ? 0x3C2 : 0x3C3;
    9294             : }
    9295             : 
    9296             : static int
    9297           0 : lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
    9298             :            Py_UCS4 c, Py_UCS4 *mapped)
    9299             : {
    9300             :     /* Obscure special case. */
    9301           0 :     if (c == 0x3A3) {
    9302           0 :         mapped[0] = handle_capital_sigma(kind, data, length, i);
    9303           0 :         return 1;
    9304             :     }
    9305           0 :     return _PyUnicode_ToLowerFull(c, mapped);
    9306             : }
    9307             : 
    9308             : static Py_ssize_t
    9309           0 : do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9310             : {
    9311           0 :     Py_ssize_t i, k = 0;
    9312             :     int n_res, j;
    9313             :     Py_UCS4 c, mapped[3];
    9314             : 
    9315           0 :     c = PyUnicode_READ(kind, data, 0);
    9316           0 :     n_res = _PyUnicode_ToUpperFull(c, mapped);
    9317           0 :     for (j = 0; j < n_res; j++) {
    9318           0 :         *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
    9319           0 :         res[k++] = mapped[j];
    9320             :     }
    9321           0 :     for (i = 1; i < length; i++) {
    9322           0 :         c = PyUnicode_READ(kind, data, i);
    9323           0 :         n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9324           0 :         for (j = 0; j < n_res; j++) {
    9325           0 :             *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
    9326           0 :             res[k++] = mapped[j];
    9327             :         }
    9328             :     }
    9329           0 :     return k;
    9330             : }
    9331             : 
    9332             : static Py_ssize_t
    9333           0 : do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
    9334           0 :     Py_ssize_t i, k = 0;
    9335             : 
    9336           0 :     for (i = 0; i < length; i++) {
    9337           0 :         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
    9338             :         int n_res, j;
    9339           0 :         if (Py_UNICODE_ISUPPER(c)) {
    9340           0 :             n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9341             :         }
    9342           0 :         else if (Py_UNICODE_ISLOWER(c)) {
    9343           0 :             n_res = _PyUnicode_ToUpperFull(c, mapped);
    9344             :         }
    9345             :         else {
    9346           0 :             n_res = 1;
    9347           0 :             mapped[0] = c;
    9348             :         }
    9349           0 :         for (j = 0; j < n_res; j++) {
    9350           0 :             *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
    9351           0 :             res[k++] = mapped[j];
    9352             :         }
    9353             :     }
    9354           0 :     return k;
    9355             : }
    9356             : 
    9357             : static Py_ssize_t
    9358           0 : do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
    9359             :                   Py_UCS4 *maxchar, int lower)
    9360             : {
    9361           0 :     Py_ssize_t i, k = 0;
    9362             : 
    9363           0 :     for (i = 0; i < length; i++) {
    9364           0 :         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
    9365             :         int n_res, j;
    9366           0 :         if (lower)
    9367           0 :             n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9368             :         else
    9369           0 :             n_res = _PyUnicode_ToUpperFull(c, mapped);
    9370           0 :         for (j = 0; j < n_res; j++) {
    9371           0 :             *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
    9372           0 :             res[k++] = mapped[j];
    9373             :         }
    9374             :     }
    9375           0 :     return k;
    9376             : }
    9377             : 
    9378             : static Py_ssize_t
    9379           0 : do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9380             : {
    9381           0 :     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
    9382             : }
    9383             : 
    9384             : static Py_ssize_t
    9385           0 : do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9386             : {
    9387           0 :     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
    9388             : }
    9389             : 
    9390             : static Py_ssize_t
    9391           0 : do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9392             : {
    9393           0 :     Py_ssize_t i, k = 0;
    9394             : 
    9395           0 :     for (i = 0; i < length; i++) {
    9396           0 :         Py_UCS4 c = PyUnicode_READ(kind, data, i);
    9397             :         Py_UCS4 mapped[3];
    9398           0 :         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
    9399           0 :         for (j = 0; j < n_res; j++) {
    9400           0 :             *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
    9401           0 :             res[k++] = mapped[j];
    9402             :         }
    9403             :     }
    9404           0 :     return k;
    9405             : }
    9406             : 
    9407             : static Py_ssize_t
    9408           0 : do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
    9409             : {
    9410           0 :     Py_ssize_t i, k = 0;
    9411             :     int previous_is_cased;
    9412             : 
    9413           0 :     previous_is_cased = 0;
    9414           0 :     for (i = 0; i < length; i++) {
    9415           0 :         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
    9416             :         Py_UCS4 mapped[3];
    9417             :         int n_res, j;
    9418             : 
    9419           0 :         if (previous_is_cased)
    9420           0 :             n_res = lower_ucs4(kind, data, length, i, c, mapped);
    9421             :         else
    9422           0 :             n_res = _PyUnicode_ToTitleFull(c, mapped);
    9423             : 
    9424           0 :         for (j = 0; j < n_res; j++) {
    9425           0 :             *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
    9426           0 :             res[k++] = mapped[j];
    9427             :         }
    9428             : 
    9429           0 :         previous_is_cased = _PyUnicode_IsCased(c);
    9430             :     }
    9431           0 :     return k;
    9432             : }
    9433             : 
    9434             : static PyObject *
    9435           0 : case_operation(PyObject *self,
    9436             :                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
    9437             : {
    9438           0 :     PyObject *res = NULL;
    9439           0 :     Py_ssize_t length, newlength = 0;
    9440             :     int kind, outkind;
    9441             :     void *data, *outdata;
    9442           0 :     Py_UCS4 maxchar = 0, *tmp, *tmpend;
    9443             : 
    9444             :     assert(PyUnicode_IS_READY(self));
    9445             : 
    9446           0 :     kind = PyUnicode_KIND(self);
    9447           0 :     data = PyUnicode_DATA(self);
    9448           0 :     length = PyUnicode_GET_LENGTH(self);
    9449           0 :     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
    9450           0 :     if (tmp == NULL)
    9451           0 :         return PyErr_NoMemory();
    9452           0 :     newlength = perform(kind, data, length, tmp, &maxchar);
    9453           0 :     res = PyUnicode_New(newlength, maxchar);
    9454           0 :     if (res == NULL)
    9455           0 :         goto leave;
    9456           0 :     tmpend = tmp + newlength;
    9457           0 :     outdata = PyUnicode_DATA(res);
    9458           0 :     outkind = PyUnicode_KIND(res);
    9459           0 :     switch (outkind) {
    9460             :     case PyUnicode_1BYTE_KIND:
    9461           0 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
    9462           0 :         break;
    9463             :     case PyUnicode_2BYTE_KIND:
    9464           0 :         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
    9465           0 :         break;
    9466             :     case PyUnicode_4BYTE_KIND:
    9467           0 :         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
    9468           0 :         break;
    9469             :     default:
    9470             :         assert(0);
    9471           0 :         break;
    9472             :     }
    9473             :   leave:
    9474           0 :     PyMem_FREE(tmp);
    9475           0 :     return res;
    9476             : }
    9477             : 
    9478             : PyObject *
    9479         404 : PyUnicode_Join(PyObject *separator, PyObject *seq)
    9480             : {
    9481         404 :     PyObject *sep = NULL;
    9482             :     Py_ssize_t seplen;
    9483         404 :     PyObject *res = NULL; /* the result */
    9484             :     PyObject *fseq;          /* PySequence_Fast(seq) */
    9485             :     Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
    9486             :     PyObject **items;
    9487             :     PyObject *item;
    9488             :     Py_ssize_t sz, i, res_offset;
    9489             :     Py_UCS4 maxchar;
    9490             :     Py_UCS4 item_maxchar;
    9491             :     int use_memcpy;
    9492         404 :     unsigned char *res_data = NULL, *sep_data = NULL;
    9493             :     PyObject *last_obj;
    9494         404 :     unsigned int kind = 0;
    9495             : 
    9496         404 :     fseq = PySequence_Fast(seq, "");
    9497         404 :     if (fseq == NULL) {
    9498           0 :         return NULL;
    9499             :     }
    9500             : 
    9501             :     /* NOTE: the following code can't call back into Python code,
    9502             :      * so we are sure that fseq won't be mutated.
    9503             :      */
    9504             : 
    9505         404 :     seqlen = PySequence_Fast_GET_SIZE(fseq);
    9506             :     /* If empty sequence, return u"". */
    9507         404 :     if (seqlen == 0) {
    9508           0 :         Py_DECREF(fseq);
    9509           0 :         Py_INCREF(unicode_empty);
    9510           0 :         res = unicode_empty;
    9511           0 :         return res;
    9512             :     }
    9513             : 
    9514             :     /* If singleton sequence with an exact Unicode, return that. */
    9515         404 :     last_obj = NULL;
    9516         404 :     items = PySequence_Fast_ITEMS(fseq);
    9517         404 :     if (seqlen == 1) {
    9518           6 :         if (PyUnicode_CheckExact(items[0])) {
    9519           6 :             res = items[0];
    9520           6 :             Py_INCREF(res);
    9521           6 :             Py_DECREF(fseq);
    9522           6 :             return res;
    9523             :         }
    9524           0 :         seplen = 0;
    9525           0 :         maxchar = 0;
    9526             :     }
    9527             :     else {
    9528             :         /* Set up sep and seplen */
    9529         398 :         if (separator == NULL) {
    9530             :             /* fall back to a blank space separator */
    9531           0 :             sep = PyUnicode_FromOrdinal(' ');
    9532           0 :             if (!sep)
    9533           0 :                 goto onError;
    9534           0 :             seplen = 1;
    9535           0 :             maxchar = 32;
    9536             :         }
    9537             :         else {
    9538         398 :             if (!PyUnicode_Check(separator)) {
    9539           0 :                 PyErr_Format(PyExc_TypeError,
    9540             :                              "separator: expected str instance,"
    9541             :                              " %.80s found",
    9542           0 :                              Py_TYPE(separator)->tp_name);
    9543           0 :                 goto onError;
    9544             :             }
    9545         398 :             if (PyUnicode_READY(separator))
    9546           0 :                 goto onError;
    9547         398 :             sep = separator;
    9548         398 :             seplen = PyUnicode_GET_LENGTH(separator);
    9549         398 :             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
    9550             :             /* inc refcount to keep this code path symmetric with the
    9551             :                above case of a blank separator */
    9552         398 :             Py_INCREF(sep);
    9553             :         }
    9554         398 :         last_obj = sep;
    9555             :     }
    9556             : 
    9557             :     /* There are at least two things to join, or else we have a subclass
    9558             :      * of str in the sequence.
    9559             :      * Do a pre-pass to figure out the total amount of space we'll
    9560             :      * need (sz), and see whether all argument are strings.
    9561             :      */
    9562         398 :     sz = 0;
    9563             : #ifdef Py_DEBUG
    9564             :     use_memcpy = 0;
    9565             : #else
    9566         398 :     use_memcpy = 1;
    9567             : #endif
    9568        2621 :     for (i = 0; i < seqlen; i++) {
    9569        2223 :         const Py_ssize_t old_sz = sz;
    9570        2223 :         item = items[i];
    9571        2223 :         if (!PyUnicode_Check(item)) {
    9572           0 :             PyErr_Format(PyExc_TypeError,
    9573             :                          "sequence item %zd: expected str instance,"
    9574             :                          " %.80s found",
    9575           0 :                          i, Py_TYPE(item)->tp_name);
    9576           0 :             goto onError;
    9577             :         }
    9578        2223 :         if (PyUnicode_READY(item) == -1)
    9579           0 :             goto onError;
    9580        2223 :         sz += PyUnicode_GET_LENGTH(item);
    9581        2223 :         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
    9582        2223 :         maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
    9583        2223 :         if (i != 0)
    9584        1825 :             sz += seplen;
    9585        2223 :         if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
    9586           0 :             PyErr_SetString(PyExc_OverflowError,
    9587             :                             "join() result is too long for a Python string");
    9588           0 :             goto onError;
    9589             :         }
    9590        2223 :         if (use_memcpy && last_obj != NULL) {
    9591        2223 :             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
    9592           7 :                 use_memcpy = 0;
    9593             :         }
    9594        2223 :         last_obj = item;
    9595             :     }
    9596             : 
    9597         398 :     res = PyUnicode_New(sz, maxchar);
    9598         398 :     if (res == NULL)
    9599           0 :         goto onError;
    9600             : 
    9601             :     /* Catenate everything. */
    9602             : #ifdef Py_DEBUG
    9603             :     use_memcpy = 0;
    9604             : #else
    9605         398 :     if (use_memcpy) {
    9606         391 :         res_data = PyUnicode_1BYTE_DATA(res);
    9607         391 :         kind = PyUnicode_KIND(res);
    9608         391 :         if (seplen != 0)
    9609         141 :             sep_data = PyUnicode_1BYTE_DATA(sep);
    9610             :     }
    9611             : #endif
    9612        2621 :     for (i = 0, res_offset = 0; i < seqlen; ++i) {
    9613             :         Py_ssize_t itemlen;
    9614        2223 :         item = items[i];
    9615             :         /* Copy item, and maybe the separator. */
    9616        2223 :         if (i && seplen != 0) {
    9617        1006 :             if (use_memcpy) {
    9618        1006 :                 Py_MEMCPY(res_data,
    9619             :                           sep_data,
    9620             :                           kind * seplen);
    9621        1006 :                 res_data += kind * seplen;
    9622             :             }
    9623             :             else {
    9624           0 :                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
    9625           0 :                 res_offset += seplen;
    9626             :             }
    9627             :         }
    9628        2223 :         itemlen = PyUnicode_GET_LENGTH(item);
    9629        2223 :         if (itemlen != 0) {
    9630        2223 :             if (use_memcpy) {
    9631        6627 :                 Py_MEMCPY(res_data,
    9632        4418 :                           PyUnicode_DATA(item),
    9633             :                           kind * itemlen);
    9634        2209 :                 res_data += kind * itemlen;
    9635             :             }
    9636             :             else {
    9637          14 :                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
    9638          14 :                 res_offset += itemlen;
    9639             :             }
    9640             :         }
    9641             :     }
    9642             :     if (use_memcpy)
    9643             :         assert(res_data == PyUnicode_1BYTE_DATA(res)
    9644             :                            + kind * PyUnicode_GET_LENGTH(res));
    9645             :     else
    9646             :         assert(res_offset == PyUnicode_GET_LENGTH(res));
    9647             : 
    9648         398 :     Py_DECREF(fseq);
    9649         398 :     Py_XDECREF(sep);
    9650             :     assert(_PyUnicode_CheckConsistency(res, 1));
    9651         398 :     return res;
    9652             : 
    9653             :   onError:
    9654           0 :     Py_DECREF(fseq);
    9655           0 :     Py_XDECREF(sep);
    9656           0 :     Py_XDECREF(res);
    9657           0 :     return NULL;
    9658             : }
    9659             : 
    9660             : #define FILL(kind, data, value, start, length) \
    9661             :     do { \
    9662             :         Py_ssize_t i_ = 0; \
    9663             :         assert(kind != PyUnicode_WCHAR_KIND); \
    9664             :         switch ((kind)) { \
    9665             :         case PyUnicode_1BYTE_KIND: { \
    9666             :             unsigned char * to_ = (unsigned char *)((data)) + (start); \
    9667             :             memset(to_, (unsigned char)value, (length)); \
    9668             :             break; \
    9669             :         } \
    9670             :         case PyUnicode_2BYTE_KIND: { \
    9671             :             Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
    9672             :             for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
    9673             :             break; \
    9674             :         } \
    9675             :         case PyUnicode_4BYTE_KIND: { \
    9676             :             Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
    9677             :             for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
    9678             :             break; \
    9679             :         default: assert(0); \
    9680             :         } \
    9681             :         } \
    9682             :     } while (0)
    9683             : 
    9684             : void
    9685           0 : _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
    9686             :                     Py_UCS4 fill_char)
    9687             : {
    9688           0 :     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
    9689           0 :     const void *data = PyUnicode_DATA(unicode);
    9690             :     assert(PyUnicode_IS_READY(unicode));
    9691             :     assert(unicode_modifiable(unicode));
    9692             :     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
    9693             :     assert(start >= 0);
    9694             :     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
    9695           0 :     FILL(kind, data, fill_char, start, length);
    9696           0 : }
    9697             : 
    9698             : Py_ssize_t
    9699           0 : PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
    9700             :                Py_UCS4 fill_char)
    9701             : {
    9702             :     Py_ssize_t maxlen;
    9703             : 
    9704           0 :     if (!PyUnicode_Check(unicode)) {
    9705           0 :         PyErr_BadInternalCall();
    9706           0 :         return -1;
    9707             :     }
    9708           0 :     if (PyUnicode_READY(unicode) == -1)
    9709           0 :         return -1;
    9710           0 :     if (unicode_check_modifiable(unicode))
    9711           0 :         return -1;
    9712             : 
    9713           0 :     if (start < 0) {
    9714           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
    9715           0 :         return -1;
    9716             :     }
    9717           0 :     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
    9718           0 :         PyErr_SetString(PyExc_ValueError,
    9719             :                          "fill character is bigger than "
    9720             :                          "the string maximum character");
    9721           0 :         return -1;
    9722             :     }
    9723             : 
    9724           0 :     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
    9725           0 :     length = Py_MIN(maxlen, length);
    9726           0 :     if (length <= 0)
    9727           0 :         return 0;
    9728             : 
    9729           0 :     _PyUnicode_FastFill(unicode, start, length, fill_char);
    9730           0 :     return length;
    9731             : }
    9732             : 
    9733             : static PyObject *
    9734           0 : pad(PyObject *self,
    9735             :     Py_ssize_t left,
    9736             :     Py_ssize_t right,
    9737             :     Py_UCS4 fill)
    9738             : {
    9739             :     PyObject *u;
    9740             :     Py_UCS4 maxchar;
    9741             :     int kind;
    9742             :     void *data;
    9743             : 
    9744           0 :     if (left < 0)
    9745           0 :         left = 0;
    9746           0 :     if (right < 0)
    9747           0 :         right = 0;
    9748             : 
    9749           0 :     if (left == 0 && right == 0)
    9750           0 :         return unicode_result_unchanged(self);
    9751             : 
    9752           0 :     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
    9753           0 :         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
    9754           0 :         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
    9755           0 :         return NULL;
    9756             :     }
    9757           0 :     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
    9758           0 :     maxchar = MAX_MAXCHAR(maxchar, fill);
    9759           0 :     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
    9760           0 :     if (!u)
    9761           0 :         return NULL;
    9762             : 
    9763           0 :     kind = PyUnicode_KIND(u);
    9764           0 :     data = PyUnicode_DATA(u);
    9765           0 :     if (left)
    9766           0 :         FILL(kind, data, fill, 0, left);
    9767           0 :     if (right)
    9768           0 :         FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
    9769           0 :     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
    9770             :     assert(_PyUnicode_CheckConsistency(u, 1));
    9771           0 :     return u;
    9772             : }
    9773             : 
    9774             : PyObject *
    9775           0 : PyUnicode_Splitlines(PyObject *string, int keepends)
    9776             : {
    9777             :     PyObject *list;
    9778             : 
    9779           0 :     string = PyUnicode_FromObject(string);
    9780           0 :     if (string == NULL)
    9781           0 :         return NULL;
    9782           0 :     if (PyUnicode_READY(string) == -1) {
    9783           0 :         Py_DECREF(string);
    9784           0 :         return NULL;
    9785             :     }
    9786             : 
    9787           0 :     switch (PyUnicode_KIND(string)) {
    9788             :     case PyUnicode_1BYTE_KIND:
    9789           0 :         if (PyUnicode_IS_ASCII(string))
    9790           0 :             list = asciilib_splitlines(
    9791           0 :                 string, PyUnicode_1BYTE_DATA(string),
    9792             :                 PyUnicode_GET_LENGTH(string), keepends);
    9793             :         else
    9794           0 :             list = ucs1lib_splitlines(
    9795           0 :                 string, PyUnicode_1BYTE_DATA(string),
    9796             :                 PyUnicode_GET_LENGTH(string), keepends);
    9797           0 :         break;
    9798             :     case PyUnicode_2BYTE_KIND:
    9799           0 :         list = ucs2lib_splitlines(
    9800           0 :             string, PyUnicode_2BYTE_DATA(string),
    9801             :             PyUnicode_GET_LENGTH(string), keepends);
    9802           0 :         break;
    9803             :     case PyUnicode_4BYTE_KIND:
    9804           0 :         list = ucs4lib_splitlines(
    9805           0 :             string, PyUnicode_4BYTE_DATA(string),
    9806             :             PyUnicode_GET_LENGTH(string), keepends);
    9807           0 :         break;
    9808             :     default:
    9809             :         assert(0);
    9810           0 :         list = 0;
    9811             :     }
    9812           0 :     Py_DECREF(string);
    9813           0 :     return list;
    9814             : }
    9815             : 
    9816             : static PyObject *
    9817         109 : split(PyObject *self,
    9818             :       PyObject *substring,
    9819             :       Py_ssize_t maxcount)
    9820             : {
    9821             :     int kind1, kind2, kind;
    9822             :     void *buf1, *buf2;
    9823             :     Py_ssize_t len1, len2;
    9824             :     PyObject* out;
    9825             : 
    9826         109 :     if (maxcount < 0)
    9827         109 :         maxcount = PY_SSIZE_T_MAX;
    9828             : 
    9829         109 :     if (PyUnicode_READY(self) == -1)
    9830           0 :         return NULL;
    9831             : 
    9832         109 :     if (substring == NULL)
    9833           2 :         switch (PyUnicode_KIND(self)) {
    9834             :         case PyUnicode_1BYTE_KIND:
    9835           2 :             if (PyUnicode_IS_ASCII(self))
    9836           6 :                 return asciilib_split_whitespace(
    9837           4 :                     self,  PyUnicode_1BYTE_DATA(self),
    9838             :                     PyUnicode_GET_LENGTH(self), maxcount
    9839             :                     );
    9840             :             else
    9841           0 :                 return ucs1lib_split_whitespace(
    9842           0 :                     self,  PyUnicode_1BYTE_DATA(self),
    9843             :                     PyUnicode_GET_LENGTH(self), maxcount
    9844             :                     );
    9845             :         case PyUnicode_2BYTE_KIND:
    9846           0 :             return ucs2lib_split_whitespace(
    9847           0 :                 self,  PyUnicode_2BYTE_DATA(self),
    9848             :                 PyUnicode_GET_LENGTH(self), maxcount
    9849             :                 );
    9850             :         case PyUnicode_4BYTE_KIND:
    9851           0 :             return ucs4lib_split_whitespace(
    9852           0 :                 self,  PyUnicode_4BYTE_DATA(self),
    9853             :                 PyUnicode_GET_LENGTH(self), maxcount
    9854             :                 );
    9855             :         default:
    9856             :             assert(0);
    9857           0 :             return NULL;
    9858             :         }
    9859             : 
    9860         107 :     if (PyUnicode_READY(substring) == -1)
    9861           0 :         return NULL;
    9862             : 
    9863         107 :     kind1 = PyUnicode_KIND(self);
    9864         107 :     kind2 = PyUnicode_KIND(substring);
    9865         107 :     kind = kind1 > kind2 ? kind1 : kind2;
    9866         107 :     buf1 = PyUnicode_DATA(self);
    9867         107 :     buf2 = PyUnicode_DATA(substring);
    9868         107 :     if (kind1 != kind)
    9869           0 :         buf1 = _PyUnicode_AsKind(self, kind);
    9870         107 :     if (!buf1)
    9871           0 :         return NULL;
    9872         107 :     if (kind2 != kind)
    9873           0 :         buf2 = _PyUnicode_AsKind(substring, kind);
    9874         107 :     if (!buf2) {
    9875           0 :         if (kind1 != kind) PyMem_Free(buf1);
    9876           0 :         return NULL;
    9877             :     }
    9878         107 :     len1 = PyUnicode_GET_LENGTH(self);
    9879         107 :     len2 = PyUnicode_GET_LENGTH(substring);
    9880             : 
    9881         107 :     switch (kind) {
    9882             :     case PyUnicode_1BYTE_KIND:
    9883         107 :         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
    9884         107 :             out = asciilib_split(
    9885             :                 self,  buf1, len1, buf2, len2, maxcount);
    9886             :         else
    9887           0 :             out = ucs1lib_split(
    9888             :                 self,  buf1, len1, buf2, len2, maxcount);
    9889         107 :         break;
    9890             :     case PyUnicode_2BYTE_KIND:
    9891           0 :         out = ucs2lib_split(
    9892             :             self,  buf1, len1, buf2, len2, maxcount);
    9893           0 :         break;
    9894             :     case PyUnicode_4BYTE_KIND:
    9895           0 :         out = ucs4lib_split(
    9896             :             self,  buf1, len1, buf2, len2, maxcount);
    9897           0 :         break;
    9898             :     default:
    9899           0 :         out = NULL;
    9900             :     }
    9901         107 :     if (kind1 != kind)
    9902           0 :         PyMem_Free(buf1);
    9903         107 :     if (kind2 != kind)
    9904           0 :         PyMem_Free(buf2);
    9905         107 :     return out;
    9906             : }
    9907             : 
    9908             : static PyObject *
    9909          90 : rsplit(PyObject *self,
    9910             :        PyObject *substring,
    9911             :        Py_ssize_t maxcount)
    9912             : {
    9913             :     int kind1, kind2, kind;
    9914             :     void *buf1, *buf2;
    9915             :     Py_ssize_t len1, len2;
    9916             :     PyObject* out;
    9917             : 
    9918          90 :     if (maxcount < 0)
    9919           0 :         maxcount = PY_SSIZE_T_MAX;
    9920             : 
    9921          90 :     if (PyUnicode_READY(self) == -1)
    9922           0 :         return NULL;
    9923             : 
    9924          90 :     if (substring == NULL)
    9925           0 :         switch (PyUnicode_KIND(self)) {
    9926             :         case PyUnicode_1BYTE_KIND:
    9927           0 :             if (PyUnicode_IS_ASCII(self))
    9928           0 :                 return asciilib_rsplit_whitespace(
    9929           0 :                     self,  PyUnicode_1BYTE_DATA(self),
    9930             :                     PyUnicode_GET_LENGTH(self), maxcount
    9931             :                     );
    9932             :             else
    9933           0 :                 return ucs1lib_rsplit_whitespace(
    9934           0 :                     self,  PyUnicode_1BYTE_DATA(self),
    9935             :                     PyUnicode_GET_LENGTH(self), maxcount
    9936             :                     );
    9937             :         case PyUnicode_2BYTE_KIND:
    9938           0 :             return ucs2lib_rsplit_whitespace(
    9939           0 :                 self,  PyUnicode_2BYTE_DATA(self),
    9940             :                 PyUnicode_GET_LENGTH(self), maxcount
    9941             :                 );
    9942             :         case PyUnicode_4BYTE_KIND:
    9943           0 :             return ucs4lib_rsplit_whitespace(
    9944           0 :                 self,  PyUnicode_4BYTE_DATA(self),
    9945             :                 PyUnicode_GET_LENGTH(self), maxcount
    9946             :                 );
    9947             :         default:
    9948             :             assert(0);
    9949           0 :             return NULL;
    9950             :         }
    9951             : 
    9952          90 :     if (PyUnicode_READY(substring) == -1)
    9953           0 :         return NULL;
    9954             : 
    9955          90 :     kind1 = PyUnicode_KIND(self);
    9956          90 :     kind2 = PyUnicode_KIND(substring);
    9957          90 :     kind = kind1 > kind2 ? kind1 : kind2;
    9958          90 :     buf1 = PyUnicode_DATA(self);
    9959          90 :     buf2 = PyUnicode_DATA(substring);
    9960          90 :     if (kind1 != kind)
    9961           0 :         buf1 = _PyUnicode_AsKind(self, kind);
    9962          90 :     if (!buf1)
    9963           0 :         return NULL;
    9964          90 :     if (kind2 != kind)
    9965           0 :         buf2 = _PyUnicode_AsKind(substring, kind);
    9966          90 :     if (!buf2) {
    9967           0 :         if (kind1 != kind) PyMem_Free(buf1);
    9968           0 :         return NULL;
    9969             :     }
    9970          90 :     len1 = PyUnicode_GET_LENGTH(self);
    9971          90 :     len2 = PyUnicode_GET_LENGTH(substring);
    9972             : 
    9973          90 :     switch (kind) {
    9974             :     case PyUnicode_1BYTE_KIND:
    9975          90 :         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
    9976          90 :             out = asciilib_rsplit(
    9977             :                 self,  buf1, len1, buf2, len2, maxcount);
    9978             :         else
    9979           0 :             out = ucs1lib_rsplit(
    9980             :                 self,  buf1, len1, buf2, len2, maxcount);
    9981          90 :         break;
    9982             :     case PyUnicode_2BYTE_KIND:
    9983           0 :         out = ucs2lib_rsplit(
    9984             :             self,  buf1, len1, buf2, len2, maxcount);
    9985           0 :         break;
    9986             :     case PyUnicode_4BYTE_KIND:
    9987           0 :         out = ucs4lib_rsplit(
    9988             :             self,  buf1, len1, buf2, len2, maxcount);
    9989           0 :         break;
    9990             :     default:
    9991           0 :         out = NULL;
    9992             :     }
    9993          90 :     if (kind1 != kind)
    9994           0 :         PyMem_Free(buf1);
    9995          90 :     if (kind2 != kind)
    9996           0 :         PyMem_Free(buf2);
    9997          90 :     return out;
    9998             : }
    9999             : 
   10000             : static Py_ssize_t
   10001          19 : anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
   10002             :             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
   10003             : {
   10004          19 :     switch (kind) {
   10005             :     case PyUnicode_1BYTE_KIND:
   10006          19 :         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
   10007          19 :             return asciilib_find(buf1, len1, buf2, len2, offset);
   10008             :         else
   10009           0 :             return ucs1lib_find(buf1, len1, buf2, len2, offset);
   10010             :     case PyUnicode_2BYTE_KIND:
   10011           0 :         return ucs2lib_find(buf1, len1, buf2, len2, offset);
   10012             :     case PyUnicode_4BYTE_KIND:
   10013           0 :         return ucs4lib_find(buf1, len1, buf2, len2, offset);
   10014             :     }
   10015             :     assert(0);
   10016           0 :     return -1;
   10017             : }
   10018             : 
   10019             : static Py_ssize_t
   10020           4 : anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
   10021             :              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
   10022             : {
   10023           4 :     switch (kind) {
   10024             :     case PyUnicode_1BYTE_KIND:
   10025           4 :         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
   10026           3 :             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
   10027             :         else
   10028           1 :             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
   10029             :     case PyUnicode_2BYTE_KIND:
   10030           0 :         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
   10031             :     case PyUnicode_4BYTE_KIND:
   10032           0 :         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
   10033             :     }
   10034             :     assert(0);
   10035           0 :     return 0;
   10036             : }
   10037             : 
   10038             : static PyObject *
   10039           6 : replace(PyObject *self, PyObject *str1,
   10040             :         PyObject *str2, Py_ssize_t maxcount)
   10041             : {
   10042             :     PyObject *u;
   10043           6 :     char *sbuf = PyUnicode_DATA(self);
   10044           6 :     char *buf1 = PyUnicode_DATA(str1);
   10045           6 :     char *buf2 = PyUnicode_DATA(str2);
   10046           6 :     int srelease = 0, release1 = 0, release2 = 0;
   10047           6 :     int skind = PyUnicode_KIND(self);
   10048           6 :     int kind1 = PyUnicode_KIND(str1);
   10049           6 :     int kind2 = PyUnicode_KIND(str2);
   10050           6 :     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
   10051           6 :     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
   10052           6 :     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
   10053             :     int mayshrink;
   10054             :     Py_UCS4 maxchar, maxchar_str2;
   10055             : 
   10056           6 :     if (maxcount < 0)
   10057           6 :         maxcount = PY_SSIZE_T_MAX;
   10058           0 :     else if (maxcount == 0 || slen == 0)
   10059             :         goto nothing;
   10060             : 
   10061           6 :     if (str1 == str2)
   10062           0 :         goto nothing;
   10063           6 :     if (skind < kind1)
   10064             :         /* substring too wide to be present */
   10065           0 :         goto nothing;
   10066             : 
   10067           6 :     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
   10068           6 :     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
   10069             :     /* Replacing str1 with str2 may cause a maxchar reduction in the
   10070             :        result string. */
   10071           6 :     mayshrink = (maxchar_str2 < maxchar);
   10072           6 :     maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
   10073             : 
   10074           6 :     if (len1 == len2) {
   10075             :         /* same length */
   10076           2 :         if (len1 == 0)
   10077           0 :             goto nothing;
   10078           2 :         if (len1 == 1) {
   10079             :             /* replace characters */
   10080             :             Py_UCS4 u1, u2;
   10081             :             int rkind;
   10082             :             Py_ssize_t index, pos;
   10083             :             char *src;
   10084             : 
   10085           2 :             u1 = PyUnicode_READ_CHAR(str1, 0);
   10086           2 :             pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
   10087           2 :             if (pos < 0)
   10088           2 :                 goto nothing;
   10089           0 :             u2 = PyUnicode_READ_CHAR(str2, 0);
   10090           0 :             u = PyUnicode_New(slen, maxchar);
   10091           0 :             if (!u)
   10092           0 :                 goto error;
   10093           0 :             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
   10094           0 :             rkind = PyUnicode_KIND(u);
   10095             : 
   10096           0 :             PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
   10097           0 :             index = 0;
   10098           0 :             src = sbuf;
   10099           0 :             while (--maxcount)
   10100             :             {
   10101           0 :                 pos++;
   10102           0 :                 src += pos * PyUnicode_KIND(self);
   10103           0 :                 slen -= pos;
   10104           0 :                 index += pos;
   10105           0 :                 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
   10106           0 :                 if (pos < 0)
   10107           0 :                     break;
   10108           0 :                 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
   10109             :             }
   10110             :         }
   10111             :         else {
   10112           0 :             int rkind = skind;
   10113             :             char *res;
   10114             :             Py_ssize_t i;
   10115             : 
   10116           0 :             if (kind1 < rkind) {
   10117             :                 /* widen substring */
   10118           0 :                 buf1 = _PyUnicode_AsKind(str1, rkind);
   10119           0 :                 if (!buf1) goto error;
   10120           0 :                 release1 = 1;
   10121             :             }
   10122           0 :             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
   10123           0 :             if (i < 0)
   10124           0 :                 goto nothing;
   10125           0 :             if (rkind > kind2) {
   10126             :                 /* widen replacement */
   10127           0 :                 buf2 = _PyUnicode_AsKind(str2, rkind);
   10128           0 :                 if (!buf2) goto error;
   10129           0 :                 release2 = 1;
   10130             :             }
   10131           0 :             else if (rkind < kind2) {
   10132             :                 /* widen self and buf1 */
   10133           0 :                 rkind = kind2;
   10134           0 :                 if (release1) PyMem_Free(buf1);
   10135           0 :                 sbuf = _PyUnicode_AsKind(self, rkind);
   10136           0 :                 if (!sbuf) goto error;
   10137           0 :                 srelease = 1;
   10138           0 :                 buf1 = _PyUnicode_AsKind(str1, rkind);
   10139           0 :                 if (!buf1) goto error;
   10140           0 :                 release1 = 1;
   10141             :             }
   10142           0 :             u = PyUnicode_New(slen, maxchar);
   10143           0 :             if (!u)
   10144           0 :                 goto error;
   10145             :             assert(PyUnicode_KIND(u) == rkind);
   10146           0 :             res = PyUnicode_DATA(u);
   10147             : 
   10148           0 :             memcpy(res, sbuf, rkind * slen);
   10149             :             /* change everything in-place, starting with this one */
   10150           0 :             memcpy(res + rkind * i,
   10151             :                    buf2,
   10152           0 :                    rkind * len2);
   10153           0 :             i += len1;
   10154             : 
   10155           0 :             while ( --maxcount > 0) {
   10156           0 :                 i = anylib_find(rkind, self,
   10157           0 :                                 sbuf+rkind*i, slen-i,
   10158             :                                 str1, buf1, len1, i);
   10159           0 :                 if (i == -1)
   10160           0 :                     break;
   10161           0 :                 memcpy(res + rkind * i,
   10162             :                        buf2,
   10163           0 :                        rkind * len2);
   10164           0 :                 i += len1;
   10165             :             }
   10166             :         }
   10167             :     }
   10168             :     else {
   10169             :         Py_ssize_t n, i, j, ires;
   10170             :         Py_ssize_t product, new_size;
   10171           4 :         int rkind = skind;
   10172             :         char *res;
   10173             : 
   10174           4 :         if (kind1 < rkind) {
   10175             :             /* widen substring */
   10176           0 :             buf1 = _PyUnicode_AsKind(str1, rkind);
   10177           0 :             if (!buf1) goto error;
   10178           0 :             release1 = 1;
   10179             :         }
   10180           4 :         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
   10181           4 :         if (n == 0)
   10182           1 :             goto nothing;
   10183           3 :         if (kind2 < rkind) {
   10184             :             /* widen replacement */
   10185           0 :             buf2 = _PyUnicode_AsKind(str2, rkind);
   10186           0 :             if (!buf2) goto error;
   10187           0 :             release2 = 1;
   10188             :         }
   10189           3 :         else if (kind2 > rkind) {
   10190             :             /* widen self and buf1 */
   10191           0 :             rkind = kind2;
   10192           0 :             sbuf = _PyUnicode_AsKind(self, rkind);
   10193           0 :             if (!sbuf) goto error;
   10194           0 :             srelease = 1;
   10195           0 :             if (release1) PyMem_Free(buf1);
   10196           0 :             buf1 = _PyUnicode_AsKind(str1, rkind);
   10197           0 :             if (!buf1) goto error;
   10198           0 :             release1 = 1;
   10199             :         }
   10200             :         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
   10201             :            PyUnicode_GET_LENGTH(str1))); */
   10202           3 :         product = n * (len2-len1);
   10203           3 :         if ((product / (len2-len1)) != n) {
   10204           0 :                 PyErr_SetString(PyExc_OverflowError,
   10205             :                                 "replace string is too long");
   10206           0 :                 goto error;
   10207             :         }
   10208           3 :         new_size = slen + product;
   10209           3 :         if (new_size == 0) {
   10210           0 :             Py_INCREF(unicode_empty);
   10211           0 :             u = unicode_empty;
   10212           0 :             goto done;
   10213             :         }
   10214           3 :         if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
   10215           0 :             PyErr_SetString(PyExc_OverflowError,
   10216             :                             "replace string is too long");
   10217           0 :             goto error;
   10218             :         }
   10219           3 :         u = PyUnicode_New(new_size, maxchar);
   10220           3 :         if (!u)
   10221           0 :             goto error;
   10222             :         assert(PyUnicode_KIND(u) == rkind);
   10223           3 :         res = PyUnicode_DATA(u);
   10224           3 :         ires = i = 0;
   10225           3 :         if (len1 > 0) {
   10226          25 :             while (n-- > 0) {
   10227             :                 /* look for next match */
   10228          38 :                 j = anylib_find(rkind, self,
   10229          19 :                                 sbuf + rkind * i, slen-i,
   10230             :                                 str1, buf1, len1, i);
   10231          19 :                 if (j == -1)
   10232           0 :                     break;
   10233          19 :                 else if (j > i) {
   10234             :                     /* copy unchanged part [i:j] */
   10235          57 :                     memcpy(res + rkind * ires,
   10236          38 :                            sbuf + rkind * i,
   10237          19 :                            rkind * (j-i));
   10238          19 :                     ires += j - i;
   10239             :                 }
   10240             :                 /* copy substitution string */
   10241          19 :                 if (len2 > 0) {
   10242           1 :                     memcpy(res + rkind * ires,
   10243             :                            buf2,
   10244           1 :                            rkind * len2);
   10245           1 :                     ires += len2;
   10246             :                 }
   10247          19 :                 i = j + len1;
   10248             :             }
   10249           3 :             if (i < slen)
   10250             :                 /* copy tail [i:] */
   10251           6 :                 memcpy(res + rkind * ires,
   10252           4 :                        sbuf + rkind * i,
   10253           2 :                        rkind * (slen-i));
   10254             :         }
   10255             :         else {
   10256             :             /* interleave */
   10257           0 :             while (n > 0) {
   10258           0 :                 memcpy(res + rkind * ires,
   10259             :                        buf2,
   10260           0 :                        rkind * len2);
   10261           0 :                 ires += len2;
   10262           0 :                 if (--n <= 0)
   10263           0 :                     break;
   10264           0 :                 memcpy(res + rkind * ires,
   10265           0 :                        sbuf + rkind * i,
   10266             :                        rkind);
   10267           0 :                 ires++;
   10268           0 :                 i++;
   10269             :             }
   10270           0 :             memcpy(res + rkind * ires,
   10271           0 :                    sbuf + rkind * i,
   10272           0 :                    rkind * (slen-i));
   10273             :         }
   10274             :     }
   10275             : 
   10276           3 :     if (mayshrink) {
   10277           0 :         unicode_adjust_maxchar(&u);
   10278           0 :         if (u == NULL)
   10279           0 :             goto error;
   10280             :     }
   10281             : 
   10282             :   done:
   10283           3 :     if (srelease)
   10284           0 :         PyMem_FREE(sbuf);
   10285           3 :     if (release1)
   10286           0 :         PyMem_FREE(buf1);
   10287           3 :     if (release2)
   10288           0 :         PyMem_FREE(buf2);
   10289             :     assert(_PyUnicode_CheckConsistency(u, 1));
   10290           3 :     return u;
   10291             : 
   10292             :   nothing:
   10293             :     /* nothing to replace; return original string (when possible) */
   10294           3 :     if (srelease)
   10295           0 :         PyMem_FREE(sbuf);
   10296           3 :     if (release1)
   10297           0 :         PyMem_FREE(buf1);
   10298           3 :     if (release2)
   10299           0 :         PyMem_FREE(buf2);
   10300           3 :     return unicode_result_unchanged(self);
   10301             : 
   10302             :   error:
   10303           0 :     if (srelease && sbuf)
   10304           0 :         PyMem_FREE(sbuf);
   10305           0 :     if (release1 && buf1)
   10306           0 :         PyMem_FREE(buf1);
   10307           0 :     if (release2 && buf2)
   10308           0 :         PyMem_FREE(buf2);
   10309           0 :     return NULL;
   10310             : }
   10311             : 
   10312             : /* --- Unicode Object Methods --------------------------------------------- */
   10313             : 
   10314             : PyDoc_STRVAR(title__doc__,
   10315             :              "S.title() -> str\n\
   10316             : \n\
   10317             : Return a titlecased version of S, i.e. words start with title case\n\
   10318             : characters, all remaining cased characters have lower case.");
   10319             : 
   10320             : static PyObject*
   10321           0 : unicode_title(PyObject *self)
   10322             : {
   10323           0 :     if (PyUnicode_READY(self) == -1)
   10324           0 :         return NULL;
   10325           0 :     return case_operation(self, do_title);
   10326             : }
   10327             : 
   10328             : PyDoc_STRVAR(capitalize__doc__,
   10329             :              "S.capitalize() -> str\n\
   10330             : \n\
   10331             : Return a capitalized version of S, i.e. make the first character\n\
   10332             : have upper case and the rest lower case.");
   10333             : 
   10334             : static PyObject*
   10335           0 : unicode_capitalize(PyObject *self)
   10336             : {
   10337           0 :     if (PyUnicode_READY(self) == -1)
   10338           0 :         return NULL;
   10339           0 :     if (PyUnicode_GET_LENGTH(self) == 0)
   10340           0 :         return unicode_result_unchanged(self);
   10341           0 :     return case_operation(self, do_capitalize);
   10342             : }
   10343             : 
   10344             : PyDoc_STRVAR(casefold__doc__,
   10345             :              "S.casefold() -> str\n\
   10346             : \n\
   10347             : Return a version of S suitable for caseless comparisons.");
   10348             : 
   10349             : static PyObject *
   10350           0 : unicode_casefold(PyObject *self)
   10351             : {
   10352           0 :     if (PyUnicode_READY(self) == -1)
   10353           0 :         return NULL;
   10354           0 :     if (PyUnicode_IS_ASCII(self))
   10355           0 :         return ascii_upper_or_lower(self, 1);
   10356           0 :     return case_operation(self, do_casefold);
   10357             : }
   10358             : 
   10359             : 
   10360             : /* Argument converter.  Coerces to a single unicode character */
   10361             : 
   10362             : static int
   10363           0 : convert_uc(PyObject *obj, void *addr)
   10364             : {
   10365           0 :     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
   10366             :     PyObject *uniobj;
   10367             : 
   10368           0 :     uniobj = PyUnicode_FromObject(obj);
   10369           0 :     if (uniobj == NULL) {
   10370           0 :         PyErr_SetString(PyExc_TypeError,
   10371             :                         "The fill character cannot be converted to Unicode");
   10372           0 :         return 0;
   10373             :     }
   10374           0 :     if (PyUnicode_GET_LENGTH(uniobj) != 1) {
   10375           0 :         PyErr_SetString(PyExc_TypeError,
   10376             :                         "The fill character must be exactly one character long");
   10377           0 :         Py_DECREF(uniobj);
   10378           0 :         return 0;
   10379             :     }
   10380           0 :     *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
   10381           0 :     Py_DECREF(uniobj);
   10382           0 :     return 1;
   10383             : }
   10384             : 
   10385             : PyDoc_STRVAR(center__doc__,
   10386             :              "S.center(width[, fillchar]) -> str\n\
   10387             : \n\
   10388             : Return S centered in a string of length width. Padding is\n\
   10389             : done using the specified fill character (default is a space)");
   10390             : 
   10391             : static PyObject *
   10392           0 : unicode_center(PyObject *self, PyObject *args)
   10393             : {
   10394             :     Py_ssize_t marg, left;
   10395             :     Py_ssize_t width;
   10396           0 :     Py_UCS4 fillchar = ' ';
   10397             : 
   10398           0 :     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
   10399           0 :         return NULL;
   10400             : 
   10401           0 :     if (PyUnicode_READY(self) == -1)
   10402           0 :         return NULL;
   10403             : 
   10404           0 :     if (PyUnicode_GET_LENGTH(self) >= width)
   10405           0 :         return unicode_result_unchanged(self);
   10406             : 
   10407           0 :     marg = width - PyUnicode_GET_LENGTH(self);
   10408           0 :     left = marg / 2 + (marg & width & 1);
   10409             : 
   10410           0 :     return pad(self, left, marg - left, fillchar);
   10411             : }
   10412             : 
   10413             : /* This function assumes that str1 and str2 are readied by the caller. */
   10414             : 
   10415             : static int
   10416       14735 : unicode_compare(PyObject *str1, PyObject *str2)
   10417             : {
   10418             :     int kind1, kind2;
   10419             :     void *data1, *data2;
   10420             :     Py_ssize_t len1, len2, i;
   10421             : 
   10422       14735 :     kind1 = PyUnicode_KIND(str1);
   10423       14735 :     kind2 = PyUnicode_KIND(str2);
   10424       14735 :     data1 = PyUnicode_DATA(str1);
   10425       14735 :     data2 = PyUnicode_DATA(str2);
   10426       14735 :     len1 = PyUnicode_GET_LENGTH(str1);
   10427       14735 :     len2 = PyUnicode_GET_LENGTH(str2);
   10428             : 
   10429      158079 :     for (i = 0; i < len1 && i < len2; ++i) {
   10430             :         Py_UCS4 c1, c2;
   10431      156572 :         c1 = PyUnicode_READ(kind1, data1, i);
   10432      156572 :         c2 = PyUnicode_READ(kind2, data2, i);
   10433             : 
   10434      156572 :         if (c1 != c2)
   10435       13228 :             return (c1 < c2) ? -1 : 1;
   10436             :     }
   10437             : 
   10438        1507 :     return (len1 < len2) ? -1 : (len1 != len2);
   10439             : }
   10440             : 
   10441             : int
   10442        1400 : PyUnicode_Compare(PyObject *left, PyObject *right)
   10443             : {
   10444        1400 :     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
   10445        2800 :         if (PyUnicode_READY(left) == -1 ||
   10446        1400 :             PyUnicode_READY(right) == -1)
   10447           0 :             return -1;
   10448        1400 :         return unicode_compare(left, right);
   10449             :     }
   10450           0 :     PyErr_Format(PyExc_TypeError,
   10451             :                  "Can't compare %.100s and %.100s",
   10452           0 :                  left->ob_type->tp_name,
   10453           0 :                  right->ob_type->tp_name);
   10454           0 :     return -1;
   10455             : }
   10456             : 
   10457             : int
   10458        5705 : PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
   10459             : {
   10460             :     Py_ssize_t i;
   10461             :     int kind;
   10462             :     void *data;
   10463             :     Py_UCS4 chr;
   10464             : 
   10465             :     assert(_PyUnicode_CHECK(uni));
   10466        5705 :     if (PyUnicode_READY(uni) == -1)
   10467           0 :         return -1;
   10468        5705 :     kind = PyUnicode_KIND(uni);
   10469        5705 :     data = PyUnicode_DATA(uni);
   10470             :     /* Compare Unicode string and source character set string */
   10471       12812 :     for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
   10472       12537 :         if (chr != str[i])
   10473        5430 :             return (chr < (unsigned char)(str[i])) ? -1 : 1;
   10474             :     /* This check keeps Python strings that end in '\0' from comparing equal
   10475             :      to C strings identical up to that point. */
   10476         275 :     if (PyUnicode_GET_LENGTH(uni) != i || chr)
   10477           3 :         return 1; /* uni is longer */
   10478         272 :     if (str[i])
   10479           0 :         return -1; /* str is longer */
   10480         272 :     return 0;
   10481             : }
   10482             : 
   10483             : 
   10484             : #define TEST_COND(cond)                         \
   10485             :     ((cond) ? Py_True : Py_False)
   10486             : 
   10487             : PyObject *
   10488       27937 : PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
   10489             : {
   10490             :     int result;
   10491             : 
   10492       27937 :     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
   10493             :         PyObject *v;
   10494       55566 :         if (PyUnicode_READY(left) == -1 ||
   10495       27783 :             PyUnicode_READY(right) == -1)
   10496           0 :             return NULL;
   10497       42263 :         if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
   10498       14480 :             PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
   10499       14008 :             if (op == Py_EQ) {
   10500        9982 :                 Py_INCREF(Py_False);
   10501        9982 :                 return Py_False;
   10502             :             }
   10503        4026 :             if (op == Py_NE) {
   10504        1247 :                 Py_INCREF(Py_True);
   10505        1247 :                 return Py_True;
   10506             :             }
   10507             :         }
   10508       16554 :         if (left == right)
   10509        3219 :             result = 0;
   10510             :         else
   10511       13335 :             result = unicode_compare(left, right);
   10512             : 
   10513             :         /* Convert the return value to a Boolean */
   10514       16554 :         switch (op) {
   10515             :         case Py_EQ:
   10516       11968 :             v = TEST_COND(result == 0);
   10517       11968 :             break;
   10518             :         case Py_NE:
   10519         786 :             v = TEST_COND(result != 0);
   10520         786 :             break;
   10521             :         case Py_LE:
   10522         698 :             v = TEST_COND(result <= 0);
   10523         698 :             break;
   10524             :         case Py_GE:
   10525           0 :             v = TEST_COND(result >= 0);
   10526           0 :             break;
   10527             :         case Py_LT:
   10528        3102 :             v = TEST_COND(result == -1);
   10529        3102 :             break;
   10530             :         case Py_GT:
   10531           0 :             v = TEST_COND(result == 1);
   10532           0 :             break;
   10533             :         default:
   10534           0 :             PyErr_BadArgument();
   10535           0 :             return NULL;
   10536             :         }
   10537       16554 :         Py_INCREF(v);
   10538       16554 :         return v;
   10539             :     }
   10540             : 
   10541         154 :     Py_RETURN_NOTIMPLEMENTED;
   10542             : }
   10543             : 
   10544             : int
   10545        2034 : PyUnicode_Contains(PyObject *container, PyObject *element)
   10546             : {
   10547             :     PyObject *str, *sub;
   10548             :     int kind1, kind2, kind;
   10549             :     void *buf1, *buf2;
   10550             :     Py_ssize_t len1, len2;
   10551             :     int result;
   10552             : 
   10553             :     /* Coerce the two arguments */
   10554        2034 :     sub = PyUnicode_FromObject(element);
   10555        2034 :     if (!sub) {
   10556           0 :         PyErr_Format(PyExc_TypeError,
   10557             :                      "'in <string>' requires string as left operand, not %s",
   10558           0 :                      element->ob_type->tp_name);
   10559           0 :         return -1;
   10560             :     }
   10561             : 
   10562        2034 :     str = PyUnicode_FromObject(container);
   10563        2034 :     if (!str) {
   10564           0 :         Py_DECREF(sub);
   10565           0 :         return -1;
   10566             :     }
   10567        2034 :     if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
   10568           0 :         Py_DECREF(sub);
   10569           0 :         Py_DECREF(str);
   10570             :     }
   10571             : 
   10572        2034 :     kind1 = PyUnicode_KIND(str);
   10573        2034 :     kind2 = PyUnicode_KIND(sub);
   10574        2034 :     kind = kind1;
   10575        2034 :     buf1 = PyUnicode_DATA(str);
   10576        2034 :     buf2 = PyUnicode_DATA(sub);
   10577        2034 :     if (kind2 != kind) {
   10578           5 :         if (kind2 > kind) {
   10579           5 :             Py_DECREF(sub);
   10580           5 :             Py_DECREF(str);
   10581           5 :             return 0;
   10582             :         }
   10583           0 :         buf2 = _PyUnicode_AsKind(sub, kind);
   10584             :     }
   10585        2029 :     if (!buf2) {
   10586           0 :         Py_DECREF(sub);
   10587           0 :         Py_DECREF(str);
   10588           0 :         return -1;
   10589             :     }
   10590        2029 :     len1 = PyUnicode_GET_LENGTH(str);
   10591        2029 :     len2 = PyUnicode_GET_LENGTH(sub);
   10592             : 
   10593        2029 :     switch (kind) {
   10594             :     case PyUnicode_1BYTE_KIND:
   10595        2029 :         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
   10596        2029 :         break;
   10597             :     case PyUnicode_2BYTE_KIND:
   10598           0 :         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
   10599           0 :         break;
   10600             :     case PyUnicode_4BYTE_KIND:
   10601           0 :         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
   10602           0 :         break;
   10603             :     default:
   10604           0 :         result = -1;
   10605             :         assert(0);
   10606             :     }
   10607             : 
   10608        2029 :     Py_DECREF(str);
   10609        2029 :     Py_DECREF(sub);
   10610             : 
   10611        2029 :     if (kind2 != kind)
   10612           0 :         PyMem_Free(buf2);
   10613             : 
   10614        2029 :     return result;
   10615             : }
   10616             : 
   10617             : /* Concat to string or Unicode object giving a new Unicode object. */
   10618             : 
   10619             : PyObject *
   10620           0 : PyUnicode_Concat(PyObject *left, PyObject *right)
   10621             : {
   10622           0 :     PyObject *u = NULL, *v = NULL, *w;
   10623             :     Py_UCS4 maxchar, maxchar2;
   10624             :     Py_ssize_t u_len, v_len, new_len;
   10625             : 
   10626             :     /* Coerce the two arguments */
   10627           0 :     u = PyUnicode_FromObject(left);
   10628           0 :     if (u == NULL)
   10629           0 :         goto onError;
   10630           0 :     v = PyUnicode_FromObject(right);
   10631           0 :     if (v == NULL)
   10632           0 :         goto onError;
   10633             : 
   10634             :     /* Shortcuts */
   10635           0 :     if (v == unicode_empty) {
   10636           0 :         Py_DECREF(v);
   10637           0 :         return u;
   10638             :     }
   10639           0 :     if (u == unicode_empty) {
   10640           0 :         Py_DECREF(u);
   10641           0 :         return v;
   10642             :     }
   10643             : 
   10644           0 :     u_len = PyUnicode_GET_LENGTH(u);
   10645           0 :     v_len = PyUnicode_GET_LENGTH(v);
   10646           0 :     if (u_len > PY_SSIZE_T_MAX - v_len) {
   10647           0 :         PyErr_SetString(PyExc_OverflowError,
   10648             :                         "strings are too large to concat");
   10649           0 :         goto onError;
   10650             :     }
   10651           0 :     new_len = u_len + v_len;
   10652             : 
   10653           0 :     maxchar = PyUnicode_MAX_CHAR_VALUE(u);
   10654           0 :     maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
   10655           0 :     maxchar = MAX_MAXCHAR(maxchar, maxchar2);
   10656             : 
   10657             :     /* Concat the two Unicode strings */
   10658           0 :     w = PyUnicode_New(new_len, maxchar);
   10659           0 :     if (w == NULL)
   10660           0 :         goto onError;
   10661           0 :     _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
   10662           0 :     _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
   10663           0 :     Py_DECREF(u);
   10664           0 :     Py_DECREF(v);
   10665             :     assert(_PyUnicode_CheckConsistency(w, 1));
   10666           0 :     return w;
   10667             : 
   10668             :   onError:
   10669           0 :     Py_XDECREF(u);
   10670           0 :     Py_XDECREF(v);
   10671           0 :     return NULL;
   10672             : }
   10673             : 
   10674             : void
   10675        2085 : PyUnicode_Append(PyObject **p_left, PyObject *right)
   10676             : {
   10677             :     PyObject *left, *res;
   10678             :     Py_UCS4 maxchar, maxchar2;
   10679             :     Py_ssize_t left_len, right_len, new_len;
   10680             : 
   10681        2085 :     if (p_left == NULL) {
   10682           0 :         if (!PyErr_Occurred())
   10683           0 :             PyErr_BadInternalCall();
   10684           0 :         return;
   10685             :     }
   10686        2085 :     left = *p_left;
   10687        2085 :     if (right == NULL || !PyUnicode_Check(left)) {
   10688           0 :         if (!PyErr_Occurred())
   10689           0 :             PyErr_BadInternalCall();
   10690           0 :         goto error;
   10691             :     }
   10692             : 
   10693        2085 :     if (PyUnicode_READY(left) == -1)
   10694           0 :         goto error;
   10695        2085 :     if (PyUnicode_READY(right) == -1)
   10696           0 :         goto error;
   10697             : 
   10698             :     /* Shortcuts */
   10699        2085 :     if (left == unicode_empty) {
   10700          76 :         Py_DECREF(left);
   10701          76 :         Py_INCREF(right);
   10702          76 :         *p_left = right;
   10703          76 :         return;
   10704             :     }
   10705        2009 :     if (right == unicode_empty)
   10706           1 :         return;
   10707             : 
   10708        2008 :     left_len = PyUnicode_GET_LENGTH(left);
   10709        2008 :     right_len = PyUnicode_GET_LENGTH(right);
   10710        2008 :     if (left_len > PY_SSIZE_T_MAX - right_len) {
   10711           0 :         PyErr_SetString(PyExc_OverflowError,
   10712             :                         "strings are too large to concat");
   10713           0 :         goto error;
   10714             :     }
   10715        2008 :     new_len = left_len + right_len;
   10716             : 
   10717        2008 :     if (unicode_modifiable(left)
   10718         395 :         && PyUnicode_CheckExact(right)
   10719         395 :         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
   10720             :         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
   10721             :            to change the structure size, but characters are stored just after
   10722             :            the structure, and so it requires to move all characters which is
   10723             :            not so different than duplicating the string. */
   10724         395 :         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
   10725             :     {
   10726             :         /* append inplace */
   10727         395 :         if (unicode_resize(p_left, new_len) != 0) {
   10728             :             /* XXX if _PyUnicode_Resize() fails, 'left' has been
   10729             :              * deallocated so it cannot be put back into
   10730             :              * 'variable'.  The MemoryError is raised when there
   10731             :              * is no value in 'variable', which might (very
   10732             :              * remotely) be a cause of incompatibilities.
   10733             :              */
   10734           0 :             goto error;
   10735             :         }
   10736             :         /* copy 'right' into the newly allocated area of 'left' */
   10737         395 :         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
   10738             :     }
   10739             :     else {
   10740        1613 :         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
   10741        1613 :         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
   10742        1613 :         maxchar = MAX_MAXCHAR(maxchar, maxchar2);
   10743             : 
   10744             :         /* Concat the two Unicode strings */
   10745        1613 :         res = PyUnicode_New(new_len, maxchar);
   10746        1613 :         if (res == NULL)
   10747           0 :             goto error;
   10748        1613 :         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
   10749        1613 :         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
   10750        1613 :         Py_DECREF(left);
   10751        1613 :         *p_left = res;
   10752             :     }
   10753             :     assert(_PyUnicode_CheckConsistency(*p_left, 1));
   10754        2008 :     return;
   10755             : 
   10756             : error:
   10757           0 :     Py_CLEAR(*p_left);
   10758             : }
   10759             : 
   10760             : void
   10761           3 : PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
   10762             : {
   10763           3 :     PyUnicode_Append(pleft, right);
   10764           3 :     Py_XDECREF(right);
   10765           3 : }
   10766             : 
   10767             : PyDoc_STRVAR(count__doc__,
   10768             :              "S.count(sub[, start[, end]]) -> int\n\
   10769             : \n\
   10770             : Return the number of non-overlapping occurrences of substring sub in\n\
   10771             : string S[start:end].  Optional arguments start and end are\n\
   10772             : interpreted as in slice notation.");
   10773             : 
   10774             : static PyObject *
   10775           0 : unicode_count(PyObject *self, PyObject *args)
   10776             : {
   10777             :     PyObject *substring;
   10778           0 :     Py_ssize_t start = 0;
   10779           0 :     Py_ssize_t end = PY_SSIZE_T_MAX;
   10780             :     PyObject *result;
   10781             :     int kind1, kind2, kind;
   10782             :     void *buf1, *buf2;
   10783             :     Py_ssize_t len1, len2, iresult;
   10784             : 
   10785           0 :     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
   10786             :                                             &start, &end))
   10787           0 :         return NULL;
   10788             : 
   10789           0 :     kind1 = PyUnicode_KIND(self);
   10790           0 :     kind2 = PyUnicode_KIND(substring);
   10791           0 :     if (kind2 > kind1)
   10792           0 :         return PyLong_FromLong(0);
   10793           0 :     kind = kind1;
   10794           0 :     buf1 = PyUnicode_DATA(self);
   10795           0 :     buf2 = PyUnicode_DATA(substring);
   10796           0 :     if (kind2 != kind)
   10797           0 :         buf2 = _PyUnicode_AsKind(substring, kind);
   10798           0 :     if (!buf2) {
   10799           0 :         Py_DECREF(substring);
   10800           0 :         return NULL;
   10801             :     }
   10802           0 :     len1 = PyUnicode_GET_LENGTH(self);
   10803           0 :     len2 = PyUnicode_GET_LENGTH(substring);
   10804             : 
   10805           0 :     ADJUST_INDICES(start, end, len1);
   10806           0 :     switch (kind) {
   10807             :     case PyUnicode_1BYTE_KIND:
   10808           0 :         iresult = ucs1lib_count(
   10809             :             ((Py_UCS1*)buf1) + start, end - start,
   10810             :             buf2, len2, PY_SSIZE_T_MAX
   10811             :             );
   10812           0 :         break;
   10813             :     case PyUnicode_2BYTE_KIND:
   10814           0 :         iresult = ucs2lib_count(
   10815           0 :             ((Py_UCS2*)buf1) + start, end - start,
   10816             :             buf2, len2, PY_SSIZE_T_MAX
   10817             :             );
   10818           0 :         break;
   10819             :     case PyUnicode_4BYTE_KIND:
   10820           0 :         iresult = ucs4lib_count(
   10821           0 :             ((Py_UCS4*)buf1) + start, end - start,
   10822             :             buf2, len2, PY_SSIZE_T_MAX
   10823             :             );
   10824           0 :         break;
   10825             :     default:
   10826           0 :         assert(0); iresult = 0;
   10827             :     }
   10828             : 
   10829           0 :     result = PyLong_FromSsize_t(iresult);
   10830             : 
   10831           0 :     if (kind2 != kind)
   10832           0 :         PyMem_Free(buf2);
   10833             : 
   10834           0 :     Py_DECREF(substring);
   10835             : 
   10836           0 :     return result;
   10837             : }
   10838             : 
   10839             : PyDoc_STRVAR(encode__doc__,
   10840             :              "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
   10841             : \n\
   10842             : Encode S using the codec registered for encoding. Default encoding\n\
   10843             : is 'utf-8'. errors may be given to set a different error\n\
   10844             : handling scheme. Default is 'strict' meaning that encoding errors raise\n\
   10845             : a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
   10846             : 'xmlcharrefreplace' as well as any other name registered with\n\
   10847             : codecs.register_error that can handle UnicodeEncodeErrors.");
   10848             : 
   10849             : static PyObject *
   10850           8 : unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
   10851             : {
   10852             :     static char *kwlist[] = {"encoding", "errors", 0};
   10853           8 :     char *encoding = NULL;
   10854           8 :     char *errors = NULL;
   10855             : 
   10856           8 :     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
   10857             :                                      kwlist, &encoding, &errors))
   10858           0 :         return NULL;
   10859           8 :     return PyUnicode_AsEncodedString(self, encoding, errors);
   10860             : }
   10861             : 
   10862             : PyDoc_STRVAR(expandtabs__doc__,
   10863             :              "S.expandtabs([tabsize]) -> str\n\
   10864             : \n\
   10865             : Return a copy of S where all tab characters are expanded using spaces.\n\
   10866             : If tabsize is not given, a tab size of 8 characters is assumed.");
   10867             : 
   10868             : static PyObject*
   10869           0 : unicode_expandtabs(PyObject *self, PyObject *args)
   10870             : {
   10871             :     Py_ssize_t i, j, line_pos, src_len, incr;
   10872             :     Py_UCS4 ch;
   10873             :     PyObject *u;
   10874             :     void *src_data, *dest_data;
   10875           0 :     int tabsize = 8;
   10876             :     int kind;
   10877             :     int found;
   10878             : 
   10879           0 :     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
   10880           0 :         return NULL;
   10881             : 
   10882           0 :     if (PyUnicode_READY(self) == -1)
   10883           0 :         return NULL;
   10884             : 
   10885             :     /* First pass: determine size of output string */
   10886           0 :     src_len = PyUnicode_GET_LENGTH(self);
   10887           0 :     i = j = line_pos = 0;
   10888           0 :     kind = PyUnicode_KIND(self);
   10889           0 :     src_data = PyUnicode_DATA(self);
   10890           0 :     found = 0;
   10891           0 :     for (; i < src_len; i++) {
   10892           0 :         ch = PyUnicode_READ(kind, src_data, i);
   10893           0 :         if (ch == '\t') {
   10894           0 :             found = 1;
   10895           0 :             if (tabsize > 0) {
   10896           0 :                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
   10897           0 :                 if (j > PY_SSIZE_T_MAX - incr)
   10898           0 :                     goto overflow;
   10899           0 :                 line_pos += incr;
   10900           0 :                 j += incr;
   10901             :             }
   10902             :         }
   10903             :         else {
   10904           0 :             if (j > PY_SSIZE_T_MAX - 1)
   10905           0 :                 goto overflow;
   10906           0 :             line_pos++;
   10907           0 :             j++;
   10908           0 :             if (ch == '\n' || ch == '\r')
   10909           0 :                 line_pos = 0;
   10910             :         }
   10911             :     }
   10912           0 :     if (!found)
   10913           0 :         return unicode_result_unchanged(self);
   10914             : 
   10915             :     /* Second pass: create output string and fill it */
   10916           0 :     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
   10917           0 :     if (!u)
   10918           0 :         return NULL;
   10919           0 :     dest_data = PyUnicode_DATA(u);
   10920             : 
   10921           0 :     i = j = line_pos = 0;
   10922             : 
   10923           0 :     for (; i < src_len; i++) {
   10924           0 :         ch = PyUnicode_READ(kind, src_data, i);
   10925           0 :         if (ch == '\t') {
   10926           0 :             if (tabsize > 0) {
   10927           0 :                 incr = tabsize - (line_pos % tabsize);
   10928           0 :                 line_pos += incr;
   10929           0 :                 FILL(kind, dest_data, ' ', j, incr);
   10930           0 :                 j += incr;
   10931             :             }
   10932             :         }
   10933             :         else {
   10934           0 :             line_pos++;
   10935           0 :             PyUnicode_WRITE(kind, dest_data, j, ch);
   10936           0 :             j++;
   10937           0 :             if (ch == '\n' || ch == '\r')
   10938           0 :                 line_pos = 0;
   10939             :         }
   10940             :     }
   10941             :     assert (j == PyUnicode_GET_LENGTH(u));
   10942           0 :     return unicode_result(u);
   10943             : 
   10944             :   overflow:
   10945           0 :     PyErr_SetString(PyExc_OverflowError, "new string is too long");
   10946           0 :     return NULL;
   10947             : }
   10948             : 
   10949             : PyDoc_STRVAR(find__doc__,
   10950             :              "S.find(sub[, start[, end]]) -> int\n\
   10951             : \n\
   10952             : Return the lowest index in S where substring sub is found,\n\
   10953             : such that sub is contained within S[start:end].  Optional\n\
   10954             : arguments start and end are interpreted as in slice notation.\n\
   10955             : \n\
   10956             : Return -1 on failure.");
   10957             : 
   10958             : static PyObject *
   10959           2 : unicode_find(PyObject *self, PyObject *args)
   10960             : {
   10961             :     PyObject *substring;
   10962             :     Py_ssize_t start;
   10963             :     Py_ssize_t end;
   10964             :     Py_ssize_t result;
   10965             : 
   10966           2 :     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
   10967             :                                             &start, &end))
   10968           0 :         return NULL;
   10969             : 
   10970           2 :     if (PyUnicode_READY(self) == -1)
   10971           0 :         return NULL;
   10972           2 :     if (PyUnicode_READY(substring) == -1)
   10973           0 :         return NULL;
   10974             : 
   10975           2 :     result = any_find_slice(1, self, substring, start, end);
   10976             : 
   10977           2 :     Py_DECREF(substring);
   10978             : 
   10979           2 :     if (result == -2)
   10980           0 :         return NULL;
   10981             : 
   10982           2 :     return PyLong_FromSsize_t(result);
   10983             : }
   10984             : 
   10985             : static PyObject *
   10986        6347 : unicode_getitem(PyObject *self, Py_ssize_t index)
   10987             : {
   10988             :     void *data;
   10989             :     enum PyUnicode_Kind kind;
   10990             :     Py_UCS4 ch;
   10991             :     PyObject *res;
   10992             : 
   10993        6347 :     if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
   10994           0 :         PyErr_BadArgument();
   10995           0 :         return NULL;
   10996             :     }
   10997        6347 :     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
   10998           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
   10999           0 :         return NULL;
   11000             :     }
   11001        6347 :     kind = PyUnicode_KIND(self);
   11002        6347 :     data = PyUnicode_DATA(self);
   11003        6347 :     ch = PyUnicode_READ(kind, data, index);
   11004        6347 :     if (ch < 256)
   11005        6167 :         return get_latin1_char(ch);
   11006             : 
   11007         180 :     res = PyUnicode_New(1, ch);
   11008         180 :     if (res == NULL)
   11009           0 :         return NULL;
   11010         180 :     kind = PyUnicode_KIND(res);
   11011         180 :     data = PyUnicode_DATA(res);
   11012         180 :     PyUnicode_WRITE(kind, data, 0, ch);
   11013             :     assert(_PyUnicode_CheckConsistency(res, 1));
   11014         180 :     return res;
   11015             : }
   11016             : 
   11017             : /* Believe it or not, this produces the same value for ASCII strings
   11018             :    as bytes_hash(). */
   11019             : static Py_hash_t
   11020       37793 : unicode_hash(PyObject *self)
   11021             : {
   11022             :     Py_ssize_t len;
   11023             :     Py_uhash_t x;
   11024             : 
   11025             : #ifdef Py_DEBUG
   11026             :     assert(_Py_HashSecret_Initialized);
   11027             : #endif
   11028       37793 :     if (_PyUnicode_HASH(self) != -1)
   11029        2325 :         return _PyUnicode_HASH(self);
   11030       35468 :     if (PyUnicode_READY(self) == -1)
   11031           0 :         return -1;
   11032       35468 :     len = PyUnicode_GET_LENGTH(self);
   11033             :     /*
   11034             :       We make the hash of the empty string be 0, rather than using
   11035             :       (prefix ^ suffix), since this slightly obfuscates the hash secret
   11036             :     */
   11037       35468 :     if (len == 0) {
   11038           1 :         _PyUnicode_HASH(self) = 0;
   11039           1 :         return 0;
   11040             :     }
   11041             : 
   11042             :     /* The hash function as a macro, gets expanded three times below. */
   11043             : #define HASH(P)                                            \
   11044             :     x ^= (Py_uhash_t) *P << 7;                             \
   11045             :     while (--len >= 0)                                     \
   11046             :         x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
   11047             : 
   11048       35467 :     x = (Py_uhash_t) _Py_HashSecret.prefix;
   11049       35467 :     switch (PyUnicode_KIND(self)) {
   11050             :     case PyUnicode_1BYTE_KIND: {
   11051       35405 :         const unsigned char *c = PyUnicode_1BYTE_DATA(self);
   11052       35405 :         HASH(c);
   11053       35405 :         break;
   11054             :     }
   11055             :     case PyUnicode_2BYTE_KIND: {
   11056          62 :         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
   11057          62 :         HASH(s);
   11058          62 :         break;
   11059             :     }
   11060             :     default: {
   11061             :         Py_UCS4 *l;
   11062             :         assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
   11063             :                "Impossible switch case in unicode_hash");
   11064           0 :         l = PyUnicode_4BYTE_DATA(self);
   11065           0 :         HASH(l);
   11066           0 :         break;
   11067             :     }
   11068             :     }
   11069       35467 :     x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
   11070       35467 :     x ^= (Py_uhash_t) _Py_HashSecret.suffix;
   11071             : 
   11072       35467 :     if (x == -1)
   11073           0 :         x = -2;
   11074       35467 :     _PyUnicode_HASH(self) = x;
   11075       35467 :     return x;
   11076             : }
   11077             : #undef HASH
   11078             : 
   11079             : PyDoc_STRVAR(index__doc__,
   11080             :              "S.index(sub[, start[, end]]) -> int\n\
   11081             : \n\
   11082             : Like S.find() but raise ValueError when the substring is not found.");
   11083             : 
   11084             : static PyObject *
   11085           0 : unicode_index(PyObject *self, PyObject *args)
   11086             : {
   11087             :     Py_ssize_t result;
   11088             :     PyObject *substring;
   11089             :     Py_ssize_t start;
   11090             :     Py_ssize_t end;
   11091             : 
   11092           0 :     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
   11093             :                                             &start, &end))
   11094           0 :         return NULL;
   11095             : 
   11096           0 :     if (PyUnicode_READY(self) == -1)
   11097           0 :         return NULL;
   11098           0 :     if (PyUnicode_READY(substring) == -1)
   11099           0 :         return NULL;
   11100             : 
   11101           0 :     result = any_find_slice(1, self, substring, start, end);
   11102             : 
   11103           0 :     Py_DECREF(substring);
   11104             : 
   11105           0 :     if (result == -2)
   11106           0 :         return NULL;
   11107             : 
   11108           0 :     if (result < 0) {
   11109           0 :         PyErr_SetString(PyExc_ValueError, "substring not found");
   11110           0 :         return NULL;
   11111             :     }
   11112             : 
   11113           0 :     return PyLong_FromSsize_t(result);
   11114             : }
   11115             : 
   11116             : PyDoc_STRVAR(islower__doc__,
   11117             :              "S.islower() -> bool\n\
   11118             : \n\
   11119             : Return True if all cased characters in S are lowercase and there is\n\
   11120             : at least one cased character in S, False otherwise.");
   11121             : 
   11122             : static PyObject*
   11123           0 : unicode_islower(PyObject *self)
   11124             : {
   11125             :     Py_ssize_t i, length;
   11126             :     int kind;
   11127             :     void *data;
   11128             :     int cased;
   11129             : 
   11130           0 :     if (PyUnicode_READY(self) == -1)
   11131           0 :         return NULL;
   11132           0 :     length = PyUnicode_GET_LENGTH(self);
   11133           0 :     kind = PyUnicode_KIND(self);
   11134           0 :     data = PyUnicode_DATA(self);
   11135             : 
   11136             :     /* Shortcut for single character strings */
   11137           0 :     if (length == 1)
   11138           0 :         return PyBool_FromLong(
   11139           0 :             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
   11140             : 
   11141             :     /* Special case for empty strings */
   11142           0 :     if (length == 0)
   11143           0 :         return PyBool_FromLong(0);
   11144             : 
   11145           0 :     cased = 0;
   11146           0 :     for (i = 0; i < length; i++) {
   11147           0 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11148             : 
   11149           0 :         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
   11150           0 :             return PyBool_FromLong(0);
   11151           0 :         else if (!cased && Py_UNICODE_ISLOWER(ch))
   11152           0 :             cased = 1;
   11153             :     }
   11154           0 :     return PyBool_FromLong(cased);
   11155             : }
   11156             : 
   11157             : PyDoc_STRVAR(isupper__doc__,
   11158             :              "S.isupper() -> bool\n\
   11159             : \n\
   11160             : Return True if all cased characters in S are uppercase and there is\n\
   11161             : at least one cased character in S, False otherwise.");
   11162             : 
   11163             : static PyObject*
   11164           0 : unicode_isupper(PyObject *self)
   11165             : {
   11166             :     Py_ssize_t i, length;
   11167             :     int kind;
   11168             :     void *data;
   11169             :     int cased;
   11170             : 
   11171           0 :     if (PyUnicode_READY(self) == -1)
   11172           0 :         return NULL;
   11173           0 :     length = PyUnicode_GET_LENGTH(self);
   11174           0 :     kind = PyUnicode_KIND(self);
   11175           0 :     data = PyUnicode_DATA(self);
   11176             : 
   11177             :     /* Shortcut for single character strings */
   11178           0 :     if (length == 1)
   11179           0 :         return PyBool_FromLong(
   11180           0 :             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
   11181             : 
   11182             :     /* Special case for empty strings */
   11183           0 :     if (length == 0)
   11184           0 :         return PyBool_FromLong(0);
   11185             : 
   11186           0 :     cased = 0;
   11187           0 :     for (i = 0; i < length; i++) {
   11188           0 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11189             : 
   11190           0 :         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
   11191           0 :             return PyBool_FromLong(0);
   11192           0 :         else if (!cased && Py_UNICODE_ISUPPER(ch))
   11193           0 :             cased = 1;
   11194             :     }
   11195           0 :     return PyBool_FromLong(cased);
   11196             : }
   11197             : 
   11198             : PyDoc_STRVAR(istitle__doc__,
   11199             :              "S.istitle() -> bool\n\
   11200             : \n\
   11201             : Return True if S is a titlecased string and there is at least one\n\
   11202             : character in S, i.e. upper- and titlecase characters may only\n\
   11203             : follow uncased characters and lowercase characters only cased ones.\n\
   11204             : Return False otherwise.");
   11205             : 
   11206             : static PyObject*
   11207           0 : unicode_istitle(PyObject *self)
   11208             : {
   11209             :     Py_ssize_t i, length;
   11210             :     int kind;
   11211             :     void *data;
   11212             :     int cased, previous_is_cased;
   11213             : 
   11214           0 :     if (PyUnicode_READY(self) == -1)
   11215           0 :         return NULL;
   11216           0 :     length = PyUnicode_GET_LENGTH(self);
   11217           0 :     kind = PyUnicode_KIND(self);
   11218           0 :     data = PyUnicode_DATA(self);
   11219             : 
   11220             :     /* Shortcut for single character strings */
   11221           0 :     if (length == 1) {
   11222           0 :         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11223           0 :         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
   11224           0 :                                (Py_UNICODE_ISUPPER(ch) != 0));
   11225             :     }
   11226             : 
   11227             :     /* Special case for empty strings */
   11228           0 :     if (length == 0)
   11229           0 :         return PyBool_FromLong(0);
   11230             : 
   11231           0 :     cased = 0;
   11232           0 :     previous_is_cased = 0;
   11233           0 :     for (i = 0; i < length; i++) {
   11234           0 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11235             : 
   11236           0 :         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
   11237           0 :             if (previous_is_cased)
   11238           0 :                 return PyBool_FromLong(0);
   11239           0 :             previous_is_cased = 1;
   11240           0 :             cased = 1;
   11241             :         }
   11242           0 :         else if (Py_UNICODE_ISLOWER(ch)) {
   11243           0 :             if (!previous_is_cased)
   11244           0 :                 return PyBool_FromLong(0);
   11245           0 :             previous_is_cased = 1;
   11246           0 :             cased = 1;
   11247             :         }
   11248             :         else
   11249           0 :             previous_is_cased = 0;
   11250             :     }
   11251           0 :     return PyBool_FromLong(cased);
   11252             : }
   11253             : 
   11254             : PyDoc_STRVAR(isspace__doc__,
   11255             :              "S.isspace() -> bool\n\
   11256             : \n\
   11257             : Return True if all characters in S are whitespace\n\
   11258             : and there is at least one character in S, False otherwise.");
   11259             : 
   11260             : static PyObject*
   11261           0 : unicode_isspace(PyObject *self)
   11262             : {
   11263             :     Py_ssize_t i, length;
   11264             :     int kind;
   11265             :     void *data;
   11266             : 
   11267           0 :     if (PyUnicode_READY(self) == -1)
   11268           0 :         return NULL;
   11269           0 :     length = PyUnicode_GET_LENGTH(self);
   11270           0 :     kind = PyUnicode_KIND(self);
   11271           0 :     data = PyUnicode_DATA(self);
   11272             : 
   11273             :     /* Shortcut for single character strings */
   11274           0 :     if (length == 1)
   11275           0 :         return PyBool_FromLong(
   11276           0 :             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
   11277             : 
   11278             :     /* Special case for empty strings */
   11279           0 :     if (length == 0)
   11280           0 :         return PyBool_FromLong(0);
   11281             : 
   11282           0 :     for (i = 0; i < length; i++) {
   11283           0 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11284           0 :         if (!Py_UNICODE_ISSPACE(ch))
   11285           0 :             return PyBool_FromLong(0);
   11286             :     }
   11287           0 :     return PyBool_FromLong(1);
   11288             : }
   11289             : 
   11290             : PyDoc_STRVAR(isalpha__doc__,
   11291             :              "S.isalpha() -> bool\n\
   11292             : \n\
   11293             : Return True if all characters in S are alphabetic\n\
   11294             : and there is at least one character in S, False otherwise.");
   11295             : 
   11296             : static PyObject*
   11297           0 : unicode_isalpha(PyObject *self)
   11298             : {
   11299             :     Py_ssize_t i, length;
   11300             :     int kind;
   11301             :     void *data;
   11302             : 
   11303           0 :     if (PyUnicode_READY(self) == -1)
   11304           0 :         return NULL;
   11305           0 :     length = PyUnicode_GET_LENGTH(self);
   11306           0 :     kind = PyUnicode_KIND(self);
   11307           0 :     data = PyUnicode_DATA(self);
   11308             : 
   11309             :     /* Shortcut for single character strings */
   11310           0 :     if (length == 1)
   11311           0 :         return PyBool_FromLong(
   11312           0 :             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
   11313             : 
   11314             :     /* Special case for empty strings */
   11315           0 :     if (length == 0)
   11316           0 :         return PyBool_FromLong(0);
   11317             : 
   11318           0 :     for (i = 0; i < length; i++) {
   11319           0 :         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
   11320           0 :             return PyBool_FromLong(0);
   11321             :     }
   11322           0 :     return PyBool_FromLong(1);
   11323             : }
   11324             : 
   11325             : PyDoc_STRVAR(isalnum__doc__,
   11326             :              "S.isalnum() -> bool\n\
   11327             : \n\
   11328             : Return True if all characters in S are alphanumeric\n\
   11329             : and there is at least one character in S, False otherwise.");
   11330             : 
   11331             : static PyObject*
   11332          70 : unicode_isalnum(PyObject *self)
   11333             : {
   11334             :     int kind;
   11335             :     void *data;
   11336             :     Py_ssize_t len, i;
   11337             : 
   11338          70 :     if (PyUnicode_READY(self) == -1)
   11339           0 :         return NULL;
   11340             : 
   11341          70 :     kind = PyUnicode_KIND(self);
   11342          70 :     data = PyUnicode_DATA(self);
   11343          70 :     len = PyUnicode_GET_LENGTH(self);
   11344             : 
   11345             :     /* Shortcut for single character strings */
   11346          70 :     if (len == 1) {
   11347          70 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11348          70 :         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
   11349             :     }
   11350             : 
   11351             :     /* Special case for empty strings */
   11352           0 :     if (len == 0)
   11353           0 :         return PyBool_FromLong(0);
   11354             : 
   11355           0 :     for (i = 0; i < len; i++) {
   11356           0 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
   11357           0 :         if (!Py_UNICODE_ISALNUM(ch))
   11358           0 :             return PyBool_FromLong(0);
   11359             :     }
   11360           0 :     return PyBool_FromLong(1);
   11361             : }
   11362             : 
   11363             : PyDoc_STRVAR(isdecimal__doc__,
   11364             :              "S.isdecimal() -> bool\n\
   11365             : \n\
   11366             : Return True if there are only decimal characters in S,\n\
   11367             : False otherwise.");
   11368             : 
   11369             : static PyObject*
   11370           0 : unicode_isdecimal(PyObject *self)
   11371             : {
   11372             :     Py_ssize_t i, length;
   11373             :     int kind;
   11374             :     void *data;
   11375             : 
   11376           0 :     if (PyUnicode_READY(self) == -1)
   11377           0 :         return NULL;
   11378           0 :     length = PyUnicode_GET_LENGTH(self);
   11379           0 :     kind = PyUnicode_KIND(self);
   11380           0 :     data = PyUnicode_DATA(self);
   11381             : 
   11382             :     /* Shortcut for single character strings */
   11383           0 :     if (length == 1)
   11384           0 :         return PyBool_FromLong(
   11385           0 :             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
   11386             : 
   11387             :     /* Special case for empty strings */
   11388           0 :     if (length == 0)
   11389           0 :         return PyBool_FromLong(0);
   11390             : 
   11391           0 :     for (i = 0; i < length; i++) {
   11392           0 :         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
   11393           0 :             return PyBool_FromLong(0);
   11394             :     }
   11395           0 :     return PyBool_FromLong(1);
   11396             : }
   11397             : 
   11398             : PyDoc_STRVAR(isdigit__doc__,
   11399             :              "S.isdigit() -> bool\n\
   11400             : \n\
   11401             : Return True if all characters in S are digits\n\
   11402             : and there is at least one character in S, False otherwise.");
   11403             : 
   11404             : static PyObject*
   11405          11 : unicode_isdigit(PyObject *self)
   11406             : {
   11407             :     Py_ssize_t i, length;
   11408             :     int kind;
   11409             :     void *data;
   11410             : 
   11411          11 :     if (PyUnicode_READY(self) == -1)
   11412           0 :         return NULL;
   11413          11 :     length = PyUnicode_GET_LENGTH(self);
   11414          11 :     kind = PyUnicode_KIND(self);
   11415          11 :     data = PyUnicode_DATA(self);
   11416             : 
   11417             :     /* Shortcut for single character strings */
   11418          11 :     if (length == 1) {
   11419          11 :         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
   11420          11 :         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
   11421             :     }
   11422             : 
   11423             :     /* Special case for empty strings */
   11424           0 :     if (length == 0)
   11425           0 :         return PyBool_FromLong(0);
   11426             : 
   11427           0 :     for (i = 0; i < length; i++) {
   11428           0 :         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
   11429           0 :             return PyBool_FromLong(0);
   11430             :     }
   11431           0 :     return PyBool_FromLong(1);
   11432             : }
   11433             : 
   11434             : PyDoc_STRVAR(isnumeric__doc__,
   11435             :              "S.isnumeric() -> bool\n\
   11436             : \n\
   11437             : Return True if there are only numeric characters in S,\n\
   11438             : False otherwise.");
   11439             : 
   11440             : static PyObject*
   11441           0 : unicode_isnumeric(PyObject *self)
   11442             : {
   11443             :     Py_ssize_t i, length;
   11444             :     int kind;
   11445             :     void *data;
   11446             : 
   11447           0 :     if (PyUnicode_READY(self) == -1)
   11448           0 :         return NULL;
   11449           0 :     length = PyUnicode_GET_LENGTH(self);
   11450           0 :     kind = PyUnicode_KIND(self);
   11451           0 :     data = PyUnicode_DATA(self);
   11452             : 
   11453             :     /* Shortcut for single character strings */
   11454           0 :     if (length == 1)
   11455           0 :         return PyBool_FromLong(
   11456           0 :             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
   11457             : 
   11458             :     /* Special case for empty strings */
   11459           0 :     if (length == 0)
   11460           0 :         return PyBool_FromLong(0);
   11461             : 
   11462           0 :     for (i = 0; i < length; i++) {
   11463           0 :         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
   11464           0 :             return PyBool_FromLong(0);
   11465             :     }
   11466           0 :     return PyBool_FromLong(1);
   11467             : }
   11468             : 
   11469             : int
   11470           9 : PyUnicode_IsIdentifier(PyObject *self)
   11471             : {
   11472             :     int kind;
   11473             :     void *data;
   11474             :     Py_ssize_t i;
   11475             :     Py_UCS4 first;
   11476             : 
   11477           9 :     if (PyUnicode_READY(self) == -1) {
   11478           0 :         Py_FatalError("identifier not ready");
   11479           0 :         return 0;
   11480             :     }
   11481             : 
   11482             :     /* Special case for empty strings */
   11483           9 :     if (PyUnicode_GET_LENGTH(self) == 0)
   11484           0 :         return 0;
   11485           9 :     kind = PyUnicode_KIND(self);
   11486           9 :     data = PyUnicode_DATA(self);
   11487             : 
   11488             :     /* PEP 3131 says that the first character must be in
   11489             :        XID_Start and subsequent characters in XID_Continue,
   11490             :        and for the ASCII range, the 2.x rules apply (i.e
   11491             :        start with letters and underscore, continue with
   11492             :        letters, digits, underscore). However, given the current
   11493             :        definition of XID_Start and XID_Continue, it is sufficient
   11494             :        to check just for these, except that _ must be allowed
   11495             :        as starting an identifier.  */
   11496           9 :     first = PyUnicode_READ(kind, data, 0);
   11497           9 :     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
   11498           0 :         return 0;
   11499             : 
   11500          60 :     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
   11501          51 :         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
   11502           0 :             return 0;
   11503           9 :     return 1;
   11504             : }
   11505             : 
   11506             : PyDoc_STRVAR(isidentifier__doc__,
   11507             :              "S.isidentifier() -> bool\n\
   11508             : \n\
   11509             : Return True if S is a valid identifier according\n\
   11510             : to the language definition.");
   11511             : 
   11512             : static PyObject*
   11513           0 : unicode_isidentifier(PyObject *self)
   11514             : {
   11515           0 :     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
   11516             : }
   11517             : 
   11518             : PyDoc_STRVAR(isprintable__doc__,
   11519             :              "S.isprintable() -> bool\n\
   11520             : \n\
   11521             : Return True if all characters in S are considered\n\
   11522             : printable in repr() or S is empty, False otherwise.");
   11523             : 
   11524             : static PyObject*
   11525           0 : unicode_isprintable(PyObject *self)
   11526             : {
   11527             :     Py_ssize_t i, length;
   11528             :     int kind;
   11529             :     void *data;
   11530             : 
   11531           0 :     if (PyUnicode_READY(self) == -1)
   11532           0 :         return NULL;
   11533           0 :     length = PyUnicode_GET_LENGTH(self);
   11534           0 :     kind = PyUnicode_KIND(self);
   11535           0 :     data = PyUnicode_DATA(self);
   11536             : 
   11537             :     /* Shortcut for single character strings */
   11538           0 :     if (length == 1)
   11539           0 :         return PyBool_FromLong(
   11540           0 :             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
   11541             : 
   11542           0 :     for (i = 0; i < length; i++) {
   11543           0 :         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
   11544           0 :             Py_RETURN_FALSE;
   11545             :         }
   11546             :     }
   11547           0 :     Py_RETURN_TRUE;
   11548             : }
   11549             : 
   11550             : PyDoc_STRVAR(join__doc__,
   11551             :              "S.join(iterable) -> str\n\
   11552             : \n\
   11553             : Return a string which is the concatenation of the strings in the\n\
   11554             : iterable.  The separator between elements is S.");
   11555             : 
   11556             : static PyObject*
   11557         361 : unicode_join(PyObject *self, PyObject *data)
   11558             : {
   11559         361 :     return PyUnicode_Join(self, data);
   11560             : }
   11561             : 
   11562             : static Py_ssize_t
   11563       17646 : unicode_length(PyObject *self)
   11564             : {
   11565       17646 :     if (PyUnicode_READY(self) == -1)
   11566           0 :         return -1;
   11567       17646 :     return PyUnicode_GET_LENGTH(self);
   11568             : }
   11569             : 
   11570             : PyDoc_STRVAR(ljust__doc__,
   11571             :              "S.ljust(width[, fillchar]) -> str\n\
   11572             : \n\
   11573             : Return S left-justified in a Unicode string of length width. Padding is\n\
   11574             : done using the specified fill character (default is a space).");
   11575             : 
   11576             : static PyObject *
   11577           0 : unicode_ljust(PyObject *self, PyObject *args)
   11578             : {
   11579             :     Py_ssize_t width;
   11580           0 :     Py_UCS4 fillchar = ' ';
   11581             : 
   11582           0 :     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
   11583           0 :         return NULL;
   11584             : 
   11585           0 :     if (PyUnicode_READY(self) == -1)
   11586           0 :         return NULL;
   11587             : 
   11588           0 :     if (PyUnicode_GET_LENGTH(self) >= width)
   11589           0 :         return unicode_result_unchanged(self);
   11590             : 
   11591           0 :     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
   11592             : }
   11593             : 
   11594             : PyDoc_STRVAR(lower__doc__,
   11595             :              "S.lower() -> str\n\
   11596             : \n\
   11597             : Return a copy of the string S converted to lowercase.");
   11598             : 
   11599             : static PyObject*
   11600           1 : unicode_lower(PyObject *self)
   11601             : {
   11602           1 :     if (PyUnicode_READY(self) == -1)
   11603           0 :         return NULL;
   11604           1 :     if (PyUnicode_IS_ASCII(self))
   11605           1 :         return ascii_upper_or_lower(self, 1);
   11606           0 :     return case_operation(self, do_lower);
   11607             : }
   11608             : 
   11609             : #define LEFTSTRIP 0
   11610             : #define RIGHTSTRIP 1
   11611             : #define BOTHSTRIP 2
   11612             : 
   11613             : /* Arrays indexed by above */
   11614             : static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
   11615             : 
   11616             : #define STRIPNAME(i) (stripformat[i]+3)
   11617             : 
   11618             : /* externally visible for str.strip(unicode) */
   11619             : PyObject *
   11620           3 : _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
   11621             : {
   11622             :     void *data;
   11623             :     int kind;
   11624             :     Py_ssize_t i, j, len;
   11625             :     BLOOM_MASK sepmask;
   11626             : 
   11627           3 :     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
   11628           0 :         return NULL;
   11629             : 
   11630           3 :     kind = PyUnicode_KIND(self);
   11631           3 :     data = PyUnicode_DATA(self);
   11632           3 :     len = PyUnicode_GET_LENGTH(self);
   11633           9 :     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
   11634           6 :                               PyUnicode_DATA(sepobj),
   11635             :                               PyUnicode_GET_LENGTH(sepobj));
   11636             : 
   11637           3 :     i = 0;
   11638           3 :     if (striptype != RIGHTSTRIP) {
   11639           0 :         while (i < len &&
   11640           0 :                BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
   11641           0 :             i++;
   11642             :         }
   11643             :     }
   11644             : 
   11645           3 :     j = len;
   11646           3 :     if (striptype != LEFTSTRIP) {
   11647             :         do {
   11648           5 :             j--;
   11649           5 :         } while (j >= i &&
   11650           7 :                  BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
   11651           3 :         j++;
   11652             :     }
   11653             : 
   11654           3 :     return PyUnicode_Substring(self, i, j);
   11655             : }
   11656             : 
   11657             : PyObject*
   11658        4267 : PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
   11659             : {
   11660             :     unsigned char *data;
   11661             :     int kind;
   11662             :     Py_ssize_t length;
   11663             : 
   11664        4267 :     if (PyUnicode_READY(self) == -1)
   11665           0 :         return NULL;
   11666             : 
   11667        4267 :     length = PyUnicode_GET_LENGTH(self);
   11668        4267 :     end = Py_MIN(end, length);
   11669             : 
   11670        4267 :     if (start == 0 && end == length)
   11671          12 :         return unicode_result_unchanged(self);
   11672             : 
   11673        4255 :     if (start < 0 || end < 0) {
   11674           0 :         PyErr_SetString(PyExc_IndexError, "string index out of range");
   11675           0 :         return NULL;
   11676             :     }
   11677        4255 :     if (start >= length || end < start) {
   11678           0 :         Py_INCREF(unicode_empty);
   11679           0 :         return unicode_empty;
   11680             :     }
   11681             : 
   11682        4255 :     length = end - start;
   11683        4255 :     if (PyUnicode_IS_ASCII(self)) {
   11684        1132 :         data = PyUnicode_1BYTE_DATA(self);
   11685        1132 :         return _PyUnicode_FromASCII((char*)(data + start), length);
   11686             :     }
   11687             :     else {
   11688        3123 :         kind = PyUnicode_KIND(self);
   11689        3123 :         data = PyUnicode_1BYTE_DATA(self);
   11690        3123 :         return PyUnicode_FromKindAndData(kind,
   11691        3123 :                                          data + kind * start,
   11692             :                                          length);
   11693             :     }
   11694             : }
   11695             : 
   11696             : static PyObject *
   11697           0 : do_strip(PyObject *self, int striptype)
   11698             : {
   11699             :     int kind;
   11700             :     void *data;
   11701             :     Py_ssize_t len, i, j;
   11702             : 
   11703           0 :     if (PyUnicode_READY(self) == -1)
   11704           0 :         return NULL;
   11705             : 
   11706           0 :     kind = PyUnicode_KIND(self);
   11707           0 :     data = PyUnicode_DATA(self);
   11708           0 :     len = PyUnicode_GET_LENGTH(self);
   11709             : 
   11710           0 :     i = 0;
   11711           0 :     if (striptype != RIGHTSTRIP) {
   11712           0 :         while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
   11713           0 :             i++;
   11714             :         }
   11715             :     }
   11716             : 
   11717           0 :     j = len;
   11718           0 :     if (striptype != LEFTSTRIP) {
   11719             :         do {
   11720           0 :             j--;
   11721           0 :         } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
   11722           0 :         j++;
   11723             :     }
   11724             : 
   11725           0 :     return PyUnicode_Substring(self, i, j);
   11726             : }
   11727             : 
   11728             : 
   11729             : static PyObject *
   11730           3 : do_argstrip(PyObject *self, int striptype, PyObject *args)
   11731             : {
   11732           3 :     PyObject *sep = NULL;
   11733             : 
   11734           3 :     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
   11735           0 :         return NULL;
   11736             : 
   11737           3 :     if (sep != NULL && sep != Py_None) {
   11738           3 :         if (PyUnicode_Check(sep))
   11739           3 :             return _PyUnicode_XStrip(self, striptype, sep);
   11740             :         else {
   11741           0 :             PyErr_Format(PyExc_TypeError,
   11742             :                          "%s arg must be None or str",
   11743           0 :                          STRIPNAME(striptype));
   11744           0 :             return NULL;
   11745             :         }
   11746             :     }
   11747             : 
   11748           0 :     return do_strip(self, striptype);
   11749             : }
   11750             : 
   11751             : 
   11752             : PyDoc_STRVAR(strip__doc__,
   11753             :              "S.strip([chars]) -> str\n\
   11754             : \n\
   11755             : Return a copy of the string S with leading and trailing\n\
   11756             : whitespace removed.\n\
   11757             : If chars is given and not None, remove characters in chars instead.");
   11758             : 
   11759             : static PyObject *
   11760           0 : unicode_strip(PyObject *self, PyObject *args)
   11761             : {
   11762           0 :     if (PyTuple_GET_SIZE(args) == 0)
   11763           0 :         return do_strip(self, BOTHSTRIP); /* Common case */
   11764             :     else
   11765           0 :         return do_argstrip(self, BOTHSTRIP, args);
   11766             : }
   11767             : 
   11768             : 
   11769             : PyDoc_STRVAR(lstrip__doc__,
   11770             :              "S.lstrip([chars]) -> str\n\
   11771             : \n\
   11772             : Return a copy of the string S with leading whitespace removed.\n\
   11773             : If chars is given and not None, remove characters in chars instead.");
   11774             : 
   11775             : static PyObject *
   11776           0 : unicode_lstrip(PyObject *self, PyObject *args)
   11777             : {
   11778           0 :     if (PyTuple_GET_SIZE(args) == 0)
   11779           0 :         return do_strip(self, LEFTSTRIP); /* Common case */
   11780             :     else
   11781           0 :         return do_argstrip(self, LEFTSTRIP, args);
   11782             : }
   11783             : 
   11784             : 
   11785             : PyDoc_STRVAR(rstrip__doc__,
   11786             :              "S.rstrip([chars]) -> str\n\
   11787             : \n\
   11788             : Return a copy of the string S with trailing whitespace removed.\n\
   11789             : If chars is given and not None, remove characters in chars instead.");
   11790             : 
   11791             : static PyObject *
   11792           3 : unicode_rstrip(PyObject *self, PyObject *args)
   11793             : {
   11794           3 :     if (PyTuple_GET_SIZE(args) == 0)
   11795           0 :         return do_strip(self, RIGHTSTRIP); /* Common case */
   11796             :     else
   11797           3 :         return do_argstrip(self, RIGHTSTRIP, args);
   11798             : }
   11799             : 
   11800             : 
   11801             : static PyObject*
   11802         176 : unicode_repeat(PyObject *str, Py_ssize_t len)
   11803             : {
   11804             :     PyObject *u;
   11805             :     Py_ssize_t nchars, n;
   11806             : 
   11807         176 :     if (len < 1) {
   11808           0 :         Py_INCREF(unicode_empty);
   11809           0 :         return unicode_empty;
   11810             :     }
   11811             : 
   11812             :     /* no repeat, return original string */
   11813         176 :     if (len == 1)
   11814          87 :         return unicode_result_unchanged(str);
   11815             : 
   11816          89 :     if (PyUnicode_READY(str) == -1)
   11817           0 :         return NULL;
   11818             : 
   11819          89 :     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
   11820           0 :         PyErr_SetString(PyExc_OverflowError,
   11821             :                         "repeated string is too long");
   11822           0 :         return NULL;
   11823             :     }
   11824          89 :     nchars = len * PyUnicode_GET_LENGTH(str);
   11825             : 
   11826          89 :     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
   11827          89 :     if (!u)
   11828           0 :         return NULL;
   11829             :     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
   11830             : 
   11831          89 :     if (PyUnicode_GET_LENGTH(str) == 1) {
   11832          89 :         const int kind = PyUnicode_KIND(str);
   11833          89 :         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
   11834          89 :         if (kind == PyUnicode_1BYTE_KIND) {
   11835          89 :             void *to = PyUnicode_DATA(u);
   11836          89 :             memset(to, (unsigned char)fill_char, len);
   11837             :         }
   11838           0 :         else if (kind == PyUnicode_2BYTE_KIND) {
   11839           0 :             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
   11840           0 :             for (n = 0; n < len; ++n)
   11841           0 :                 ucs2[n] = fill_char;
   11842             :         } else {
   11843           0 :             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
   11844             :             assert(kind == PyUnicode_4BYTE_KIND);
   11845           0 :             for (n = 0; n < len; ++n)
   11846           0 :                 ucs4[n] = fill_char;
   11847             :         }
   11848             :     }
   11849             :     else {
   11850             :         /* number of characters copied this far */
   11851           0 :         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
   11852           0 :         const Py_ssize_t char_size = PyUnicode_KIND(str);
   11853           0 :         char *to = (char *) PyUnicode_DATA(u);
   11854           0 :         Py_MEMCPY(to, PyUnicode_DATA(str),
   11855           0 :                   PyUnicode_GET_LENGTH(str) * char_size);
   11856           0 :         while (done < nchars) {
   11857           0 :             n = (done <= nchars-done) ? done : nchars-done;
   11858           0 :             Py_MEMCPY(to + (done * char_size), to, n * char_size);
   11859           0 :             done += n;
   11860             :         }
   11861             :     }
   11862             : 
   11863             :     assert(_PyUnicode_CheckConsistency(u, 1));
   11864          89 :     return u;
   11865             : }
   11866             : 
   11867             : PyObject *
   11868           0 : PyUnicode_Replace(PyObject *obj,
   11869             :                   PyObject *subobj,
   11870             :                   PyObject *replobj,
   11871             :                   Py_ssize_t maxcount)
   11872             : {
   11873             :     PyObject *self;
   11874             :     PyObject *str1;
   11875             :     PyObject *str2;
   11876             :     PyObject *result;
   11877             : 
   11878           0 :     self = PyUnicode_FromObject(obj);
   11879           0 :     if (self == NULL)
   11880           0 :         return NULL;
   11881           0 :     str1 = PyUnicode_FromObject(subobj);
   11882           0 :     if (str1 == NULL) {
   11883           0 :         Py_DECREF(self);
   11884           0 :         return NULL;
   11885             :     }
   11886           0 :     str2 = PyUnicode_FromObject(replobj);
   11887           0 :     if (str2 == NULL) {
   11888           0 :         Py_DECREF(self);
   11889           0 :         Py_DECREF(str1);
   11890           0 :         return NULL;
   11891             :     }
   11892           0 :     if (PyUnicode_READY(self) == -1 ||
   11893           0 :         PyUnicode_READY(str1) == -1 ||
   11894           0 :         PyUnicode_READY(str2) == -1)
   11895           0 :         result = NULL;
   11896             :     else
   11897           0 :         result = replace(self, str1, str2, maxcount);
   11898           0 :     Py_DECREF(self);
   11899           0 :     Py_DECREF(str1);
   11900           0 :     Py_DECREF(str2);
   11901           0 :     return result;
   11902             : }
   11903             : 
   11904             : PyDoc_STRVAR(replace__doc__,
   11905             :              "S.replace(old, new[, count]) -> str\n\
   11906             : \n\
   11907             : Return a copy of S with all occurrences of substring\n\
   11908             : old replaced by new.  If the optional argument count is\n\
   11909             : given, only the first count occurrences are replaced.");
   11910             : 
   11911             : static PyObject*
   11912           6 : unicode_replace(PyObject *self, PyObject *args)
   11913             : {
   11914             :     PyObject *str1;
   11915             :     PyObject *str2;
   11916           6 :     Py_ssize_t maxcount = -1;
   11917             :     PyObject *result;
   11918             : 
   11919           6 :     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
   11920           0 :         return NULL;
   11921           6 :     if (PyUnicode_READY(self) == -1)
   11922           0 :         return NULL;
   11923           6 :     str1 = PyUnicode_FromObject(str1);
   11924           6 :     if (str1 == NULL)
   11925           0 :         return NULL;
   11926           6 :     str2 = PyUnicode_FromObject(str2);
   11927           6 :     if (str2 == NULL) {
   11928           0 :         Py_DECREF(str1);
   11929           0 :         return NULL;
   11930             :     }
   11931           6 :     if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
   11932           0 :         result = NULL;
   11933             :     else
   11934           6 :         result = replace(self, str1, str2, maxcount);
   11935             : 
   11936           6 :     Py_DECREF(str1);
   11937           6 :     Py_DECREF(str2);
   11938           6 :     return result;
   11939             : }
   11940             : 
   11941             : static PyObject *
   11942          51 : unicode_repr(PyObject *unicode)
   11943             : {
   11944             :     PyObject *repr;
   11945             :     Py_ssize_t isize;
   11946             :     Py_ssize_t osize, squote, dquote, i, o;
   11947             :     Py_UCS4 max, quote;
   11948             :     int ikind, okind;
   11949             :     void *idata, *odata;
   11950             : 
   11951          51 :     if (PyUnicode_READY(unicode) == -1)
   11952           0 :         return NULL;
   11953             : 
   11954          51 :     isize = PyUnicode_GET_LENGTH(unicode);
   11955          51 :     idata = PyUnicode_DATA(unicode);
   11956             : 
   11957             :     /* Compute length of output, quote characters, and
   11958             :        maximum character */
   11959          51 :     osize = 2; /* quotes */
   11960          51 :     max = 127;
   11961          51 :     squote = dquote = 0;
   11962          51 :     ikind = PyUnicode_KIND(unicode);
   11963         391 :     for (i = 0; i < isize; i++) {
   11964         340 :         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   11965         340 :         switch (ch) {
   11966           0 :         case '\'': squote++; osize++; break;
   11967           0 :         case '"':  dquote++; osize++; break;
   11968             :         case '\\': case '\t': case '\r': case '\n':
   11969           0 :             osize += 2; break;
   11970             :         default:
   11971             :             /* Fast-path ASCII */
   11972         340 :             if (ch < ' ' || ch == 0x7f)
   11973           0 :                 osize += 4; /* \xHH */
   11974         340 :             else if (ch < 0x7f)
   11975         340 :                 osize++;
   11976           0 :             else if (Py_UNICODE_ISPRINTABLE(ch)) {
   11977           0 :                 osize++;
   11978           0 :                 max = ch > max ? ch : max;
   11979             :             }
   11980           0 :             else if (ch < 0x100)
   11981           0 :                 osize += 4; /* \xHH */
   11982           0 :             else if (ch < 0x10000)
   11983           0 :                 osize += 6; /* \uHHHH */
   11984             :             else
   11985           0 :                 osize += 10; /* \uHHHHHHHH */
   11986             :         }
   11987             :     }
   11988             : 
   11989          51 :     quote = '\'';
   11990          51 :     if (squote) {
   11991           0 :         if (dquote)
   11992             :             /* Both squote and dquote present. Use squote,
   11993             :                and escape them */
   11994           0 :             osize += squote;
   11995             :         else
   11996           0 :             quote = '"';
   11997             :     }
   11998             : 
   11999          51 :     repr = PyUnicode_New(osize, max);
   12000          51 :     if (repr == NULL)
   12001           0 :         return NULL;
   12002          51 :     okind = PyUnicode_KIND(repr);
   12003          51 :     odata = PyUnicode_DATA(repr);
   12004             : 
   12005          51 :     PyUnicode_WRITE(okind, odata, 0, quote);
   12006          51 :     PyUnicode_WRITE(okind, odata, osize-1, quote);
   12007             : 
   12008         391 :     for (i = 0, o = 1; i < isize; i++) {
   12009         340 :         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
   12010             : 
   12011             :         /* Escape quotes and backslashes */
   12012         340 :         if ((ch == quote) || (ch == '\\')) {
   12013           0 :             PyUnicode_WRITE(okind, odata, o++, '\\');
   12014           0 :             PyUnicode_WRITE(okind, odata, o++, ch);
   12015           0 :             continue;
   12016             :         }
   12017             : 
   12018             :         /* Map special whitespace to '\t', \n', '\r' */
   12019         340 :         if (ch == '\t') {
   12020           0 :             PyUnicode_WRITE(okind, odata, o++, '\\');
   12021           0 :             PyUnicode_WRITE(okind, odata, o++, 't');
   12022             :         }
   12023         340 :         else if (ch == '\n') {
   12024           0 :             PyUnicode_WRITE(okind, odata, o++, '\\');
   12025           0 :             PyUnicode_WRITE(okind, odata, o++, 'n');
   12026             :         }
   12027         340 :         else if (ch == '\r') {
   12028           0 :             PyUnicode_WRITE(okind, odata, o++, '\\');
   12029           0 :             PyUnicode_WRITE(okind, odata, o++, 'r');
   12030             :         }
   12031             : 
   12032             :         /* Map non-printable US ASCII to '\xhh' */
   12033         340 :         else if (ch < ' ' || ch == 0x7F) {
   12034           0 :             PyUnicode_WRITE(okind, odata, o++, '\\');
   12035           0 :             PyUnicode_WRITE(okind, odata, o++, 'x');
   12036           0 :             PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12037           0 :             PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12038             :         }
   12039             : 
   12040             :         /* Copy ASCII characters as-is */
   12041         340 :         else if (ch < 0x7F) {
   12042         340 :             PyUnicode_WRITE(okind, odata, o++, ch);
   12043             :         }
   12044             : 
   12045             :         /* Non-ASCII characters */
   12046             :         else {
   12047             :             /* Map Unicode whitespace and control characters
   12048             :                (categories Z* and C* except ASCII space)
   12049             :             */
   12050           0 :             if (!Py_UNICODE_ISPRINTABLE(ch)) {
   12051           0 :                 PyUnicode_WRITE(okind, odata, o++, '\\');
   12052             :                 /* Map 8-bit characters to '\xhh' */
   12053           0 :                 if (ch <= 0xff) {
   12054           0 :                     PyUnicode_WRITE(okind, odata, o++, 'x');
   12055           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
   12056           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
   12057             :                 }
   12058             :                 /* Map 16-bit characters to '\uxxxx' */
   12059           0 :                 else if (ch <= 0xffff) {
   12060           0 :                     PyUnicode_WRITE(okind, odata, o++, 'u');
   12061           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12062           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12063           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12064           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12065             :                 }
   12066             :                 /* Map 21-bit characters to '\U00xxxxxx' */
   12067             :                 else {
   12068           0 :                     PyUnicode_WRITE(okind, odata, o++, 'U');
   12069           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
   12070           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
   12071           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
   12072           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
   12073           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
   12074           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
   12075           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
   12076           0 :                     PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
   12077             :                 }
   12078             :             }
   12079             :             /* Copy characters as-is */
   12080             :             else {
   12081           0 :                 PyUnicode_WRITE(okind, odata, o++, ch);
   12082             :             }
   12083             :         }
   12084             :     }
   12085             :     /* Closing quote already added at the beginning */
   12086             :     assert(_PyUnicode_CheckConsistency(repr, 1));
   12087          51 :     return repr;
   12088             : }
   12089             : 
   12090             : PyDoc_STRVAR(rfind__doc__,
   12091             :              "S.rfind(sub[, start[, end]]) -> int\n\
   12092             : \n\
   12093             : Return the highest index in S where substring sub is found,\n\
   12094             : such that sub is contained within S[start:end].  Optional\n\
   12095             : arguments start and end are interpreted as in slice notation.\n\
   12096             : \n\
   12097             : Return -1 on failure.");
   12098             : 
   12099             : static PyObject *
   12100           5 : unicode_rfind(PyObject *self, PyObject *args)
   12101             : {
   12102             :     PyObject *substring;
   12103             :     Py_ssize_t start;
   12104             :     Py_ssize_t end;
   12105             :     Py_ssize_t result;
   12106             : 
   12107           5 :     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
   12108             :                                             &start, &end))
   12109           0 :         return NULL;
   12110             : 
   12111           5 :     if (PyUnicode_READY(self) == -1)
   12112           0 :         return NULL;
   12113           5 :     if (PyUnicode_READY(substring) == -1)
   12114           0 :         return NULL;
   12115             : 
   12116           5 :     result = any_find_slice(-1, self, substring, start, end);
   12117             : 
   12118           5 :     Py_DECREF(substring);
   12119             : 
   12120           5 :     if (result == -2)
   12121           0 :         return NULL;
   12122             : 
   12123           5 :     return PyLong_FromSsize_t(result);
   12124             : }
   12125             : 
   12126             : PyDoc_STRVAR(rindex__doc__,
   12127             :              "S.rindex(sub[, start[, end]]) -> int\n\
   12128             : \n\
   12129             : Like S.rfind() but raise ValueError when the substring is not found.");
   12130             : 
   12131             : static PyObject *
   12132           0 : unicode_rindex(PyObject *self, PyObject *args)
   12133             : {
   12134             :     PyObject *substring;
   12135             :     Py_ssize_t start;
   12136             :     Py_ssize_t end;
   12137             :     Py_ssize_t result;
   12138             : 
   12139           0 :     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
   12140             :                                             &start, &end))
   12141           0 :         return NULL;
   12142             : 
   12143           0 :     if (PyUnicode_READY(self) == -1)
   12144           0 :         return NULL;
   12145           0 :     if (PyUnicode_READY(substring) == -1)
   12146           0 :         return NULL;
   12147             : 
   12148           0 :     result = any_find_slice(-1, self, substring, start, end);
   12149             : 
   12150           0 :     Py_DECREF(substring);
   12151             : 
   12152           0 :     if (result == -2)
   12153           0 :         return NULL;
   12154             : 
   12155           0 :     if (result < 0) {
   12156           0 :         PyErr_SetString(PyExc_ValueError, "substring not found");
   12157           0 :         return NULL;
   12158             :     }
   12159             : 
   12160           0 :     return PyLong_FromSsize_t(result);
   12161             : }
   12162             : 
   12163             : PyDoc_STRVAR(rjust__doc__,
   12164             :              "S.rjust(width[, fillchar]) -> str\n\
   12165             : \n\
   12166             : Return S right-justified in a string of length width. Padding is\n\
   12167             : done using the specified fill character (default is a space).");
   12168             : 
   12169             : static PyObject *
   12170           0 : unicode_rjust(PyObject *self, PyObject *args)
   12171             : {
   12172             :     Py_ssize_t width;
   12173           0 :     Py_UCS4 fillchar = ' ';
   12174             : 
   12175           0 :     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
   12176           0 :         return NULL;
   12177             : 
   12178           0 :     if (PyUnicode_READY(self) == -1)
   12179           0 :         return NULL;
   12180             : 
   12181           0 :     if (PyUnicode_GET_LENGTH(self) >= width)
   12182           0 :         return unicode_result_unchanged(self);
   12183             : 
   12184           0 :     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
   12185             : }
   12186             : 
   12187             : PyObject *
   12188           0 : PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12189             : {
   12190             :     PyObject *result;
   12191             : 
   12192           0 :     s = PyUnicode_FromObject(s);
   12193           0 :     if (s == NULL)
   12194           0 :         return NULL;
   12195           0 :     if (sep != NULL) {
   12196           0 :         sep = PyUnicode_FromObject(sep);
   12197           0 :         if (sep == NULL) {
   12198           0 :             Py_DECREF(s);
   12199           0 :             return NULL;
   12200             :         }
   12201             :     }
   12202             : 
   12203           0 :     result = split(s, sep, maxsplit);
   12204             : 
   12205           0 :     Py_DECREF(s);
   12206           0 :     Py_XDECREF(sep);
   12207           0 :     return result;
   12208             : }
   12209             : 
   12210             : PyDoc_STRVAR(split__doc__,
   12211             :              "S.split(sep=None, maxsplit=-1) -> list of strings\n\
   12212             : \n\
   12213             : Return a list of the words in S, using sep as the\n\
   12214             : delimiter string.  If maxsplit is given, at most maxsplit\n\
   12215             : splits are done. If sep is not specified or is None, any\n\
   12216             : whitespace string is a separator and empty strings are\n\
   12217             : removed from the result.");
   12218             : 
   12219             : static PyObject*
   12220         109 : unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
   12221             : {
   12222             :     static char *kwlist[] = {"sep", "maxsplit", 0};
   12223         109 :     PyObject *substring = Py_None;
   12224         109 :     Py_ssize_t maxcount = -1;
   12225             : 
   12226         109 :     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
   12227             :                                      kwlist, &substring, &maxcount))
   12228           0 :         return NULL;
   12229             : 
   12230         109 :     if (substring == Py_None)
   12231           2 :         return split(self, NULL, maxcount);
   12232         107 :     else if (PyUnicode_Check(substring))
   12233         107 :         return split(self, substring, maxcount);
   12234             :     else
   12235           0 :         return PyUnicode_Split(self, substring, maxcount);
   12236             : }
   12237             : 
   12238             : PyObject *
   12239         276 : PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
   12240             : {
   12241             :     PyObject* str_obj;
   12242             :     PyObject* sep_obj;
   12243             :     PyObject* out;
   12244             :     int kind1, kind2, kind;
   12245         276 :     void *buf1 = NULL, *buf2 = NULL;
   12246             :     Py_ssize_t len1, len2;
   12247             : 
   12248         276 :     str_obj = PyUnicode_FromObject(str_in);
   12249         276 :     if (!str_obj)
   12250           0 :         return NULL;
   12251         276 :     sep_obj = PyUnicode_FromObject(sep_in);
   12252         276 :     if (!sep_obj) {
   12253           0 :         Py_DECREF(str_obj);
   12254           0 :         return NULL;
   12255             :     }
   12256         276 :     if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
   12257           0 :         Py_DECREF(sep_obj);
   12258           0 :         Py_DECREF(str_obj);
   12259           0 :         return NULL;
   12260             :     }
   12261             : 
   12262         276 :     kind1 = PyUnicode_KIND(str_obj);
   12263         276 :     kind2 = PyUnicode_KIND(sep_obj);
   12264         276 :     kind = Py_MAX(kind1, kind2);
   12265         276 :     buf1 = PyUnicode_DATA(str_obj);
   12266         276 :     if (kind1 != kind)
   12267           0 :         buf1 = _PyUnicode_AsKind(str_obj, kind);
   12268         276 :     if (!buf1)
   12269           0 :         goto onError;
   12270         276 :     buf2 = PyUnicode_DATA(sep_obj);
   12271         276 :     if (kind2 != kind)
   12272           0 :         buf2 = _PyUnicode_AsKind(sep_obj, kind);
   12273         276 :     if (!buf2)
   12274           0 :         goto onError;
   12275         276 :     len1 = PyUnicode_GET_LENGTH(str_obj);
   12276         276 :     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12277             : 
   12278         276 :     switch (PyUnicode_KIND(str_obj)) {
   12279             :     case PyUnicode_1BYTE_KIND:
   12280         276 :         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12281         276 :             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12282             :         else
   12283           0 :             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12284         276 :         break;
   12285             :     case PyUnicode_2BYTE_KIND:
   12286           0 :         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12287           0 :         break;
   12288             :     case PyUnicode_4BYTE_KIND:
   12289           0 :         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12290           0 :         break;
   12291             :     default:
   12292             :         assert(0);
   12293           0 :         out = 0;
   12294             :     }
   12295             : 
   12296         276 :     Py_DECREF(sep_obj);
   12297         276 :     Py_DECREF(str_obj);
   12298         276 :     if (kind1 != kind)
   12299           0 :         PyMem_Free(buf1);
   12300         276 :     if (kind2 != kind)
   12301           0 :         PyMem_Free(buf2);
   12302             : 
   12303         276 :     return out;
   12304             :   onError:
   12305           0 :     Py_DECREF(sep_obj);
   12306           0 :     Py_DECREF(str_obj);
   12307           0 :     if (kind1 != kind && buf1)
   12308           0 :         PyMem_Free(buf1);
   12309           0 :     if (kind2 != kind && buf2)
   12310           0 :         PyMem_Free(buf2);
   12311           0 :     return NULL;
   12312             : }
   12313             : 
   12314             : 
   12315             : PyObject *
   12316         719 : PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
   12317             : {
   12318             :     PyObject* str_obj;
   12319             :     PyObject* sep_obj;
   12320             :     PyObject* out;
   12321             :     int kind1, kind2, kind;
   12322         719 :     void *buf1 = NULL, *buf2 = NULL;
   12323             :     Py_ssize_t len1, len2;
   12324             : 
   12325         719 :     str_obj = PyUnicode_FromObject(str_in);
   12326         719 :     if (!str_obj)
   12327           0 :         return NULL;
   12328         719 :     sep_obj = PyUnicode_FromObject(sep_in);
   12329         719 :     if (!sep_obj) {
   12330           0 :         Py_DECREF(str_obj);
   12331           0 :         return NULL;
   12332             :     }
   12333             : 
   12334         719 :     kind1 = PyUnicode_KIND(str_in);
   12335         719 :     kind2 = PyUnicode_KIND(sep_obj);
   12336         719 :     kind = Py_MAX(kind1, kind2);
   12337         719 :     buf1 = PyUnicode_DATA(str_in);
   12338         719 :     if (kind1 != kind)
   12339           0 :         buf1 = _PyUnicode_AsKind(str_in, kind);
   12340         719 :     if (!buf1)
   12341           0 :         goto onError;
   12342         719 :     buf2 = PyUnicode_DATA(sep_obj);
   12343         719 :     if (kind2 != kind)
   12344           0 :         buf2 = _PyUnicode_AsKind(sep_obj, kind);
   12345         719 :     if (!buf2)
   12346           0 :         goto onError;
   12347         719 :     len1 = PyUnicode_GET_LENGTH(str_obj);
   12348         719 :     len2 = PyUnicode_GET_LENGTH(sep_obj);
   12349             : 
   12350         719 :     switch (PyUnicode_KIND(str_in)) {
   12351             :     case PyUnicode_1BYTE_KIND:
   12352         719 :         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
   12353         719 :             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12354             :         else
   12355           0 :             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12356         719 :         break;
   12357             :     case PyUnicode_2BYTE_KIND:
   12358           0 :         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12359           0 :         break;
   12360             :     case PyUnicode_4BYTE_KIND:
   12361           0 :         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
   12362           0 :         break;
   12363             :     default:
   12364             :         assert(0);
   12365           0 :         out = 0;
   12366             :     }
   12367             : 
   12368         719 :     Py_DECREF(sep_obj);
   12369         719 :     Py_DECREF(str_obj);
   12370         719 :     if (kind1 != kind)
   12371           0 :         PyMem_Free(buf1);
   12372         719 :     if (kind2 != kind)
   12373           0 :         PyMem_Free(buf2);
   12374             : 
   12375         719 :     return out;
   12376             :   onError:
   12377           0 :     Py_DECREF(sep_obj);
   12378           0 :     Py_DECREF(str_obj);
   12379           0 :     if (kind1 != kind && buf1)
   12380           0 :         PyMem_Free(buf1);
   12381           0 :     if (kind2 != kind && buf2)
   12382           0 :         PyMem_Free(buf2);
   12383           0 :     return NULL;
   12384             : }
   12385             : 
   12386             : PyDoc_STRVAR(partition__doc__,
   12387             :              "S.partition(sep) -> (head, sep, tail)\n\
   12388             : \n\
   12389             : Search for the separator sep in S, and return the part before it,\n\
   12390             : the separator itself, and the part after it.  If the separator is not\n\
   12391             : found, return S and two empty strings.");
   12392             : 
   12393             : static PyObject*
   12394          90 : unicode_partition(PyObject *self, PyObject *separator)
   12395             : {
   12396          90 :     return PyUnicode_Partition(self, separator);
   12397             : }
   12398             : 
   12399             : PyDoc_STRVAR(rpartition__doc__,
   12400             :              "S.rpartition(sep) -> (head, sep, tail)\n\
   12401             : \n\
   12402             : Search for the separator sep in S, starting at the end of S, and return\n\
   12403             : the part before it, the separator itself, and the part after it.  If the\n\
   12404             : separator is not found, return two empty strings and S.");
   12405             : 
   12406             : static PyObject*
   12407         719 : unicode_rpartition(PyObject *self, PyObject *separator)
   12408             : {
   12409         719 :     return PyUnicode_RPartition(self, separator);
   12410             : }
   12411             : 
   12412             : PyObject *
   12413           0 : PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
   12414             : {
   12415             :     PyObject *result;
   12416             : 
   12417           0 :     s = PyUnicode_FromObject(s);
   12418           0 :     if (s == NULL)
   12419           0 :         return NULL;
   12420           0 :     if (sep != NULL) {
   12421           0 :         sep = PyUnicode_FromObject(sep);
   12422           0 :         if (sep == NULL) {
   12423           0 :             Py_DECREF(s);
   12424           0 :             return NULL;
   12425             :         }
   12426             :     }
   12427             : 
   12428           0 :     result = rsplit(s, sep, maxsplit);
   12429             : 
   12430           0 :     Py_DECREF(s);
   12431           0 :     Py_XDECREF(sep);
   12432           0 :     return result;
   12433             : }
   12434             : 
   12435             : PyDoc_STRVAR(rsplit__doc__,
   12436             :              "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
   12437             : \n\
   12438             : Return a list of the words in S, using sep as the\n\
   12439             : delimiter string, starting at the end of the string and\n\
   12440             : working to the front.  If maxsplit is given, at most maxsplit\n\
   12441             : splits are done. If sep is not specified, any whitespace string\n\
   12442             : is a separator.");
   12443             : 
   12444             : static PyObject*
   12445          90 : unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
   12446             : {
   12447             :     static char *kwlist[] = {"sep", "maxsplit", 0};
   12448          90 :     PyObject *substring = Py_None;
   12449          90 :     Py_ssize_t maxcount = -1;
   12450             : 
   12451          90 :     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
   12452             :                                      kwlist, &substring, &maxcount))
   12453           0 :         return NULL;
   12454             : 
   12455          90 :     if (substring == Py_None)
   12456           0 :         return rsplit(self, NULL, maxcount);
   12457          90 :     else if (PyUnicode_Check(substring))
   12458          90 :         return rsplit(self, substring, maxcount);
   12459             :     else
   12460           0 :         return PyUnicode_RSplit(self, substring, maxcount);
   12461             : }
   12462             : 
   12463             : PyDoc_STRVAR(splitlines__doc__,
   12464             :              "S.splitlines([keepends]) -> list of strings\n\
   12465             : \n\
   12466             : Return a list of the lines in S, breaking at line boundaries.\n\
   12467             : Line breaks are not included in the resulting list unless keepends\n\
   12468             : is given and true.");
   12469             : 
   12470             : static PyObject*
   12471           0 : unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
   12472             : {
   12473             :     static char *kwlist[] = {"keepends", 0};
   12474           0 :     int keepends = 0;
   12475             : 
   12476           0 :     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
   12477             :                                      kwlist, &keepends))
   12478           0 :         return NULL;
   12479             : 
   12480           0 :     return PyUnicode_Splitlines(self, keepends);
   12481             : }
   12482             : 
   12483             : static
   12484           0 : PyObject *unicode_str(PyObject *self)
   12485             : {
   12486           0 :     return unicode_result_unchanged(self);
   12487             : }
   12488             : 
   12489             : PyDoc_STRVAR(swapcase__doc__,
   12490             :              "S.swapcase() -> str\n\
   12491             : \n\
   12492             : Return a copy of S with uppercase characters converted to lowercase\n\
   12493             : and vice versa.");
   12494             : 
   12495             : static PyObject*
   12496           0 : unicode_swapcase(PyObject *self)
   12497             : {
   12498           0 :     if (PyUnicode_READY(self) == -1)
   12499           0 :         return NULL;
   12500           0 :     return case_operation(self, do_swapcase);
   12501             : }
   12502             : 
   12503             : PyDoc_STRVAR(maketrans__doc__,
   12504             :              "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
   12505             : \n\
   12506             : Return a translation table usable for str.translate().\n\
   12507             : If there is only one argument, it must be a dictionary mapping Unicode\n\
   12508             : ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
   12509             : Character keys will be then converted to ordinals.\n\
   12510             : If there are two arguments, they must be strings of equal length, and\n\
   12511             : in the resulting dictionary, each character in x will be mapped to the\n\
   12512             : character at the same position in y. If there is a third argument, it\n\
   12513             : must be a string, whose characters will be mapped to None in the result.");
   12514             : 
   12515             : static PyObject*
   12516           0 : unicode_maketrans(PyObject *null, PyObject *args)
   12517             : {
   12518           0 :     PyObject *x, *y = NULL, *z = NULL;
   12519           0 :     PyObject *new = NULL, *key, *value;
   12520           0 :     Py_ssize_t i = 0;
   12521             :     int res;
   12522             : 
   12523           0 :     if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
   12524           0 :         return NULL;
   12525           0 :     new = PyDict_New();
   12526           0 :     if (!new)
   12527           0 :         return NULL;
   12528           0 :     if (y != NULL) {
   12529             :         int x_kind, y_kind, z_kind;
   12530             :         void *x_data, *y_data, *z_data;
   12531             : 
   12532             :         /* x must be a string too, of equal length */
   12533           0 :         if (!PyUnicode_Check(x)) {
   12534           0 :             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
   12535             :                             "be a string if there is a second argument");
   12536           0 :             goto err;
   12537             :         }
   12538           0 :         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
   12539           0 :             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
   12540             :                             "arguments must have equal length");
   12541           0 :             goto err;
   12542             :         }
   12543             :         /* create entries for translating chars in x to those in y */
   12544           0 :         x_kind = PyUnicode_KIND(x);
   12545           0 :         y_kind = PyUnicode_KIND(y);
   12546           0 :         x_data = PyUnicode_DATA(x);
   12547           0 :         y_data = PyUnicode_DATA(y);
   12548           0 :         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
   12549           0 :             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
   12550           0 :             if (!key)
   12551           0 :                 goto err;
   12552           0 :             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
   12553           0 :             if (!value) {
   12554           0 :                 Py_DECREF(key);
   12555           0 :                 goto err;
   12556             :             }
   12557           0 :             res = PyDict_SetItem(new, key, value);
   12558           0 :             Py_DECREF(key);
   12559           0 :             Py_DECREF(value);
   12560           0 :             if (res < 0)
   12561           0 :                 goto err;
   12562             :         }
   12563             :         /* create entries for deleting chars in z */
   12564           0 :         if (z != NULL) {
   12565           0 :             z_kind = PyUnicode_KIND(z);
   12566           0 :             z_data = PyUnicode_DATA(z);
   12567           0 :             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
   12568           0 :                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
   12569           0 :                 if (!key)
   12570           0 :                     goto err;
   12571           0 :                 res = PyDict_SetItem(new, key, Py_None);
   12572           0 :                 Py_DECREF(key);
   12573           0 :                 if (res < 0)
   12574           0 :                     goto err;
   12575             :             }
   12576             :         }
   12577             :     } else {
   12578             :         int kind;
   12579             :         void *data;
   12580             : 
   12581             :         /* x must be a dict */
   12582           0 :         if (!PyDict_CheckExact(x)) {
   12583           0 :             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
   12584             :                             "to maketrans it must be a dict");
   12585           0 :             goto err;
   12586             :         }
   12587             :         /* copy entries into the new dict, converting string keys to int keys */
   12588           0 :         while (PyDict_Next(x, &i, &key, &value)) {
   12589           0 :             if (PyUnicode_Check(key)) {
   12590             :                 /* convert string keys to integer keys */
   12591             :                 PyObject *newkey;
   12592           0 :                 if (PyUnicode_GET_LENGTH(key) != 1) {
   12593           0 :                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
   12594             :                                     "table must be of length 1");
   12595           0 :                     goto err;
   12596             :                 }
   12597           0 :                 kind = PyUnicode_KIND(key);
   12598           0 :                 data = PyUnicode_DATA(key);
   12599           0 :                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
   12600           0 :                 if (!newkey)
   12601           0 :                     goto err;
   12602           0 :                 res = PyDict_SetItem(new, newkey, value);
   12603           0 :                 Py_DECREF(newkey);
   12604           0 :                 if (res < 0)
   12605           0 :                     goto err;
   12606           0 :             } else if (PyLong_Check(key)) {
   12607             :                 /* just keep integer keys */
   12608           0 :                 if (PyDict_SetItem(new, key, value) < 0)
   12609           0 :                     goto err;
   12610             :             } else {
   12611           0 :                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
   12612             :                                 "be strings or integers");
   12613           0 :                 goto err;
   12614             :             }
   12615             :         }
   12616             :     }
   12617           0 :     return new;
   12618             :   err:
   12619           0 :     Py_DECREF(new);
   12620           0 :     return NULL;
   12621             : }
   12622             : 
   12623             : PyDoc_STRVAR(translate__doc__,
   12624             :              "S.translate(table) -> str\n\
   12625             : \n\
   12626             : Return a copy of the string S, where all characters have been mapped\n\
   12627             : through the given translation table, which must be a mapping of\n\
   12628             : Unicode ordinals to Unicode ordinals, strings, or None.\n\
   12629             : Unmapped characters are left untouched. Characters mapped to None\n\
   12630             : are deleted.");
   12631             : 
   12632             : static PyObject*
   12633           0 : unicode_translate(PyObject *self, PyObject *table)
   12634             : {
   12635           0 :     return _PyUnicode_TranslateCharmap(self, table, "ignore");
   12636             : }
   12637             : 
   12638             : PyDoc_STRVAR(upper__doc__,
   12639             :              "S.upper() -> str\n\
   12640             : \n\
   12641             : Return a copy of S converted to uppercase.");
   12642             : 
   12643             : static PyObject*
   12644           0 : unicode_upper(PyObject *self)
   12645             : {
   12646           0 :     if (PyUnicode_READY(self) == -1)
   12647           0 :         return NULL;
   12648           0 :     if (PyUnicode_IS_ASCII(self))
   12649           0 :         return ascii_upper_or_lower(self, 0);
   12650           0 :     return case_operation(self, do_upper);
   12651             : }
   12652             : 
   12653             : PyDoc_STRVAR(zfill__doc__,
   12654             :              "S.zfill(width) -> str\n\
   12655             : \n\
   12656             : Pad a numeric string S with zeros on the left, to fill a field\n\
   12657             : of the specified width. The string S is never truncated.");
   12658             : 
   12659             : static PyObject *
   12660           0 : unicode_zfill(PyObject *self, PyObject *args)
   12661             : {
   12662             :     Py_ssize_t fill;
   12663             :     PyObject *u;
   12664             :     Py_ssize_t width;
   12665             :     int kind;
   12666             :     void *data;
   12667             :     Py_UCS4 chr;
   12668             : 
   12669           0 :     if (!PyArg_ParseTuple(args, "n:zfill", &width))
   12670           0 :         return NULL;
   12671             : 
   12672           0 :     if (PyUnicode_READY(self) == -1)
   12673           0 :         return NULL;
   12674             : 
   12675           0 :     if (PyUnicode_GET_LENGTH(self) >= width)
   12676           0 :         return unicode_result_unchanged(self);
   12677             : 
   12678           0 :     fill = width - PyUnicode_GET_LENGTH(self);
   12679             : 
   12680           0 :     u = pad(self, fill, 0, '0');
   12681             : 
   12682           0 :     if (u == NULL)
   12683           0 :         return NULL;
   12684             : 
   12685           0 :     kind = PyUnicode_KIND(u);
   12686           0 :     data = PyUnicode_DATA(u);
   12687           0 :     chr = PyUnicode_READ(kind, data, fill);
   12688             : 
   12689           0 :     if (chr == '+' || chr == '-') {
   12690             :         /* move sign to beginning of string */
   12691           0 :         PyUnicode_WRITE(kind, data, 0, chr);
   12692           0 :         PyUnicode_WRITE(kind, data, fill, '0');
   12693             :     }
   12694             : 
   12695             :     assert(_PyUnicode_CheckConsistency(u, 1));
   12696           0 :     return u;
   12697             : }
   12698             : 
   12699             : #if 0
   12700             : static PyObject *
   12701             : unicode__decimal2ascii(PyObject *self)
   12702             : {
   12703             :     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
   12704             : }
   12705             : #endif
   12706             : 
   12707             : PyDoc_STRVAR(startswith__doc__,
   12708             :              "S.startswith(prefix[, start[, end]]) -> bool\n\
   12709             : \n\
   12710             : Return True if S starts with the specified prefix, False otherwise.\n\
   12711             : With optional start, test S beginning at that position.\n\
   12712             : With optional end, stop comparing S at that position.\n\
   12713             : prefix can also be a tuple of strings to try.");
   12714             : 
   12715             : static PyObject *
   12716         538 : unicode_startswith(PyObject *self,
   12717             :                    PyObject *args)
   12718             : {
   12719             :     PyObject *subobj;
   12720             :     PyObject *substring;
   12721         538 :     Py_ssize_t start = 0;
   12722         538 :     Py_ssize_t end = PY_SSIZE_T_MAX;
   12723             :     int result;
   12724             : 
   12725         538 :     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
   12726           0 :         return NULL;
   12727         538 :     if (PyTuple_Check(subobj)) {
   12728             :         Py_ssize_t i;
   12729          48 :         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   12730          36 :             substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
   12731          36 :             if (substring == NULL)
   12732           0 :                 return NULL;
   12733          36 :             result = tailmatch(self, substring, start, end, -1);
   12734          36 :             Py_DECREF(substring);
   12735          36 :             if (result) {
   12736           0 :                 Py_RETURN_TRUE;
   12737             :             }
   12738             :         }
   12739             :         /* nothing matched */
   12740          12 :         Py_RETURN_FALSE;
   12741             :     }
   12742         526 :     substring = PyUnicode_FromObject(subobj);
   12743         526 :     if (substring == NULL) {
   12744           0 :         if (PyErr_ExceptionMatches(PyExc_TypeError))
   12745           0 :             PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
   12746           0 :                          "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
   12747           0 :         return NULL;
   12748             :     }
   12749         526 :     result = tailmatch(self, substring, start, end, -1);
   12750         526 :     Py_DECREF(substring);
   12751         526 :     return PyBool_FromLong(result);
   12752             : }
   12753             : 
   12754             : 
   12755             : PyDoc_STRVAR(endswith__doc__,
   12756             :              "S.endswith(suffix[, start[, end]]) -> bool\n\
   12757             : \n\
   12758             : Return True if S ends with the specified suffix, False otherwise.\n\
   12759             : With optional start, test S beginning at that position.\n\
   12760             : With optional end, stop comparing S at that position.\n\
   12761             : suffix can also be a tuple of strings to try.");
   12762             : 
   12763             : static PyObject *
   12764         151 : unicode_endswith(PyObject *self,
   12765             :                  PyObject *args)
   12766             : {
   12767             :     PyObject *subobj;
   12768             :     PyObject *substring;
   12769         151 :     Py_ssize_t start = 0;
   12770         151 :     Py_ssize_t end = PY_SSIZE_T_MAX;
   12771             :     int result;
   12772             : 
   12773         151 :     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
   12774           0 :         return NULL;
   12775         151 :     if (PyTuple_Check(subobj)) {
   12776             :         Py_ssize_t i;
   12777           0 :         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
   12778           0 :             substring = PyUnicode_FromObject(
   12779           0 :                 PyTuple_GET_ITEM(subobj, i));
   12780           0 :             if (substring == NULL)
   12781           0 :                 return NULL;
   12782           0 :             result = tailmatch(self, substring, start, end, +1);
   12783           0 :             Py_DECREF(substring);
   12784           0 :             if (result) {
   12785           0 :                 Py_RETURN_TRUE;
   12786             :             }
   12787             :         }
   12788           0 :         Py_RETURN_FALSE;
   12789             :     }
   12790         151 :     substring = PyUnicode_FromObject(subobj);
   12791         151 :     if (substring == NULL) {
   12792           0 :         if (PyErr_ExceptionMatches(PyExc_TypeError))
   12793           0 :             PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
   12794           0 :                          "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
   12795           0 :         return NULL;
   12796             :     }
   12797         151 :     result = tailmatch(self, substring, start, end, +1);
   12798         151 :     Py_DECREF(substring);
   12799         151 :     return PyBool_FromLong(result);
   12800             : }
   12801             : 
   12802             : Py_LOCAL_INLINE(void)
   12803          68 : _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
   12804             : {
   12805          68 :     writer->size = PyUnicode_GET_LENGTH(writer->buffer);
   12806          68 :     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
   12807          68 :     writer->data = PyUnicode_DATA(writer->buffer);
   12808          68 :     writer->kind = PyUnicode_KIND(writer->buffer);
   12809          68 : }
   12810             : 
   12811             : void
   12812          66 : _PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
   12813             : {
   12814          66 :     memset(writer, 0, sizeof(*writer));
   12815             : #ifdef Py_DEBUG
   12816             :     writer->kind = 5;    /* invalid kind */
   12817             : #endif
   12818          66 :     writer->min_length = Py_MAX(min_length, 100);
   12819          66 :     writer->overallocate = (min_length > 0);
   12820          66 : }
   12821             : 
   12822             : int
   12823          66 : _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
   12824             :                                  Py_ssize_t length, Py_UCS4 maxchar)
   12825             : {
   12826             :     Py_ssize_t newlen;
   12827             :     PyObject *newbuffer;
   12828             : 
   12829             :     assert(length > 0);
   12830             : 
   12831          66 :     if (length > PY_SSIZE_T_MAX - writer->pos) {
   12832           0 :         PyErr_NoMemory();
   12833           0 :         return -1;
   12834             :     }
   12835          66 :     newlen = writer->pos + length;
   12836             : 
   12837          66 :     if (writer->buffer == NULL) {
   12838          64 :         if (writer->overallocate) {
   12839             :             /* overallocate 25% to limit the number of resize */
   12840          64 :             if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
   12841          64 :                 newlen += newlen / 4;
   12842          64 :             if (newlen < writer->min_length)
   12843          63 :                 newlen = writer->min_length;
   12844             :         }
   12845          64 :         writer->buffer = PyUnicode_New(newlen, maxchar);
   12846          64 :         if (writer->buffer == NULL)
   12847           0 :             return -1;
   12848          64 :         _PyUnicodeWriter_Update(writer);
   12849          64 :         return 0;
   12850             :     }
   12851             : 
   12852           2 :     if (newlen > writer->size) {
   12853           2 :         if (writer->overallocate) {
   12854             :             /* overallocate 25% to limit the number of resize */
   12855           2 :             if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
   12856           2 :                 newlen += newlen / 4;
   12857           2 :             if (newlen < writer->min_length)
   12858           0 :                 newlen = writer->min_length;
   12859             :         }
   12860             : 
   12861           2 :         if (maxchar > writer->maxchar || writer->readonly) {
   12862             :             /* resize + widen */
   12863           0 :             newbuffer = PyUnicode_New(newlen, maxchar);
   12864           0 :             if (newbuffer == NULL)
   12865           0 :                 return -1;
   12866           0 :             _PyUnicode_FastCopyCharacters(newbuffer, 0,
   12867             :                                           writer->buffer, 0, writer->pos);
   12868           0 :             Py_DECREF(writer->buffer);
   12869           0 :             writer->readonly = 0;
   12870             :         }
   12871             :         else {
   12872           2 :             newbuffer = resize_compact(writer->buffer, newlen);
   12873           2 :             if (newbuffer == NULL)
   12874           0 :                 return -1;
   12875             :         }
   12876           2 :         writer->buffer = newbuffer;
   12877           2 :         _PyUnicodeWriter_Update(writer);
   12878             :     }
   12879           0 :     else if (maxchar > writer->maxchar) {
   12880             :         assert(!writer->readonly);
   12881           0 :         newbuffer = PyUnicode_New(writer->size, maxchar);
   12882           0 :         if (newbuffer == NULL)
   12883           0 :             return -1;
   12884           0 :         _PyUnicode_FastCopyCharacters(newbuffer, 0,
   12885             :                                       writer->buffer, 0, writer->pos);
   12886           0 :         Py_DECREF(writer->buffer);
   12887           0 :         writer->buffer = newbuffer;
   12888           0 :         _PyUnicodeWriter_Update(writer);
   12889             :     }
   12890           2 :     return 0;
   12891             : }
   12892             : 
   12893             : int
   12894         105 : _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
   12895             : {
   12896             :     Py_UCS4 maxchar;
   12897             :     Py_ssize_t len;
   12898             : 
   12899         105 :     if (PyUnicode_READY(str) == -1)
   12900           0 :         return -1;
   12901         105 :     len = PyUnicode_GET_LENGTH(str);
   12902         105 :     if (len == 0)
   12903           0 :         return 0;
   12904         105 :     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
   12905         105 :     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
   12906          27 :         if (writer->buffer == NULL && !writer->overallocate) {
   12907           2 :             Py_INCREF(str);
   12908           2 :             writer->buffer = str;
   12909           2 :             _PyUnicodeWriter_Update(writer);
   12910           2 :             writer->readonly = 1;
   12911           2 :             writer->size = 0;
   12912           2 :             writer->pos += len;
   12913           2 :             return 0;
   12914             :         }
   12915          25 :         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
   12916           0 :             return -1;
   12917             :     }
   12918         103 :     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
   12919             :                                   str, 0, len);
   12920         103 :     writer->pos += len;
   12921         103 :     return 0;
   12922             : }
   12923             : 
   12924             : PyObject *
   12925          66 : _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
   12926             : {
   12927          66 :     if (writer->pos == 0) {
   12928           0 :         Py_XDECREF(writer->buffer);
   12929           0 :         Py_INCREF(unicode_empty);
   12930           0 :         return unicode_empty;
   12931             :     }
   12932          66 :     if (writer->readonly) {
   12933             :         assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
   12934           2 :         return writer->buffer;
   12935             :     }
   12936          64 :     if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
   12937             :         PyObject *newbuffer;
   12938          64 :         newbuffer = resize_compact(writer->buffer, writer->pos);
   12939          64 :         if (newbuffer == NULL) {
   12940           0 :             Py_DECREF(writer->buffer);
   12941           0 :             return NULL;
   12942             :         }
   12943          64 :         writer->buffer = newbuffer;
   12944             :     }
   12945             :     assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
   12946          64 :     return writer->buffer;
   12947             : }
   12948             : 
   12949             : void
   12950           0 : _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
   12951             : {
   12952           0 :     Py_CLEAR(writer->buffer);
   12953           0 : }
   12954             : 
   12955             : #include "stringlib/unicode_format.h"
   12956             : 
   12957             : PyDoc_STRVAR(format__doc__,
   12958             :              "S.format(*args, **kwargs) -> str\n\
   12959             : \n\
   12960             : Return a formatted version of S, using substitutions from args and kwargs.\n\
   12961             : The substitutions are identified by braces ('{' and '}').");
   12962             : 
   12963             : PyDoc_STRVAR(format_map__doc__,
   12964             :              "S.format_map(mapping) -> str\n\
   12965             : \n\
   12966             : Return a formatted version of S, using substitutions from mapping.\n\
   12967             : The substitutions are identified by braces ('{' and '}').");
   12968             : 
   12969             : static PyObject *
   12970           0 : unicode__format__(PyObject* self, PyObject* args)
   12971             : {
   12972             :     PyObject *format_spec;
   12973             :     _PyUnicodeWriter writer;
   12974             :     int ret;
   12975             : 
   12976           0 :     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
   12977           0 :         return NULL;
   12978             : 
   12979           0 :     if (PyUnicode_READY(self) == -1)
   12980           0 :         return NULL;
   12981           0 :     _PyUnicodeWriter_Init(&writer, 0);
   12982           0 :     ret = _PyUnicode_FormatAdvancedWriter(&writer,
   12983             :                                           self, format_spec, 0,
   12984           0 :                                           PyUnicode_GET_LENGTH(format_spec));
   12985           0 :     if (ret == -1) {
   12986           0 :         _PyUnicodeWriter_Dealloc(&writer);
   12987           0 :         return NULL;
   12988             :     }
   12989           0 :     return _PyUnicodeWriter_Finish(&writer);
   12990             : }
   12991             : 
   12992             : PyDoc_STRVAR(p_format__doc__,
   12993             :              "S.__format__(format_spec) -> str\n\
   12994             : \n\
   12995             : Return a formatted version of S as described by format_spec.");
   12996             : 
   12997             : static PyObject *
   12998           0 : unicode__sizeof__(PyObject *v)
   12999             : {
   13000             :     Py_ssize_t size;
   13001             : 
   13002             :     /* If it's a compact object, account for base structure +
   13003             :        character data. */
   13004           0 :     if (PyUnicode_IS_COMPACT_ASCII(v))
   13005           0 :         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
   13006           0 :     else if (PyUnicode_IS_COMPACT(v))
   13007           0 :         size = sizeof(PyCompactUnicodeObject) +
   13008           0 :             (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
   13009             :     else {
   13010             :         /* If it is a two-block object, account for base object, and
   13011             :            for character block if present. */
   13012           0 :         size = sizeof(PyUnicodeObject);
   13013           0 :         if (_PyUnicode_DATA_ANY(v))
   13014           0 :             size += (PyUnicode_GET_LENGTH(v) + 1) *
   13015           0 :                 PyUnicode_KIND(v);
   13016             :     }
   13017             :     /* If the wstr pointer is present, account for it unless it is shared
   13018             :        with the data pointer. Check if the data is not shared. */
   13019           0 :     if (_PyUnicode_HAS_WSTR_MEMORY(v))
   13020           0 :         size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
   13021           0 :     if (_PyUnicode_HAS_UTF8_MEMORY(v))
   13022           0 :         size += PyUnicode_UTF8_LENGTH(v) + 1;
   13023             : 
   13024           0 :     return PyLong_FromSsize_t(size);
   13025             : }
   13026             : 
   13027             : PyDoc_STRVAR(sizeof__doc__,
   13028             :              "S.__sizeof__() -> size of S in memory, in bytes");
   13029             : 
   13030             : static PyObject *
   13031           0 : unicode_getnewargs(PyObject *v)
   13032             : {
   13033           0 :     PyObject *copy = _PyUnicode_Copy(v);
   13034           0 :     if (!copy)
   13035           0 :         return NULL;
   13036           0 :     return Py_BuildValue("(N)", copy);
   13037             : }
   13038             : 
   13039             : static PyMethodDef unicode_methods[] = {
   13040             :     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
   13041             :     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
   13042             :     {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
   13043             :     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
   13044             :     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
   13045             :     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
   13046             :     {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
   13047             :     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
   13048             :     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
   13049             :     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
   13050             :     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
   13051             :     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
   13052             :     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
   13053             :     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
   13054             :     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
   13055             :     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
   13056             :     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
   13057             :     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
   13058             :     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
   13059             :     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
   13060             :     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
   13061             :     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
   13062             :     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
   13063             :     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
   13064             :     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
   13065             :     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
   13066             :     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
   13067             :     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
   13068             :     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
   13069             :     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
   13070             :     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
   13071             :     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
   13072             :     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
   13073             :     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
   13074             :     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
   13075             :     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
   13076             :     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
   13077             :     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
   13078             :     {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
   13079             :     {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
   13080             :     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
   13081             :     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
   13082             :     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
   13083             :     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
   13084             :     {"maketrans", (PyCFunction) unicode_maketrans,
   13085             :      METH_VARARGS | METH_STATIC, maketrans__doc__},
   13086             :     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
   13087             : #if 0
   13088             :     /* These methods are just used for debugging the implementation. */
   13089             :     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
   13090             : #endif
   13091             : 
   13092             :     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
   13093             :     {NULL, NULL}
   13094             : };
   13095             : 
   13096             : static PyObject *
   13097           0 : unicode_mod(PyObject *v, PyObject *w)
   13098             : {
   13099           0 :     if (!PyUnicode_Check(v))
   13100           0 :         Py_RETURN_NOTIMPLEMENTED;
   13101           0 :     return PyUnicode_Format(v, w);
   13102             : }
   13103             : 
   13104             : static PyNumberMethods unicode_as_number = {
   13105             :     0,              /*nb_add*/
   13106             :     0,              /*nb_subtract*/
   13107             :     0,              /*nb_multiply*/
   13108             :     unicode_mod,            /*nb_remainder*/
   13109             : };
   13110             : 
   13111             : static PySequenceMethods unicode_as_sequence = {
   13112             :     (lenfunc) unicode_length,       /* sq_length */
   13113             :     PyUnicode_Concat,           /* sq_concat */
   13114             :     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
   13115             :     (ssizeargfunc) unicode_getitem,     /* sq_item */
   13116             :     0,                  /* sq_slice */
   13117             :     0,                  /* sq_ass_item */
   13118             :     0,                  /* sq_ass_slice */
   13119             :     PyUnicode_Contains,         /* sq_contains */
   13120             : };
   13121             : 
   13122             : static PyObject*
   13123        8285 : unicode_subscript(PyObject* self, PyObject* item)
   13124             : {
   13125        8285 :     if (PyUnicode_READY(self) == -1)
   13126           0 :         return NULL;
   13127             : 
   13128        8285 :     if (PyIndex_Check(item)) {
   13129        4058 :         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
   13130        4058 :         if (i == -1 && PyErr_Occurred())
   13131           0 :             return NULL;
   13132        4058 :         if (i < 0)
   13133         392 :             i += PyUnicode_GET_LENGTH(self);
   13134        4058 :         return unicode_getitem(self, i);
   13135        4227 :     } else if (PySlice_Check(item)) {
   13136             :         Py_ssize_t start, stop, step, slicelength, cur, i;
   13137             :         PyObject *result;
   13138             :         void *src_data, *dest_data;
   13139             :         int src_kind, dest_kind;
   13140             :         Py_UCS4 ch, max_char, kind_limit;
   13141             : 
   13142        4227 :         if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
   13143             :                                  &start, &stop, &step, &slicelength) < 0) {
   13144           0 :             return NULL;
   13145             :         }
   13146             : 
   13147        4227 :         if (slicelength <= 0) {
   13148          49 :             Py_INCREF(unicode_empty);
   13149          49 :             return unicode_empty;
   13150        4285 :         } else if (start == 0 && step == 1 &&
   13151         107 :                    slicelength == PyUnicode_GET_LENGTH(self)) {
   13152          26 :             return unicode_result_unchanged(self);
   13153        4152 :         } else if (step == 1) {
   13154        4152 :             return PyUnicode_Substring(self,
   13155             :                                        start, start + slicelength);
   13156             :         }
   13157             :         /* General case */
   13158           0 :         src_kind = PyUnicode_KIND(self);
   13159           0 :         src_data = PyUnicode_DATA(self);
   13160           0 :         if (!PyUnicode_IS_ASCII(self)) {
   13161           0 :             kind_limit = kind_maxchar_limit(src_kind);
   13162           0 :             max_char = 0;
   13163           0 :             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   13164           0 :                 ch = PyUnicode_READ(src_kind, src_data, cur);
   13165           0 :                 if (ch > max_char) {
   13166           0 :                     max_char = ch;
   13167           0 :                     if (max_char >= kind_limit)
   13168           0 :                         break;
   13169             :                 }
   13170             :             }
   13171             :         }
   13172             :         else
   13173           0 :             max_char = 127;
   13174           0 :         result = PyUnicode_New(slicelength, max_char);
   13175           0 :         if (result == NULL)
   13176           0 :             return NULL;
   13177           0 :         dest_kind = PyUnicode_KIND(result);
   13178           0 :         dest_data = PyUnicode_DATA(result);
   13179             : 
   13180           0 :         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
   13181           0 :             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
   13182           0 :             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
   13183             :         }
   13184             :         assert(_PyUnicode_CheckConsistency(result, 1));
   13185           0 :         return result;
   13186             :     } else {
   13187           0 :         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
   13188           0 :         return NULL;
   13189             :     }
   13190             : }
   13191             : 
   13192             : static PyMappingMethods unicode_as_mapping = {
   13193             :     (lenfunc)unicode_length,        /* mp_length */
   13194             :     (binaryfunc)unicode_subscript,  /* mp_subscript */
   13195             :     (objobjargproc)0,           /* mp_ass_subscript */
   13196             : };
   13197             : 
   13198             : 
   13199             : /* Helpers for PyUnicode_Format() */
   13200             : 
   13201             : static PyObject *
   13202           9 : getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
   13203             : {
   13204           9 :     Py_ssize_t argidx = *p_argidx;
   13205           9 :     if (argidx < arglen) {
   13206           9 :         (*p_argidx)++;
   13207           9 :         if (arglen < 0)
   13208           7 :             return args;
   13209             :         else
   13210           2 :             return PyTuple_GetItem(args, argidx);
   13211             :     }
   13212           0 :     PyErr_SetString(PyExc_TypeError,
   13213             :                     "not enough arguments for format string");
   13214           0 :     return NULL;
   13215             : }
   13216             : 
   13217             : /* Returns a new reference to a PyUnicode object, or NULL on failure. */
   13218             : 
   13219             : static int
   13220           0 : formatfloat(PyObject *v, int flags, int prec, int type,
   13221             :             PyObject **p_output, _PyUnicodeWriter *writer)
   13222             : {
   13223             :     char *p;
   13224             :     double x;
   13225             :     Py_ssize_t len;
   13226             : 
   13227           0 :     x = PyFloat_AsDouble(v);
   13228           0 :     if (x == -1.0 && PyErr_Occurred())
   13229           0 :         return -1;
   13230             : 
   13231           0 :     if (prec < 0)
   13232           0 :         prec = 6;
   13233             : 
   13234           0 :     p = PyOS_double_to_string(x, type, prec,
   13235           0 :                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
   13236           0 :     if (p == NULL)
   13237           0 :         return -1;
   13238           0 :     len = strlen(p);
   13239           0 :     if (writer) {
   13240           0 :         if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
   13241           0 :             PyMem_Free(p);
   13242           0 :             return -1;
   13243             :         }
   13244           0 :         unicode_write_cstr(writer->buffer, writer->pos, p, len);
   13245           0 :         writer->pos += len;
   13246             :     }
   13247             :     else
   13248           0 :         *p_output = _PyUnicode_FromASCII(p, len);
   13249           0 :     PyMem_Free(p);
   13250           0 :     return 0;
   13251             : }
   13252             : 
   13253             : /* formatlong() emulates the format codes d, u, o, x and X, and
   13254             :  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
   13255             :  * Python's regular ints.
   13256             :  * Return value:  a new PyUnicodeObject*, or NULL if error.
   13257             :  *     The output string is of the form
   13258             :  *         "-"? ("0x" | "0X")? digit+
   13259             :  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
   13260             :  *         set in flags.  The case of hex digits will be correct,
   13261             :  *     There will be at least prec digits, zero-filled on the left if
   13262             :  *         necessary to get that many.
   13263             :  * val          object to be converted
   13264             :  * flags        bitmask of format flags; only F_ALT is looked at
   13265             :  * prec         minimum number of digits; 0-fill on left if needed
   13266             :  * type         a character in [duoxX]; u acts the same as d
   13267             :  *
   13268             :  * CAUTION:  o, x and X conversions on regular ints can never
   13269             :  * produce a '-' sign, but can for Python's unbounded ints.
   13270             :  */
   13271             : static PyObject*
   13272           0 : formatlong(PyObject *val, int flags, int prec, int type)
   13273             : {
   13274           0 :     PyObject *result = NULL;
   13275             :     char *buf;
   13276             :     Py_ssize_t i;
   13277             :     int sign;           /* 1 if '-', else 0 */
   13278             :     int len;            /* number of characters */
   13279             :     Py_ssize_t llen;
   13280             :     int numdigits;      /* len == numnondigits + numdigits */
   13281           0 :     int numnondigits = 0;
   13282             : 
   13283             :     /* Avoid exceeding SSIZE_T_MAX */
   13284           0 :     if (prec > INT_MAX-3) {
   13285           0 :         PyErr_SetString(PyExc_OverflowError,
   13286             :                         "precision too large");
   13287           0 :         return NULL;
   13288             :     }
   13289             : 
   13290             :     assert(PyLong_Check(val));
   13291             : 
   13292           0 :     switch (type) {
   13293             :     case 'd':
   13294             :     case 'u':
   13295             :         /* Special-case boolean: we want 0/1 */
   13296           0 :         if (PyBool_Check(val))
   13297           0 :             result = PyNumber_ToBase(val, 10);
   13298             :         else
   13299           0 :             result = Py_TYPE(val)->tp_str(val);
   13300           0 :         break;
   13301             :     case 'o':
   13302           0 :         numnondigits = 2;
   13303           0 :         result = PyNumber_ToBase(val, 8);
   13304           0 :         break;
   13305             :     case 'x':
   13306             :     case 'X':
   13307           0 :         numnondigits = 2;
   13308           0 :         result = PyNumber_ToBase(val, 16);
   13309           0 :         break;
   13310             :     default:
   13311             :         assert(!"'type' not in [duoxX]");
   13312             :     }
   13313           0 :     if (!result)
   13314           0 :         return NULL;
   13315             : 
   13316             :     assert(unicode_modifiable(result));
   13317             :     assert(PyUnicode_IS_READY(result));
   13318             :     assert(PyUnicode_IS_ASCII(result));
   13319             : 
   13320             :     /* To modify the string in-place, there can only be one reference. */
   13321           0 :     if (Py_REFCNT(result) != 1) {
   13322           0 :         PyErr_BadInternalCall();
   13323           0 :         return NULL;
   13324             :     }
   13325           0 :     buf = PyUnicode_DATA(result);
   13326           0 :     llen = PyUnicode_GET_LENGTH(result);
   13327             :     if (llen > INT_MAX) {
   13328             :         PyErr_SetString(PyExc_ValueError,
   13329             :                         "string too large in _PyBytes_FormatLong");
   13330             :         return NULL;
   13331             :     }
   13332           0 :     len = (int)llen;
   13333           0 :     sign = buf[0] == '-';
   13334           0 :     numnondigits += sign;
   13335           0 :     numdigits = len - numnondigits;
   13336             :     assert(numdigits > 0);
   13337             : 
   13338             :     /* Get rid of base marker unless F_ALT */
   13339           0 :     if (((flags & F_ALT) == 0 &&
   13340           0 :         (type == 'o' || type == 'x' || type == 'X'))) {
   13341             :         assert(buf[sign] == '0');
   13342             :         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
   13343             :                buf[sign+1] == 'o');
   13344           0 :         numnondigits -= 2;
   13345           0 :         buf += 2;
   13346           0 :         len -= 2;
   13347           0 :         if (sign)
   13348           0 :             buf[0] = '-';
   13349             :         assert(len == numnondigits + numdigits);
   13350             :         assert(numdigits > 0);
   13351             :     }
   13352             : 
   13353             :     /* Fill with leading zeroes to meet minimum width. */
   13354           0 :     if (prec > numdigits) {
   13355           0 :         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
   13356             :                                 numnondigits + prec);
   13357             :         char *b1;
   13358           0 :         if (!r1) {
   13359           0 :             Py_DECREF(result);
   13360           0 :             return NULL;
   13361             :         }
   13362           0 :         b1 = PyBytes_AS_STRING(r1);
   13363           0 :         for (i = 0; i < numnondigits; ++i)
   13364           0 :             *b1++ = *buf++;
   13365           0 :         for (i = 0; i < prec - numdigits; i++)
   13366           0 :             *b1++ = '0';
   13367           0 :         for (i = 0; i < numdigits; i++)
   13368           0 :             *b1++ = *buf++;
   13369           0 :         *b1 = '\0';
   13370           0 :         Py_DECREF(result);
   13371           0 :         result = r1;
   13372           0 :         buf = PyBytes_AS_STRING(result);
   13373           0 :         len = numnondigits + prec;
   13374             :     }
   13375             : 
   13376             :     /* Fix up case for hex conversions. */
   13377           0 :     if (type == 'X') {
   13378             :         /* Need to convert all lower case letters to upper case.
   13379             :            and need to convert 0x to 0X (and -0x to -0X). */
   13380           0 :         for (i = 0; i < len; i++)
   13381           0 :             if (buf[i] >= 'a' && buf[i] <= 'x')
   13382           0 :                 buf[i] -= 'a'-'A';
   13383             :     }
   13384           0 :     if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
   13385             :         PyObject *unicode;
   13386           0 :         unicode = _PyUnicode_FromASCII(buf, len);
   13387           0 :         Py_DECREF(result);
   13388           0 :         result = unicode;
   13389             :     }
   13390           0 :     return result;
   13391             : }
   13392             : 
   13393             : static Py_UCS4
   13394           0 : formatchar(PyObject *v)
   13395             : {
   13396             :     /* presume that the buffer is at least 3 characters long */
   13397           0 :     if (PyUnicode_Check(v)) {
   13398           0 :         if (PyUnicode_GET_LENGTH(v) == 1) {
   13399           0 :             return PyUnicode_READ_CHAR(v, 0);
   13400             :         }
   13401           0 :         goto onError;
   13402             :     }
   13403             :     else {
   13404             :         /* Integer input truncated to a character */
   13405             :         long x;
   13406           0 :         x = PyLong_AsLong(v);
   13407           0 :         if (x == -1 && PyErr_Occurred())
   13408           0 :             goto onError;
   13409             : 
   13410           0 :         if (x < 0 || x > MAX_UNICODE) {
   13411           0 :             PyErr_SetString(PyExc_OverflowError,
   13412             :                             "%c arg not in range(0x110000)");
   13413           0 :             return (Py_UCS4) -1;
   13414             :         }
   13415             : 
   13416           0 :         return (Py_UCS4) x;
   13417             :     }
   13418             : 
   13419             :   onError:
   13420           0 :     PyErr_SetString(PyExc_TypeError,
   13421             :                     "%c requires int or char");
   13422           0 :     return (Py_UCS4) -1;
   13423             : }
   13424             : 
   13425             : PyObject *
   13426           8 : PyUnicode_Format(PyObject *format, PyObject *args)
   13427             : {
   13428             :     Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
   13429           8 :     int args_owned = 0;
   13430           8 :     PyObject *dict = NULL;
   13431           8 :     PyObject *temp = NULL;
   13432           8 :     PyObject *second = NULL;
   13433             :     PyObject *uformat;
   13434             :     void *fmt;
   13435             :     enum PyUnicode_Kind kind, fmtkind;
   13436             :     _PyUnicodeWriter writer;
   13437             :     Py_ssize_t sublen;
   13438             :     Py_UCS4 maxchar;
   13439             : 
   13440           8 :     if (format == NULL || args == NULL) {
   13441           0 :         PyErr_BadInternalCall();
   13442           0 :         return NULL;
   13443             :     }
   13444           8 :     uformat = PyUnicode_FromObject(format);
   13445           8 :     if (uformat == NULL)
   13446           0 :         return NULL;
   13447           8 :     if (PyUnicode_READY(uformat) == -1)
   13448           0 :         Py_DECREF(uformat);
   13449             : 
   13450           8 :     fmt = PyUnicode_DATA(uformat);
   13451           8 :     fmtkind = PyUnicode_KIND(uformat);
   13452           8 :     fmtcnt = PyUnicode_GET_LENGTH(uformat);
   13453           8 :     fmtpos = 0;
   13454             : 
   13455           8 :     _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
   13456             : 
   13457           8 :     if (PyTuple_Check(args)) {
   13458           1 :         arglen = PyTuple_Size(args);
   13459           1 :         argidx = 0;
   13460             :     }
   13461             :     else {
   13462           7 :         arglen = -1;
   13463           7 :         argidx = -2;
   13464             :     }
   13465          15 :     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
   13466           7 :         !PyUnicode_Check(args))
   13467           0 :         dict = args;
   13468             : 
   13469          37 :     while (--fmtcnt >= 0) {
   13470          21 :         if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
   13471             :             Py_ssize_t nonfmtpos;
   13472          12 :             nonfmtpos = fmtpos++;
   13473         367 :             while (fmtcnt >= 0 &&
   13474         175 :                    PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
   13475         168 :                 fmtpos++;
   13476         168 :                 fmtcnt--;
   13477             :             }
   13478          12 :             if (fmtcnt < 0)
   13479           5 :                 fmtpos--;
   13480          12 :             sublen = fmtpos - nonfmtpos;
   13481          12 :             maxchar = _PyUnicode_FindMaxChar(uformat,
   13482             :                                              nonfmtpos, nonfmtpos + sublen);
   13483          12 :             if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
   13484           0 :                 goto onError;
   13485             : 
   13486          12 :             _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
   13487             :                                           uformat, nonfmtpos, sublen);
   13488          12 :             writer.pos += sublen;
   13489             :         }
   13490             :         else {
   13491             :             /* Got a format specifier */
   13492           9 :             int flags = 0;
   13493           9 :             Py_ssize_t width = -1;
   13494           9 :             int prec = -1;
   13495           9 :             Py_UCS4 c = '\0';
   13496             :             Py_UCS4 fill;
   13497             :             int sign;
   13498             :             Py_UCS4 signchar;
   13499             :             int isnumok;
   13500           9 :             PyObject *v = NULL;
   13501           9 :             void *pbuf = NULL;
   13502             :             Py_ssize_t pindex, len;
   13503             :             Py_UCS4 bufmaxchar;
   13504             :             Py_ssize_t buflen;
   13505             : 
   13506           9 :             fmtpos++;
   13507           9 :             c = PyUnicode_READ(fmtkind, fmt, fmtpos);
   13508           9 :             if (c == '(') {
   13509             :                 Py_ssize_t keystart;
   13510             :                 Py_ssize_t keylen;
   13511             :                 PyObject *key;
   13512           0 :                 int pcount = 1;
   13513             : 
   13514           0 :                 if (dict == NULL) {
   13515           0 :                     PyErr_SetString(PyExc_TypeError,
   13516             :                                     "format requires a mapping");
   13517           0 :                     goto onError;
   13518             :                 }
   13519           0 :                 ++fmtpos;
   13520           0 :                 --fmtcnt;
   13521           0 :                 keystart = fmtpos;
   13522             :                 /* Skip over balanced parentheses */
   13523           0 :                 while (pcount > 0 && --fmtcnt >= 0) {
   13524           0 :                     c = PyUnicode_READ(fmtkind, fmt, fmtpos);
   13525           0 :                     if (c == ')')
   13526           0 :                         --pcount;
   13527           0 :                     else if (c == '(')
   13528           0 :                         ++pcount;
   13529           0 :                     fmtpos++;
   13530             :                 }
   13531           0 :                 keylen = fmtpos - keystart - 1;
   13532           0 :                 if (fmtcnt < 0 || pcount > 0) {
   13533           0 :                     PyErr_SetString(PyExc_ValueError,
   13534             :                                     "incomplete format key");
   13535           0 :                     goto onError;
   13536             :                 }
   13537           0 :                 key = PyUnicode_Substring(uformat,
   13538             :                                           keystart, keystart + keylen);
   13539           0 :                 if (key == NULL)
   13540           0 :                     goto onError;
   13541           0 :                 if (args_owned) {
   13542           0 :                     Py_DECREF(args);
   13543           0 :                     args_owned = 0;
   13544             :                 }
   13545           0 :                 args = PyObject_GetItem(dict, key);
   13546           0 :                 Py_DECREF(key);
   13547           0 :                 if (args == NULL) {
   13548           0 :                     goto onError;
   13549             :                 }
   13550           0 :                 args_owned = 1;
   13551           0 :                 arglen = -1;
   13552           0 :                 argidx = -2;
   13553             :             }
   13554          18 :             while (--fmtcnt >= 0) {
   13555           9 :                 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13556           9 :                 switch (c) {
   13557           0 :                 case '-': flags |= F_LJUST; continue;
   13558           0 :                 case '+': flags |= F_SIGN; continue;
   13559           0 :                 case ' ': flags |= F_BLANK; continue;
   13560           0 :                 case '#': flags |= F_ALT; continue;
   13561           0 :                 case '0': flags |= F_ZERO; continue;
   13562             :                 }
   13563           9 :                 break;
   13564             :             }
   13565           9 :             if (c == '*') {
   13566           0 :                 v = getnextarg(args, arglen, &argidx);
   13567           0 :                 if (v == NULL)
   13568           0 :                     goto onError;
   13569           0 :                 if (!PyLong_Check(v)) {
   13570           0 :                     PyErr_SetString(PyExc_TypeError,
   13571             :                                     "* wants int");
   13572           0 :                     goto onError;
   13573             :                 }
   13574           0 :                 width = PyLong_AsLong(v);
   13575           0 :                 if (width == -1 && PyErr_Occurred())
   13576           0 :                     goto onError;
   13577           0 :                 if (width < 0) {
   13578           0 :                     flags |= F_LJUST;
   13579           0 :                     width = -width;
   13580             :                 }
   13581           0 :                 if (--fmtcnt >= 0)
   13582           0 :                     c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13583             :             }
   13584           9 :             else if (c >= '0' && c <= '9') {
   13585           0 :                 width = c - '0';
   13586           0 :                 while (--fmtcnt >= 0) {
   13587           0 :                     c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13588           0 :                     if (c < '0' || c > '9')
   13589             :                         break;
   13590             :                     /* Since c is unsigned, the RHS would end up as unsigned,
   13591             :                        mixing signed and unsigned comparison. Since c is between
   13592             :                        '0' and '9', casting to int is safe. */
   13593           0 :                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
   13594           0 :                         PyErr_SetString(PyExc_ValueError,
   13595             :                                         "width too big");
   13596           0 :                         goto onError;
   13597             :                     }
   13598           0 :                     width = width*10 + (c - '0');
   13599             :                 }
   13600             :             }
   13601           9 :             if (c == '.') {
   13602           1 :                 prec = 0;
   13603           1 :                 if (--fmtcnt >= 0)
   13604           1 :                     c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13605           1 :                 if (c == '*') {
   13606           0 :                     v = getnextarg(args, arglen, &argidx);
   13607           0 :                     if (v == NULL)
   13608           0 :                         goto onError;
   13609           0 :                     if (!PyLong_Check(v)) {
   13610           0 :                         PyErr_SetString(PyExc_TypeError,
   13611             :                                         "* wants int");
   13612           0 :                         goto onError;
   13613             :                     }
   13614           0 :                     prec = PyLong_AsLong(v);
   13615           0 :                     if (prec == -1 && PyErr_Occurred())
   13616           0 :                         goto onError;
   13617           0 :                     if (prec < 0)
   13618           0 :                         prec = 0;
   13619           0 :                     if (--fmtcnt >= 0)
   13620           0 :                         c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13621             :                 }
   13622           1 :                 else if (c >= '0' && c <= '9') {
   13623           1 :                     prec = c - '0';
   13624           2 :                     while (--fmtcnt >= 0) {
   13625           1 :                         c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13626           1 :                         if (c < '0' || c > '9')
   13627             :                             break;
   13628           0 :                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
   13629           0 :                             PyErr_SetString(PyExc_ValueError,
   13630             :                                             "prec too big");
   13631           0 :                             goto onError;
   13632             :                         }
   13633           0 :                         prec = prec*10 + (c - '0');
   13634             :                     }
   13635             :                 }
   13636             :             } /* prec */
   13637           9 :             if (fmtcnt >= 0) {
   13638           9 :                 if (c == 'h' || c == 'l' || c == 'L') {
   13639           0 :                     if (--fmtcnt >= 0)
   13640           0 :                         c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
   13641             :                 }
   13642             :             }
   13643           9 :             if (fmtcnt < 0) {
   13644           0 :                 PyErr_SetString(PyExc_ValueError,
   13645             :                                 "incomplete format");
   13646           0 :                 goto onError;
   13647             :             }
   13648           9 :             if (fmtcnt == 0)
   13649           3 :                 writer.overallocate = 0;
   13650             : 
   13651           9 :             if (c == '%') {
   13652           0 :                 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
   13653           0 :                     goto onError;
   13654           0 :                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
   13655           0 :                 writer.pos += 1;
   13656           0 :                 continue;
   13657             :             }
   13658             : 
   13659           9 :             v = getnextarg(args, arglen, &argidx);
   13660           9 :             if (v == NULL)
   13661           0 :                 goto onError;
   13662             : 
   13663           9 :             sign = 0;
   13664           9 :             signchar = '\0';
   13665           9 :             fill = ' ';
   13666           9 :             switch (c) {
   13667             : 
   13668             :             case 's':
   13669             :             case 'r':
   13670             :             case 'a':
   13671           9 :                 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
   13672             :                     /* Fast path */
   13673           0 :                     if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
   13674           0 :                         goto onError;
   13675           0 :                     goto nextarg;
   13676             :                 }
   13677             : 
   13678           9 :                 if (PyUnicode_CheckExact(v) && c == 's') {
   13679           9 :                     temp = v;
   13680           9 :                     Py_INCREF(temp);
   13681             :                 }
   13682             :                 else {
   13683           0 :                     if (c == 's')
   13684           0 :                         temp = PyObject_Str(v);
   13685           0 :                     else if (c == 'r')
   13686           0 :                         temp = PyObject_Repr(v);
   13687             :                     else
   13688           0 :                         temp = PyObject_ASCII(v);
   13689             :                 }
   13690           9 :                 break;
   13691             : 
   13692             :             case 'i':
   13693             :             case 'd':
   13694             :             case 'u':
   13695             :             case 'o':
   13696             :             case 'x':
   13697             :             case 'X':
   13698           0 :                 if (PyLong_CheckExact(v)
   13699           0 :                     && width == -1 && prec == -1
   13700           0 :                     && !(flags & (F_SIGN | F_BLANK)))
   13701             :                 {
   13702             :                     /* Fast path */
   13703           0 :                     switch(c)
   13704             :                     {
   13705             :                     case 'd':
   13706             :                     case 'i':
   13707             :                     case 'u':
   13708           0 :                         if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
   13709           0 :                             goto onError;
   13710           0 :                         goto nextarg;
   13711             :                     case 'x':
   13712           0 :                         if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
   13713           0 :                             goto onError;
   13714           0 :                         goto nextarg;
   13715             :                     case 'o':
   13716           0 :                         if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
   13717           0 :                             goto onError;
   13718           0 :                         goto nextarg;
   13719             :                     default:
   13720           0 :                         break;
   13721             :                     }
   13722             :                 }
   13723             : 
   13724           0 :                 isnumok = 0;
   13725           0 :                 if (PyNumber_Check(v)) {
   13726           0 :                     PyObject *iobj=NULL;
   13727             : 
   13728           0 :                     if (PyLong_Check(v)) {
   13729           0 :                         iobj = v;
   13730           0 :                         Py_INCREF(iobj);
   13731             :                     }
   13732             :                     else {
   13733           0 :                         iobj = PyNumber_Long(v);
   13734             :                     }
   13735           0 :                     if (iobj!=NULL) {
   13736           0 :                         if (PyLong_Check(iobj)) {
   13737           0 :                             isnumok = 1;
   13738           0 :                             sign = 1;
   13739           0 :                             temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
   13740           0 :                             Py_DECREF(iobj);
   13741             :                         }
   13742             :                         else {
   13743           0 :                             Py_DECREF(iobj);
   13744             :                         }
   13745             :                     }
   13746             :                 }
   13747           0 :                 if (!isnumok) {
   13748           0 :                     PyErr_Format(PyExc_TypeError,
   13749             :                                  "%%%c format: a number is required, "
   13750           0 :                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
   13751           0 :                     goto onError;
   13752             :                 }
   13753           0 :                 if (flags & F_ZERO)
   13754           0 :                     fill = '0';
   13755           0 :                 break;
   13756             : 
   13757             :             case 'e':
   13758             :             case 'E':
   13759             :             case 'f':
   13760             :             case 'F':
   13761             :             case 'g':
   13762             :             case 'G':
   13763           0 :                 if (width == -1 && prec == -1
   13764           0 :                     && !(flags & (F_SIGN | F_BLANK)))
   13765             :                 {
   13766             :                     /* Fast path */
   13767           0 :                     if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
   13768           0 :                         goto onError;
   13769           0 :                     goto nextarg;
   13770             :                 }
   13771             : 
   13772           0 :                 sign = 1;
   13773           0 :                 if (flags & F_ZERO)
   13774           0 :                     fill = '0';
   13775           0 :                 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
   13776           0 :                     temp = NULL;
   13777           0 :                 break;
   13778             : 
   13779             :             case 'c':
   13780             :             {
   13781           0 :                 Py_UCS4 ch = formatchar(v);
   13782           0 :                 if (ch == (Py_UCS4) -1)
   13783           0 :                     goto onError;
   13784           0 :                 if (width == -1 && prec == -1) {
   13785             :                     /* Fast path */
   13786           0 :                     if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
   13787           0 :                         goto onError;
   13788           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
   13789           0 :                     writer.pos += 1;
   13790           0 :                     goto nextarg;
   13791             :                 }
   13792           0 :                 temp = PyUnicode_FromOrdinal(ch);
   13793           0 :                 break;
   13794             :             }
   13795             : 
   13796             :             default:
   13797           0 :                 PyErr_Format(PyExc_ValueError,
   13798             :                              "unsupported format character '%c' (0x%x) "
   13799             :                              "at index %zd",
   13800           0 :                              (31<=c && c<=126) ? (char)c : '?',
   13801             :                              (int)c,
   13802             :                              fmtpos - 1);
   13803           0 :                 goto onError;
   13804             :             }
   13805           9 :             if (temp == NULL)
   13806           0 :                 goto onError;
   13807             :             assert (PyUnicode_Check(temp));
   13808             : 
   13809           9 :             if (width == -1 && prec == -1
   13810           8 :                 && !(flags & (F_SIGN | F_BLANK)))
   13811             :             {
   13812             :                 /* Fast path */
   13813           8 :                 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
   13814           0 :                     goto onError;
   13815           8 :                 goto nextarg;
   13816             :             }
   13817             : 
   13818           1 :             if (PyUnicode_READY(temp) == -1) {
   13819           0 :                 Py_CLEAR(temp);
   13820           0 :                 goto onError;
   13821             :             }
   13822           1 :             kind = PyUnicode_KIND(temp);
   13823           1 :             pbuf = PyUnicode_DATA(temp);
   13824           1 :             len = PyUnicode_GET_LENGTH(temp);
   13825             : 
   13826           1 :             if (c == 's' || c == 'r' || c == 'a') {
   13827           1 :                 if (prec >= 0 && len > prec)
   13828           1 :                     len = prec;
   13829             :             }
   13830             : 
   13831             :             /* pbuf is initialized here. */
   13832           1 :             pindex = 0;
   13833           1 :             if (sign) {
   13834           0 :                 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
   13835           0 :                 if (ch == '-' || ch == '+') {
   13836           0 :                     signchar = ch;
   13837           0 :                     len--;
   13838           0 :                     pindex++;
   13839             :                 }
   13840           0 :                 else if (flags & F_SIGN)
   13841           0 :                     signchar = '+';
   13842           0 :                 else if (flags & F_BLANK)
   13843           0 :                     signchar = ' ';
   13844             :                 else
   13845           0 :                     sign = 0;
   13846             :             }
   13847           1 :             if (width < len)
   13848           1 :                 width = len;
   13849             : 
   13850             :             /* Compute the length and maximum character of the
   13851             :                written characters */
   13852           1 :             bufmaxchar = 127;
   13853           1 :             if (!(flags & F_LJUST)) {
   13854           1 :                 if (sign) {
   13855           0 :                     if ((width-1) > len)
   13856           0 :                         bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
   13857             :                 }
   13858             :                 else {
   13859           1 :                     if (width > len)
   13860           0 :                         bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
   13861             :                 }
   13862             :             }
   13863           1 :             maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
   13864           1 :             bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
   13865             : 
   13866           1 :             buflen = width;
   13867           1 :             if (sign && len == width)
   13868           0 :                 buflen++;
   13869             : 
   13870           1 :             if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
   13871           0 :                 goto onError;
   13872             : 
   13873             :             /* Write characters */
   13874           1 :             if (sign) {
   13875           0 :                 if (fill != ' ') {
   13876           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
   13877           0 :                     writer.pos += 1;
   13878             :                 }
   13879           0 :                 if (width > len)
   13880           0 :                     width--;
   13881             :             }
   13882           1 :             if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
   13883             :                 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   13884             :                 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
   13885           0 :                 if (fill != ' ') {
   13886           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
   13887           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
   13888           0 :                     writer.pos += 2;
   13889           0 :                     pindex += 2;
   13890             :                 }
   13891           0 :                 width -= 2;
   13892           0 :                 if (width < 0)
   13893           0 :                     width = 0;
   13894           0 :                 len -= 2;
   13895             :             }
   13896           1 :             if (width > len && !(flags & F_LJUST)) {
   13897           0 :                 sublen = width - len;
   13898           0 :                 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
   13899           0 :                 writer.pos += sublen;
   13900           0 :                 width = len;
   13901             :             }
   13902           1 :             if (fill == ' ') {
   13903           1 :                 if (sign) {
   13904           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
   13905           0 :                     writer.pos += 1;
   13906             :                 }
   13907           1 :                 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
   13908             :                     assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
   13909             :                     assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
   13910           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
   13911           0 :                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
   13912           0 :                     writer.pos += 2;
   13913           0 :                     pindex += 2;
   13914             :                 }
   13915             :             }
   13916             : 
   13917           1 :             if (len) {
   13918           1 :                 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
   13919             :                                               temp, pindex, len);
   13920           1 :                 writer.pos += len;
   13921             :             }
   13922           1 :             if (width > len) {
   13923           0 :                 sublen = width - len;
   13924           0 :                 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
   13925           0 :                 writer.pos += sublen;
   13926             :             }
   13927             : 
   13928             : nextarg:
   13929           9 :             if (dict && (argidx < arglen) && c != '%') {
   13930           0 :                 PyErr_SetString(PyExc_TypeError,
   13931             :                                 "not all arguments converted during string formatting");
   13932           0 :                 goto onError;
   13933             :             }
   13934           9 :             Py_CLEAR(temp);
   13935             :         } /* '%' */
   13936             :     } /* until end */
   13937           8 :     if (argidx < arglen && !dict) {
   13938           0 :         PyErr_SetString(PyExc_TypeError,
   13939             :                         "not all arguments converted during string formatting");
   13940           0 :         goto onError;
   13941             :     }
   13942             : 
   13943           8 :     if (args_owned) {
   13944           0 :         Py_DECREF(args);
   13945             :     }
   13946           8 :     Py_DECREF(uformat);
   13947           8 :     Py_XDECREF(temp);
   13948           8 :     Py_XDECREF(second);
   13949           8 :     return _PyUnicodeWriter_Finish(&writer);
   13950             : 
   13951             :   onError:
   13952           0 :     Py_DECREF(uformat);
   13953           0 :     Py_XDECREF(temp);
   13954           0 :     Py_XDECREF(second);
   13955           0 :     _PyUnicodeWriter_Dealloc(&writer);
   13956           0 :     if (args_owned) {
   13957           0 :         Py_DECREF(args);
   13958             :     }
   13959           0 :     return NULL;
   13960             : }
   13961             : 
   13962             : static PyObject *
   13963             : unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
   13964             : 
   13965             : static PyObject *
   13966        1899 : unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   13967             : {
   13968        1899 :     PyObject *x = NULL;
   13969             :     static char *kwlist[] = {"object", "encoding", "errors", 0};
   13970        1899 :     char *encoding = NULL;
   13971        1899 :     char *errors = NULL;
   13972             : 
   13973        1899 :     if (type != &PyUnicode_Type)
   13974           0 :         return unicode_subtype_new(type, args, kwds);
   13975        1899 :     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
   13976             :                                      kwlist, &x, &encoding, &errors))
   13977           0 :         return NULL;
   13978        1899 :     if (x == NULL) {
   13979           0 :         Py_INCREF(unicode_empty);
   13980           0 :         return unicode_empty;
   13981             :     }
   13982        1899 :     if (encoding == NULL && errors == NULL)
   13983        1899 :         return PyObject_Str(x);
   13984             :     else
   13985           0 :         return PyUnicode_FromEncodedObject(x, encoding, errors);
   13986             : }
   13987             : 
   13988             : static PyObject *
   13989           0 : unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
   13990             : {
   13991             :     PyObject *unicode, *self;
   13992             :     Py_ssize_t length, char_size;
   13993             :     int share_wstr, share_utf8;
   13994             :     unsigned int kind;
   13995             :     void *data;
   13996             : 
   13997             :     assert(PyType_IsSubtype(type, &PyUnicode_Type));
   13998             : 
   13999           0 :     unicode = unicode_new(&PyUnicode_Type, args, kwds);
   14000           0 :     if (unicode == NULL)
   14001           0 :         return NULL;
   14002             :     assert(_PyUnicode_CHECK(unicode));
   14003           0 :     if (PyUnicode_READY(unicode) == -1) {
   14004           0 :         Py_DECREF(unicode);
   14005           0 :         return NULL;
   14006             :     }
   14007             : 
   14008           0 :     self = type->tp_alloc(type, 0);
   14009           0 :     if (self == NULL) {
   14010           0 :         Py_DECREF(unicode);
   14011           0 :         return NULL;
   14012             :     }
   14013           0 :     kind = PyUnicode_KIND(unicode);
   14014           0 :     length = PyUnicode_GET_LENGTH(unicode);
   14015             : 
   14016           0 :     _PyUnicode_LENGTH(self) = length;
   14017             : #ifdef Py_DEBUG
   14018             :     _PyUnicode_HASH(self) = -1;
   14019             : #else
   14020           0 :     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   14021             : #endif
   14022           0 :     _PyUnicode_STATE(self).interned = 0;
   14023           0 :     _PyUnicode_STATE(self).kind = kind;
   14024           0 :     _PyUnicode_STATE(self).compact = 0;
   14025           0 :     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
   14026           0 :     _PyUnicode_STATE(self).ready = 1;
   14027           0 :     _PyUnicode_WSTR(self) = NULL;
   14028           0 :     _PyUnicode_UTF8_LENGTH(self) = 0;
   14029           0 :     _PyUnicode_UTF8(self) = NULL;
   14030           0 :     _PyUnicode_WSTR_LENGTH(self) = 0;
   14031           0 :     _PyUnicode_DATA_ANY(self) = NULL;
   14032             : 
   14033           0 :     share_utf8 = 0;
   14034           0 :     share_wstr = 0;
   14035           0 :     if (kind == PyUnicode_1BYTE_KIND) {
   14036           0 :         char_size = 1;
   14037           0 :         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
   14038           0 :             share_utf8 = 1;
   14039             :     }
   14040           0 :     else if (kind == PyUnicode_2BYTE_KIND) {
   14041           0 :         char_size = 2;
   14042             :         if (sizeof(wchar_t) == 2)
   14043             :             share_wstr = 1;
   14044             :     }
   14045             :     else {
   14046             :         assert(kind == PyUnicode_4BYTE_KIND);
   14047           0 :         char_size = 4;
   14048             :         if (sizeof(wchar_t) == 4)
   14049           0 :             share_wstr = 1;
   14050             :     }
   14051             : 
   14052             :     /* Ensure we won't overflow the length. */
   14053           0 :     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
   14054           0 :         PyErr_NoMemory();
   14055           0 :         goto onError;
   14056             :     }
   14057           0 :     data = PyObject_MALLOC((length + 1) * char_size);
   14058           0 :     if (data == NULL) {
   14059           0 :         PyErr_NoMemory();
   14060           0 :         goto onError;
   14061             :     }
   14062             : 
   14063           0 :     _PyUnicode_DATA_ANY(self) = data;
   14064           0 :     if (share_utf8) {
   14065           0 :         _PyUnicode_UTF8_LENGTH(self) = length;
   14066           0 :         _PyUnicode_UTF8(self) = data;
   14067             :     }
   14068           0 :     if (share_wstr) {
   14069           0 :         _PyUnicode_WSTR_LENGTH(self) = length;
   14070           0 :         _PyUnicode_WSTR(self) = (wchar_t *)data;
   14071             :     }
   14072             : 
   14073           0 :     Py_MEMCPY(data, PyUnicode_DATA(unicode),
   14074           0 :               kind * (length + 1));
   14075             :     assert(_PyUnicode_CheckConsistency(self, 1));
   14076             : #ifdef Py_DEBUG
   14077             :     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
   14078             : #endif
   14079           0 :     Py_DECREF(unicode);
   14080           0 :     return self;
   14081             : 
   14082             : onError:
   14083           0 :     Py_DECREF(unicode);
   14084           0 :     Py_DECREF(self);
   14085           0 :     return NULL;
   14086             : }
   14087             : 
   14088             : PyDoc_STRVAR(unicode_doc,
   14089             :              "str(object[, encoding[, errors]]) -> str\n\
   14090             : \n\
   14091             : Create a new string object from the given object. If encoding or\n\
   14092             : errors is specified, then the object must expose a data buffer\n\
   14093             : that will be decoded using the given encoding and error handler.\n\
   14094             : Otherwise, returns the result of object.__str__() (if defined)\n\
   14095             : or repr(object).\n\
   14096             : encoding defaults to sys.getdefaultencoding().\n\
   14097             : errors defaults to 'strict'.");
   14098             : 
   14099             : static PyObject *unicode_iter(PyObject *seq);
   14100             : 
   14101             : PyTypeObject PyUnicode_Type = {
   14102             :     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   14103             :     "str",              /* tp_name */
   14104             :     sizeof(PyUnicodeObject),        /* tp_size */
   14105             :     0,                  /* tp_itemsize */
   14106             :     /* Slots */
   14107             :     (destructor)unicode_dealloc,    /* tp_dealloc */
   14108             :     0,                  /* tp_print */
   14109             :     0,                  /* tp_getattr */
   14110             :     0,                  /* tp_setattr */
   14111             :     0,                  /* tp_reserved */
   14112             :     unicode_repr,           /* tp_repr */
   14113             :     &unicode_as_number,         /* tp_as_number */
   14114             :     &unicode_as_sequence,       /* tp_as_sequence */
   14115             :     &unicode_as_mapping,        /* tp_as_mapping */
   14116             :     (hashfunc) unicode_hash,        /* tp_hash*/
   14117             :     0,                  /* tp_call*/
   14118             :     (reprfunc) unicode_str,     /* tp_str */
   14119             :     PyObject_GenericGetAttr,        /* tp_getattro */
   14120             :     0,                  /* tp_setattro */
   14121             :     0,                  /* tp_as_buffer */
   14122             :     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
   14123             :     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
   14124             :     unicode_doc,            /* tp_doc */
   14125             :     0,                  /* tp_traverse */
   14126             :     0,                  /* tp_clear */
   14127             :     PyUnicode_RichCompare,      /* tp_richcompare */
   14128             :     0,                  /* tp_weaklistoffset */
   14129             :     unicode_iter,           /* tp_iter */
   14130             :     0,                  /* tp_iternext */
   14131             :     unicode_methods,            /* tp_methods */
   14132             :     0,                  /* tp_members */
   14133             :     0,                  /* tp_getset */
   14134             :     &PyBaseObject_Type,         /* tp_base */
   14135             :     0,                  /* tp_dict */
   14136             :     0,                  /* tp_descr_get */
   14137             :     0,                  /* tp_descr_set */
   14138             :     0,                  /* tp_dictoffset */
   14139             :     0,                  /* tp_init */
   14140             :     0,                  /* tp_alloc */
   14141             :     unicode_new,            /* tp_new */
   14142             :     PyObject_Del,           /* tp_free */
   14143             : };
   14144             : 
   14145             : /* Initialize the Unicode implementation */
   14146             : 
   14147           1 : int _PyUnicode_Init(void)
   14148             : {
   14149             :     int i;
   14150             : 
   14151             :     /* XXX - move this array to unicodectype.c ? */
   14152           1 :     Py_UCS2 linebreak[] = {
   14153             :         0x000A, /* LINE FEED */
   14154             :         0x000D, /* CARRIAGE RETURN */
   14155             :         0x001C, /* FILE SEPARATOR */
   14156             :         0x001D, /* GROUP SEPARATOR */
   14157             :         0x001E, /* RECORD SEPARATOR */
   14158             :         0x0085, /* NEXT LINE */
   14159             :         0x2028, /* LINE SEPARATOR */
   14160             :         0x2029, /* PARAGRAPH SEPARATOR */
   14161             :     };
   14162             : 
   14163             :     /* Init the implementation */
   14164           1 :     unicode_empty = PyUnicode_New(0, 0);
   14165           1 :     if (!unicode_empty)
   14166           0 :         Py_FatalError("Can't create empty string");
   14167             :     assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
   14168             : 
   14169         257 :     for (i = 0; i < 256; i++)
   14170         256 :         unicode_latin1[i] = NULL;
   14171           1 :     if (PyType_Ready(&PyUnicode_Type) < 0)
   14172           0 :         Py_FatalError("Can't initialize 'unicode'");
   14173             : 
   14174             :     /* initialize the linebreak bloom filter */
   14175           1 :     bloom_linebreak = make_bloom_mask(
   14176             :         PyUnicode_2BYTE_KIND, linebreak,
   14177             :         Py_ARRAY_LENGTH(linebreak));
   14178             : 
   14179           1 :     PyType_Ready(&EncodingMapType);
   14180             : 
   14181             : #ifdef HAVE_MBCS
   14182             :     winver.dwOSVersionInfoSize = sizeof(winver);
   14183             :     if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
   14184             :         PyErr_SetFromWindowsErr(0);
   14185             :         return -1;
   14186             :     }
   14187             : #endif
   14188           1 :     return 0;
   14189             : }
   14190             : 
   14191             : /* Finalize the Unicode implementation */
   14192             : 
   14193             : int
   14194           0 : PyUnicode_ClearFreeList(void)
   14195             : {
   14196           0 :     return 0;
   14197             : }
   14198             : 
   14199             : void
   14200           0 : _PyUnicode_Fini(void)
   14201             : {
   14202             :     int i;
   14203             : 
   14204           0 :     Py_XDECREF(unicode_empty);
   14205           0 :     unicode_empty = NULL;
   14206             : 
   14207           0 :     for (i = 0; i < 256; i++) {
   14208           0 :         if (unicode_latin1[i]) {
   14209           0 :             Py_DECREF(unicode_latin1[i]);
   14210           0 :             unicode_latin1[i] = NULL;
   14211             :         }
   14212             :     }
   14213           0 :     _PyUnicode_ClearStaticStrings();
   14214           0 :     (void)PyUnicode_ClearFreeList();
   14215           0 : }
   14216             : 
   14217             : void
   14218       40669 : PyUnicode_InternInPlace(PyObject **p)
   14219             : {
   14220       40669 :     register PyObject *s = *p;
   14221             :     PyObject *t;
   14222             : #ifdef Py_DEBUG
   14223             :     assert(s != NULL);
   14224             :     assert(_PyUnicode_CHECK(s));
   14225             : #else
   14226       40669 :     if (s == NULL || !PyUnicode_Check(s))
   14227           0 :         return;
   14228             : #endif
   14229             :     /* If it's a subclass, we don't really know what putting
   14230             :        it in the interned dict might do. */
   14231       40669 :     if (!PyUnicode_CheckExact(s))
   14232           0 :         return;
   14233       40669 :     if (PyUnicode_CHECK_INTERNED(s))
   14234       16791 :         return;
   14235       23878 :     if (interned == NULL) {
   14236           1 :         interned = PyDict_New();
   14237           1 :         if (interned == NULL) {
   14238           0 :             PyErr_Clear(); /* Don't leave an exception */
   14239           0 :             return;
   14240             :         }
   14241             :     }
   14242             :     /* It might be that the GetItem call fails even
   14243             :        though the key is present in the dictionary,
   14244             :        namely when this happens during a stack overflow. */
   14245       23878 :     Py_ALLOW_RECURSION
   14246       23878 :     t = PyDict_GetItem(interned, s);
   14247       23878 :     Py_END_ALLOW_RECURSION
   14248             : 
   14249       23878 :         if (t) {
   14250       17722 :             Py_INCREF(t);
   14251       17722 :             Py_DECREF(*p);
   14252       17722 :             *p = t;
   14253       17722 :             return;
   14254             :         }
   14255             : 
   14256        6156 :     PyThreadState_GET()->recursion_critical = 1;
   14257        6156 :     if (PyDict_SetItem(interned, s, s) < 0) {
   14258           0 :         PyErr_Clear();
   14259           0 :         PyThreadState_GET()->recursion_critical = 0;
   14260           0 :         return;
   14261             :     }
   14262        6156 :     PyThreadState_GET()->recursion_critical = 0;
   14263             :     /* The two references in interned are not counted by refcnt.
   14264             :        The deallocator will take care of this */
   14265        6156 :     Py_REFCNT(s) -= 2;
   14266        6156 :     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
   14267             : }
   14268             : 
   14269             : void
   14270           0 : PyUnicode_InternImmortal(PyObject **p)
   14271             : {
   14272           0 :     PyUnicode_InternInPlace(p);
   14273           0 :     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
   14274           0 :         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
   14275           0 :         Py_INCREF(*p);
   14276             :     }
   14277           0 : }
   14278             : 
   14279             : PyObject *
   14280        4864 : PyUnicode_InternFromString(const char *cp)
   14281             : {
   14282        4864 :     PyObject *s = PyUnicode_FromString(cp);
   14283        4864 :     if (s == NULL)
   14284           0 :         return NULL;
   14285        4864 :     PyUnicode_InternInPlace(&s);
   14286        4864 :     return s;
   14287             : }
   14288             : 
   14289             : void
   14290           0 : _Py_ReleaseInternedUnicodeStrings(void)
   14291             : {
   14292             :     PyObject *keys;
   14293             :     PyObject *s;
   14294             :     Py_ssize_t i, n;
   14295           0 :     Py_ssize_t immortal_size = 0, mortal_size = 0;
   14296             : 
   14297           0 :     if (interned == NULL || !PyDict_Check(interned))
   14298           0 :         return;
   14299           0 :     keys = PyDict_Keys(interned);
   14300           0 :     if (keys == NULL || !PyList_Check(keys)) {
   14301           0 :         PyErr_Clear();
   14302           0 :         return;
   14303             :     }
   14304             : 
   14305             :     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
   14306             :        detector, interned unicode strings are not forcibly deallocated;
   14307             :        rather, we give them their stolen references back, and then clear
   14308             :        and DECREF the interned dict. */
   14309             : 
   14310           0 :     n = PyList_GET_SIZE(keys);
   14311           0 :     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
   14312             :             n);
   14313           0 :     for (i = 0; i < n; i++) {
   14314           0 :         s = PyList_GET_ITEM(keys, i);
   14315           0 :         if (PyUnicode_READY(s) == -1) {
   14316             :             assert(0 && "could not ready string");
   14317           0 :             fprintf(stderr, "could not ready string\n");
   14318             :         }
   14319           0 :         switch (PyUnicode_CHECK_INTERNED(s)) {
   14320             :         case SSTATE_NOT_INTERNED:
   14321             :             /* XXX Shouldn't happen */
   14322           0 :             break;
   14323             :         case SSTATE_INTERNED_IMMORTAL:
   14324           0 :             Py_REFCNT(s) += 1;
   14325           0 :             immortal_size += PyUnicode_GET_LENGTH(s);
   14326           0 :             break;
   14327             :         case SSTATE_INTERNED_MORTAL:
   14328           0 :             Py_REFCNT(s) += 2;
   14329           0 :             mortal_size += PyUnicode_GET_LENGTH(s);
   14330           0 :             break;
   14331             :         default:
   14332           0 :             Py_FatalError("Inconsistent interned string state.");
   14333             :         }
   14334           0 :         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
   14335             :     }
   14336           0 :     fprintf(stderr, "total size of all interned strings: "
   14337             :             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
   14338             :             "mortal/immortal\n", mortal_size, immortal_size);
   14339           0 :     Py_DECREF(keys);
   14340           0 :     PyDict_Clear(interned);
   14341           0 :     Py_DECREF(interned);
   14342           0 :     interned = NULL;
   14343             : }
   14344             : 
   14345             : 
   14346             : /********************* Unicode Iterator **************************/
   14347             : 
   14348             : typedef struct {
   14349             :     PyObject_HEAD
   14350             :     Py_ssize_t it_index;
   14351             :     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
   14352             : } unicodeiterobject;
   14353             : 
   14354             : static void
   14355          84 : unicodeiter_dealloc(unicodeiterobject *it)
   14356             : {
   14357          84 :     _PyObject_GC_UNTRACK(it);
   14358          84 :     Py_XDECREF(it->it_seq);
   14359          84 :     PyObject_GC_Del(it);
   14360          84 : }
   14361             : 
   14362             : static int
   14363           0 : unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
   14364             : {
   14365           0 :     Py_VISIT(it->it_seq);
   14366           0 :     return 0;
   14367             : }
   14368             : 
   14369             : static PyObject *
   14370         470 : unicodeiter_next(unicodeiterobject *it)
   14371             : {
   14372             :     PyObject *seq, *item;
   14373             : 
   14374             :     assert(it != NULL);
   14375         470 :     seq = it->it_seq;
   14376         470 :     if (seq == NULL)
   14377           0 :         return NULL;
   14378             :     assert(_PyUnicode_CHECK(seq));
   14379             : 
   14380         470 :     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
   14381         387 :         int kind = PyUnicode_KIND(seq);
   14382         387 :         void *data = PyUnicode_DATA(seq);
   14383         387 :         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
   14384         387 :         item = PyUnicode_FromOrdinal(chr);
   14385         387 :         if (item != NULL)
   14386         387 :             ++it->it_index;
   14387         387 :         return item;
   14388             :     }
   14389             : 
   14390          83 :     Py_DECREF(seq);
   14391          83 :     it->it_seq = NULL;
   14392          83 :     return NULL;
   14393             : }
   14394             : 
   14395             : static PyObject *
   14396           0 : unicodeiter_len(unicodeiterobject *it)
   14397             : {
   14398           0 :     Py_ssize_t len = 0;
   14399           0 :     if (it->it_seq)
   14400           0 :         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
   14401           0 :     return PyLong_FromSsize_t(len);
   14402             : }
   14403             : 
   14404             : PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
   14405             : 
   14406             : static PyObject *
   14407           0 : unicodeiter_reduce(unicodeiterobject *it)
   14408             : {
   14409           0 :     if (it->it_seq != NULL) {
   14410           0 :         return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
   14411             :                              it->it_seq, it->it_index);
   14412             :     } else {
   14413           0 :         PyObject *u = PyUnicode_FromUnicode(NULL, 0);
   14414           0 :         if (u == NULL)
   14415           0 :             return NULL;
   14416           0 :         return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
   14417             :     }
   14418             : }
   14419             : 
   14420             : PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
   14421             : 
   14422             : static PyObject *
   14423           0 : unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
   14424             : {
   14425           0 :     Py_ssize_t index = PyLong_AsSsize_t(state);
   14426           0 :     if (index == -1 && PyErr_Occurred())
   14427           0 :         return NULL;
   14428           0 :     if (index < 0)
   14429           0 :         index = 0;
   14430           0 :     it->it_index = index;
   14431           0 :     Py_RETURN_NONE;
   14432             : }
   14433             : 
   14434             : PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
   14435             : 
   14436             : static PyMethodDef unicodeiter_methods[] = {
   14437             :     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
   14438             :      length_hint_doc},
   14439             :     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
   14440             :      reduce_doc},
   14441             :     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
   14442             :      setstate_doc},
   14443             :     {NULL,      NULL}       /* sentinel */
   14444             : };
   14445             : 
   14446             : PyTypeObject PyUnicodeIter_Type = {
   14447             :     PyVarObject_HEAD_INIT(&PyType_Type, 0)
   14448             :     "str_iterator",         /* tp_name */
   14449             :     sizeof(unicodeiterobject),      /* tp_basicsize */
   14450             :     0,                  /* tp_itemsize */
   14451             :     /* methods */
   14452             :     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
   14453             :     0,                  /* tp_print */
   14454             :     0,                  /* tp_getattr */
   14455             :     0,                  /* tp_setattr */
   14456             :     0,                  /* tp_reserved */
   14457             :     0,                  /* tp_repr */
   14458             :     0,                  /* tp_as_number */
   14459             :     0,                  /* tp_as_sequence */
   14460             :     0,                  /* tp_as_mapping */
   14461             :     0,                  /* tp_hash */
   14462             :     0,                  /* tp_call */
   14463             :     0,                  /* tp_str */
   14464             :     PyObject_GenericGetAttr,        /* tp_getattro */
   14465             :     0,                  /* tp_setattro */
   14466             :     0,                  /* tp_as_buffer */
   14467             :     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
   14468             :     0,                  /* tp_doc */
   14469             :     (traverseproc)unicodeiter_traverse, /* tp_traverse */
   14470             :     0,                  /* tp_clear */
   14471             :     0,                  /* tp_richcompare */
   14472             :     0,                  /* tp_weaklistoffset */
   14473             :     PyObject_SelfIter,          /* tp_iter */
   14474             :     (iternextfunc)unicodeiter_next,     /* tp_iternext */
   14475             :     unicodeiter_methods,            /* tp_methods */
   14476             :     0,
   14477             : };
   14478             : 
   14479             : static PyObject *
   14480          84 : unicode_iter(PyObject *seq)
   14481             : {
   14482             :     unicodeiterobject *it;
   14483             : 
   14484          84 :     if (!PyUnicode_Check(seq)) {
   14485           0 :         PyErr_BadInternalCall();
   14486           0 :         return NULL;
   14487             :     }
   14488          84 :     if (PyUnicode_READY(seq) == -1)
   14489           0 :         return NULL;
   14490          84 :     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
   14491          84 :     if (it == NULL)
   14492           0 :         return NULL;
   14493          84 :     it->it_index = 0;
   14494          84 :     Py_INCREF(seq);
   14495          84 :     it->it_seq = seq;
   14496          84 :     _PyObject_GC_TRACK(it);
   14497          84 :     return (PyObject *)it;
   14498             : }
   14499             : 
   14500             : 
   14501             : size_t
   14502           0 : Py_UNICODE_strlen(const Py_UNICODE *u)
   14503             : {
   14504           0 :     int res = 0;
   14505           0 :     while(*u++)
   14506           0 :         res++;
   14507           0 :     return res;
   14508             : }
   14509             : 
   14510             : Py_UNICODE*
   14511           0 : Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
   14512             : {
   14513           0 :     Py_UNICODE *u = s1;
   14514           0 :     while ((*u++ = *s2++));
   14515           0 :     return s1;
   14516             : }
   14517             : 
   14518             : Py_UNICODE*
   14519           0 : Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
   14520             : {
   14521           0 :     Py_UNICODE *u = s1;
   14522           0 :     while ((*u++ = *s2++))
   14523           0 :         if (n-- == 0)
   14524           0 :             break;
   14525           0 :     return s1;
   14526             : }
   14527             : 
   14528             : Py_UNICODE*
   14529           0 : Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
   14530             : {
   14531           0 :     Py_UNICODE *u1 = s1;
   14532           0 :     u1 += Py_UNICODE_strlen(u1);
   14533           0 :     Py_UNICODE_strcpy(u1, s2);
   14534           0 :     return s1;
   14535             : }
   14536             : 
   14537             : int
   14538           0 : Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
   14539             : {
   14540           0 :     while (*s1 && *s2 && *s1 == *s2)
   14541           0 :         s1++, s2++;
   14542           0 :     if (*s1 && *s2)
   14543           0 :         return (*s1 < *s2) ? -1 : +1;
   14544           0 :     if (*s1)
   14545           0 :         return 1;
   14546           0 :     if (*s2)
   14547           0 :         return -1;
   14548           0 :     return 0;
   14549             : }
   14550             : 
   14551             : int
   14552           0 : Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
   14553             : {
   14554             :     register Py_UNICODE u1, u2;
   14555           0 :     for (; n != 0; n--) {
   14556           0 :         u1 = *s1;
   14557           0 :         u2 = *s2;
   14558           0 :         if (u1 != u2)
   14559           0 :             return (u1 < u2) ? -1 : +1;
   14560           0 :         if (u1 == '\0')
   14561           0 :             return 0;
   14562           0 :         s1++;
   14563           0 :         s2++;
   14564             :     }
   14565           0 :     return 0;
   14566             : }
   14567             : 
   14568             : Py_UNICODE*
   14569           0 : Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
   14570             : {
   14571             :     const Py_UNICODE *p;
   14572           0 :     for (p = s; *p; p++)
   14573           0 :         if (*p == c)
   14574           0 :             return (Py_UNICODE*)p;
   14575           0 :     return NULL;
   14576             : }
   14577             : 
   14578             : Py_UNICODE*
   14579           0 : Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
   14580             : {
   14581             :     const Py_UNICODE *p;
   14582           0 :     p = s + Py_UNICODE_strlen(s);
   14583           0 :     while (p != s) {
   14584           0 :         p--;
   14585           0 :         if (*p == c)
   14586           0 :             return (Py_UNICODE*)p;
   14587             :     }
   14588           0 :     return NULL;
   14589             : }
   14590             : 
   14591             : Py_UNICODE*
   14592           0 : PyUnicode_AsUnicodeCopy(PyObject *unicode)
   14593             : {
   14594             :     Py_UNICODE *u, *copy;
   14595             :     Py_ssize_t len, size;
   14596             : 
   14597           0 :     if (!PyUnicode_Check(unicode)) {
   14598           0 :         PyErr_BadArgument();
   14599           0 :         return NULL;
   14600             :     }
   14601           0 :     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
   14602           0 :     if (u == NULL)
   14603           0 :         return NULL;
   14604             :     /* Ensure we won't overflow the size. */
   14605           0 :     if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
   14606           0 :         PyErr_NoMemory();
   14607           0 :         return NULL;
   14608             :     }
   14609           0 :     size = len + 1; /* copy the null character */
   14610           0 :     size *= sizeof(Py_UNICODE);
   14611           0 :     copy = PyMem_Malloc(size);
   14612           0 :     if (copy == NULL) {
   14613           0 :         PyErr_NoMemory();
   14614           0 :         return NULL;
   14615             :     }
   14616           0 :     memcpy(copy, u, size);
   14617           0 :     return copy;
   14618             : }
   14619             : 
   14620             : /* A _string module, to export formatter_parser and formatter_field_name_split
   14621             :    to the string.Formatter class implemented in Python. */
   14622             : 
   14623             : static PyMethodDef _string_methods[] = {
   14624             :     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
   14625             :      METH_O, PyDoc_STR("split the argument as a field name")},
   14626             :     {"formatter_parser", (PyCFunction) formatter_parser,
   14627             :      METH_O, PyDoc_STR("parse the argument as a format string")},
   14628             :     {NULL, NULL}
   14629             : };
   14630             : 
   14631             : static struct PyModuleDef _string_module = {
   14632             :     PyModuleDef_HEAD_INIT,
   14633             :     "_string",
   14634             :     PyDoc_STR("string helper module"),
   14635             :     0,
   14636             :     _string_methods,
   14637             :     NULL,
   14638             :     NULL,
   14639             :     NULL,
   14640             :     NULL
   14641             : };
   14642             : 
   14643             : PyMODINIT_FUNC
   14644           0 : PyInit__string(void)
   14645             : {
   14646           0 :     return PyModule_Create(&_string_module);
   14647             : }
   14648             : 
   14649             : 
   14650             : #ifdef __cplusplus
   14651             : }
   14652             : #endif

Generated by: LCOV version 1.10