LCOV - libreoffice_filtered.info - libreoffice/workdir/unxlngi6.pro/UnpackedTarball/python3/Modules/unicodedata.c

LCOV - code coverage report

Current view:	top level - libreoffice/workdir/unxlngi6.pro/UnpackedTarball/python3/Modules - unicodedata.c (source / functions)		Hit	Total	Coverage
Test:	libreoffice_filtered.info	Lines:	0	601	0.0 %
Date:	2012-12-17	Functions:	0	28	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* ------------------------------------------------------------------------
       2             : 
       3             :    unicodedata -- Provides access to the Unicode database.
       4             : 
       5             :    Data was extracted from the UnicodeData.txt file.
       6             :    The current version number is reported in the unidata_version constant.
       7             : 
       8             :    Written by Marc-Andre Lemburg (mal@lemburg.com).
       9             :    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
      10             :    Modified by Martin v. Löwis (martin@v.loewis.de)
      11             : 
      12             :    Copyright (c) Corporation for National Research Initiatives.
      13             : 
      14             :    ------------------------------------------------------------------------ */
      15             : 
      16             : #include "Python.h"
      17             : #include "ucnhash.h"
      18             : #include "structmember.h"
      19             : 
      20             : /* character properties */
      21             : 
      22             : typedef struct {
      23             :     const unsigned char category;       /* index into
      24             :                                            _PyUnicode_CategoryNames */
      25             :     const unsigned char combining;      /* combining class value 0 - 255 */
      26             :     const unsigned char bidirectional;  /* index into
      27             :                                            _PyUnicode_BidirectionalNames */
      28             :     const unsigned char mirrored;       /* true if mirrored in bidir mode */
      29             :     const unsigned char east_asian_width;       /* index into
      30             :                                                    _PyUnicode_EastAsianWidth */
      31             :     const unsigned char normalization_quick_check; /* see is_normalized() */
      32             : } _PyUnicode_DatabaseRecord;
      33             : 
      34             : typedef struct change_record {
      35             :     /* sequence of fields should be the same as in merge_old_version */
      36             :     const unsigned char bidir_changed;
      37             :     const unsigned char category_changed;
      38             :     const unsigned char decimal_changed;
      39             :     const unsigned char mirrored_changed;
      40             :     const double numeric_changed;
      41             : } change_record;
      42             : 
      43             : /* data file generated by Tools/unicode/makeunicodedata.py */
      44             : #include "unicodedata_db.h"
      45             : 
      46             : static const _PyUnicode_DatabaseRecord*
      47           0 : _getrecord_ex(Py_UCS4 code)
      48             : {
      49             :     int index;
      50           0 :     if (code >= 0x110000)
      51           0 :         index = 0;
      52             :     else {
      53           0 :         index = index1[(code>>SHIFT)];
      54           0 :         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
      55             :     }
      56             : 
      57           0 :     return &_PyUnicode_Database_Records[index];
      58             : }
      59             : 
      60             : /* ------------- Previous-version API ------------------------------------- */
      61             : typedef struct previous_version {
      62             :     PyObject_HEAD
      63             :     const char *name;
      64             :     const change_record* (*getrecord)(Py_UCS4);
      65             :     Py_UCS4 (*normalization)(Py_UCS4);
      66             : } PreviousDBVersion;
      67             : 
      68             : #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
      69             : 
      70             : static PyMemberDef DB_members[] = {
      71             :         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
      72             :         {NULL}
      73             : };
      74             : 
      75             : /* forward declaration */
      76             : static PyTypeObject UCD_Type;
      77             : #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
      78             : 
      79             : static PyObject*
      80           0 : new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
      81             :                      Py_UCS4 (*normalization)(Py_UCS4))
      82             : {
      83             :         PreviousDBVersion *self;
      84           0 :         self = PyObject_New(PreviousDBVersion, &UCD_Type);
      85           0 :         if (self == NULL)
      86           0 :                 return NULL;
      87           0 :         self->name = name;
      88           0 :         self->getrecord = getrecord;
      89           0 :         self->normalization = normalization;
      90           0 :         return (PyObject*)self;
      91             : }
      92             : 
      93             : 
      94           0 : static Py_UCS4 getuchar(PyUnicodeObject *obj)
      95             : {
      96           0 :     if (PyUnicode_READY(obj))
      97           0 :         return (Py_UCS4)-1;
      98           0 :     if (PyUnicode_GET_LENGTH(obj) == 1) {
      99           0 :         if (PyUnicode_READY(obj))
     100           0 :             return (Py_UCS4)-1;
     101           0 :         return PyUnicode_READ_CHAR(obj, 0);
     102             :     }
     103           0 :     PyErr_SetString(PyExc_TypeError,
     104             :                     "need a single Unicode character as parameter");
     105           0 :     return (Py_UCS4)-1;
     106             : }
     107             : 
     108             : /* --- Module API --------------------------------------------------------- */
     109             : 
     110             : PyDoc_STRVAR(unicodedata_decimal__doc__,
     111             : "decimal(unichr[, default])\n\
     112             : \n\
     113             : Returns the decimal value assigned to the Unicode character unichr\n\
     114             : as integer. If no such value is defined, default is returned, or, if\n\
     115             : not given, ValueError is raised.");
     116             : 
     117             : static PyObject *
     118           0 : unicodedata_decimal(PyObject *self, PyObject *args)
     119             : {
     120             :     PyUnicodeObject *v;
     121           0 :     PyObject *defobj = NULL;
     122           0 :     int have_old = 0;
     123             :     long rc;
     124             :     Py_UCS4 c;
     125             : 
     126           0 :     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
     127           0 :         return NULL;
     128           0 :     c = getuchar(v);
     129           0 :     if (c == (Py_UCS4)-1)
     130           0 :         return NULL;
     131             : 
     132           0 :     if (self && UCD_Check(self)) {
     133           0 :         const change_record *old = get_old_record(self, c);
     134           0 :         if (old->category_changed == 0) {
     135             :             /* unassigned */
     136           0 :             have_old = 1;
     137           0 :             rc = -1;
     138             :         }
     139           0 :         else if (old->decimal_changed != 0xFF) {
     140           0 :             have_old = 1;
     141           0 :             rc = old->decimal_changed;
     142             :         }
     143             :     }
     144             : 
     145           0 :     if (!have_old)
     146           0 :         rc = Py_UNICODE_TODECIMAL(c);
     147           0 :     if (rc < 0) {
     148           0 :         if (defobj == NULL) {
     149           0 :             PyErr_SetString(PyExc_ValueError,
     150             :                             "not a decimal");
     151           0 :             return NULL;
     152             :         }
     153             :         else {
     154           0 :             Py_INCREF(defobj);
     155           0 :             return defobj;
     156             :         }
     157             :     }
     158           0 :     return PyLong_FromLong(rc);
     159             : }
     160             : 
     161             : PyDoc_STRVAR(unicodedata_digit__doc__,
     162             : "digit(unichr[, default])\n\
     163             : \n\
     164             : Returns the digit value assigned to the Unicode character unichr as\n\
     165             : integer. If no such value is defined, default is returned, or, if\n\
     166             : not given, ValueError is raised.");
     167             : 
     168             : static PyObject *
     169           0 : unicodedata_digit(PyObject *self, PyObject *args)
     170             : {
     171             :     PyUnicodeObject *v;
     172           0 :     PyObject *defobj = NULL;
     173             :     long rc;
     174             :     Py_UCS4 c;
     175             : 
     176           0 :     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
     177           0 :         return NULL;
     178           0 :     c = getuchar(v);
     179           0 :     if (c == (Py_UCS4)-1)
     180           0 :         return NULL;
     181           0 :     rc = Py_UNICODE_TODIGIT(c);
     182           0 :     if (rc < 0) {
     183           0 :         if (defobj == NULL) {
     184           0 :             PyErr_SetString(PyExc_ValueError, "not a digit");
     185           0 :             return NULL;
     186             :         }
     187             :         else {
     188           0 :             Py_INCREF(defobj);
     189           0 :             return defobj;
     190             :         }
     191             :     }
     192           0 :     return PyLong_FromLong(rc);
     193             : }
     194             : 
     195             : PyDoc_STRVAR(unicodedata_numeric__doc__,
     196             : "numeric(unichr[, default])\n\
     197             : \n\
     198             : Returns the numeric value assigned to the Unicode character unichr\n\
     199             : as float. If no such value is defined, default is returned, or, if\n\
     200             : not given, ValueError is raised.");
     201             : 
     202             : static PyObject *
     203           0 : unicodedata_numeric(PyObject *self, PyObject *args)
     204             : {
     205             :     PyUnicodeObject *v;
     206           0 :     PyObject *defobj = NULL;
     207           0 :     int have_old = 0;
     208             :     double rc;
     209             :     Py_UCS4 c;
     210             : 
     211           0 :     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
     212           0 :         return NULL;
     213           0 :     c = getuchar(v);
     214           0 :     if (c == (Py_UCS4)-1)
     215           0 :         return NULL;
     216             : 
     217           0 :     if (self && UCD_Check(self)) {
     218           0 :         const change_record *old = get_old_record(self, c);
     219           0 :         if (old->category_changed == 0) {
     220             :             /* unassigned */
     221           0 :             have_old = 1;
     222           0 :             rc = -1.0;
     223             :         }
     224           0 :         else if (old->decimal_changed != 0xFF) {
     225           0 :             have_old = 1;
     226           0 :             rc = old->decimal_changed;
     227             :         }
     228             :     }
     229             : 
     230           0 :     if (!have_old)
     231           0 :         rc = Py_UNICODE_TONUMERIC(c);
     232           0 :     if (rc == -1.0) {
     233           0 :         if (defobj == NULL) {
     234           0 :             PyErr_SetString(PyExc_ValueError, "not a numeric character");
     235           0 :             return NULL;
     236             :         }
     237             :         else {
     238           0 :             Py_INCREF(defobj);
     239           0 :             return defobj;
     240             :         }
     241             :     }
     242           0 :     return PyFloat_FromDouble(rc);
     243             : }
     244             : 
     245             : PyDoc_STRVAR(unicodedata_category__doc__,
     246             : "category(unichr)\n\
     247             : \n\
     248             : Returns the general category assigned to the Unicode character\n\
     249             : unichr as string.");
     250             : 
     251             : static PyObject *
     252           0 : unicodedata_category(PyObject *self, PyObject *args)
     253             : {
     254             :     PyUnicodeObject *v;
     255             :     int index;
     256             :     Py_UCS4 c;
     257             : 
     258           0 :     if (!PyArg_ParseTuple(args, "O!:category",
     259             :                           &PyUnicode_Type, &v))
     260           0 :         return NULL;
     261           0 :     c = getuchar(v);
     262           0 :     if (c == (Py_UCS4)-1)
     263           0 :         return NULL;
     264           0 :     index = (int) _getrecord_ex(c)->category;
     265           0 :     if (self && UCD_Check(self)) {
     266           0 :         const change_record *old = get_old_record(self, c);
     267           0 :         if (old->category_changed != 0xFF)
     268           0 :             index = old->category_changed;
     269             :     }
     270           0 :     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
     271             : }
     272             : 
     273             : PyDoc_STRVAR(unicodedata_bidirectional__doc__,
     274             : "bidirectional(unichr)\n\
     275             : \n\
     276             : Returns the bidirectional category assigned to the Unicode character\n\
     277             : unichr as string. If no such value is defined, an empty string is\n\
     278             : returned.");
     279             : 
     280             : static PyObject *
     281           0 : unicodedata_bidirectional(PyObject *self, PyObject *args)
     282             : {
     283             :     PyUnicodeObject *v;
     284             :     int index;
     285             :     Py_UCS4 c;
     286             : 
     287           0 :     if (!PyArg_ParseTuple(args, "O!:bidirectional",
     288             :                           &PyUnicode_Type, &v))
     289           0 :         return NULL;
     290           0 :     c = getuchar(v);
     291           0 :     if (c == (Py_UCS4)-1)
     292           0 :         return NULL;
     293           0 :     index = (int) _getrecord_ex(c)->bidirectional;
     294           0 :     if (self && UCD_Check(self)) {
     295           0 :         const change_record *old = get_old_record(self, c);
     296           0 :         if (old->category_changed == 0)
     297           0 :             index = 0; /* unassigned */
     298           0 :         else if (old->bidir_changed != 0xFF)
     299           0 :             index = old->bidir_changed;
     300             :     }
     301           0 :     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
     302             : }
     303             : 
     304             : PyDoc_STRVAR(unicodedata_combining__doc__,
     305             : "combining(unichr)\n\
     306             : \n\
     307             : Returns the canonical combining class assigned to the Unicode\n\
     308             : character unichr as integer. Returns 0 if no combining class is\n\
     309             : defined.");
     310             : 
     311             : static PyObject *
     312           0 : unicodedata_combining(PyObject *self, PyObject *args)
     313             : {
     314             :     PyUnicodeObject *v;
     315             :     int index;
     316             :     Py_UCS4 c;
     317             : 
     318           0 :     if (!PyArg_ParseTuple(args, "O!:combining",
     319             :                           &PyUnicode_Type, &v))
     320           0 :         return NULL;
     321           0 :     c = getuchar(v);
     322           0 :     if (c == (Py_UCS4)-1)
     323           0 :         return NULL;
     324           0 :     index = (int) _getrecord_ex(c)->combining;
     325           0 :     if (self && UCD_Check(self)) {
     326           0 :         const change_record *old = get_old_record(self, c);
     327           0 :         if (old->category_changed == 0)
     328           0 :             index = 0; /* unassigned */
     329             :     }
     330           0 :     return PyLong_FromLong(index);
     331             : }
     332             : 
     333             : PyDoc_STRVAR(unicodedata_mirrored__doc__,
     334             : "mirrored(unichr)\n\
     335             : \n\
     336             : Returns the mirrored property assigned to the Unicode character\n\
     337             : unichr as integer. Returns 1 if the character has been identified as\n\
     338             : a \"mirrored\" character in bidirectional text, 0 otherwise.");
     339             : 
     340             : static PyObject *
     341           0 : unicodedata_mirrored(PyObject *self, PyObject *args)
     342             : {
     343             :     PyUnicodeObject *v;
     344             :     int index;
     345             :     Py_UCS4 c;
     346             : 
     347           0 :     if (!PyArg_ParseTuple(args, "O!:mirrored",
     348             :                           &PyUnicode_Type, &v))
     349           0 :         return NULL;
     350           0 :     c = getuchar(v);
     351           0 :     if (c == (Py_UCS4)-1)
     352           0 :         return NULL;
     353           0 :     index = (int) _getrecord_ex(c)->mirrored;
     354           0 :     if (self && UCD_Check(self)) {
     355           0 :         const change_record *old = get_old_record(self, c);
     356           0 :         if (old->category_changed == 0)
     357           0 :             index = 0; /* unassigned */
     358           0 :         else if (old->mirrored_changed != 0xFF)
     359           0 :             index = old->mirrored_changed;
     360             :     }
     361           0 :     return PyLong_FromLong(index);
     362             : }
     363             : 
     364             : PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
     365             : "east_asian_width(unichr)\n\
     366             : \n\
     367             : Returns the east asian width assigned to the Unicode character\n\
     368             : unichr as string.");
     369             : 
     370             : static PyObject *
     371           0 : unicodedata_east_asian_width(PyObject *self, PyObject *args)
     372             : {
     373             :     PyUnicodeObject *v;
     374             :     int index;
     375             :     Py_UCS4 c;
     376             : 
     377           0 :     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
     378             :                           &PyUnicode_Type, &v))
     379           0 :         return NULL;
     380           0 :     c = getuchar(v);
     381           0 :     if (c == (Py_UCS4)-1)
     382           0 :         return NULL;
     383           0 :     index = (int) _getrecord_ex(c)->east_asian_width;
     384           0 :     if (self && UCD_Check(self)) {
     385           0 :         const change_record *old = get_old_record(self, c);
     386           0 :         if (old->category_changed == 0)
     387           0 :             index = 0; /* unassigned */
     388             :     }
     389           0 :     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
     390             : }
     391             : 
     392             : PyDoc_STRVAR(unicodedata_decomposition__doc__,
     393             : "decomposition(unichr)\n\
     394             : \n\
     395             : Returns the character decomposition mapping assigned to the Unicode\n\
     396             : character unichr as string. An empty string is returned in case no\n\
     397             : such mapping is defined.");
     398             : 
     399             : static PyObject *
     400           0 : unicodedata_decomposition(PyObject *self, PyObject *args)
     401             : {
     402             :     PyUnicodeObject *v;
     403             :     char decomp[256];
     404             :     int code, index, count;
     405             :     size_t i;
     406             :     unsigned int prefix_index;
     407             :     Py_UCS4 c;
     408             : 
     409           0 :     if (!PyArg_ParseTuple(args, "O!:decomposition",
     410             :                           &PyUnicode_Type, &v))
     411           0 :         return NULL;
     412           0 :     c = getuchar(v);
     413           0 :     if (c == (Py_UCS4)-1)
     414           0 :         return NULL;
     415             : 
     416           0 :     code = (int)c;
     417             : 
     418           0 :     if (self && UCD_Check(self)) {
     419           0 :         const change_record *old = get_old_record(self, c);
     420           0 :         if (old->category_changed == 0)
     421           0 :             return PyUnicode_FromString(""); /* unassigned */
     422             :     }
     423             : 
     424           0 :     if (code < 0 || code >= 0x110000)
     425           0 :         index = 0;
     426             :     else {
     427           0 :         index = decomp_index1[(code>>DECOMP_SHIFT)];
     428           0 :         index = decomp_index2[(index<<DECOMP_SHIFT)+
     429           0 :                              (code&((1<<DECOMP_SHIFT)-1))];
     430             :     }
     431             : 
     432             :     /* high byte is number of hex bytes (usually one or two), low byte
     433             :        is prefix code (from*/
     434           0 :     count = decomp_data[index] >> 8;
     435             : 
     436             :     /* XXX: could allocate the PyString up front instead
     437             :        (strlen(prefix) + 5 * count + 1 bytes) */
     438             : 
     439             :     /* Based on how index is calculated above and decomp_data is generated
     440             :        from Tools/unicode/makeunicodedata.py, it should not be possible
     441             :        to overflow decomp_prefix. */
     442           0 :     prefix_index = decomp_data[index] & 255;
     443             :     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
     444             : 
     445             :     /* copy prefix */
     446           0 :     i = strlen(decomp_prefix[prefix_index]);
     447           0 :     memcpy(decomp, decomp_prefix[prefix_index], i);
     448             : 
     449           0 :     while (count-- > 0) {
     450           0 :         if (i)
     451           0 :             decomp[i++] = ' ';
     452             :         assert(i < sizeof(decomp));
     453           0 :         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
     454             :                       decomp_data[++index]);
     455           0 :         i += strlen(decomp + i);
     456             :     }
     457           0 :     return PyUnicode_FromStringAndSize(decomp, i);
     458             : }
     459             : 
     460             : static void
     461           0 : get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
     462             : {
     463           0 :     if (code >= 0x110000) {
     464           0 :         *index = 0;
     465           0 :     } else if (self && UCD_Check(self) &&
     466           0 :                get_old_record(self, code)->category_changed==0) {
     467             :         /* unassigned in old version */
     468           0 :         *index = 0;
     469             :     }
     470             :     else {
     471           0 :         *index = decomp_index1[(code>>DECOMP_SHIFT)];
     472           0 :         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
     473           0 :                                (code&((1<<DECOMP_SHIFT)-1))];
     474             :     }
     475             : 
     476             :     /* high byte is number of hex bytes (usually one or two), low byte
     477             :        is prefix code (from*/
     478           0 :     *count = decomp_data[*index] >> 8;
     479           0 :     *prefix = decomp_data[*index] & 255;
     480             : 
     481           0 :     (*index)++;
     482           0 : }
     483             : 
     484             : #define SBase   0xAC00
     485             : #define LBase   0x1100
     486             : #define VBase   0x1161
     487             : #define TBase   0x11A7
     488             : #define LCount  19
     489             : #define VCount  21
     490             : #define TCount  28
     491             : #define NCount  (VCount*TCount)
     492             : #define SCount  (LCount*NCount)
     493             : 
     494             : static PyObject*
     495           0 : nfd_nfkd(PyObject *self, PyObject *input, int k)
     496             : {
     497             :     PyObject *result;
     498             :     Py_UCS4 *output;
     499             :     Py_ssize_t i, o, osize;
     500             :     int kind;
     501             :     void *data;
     502             :     /* Longest decomposition in Unicode 3.2: U+FDFA */
     503             :     Py_UCS4 stack[20];
     504             :     Py_ssize_t space, isize;
     505             :     int index, prefix, count, stackptr;
     506             :     unsigned char prev, cur;
     507             : 
     508           0 :     stackptr = 0;
     509           0 :     isize = PyUnicode_GET_LENGTH(input);
     510             :     /* Overallocate atmost 10 characters. */
     511           0 :     space = (isize > 10 ? 10 : isize) + isize;
     512           0 :     osize = space;
     513           0 :     output = PyMem_Malloc(space * sizeof(Py_UCS4));
     514           0 :     if (!output) {
     515           0 :         PyErr_NoMemory();
     516           0 :         return NULL;
     517             :     }
     518           0 :     i = o = 0;
     519           0 :     kind = PyUnicode_KIND(input);
     520           0 :     data = PyUnicode_DATA(input);
     521             : 
     522           0 :     while (i < isize) {
     523           0 :         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
     524           0 :         while(stackptr) {
     525           0 :             Py_UCS4 code = stack[--stackptr];
     526             :             /* Hangul Decomposition adds three characters in
     527             :                a single step, so we need atleast that much room. */
     528           0 :             if (space < 3) {
     529             :                 Py_UCS4 *new_output;
     530           0 :                 osize += 10;
     531           0 :                 space += 10;
     532           0 :                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
     533           0 :                 if (new_output == NULL) {
     534           0 :                     PyMem_Free(output);
     535           0 :                     PyErr_NoMemory();
     536           0 :                     return NULL;
     537             :                 }
     538           0 :                 output = new_output;
     539             :             }
     540             :             /* Hangul Decomposition. */
     541           0 :             if (SBase <= code && code < (SBase+SCount)) {
     542           0 :                 int SIndex = code - SBase;
     543           0 :                 int L = LBase + SIndex / NCount;
     544           0 :                 int V = VBase + (SIndex % NCount) / TCount;
     545           0 :                 int T = TBase + SIndex % TCount;
     546           0 :                 output[o++] = L;
     547           0 :                 output[o++] = V;
     548           0 :                 space -= 2;
     549           0 :                 if (T != TBase) {
     550           0 :                     output[o++] = T;
     551           0 :                     space --;
     552             :                 }
     553           0 :                 continue;
     554             :             }
     555             :             /* normalization changes */
     556           0 :             if (self && UCD_Check(self)) {
     557           0 :                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
     558           0 :                 if (value != 0) {
     559           0 :                     stack[stackptr++] = value;
     560           0 :                     continue;
     561             :                 }
     562             :             }
     563             : 
     564             :             /* Other decompositions. */
     565           0 :             get_decomp_record(self, code, &index, &prefix, &count);
     566             : 
     567             :             /* Copy character if it is not decomposable, or has a
     568             :                compatibility decomposition, but we do NFD. */
     569           0 :             if (!count || (prefix && !k)) {
     570           0 :                 output[o++] = code;
     571           0 :                 space--;
     572           0 :                 continue;
     573             :             }
     574             :             /* Copy decomposition onto the stack, in reverse
     575             :                order.  */
     576           0 :             while(count) {
     577           0 :                 code = decomp_data[index + (--count)];
     578           0 :                 stack[stackptr++] = code;
     579             :             }
     580             :         }
     581             :     }
     582             : 
     583           0 :     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     584             :                                        output, o);
     585           0 :     PyMem_Free(output);
     586           0 :     if (!result)
     587           0 :         return NULL;
     588             :     /* result is guaranteed to be ready, as it is compact. */
     589           0 :     kind = PyUnicode_KIND(result);
     590           0 :     data = PyUnicode_DATA(result);
     591             : 
     592             :     /* Sort canonically. */
     593           0 :     i = 0;
     594           0 :     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     595           0 :     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
     596           0 :         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     597           0 :         if (prev == 0 || cur == 0 || prev <= cur) {
     598           0 :             prev = cur;
     599           0 :             continue;
     600             :         }
     601             :         /* Non-canonical order. Need to switch *i with previous. */
     602           0 :         o = i - 1;
     603             :         while (1) {
     604           0 :             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
     605           0 :             PyUnicode_WRITE(kind, data, o+1,
     606             :                             PyUnicode_READ(kind, data, o));
     607           0 :             PyUnicode_WRITE(kind, data, o, tmp);
     608           0 :             o--;
     609           0 :             if (o < 0)
     610           0 :                 break;
     611           0 :             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
     612           0 :             if (prev == 0 || prev <= cur)
     613             :                 break;
     614           0 :         }
     615           0 :         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
     616             :     }
     617           0 :     return result;
     618             : }
     619             : 
     620             : static int
     621           0 : find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
     622             : {
     623             :     unsigned int index;
     624           0 :     for (index = 0; nfc[index].start; index++) {
     625           0 :         unsigned int start = nfc[index].start;
     626           0 :         if (code < start)
     627           0 :             return -1;
     628           0 :         if (code <= start + nfc[index].count) {
     629           0 :             unsigned int delta = code - start;
     630           0 :             return nfc[index].index + delta;
     631             :         }
     632             :     }
     633           0 :     return -1;
     634             : }
     635             : 
     636             : static PyObject*
     637           0 : nfc_nfkc(PyObject *self, PyObject *input, int k)
     638             : {
     639             :     PyObject *result;
     640             :     int kind;
     641             :     void *data;
     642             :     Py_UCS4 *output;
     643             :     Py_ssize_t i, i1, o, len;
     644             :     int f,l,index,index1,comb;
     645             :     Py_UCS4 code;
     646             :     Py_ssize_t skipped[20];
     647           0 :     int cskipped = 0;
     648             : 
     649           0 :     result = nfd_nfkd(self, input, k);
     650           0 :     if (!result)
     651           0 :         return NULL;
     652             :     /* result will be "ready". */
     653           0 :     kind = PyUnicode_KIND(result);
     654           0 :     data = PyUnicode_DATA(result);
     655           0 :     len = PyUnicode_GET_LENGTH(result);
     656             : 
     657             :     /* We allocate a buffer for the output.
     658             :        If we find that we made no changes, we still return
     659             :        the NFD result. */
     660           0 :     output = PyMem_Malloc(len * sizeof(Py_UCS4));
     661           0 :     if (!output) {
     662           0 :         PyErr_NoMemory();
     663           0 :         Py_DECREF(result);
     664           0 :         return 0;
     665             :     }
     666           0 :     i = o = 0;
     667             : 
     668             :   again:
     669           0 :     while (i < len) {
     670           0 :       for (index = 0; index < cskipped; index++) {
     671           0 :           if (skipped[index] == i) {
     672             :               /* *i character is skipped.
     673             :                  Remove from list. */
     674           0 :               skipped[index] = skipped[cskipped-1];
     675           0 :               cskipped--;
     676           0 :               i++;
     677           0 :               goto again; /* continue while */
     678             :           }
     679             :       }
     680             :       /* Hangul Composition. We don't need to check for <LV,T>
     681             :          pairs, since we always have decomposed data. */
     682           0 :       code = PyUnicode_READ(kind, data, i);
     683           0 :       if (LBase <= code && code < (LBase+LCount) &&
     684           0 :           i + 1 < len &&
     685           0 :           VBase <= PyUnicode_READ(kind, data, i+1) &&
     686           0 :           PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
     687             :           int LIndex, VIndex;
     688           0 :           LIndex = code - LBase;
     689           0 :           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
     690           0 :           code = SBase + (LIndex*VCount+VIndex)*TCount;
     691           0 :           i+=2;
     692           0 :           if (i < len &&
     693           0 :               TBase <= PyUnicode_READ(kind, data, i) &&
     694           0 :               PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
     695           0 :               code += PyUnicode_READ(kind, data, i)-TBase;
     696           0 :               i++;
     697             :           }
     698           0 :           output[o++] = code;
     699           0 :           continue;
     700             :       }
     701             : 
     702             :       /* code is still input[i] here */
     703           0 :       f = find_nfc_index(self, nfc_first, code);
     704           0 :       if (f == -1) {
     705           0 :           output[o++] = code;
     706           0 :           i++;
     707           0 :           continue;
     708             :       }
     709             :       /* Find next unblocked character. */
     710           0 :       i1 = i+1;
     711           0 :       comb = 0;
     712             :       /* output base character for now; might be updated later. */
     713           0 :       output[o] = PyUnicode_READ(kind, data, i);
     714           0 :       while (i1 < len) {
     715           0 :           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
     716           0 :           int comb1 = _getrecord_ex(code1)->combining;
     717           0 :           if (comb) {
     718           0 :               if (comb1 == 0)
     719           0 :                   break;
     720           0 :               if (comb >= comb1) {
     721             :                   /* Character is blocked. */
     722           0 :                   i1++;
     723           0 :                   continue;
     724             :               }
     725             :           }
     726           0 :           l = find_nfc_index(self, nfc_last, code1);
     727             :           /* i1 cannot be combined with i. If i1
     728             :              is a starter, we don't need to look further.
     729             :              Otherwise, record the combining class. */
     730           0 :           if (l == -1) {
     731             :             not_combinable:
     732           0 :               if (comb1 == 0)
     733           0 :                   break;
     734           0 :               comb = comb1;
     735           0 :               i1++;
     736           0 :               continue;
     737             :           }
     738           0 :           index = f*TOTAL_LAST + l;
     739           0 :           index1 = comp_index[index >> COMP_SHIFT];
     740           0 :           code = comp_data[(index1<<COMP_SHIFT)+
     741           0 :                            (index&((1<<COMP_SHIFT)-1))];
     742           0 :           if (code == 0)
     743           0 :               goto not_combinable;
     744             : 
     745             :           /* Replace the original character. */
     746           0 :           output[o] = code;
     747             :           /* Mark the second character unused. */
     748             :           assert(cskipped < 20);
     749           0 :           skipped[cskipped++] = i1;
     750           0 :           i1++;
     751           0 :           f = find_nfc_index(self, nfc_first, output[o]);
     752           0 :           if (f == -1)
     753           0 :               break;
     754             :       }
     755             :       /* Output character was already written.
     756             :          Just advance the indices. */
     757           0 :       o++; i++;
     758             :     }
     759           0 :     if (o == len) {
     760             :         /* No changes. Return original string. */
     761           0 :         PyMem_Free(output);
     762           0 :         return result;
     763             :     }
     764           0 :     Py_DECREF(result);
     765           0 :     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
     766             :                                        output, o);
     767           0 :     PyMem_Free(output);
     768           0 :     return result;
     769             : }
     770             : 
     771             : /* Return 1 if the input is certainly normalized, 0 if it might not be. */
     772             : static int
     773           0 : is_normalized(PyObject *self, PyObject *input, int nfc, int k)
     774             : {
     775             :     Py_ssize_t i, len;
     776             :     int kind;
     777             :     void *data;
     778           0 :     unsigned char prev_combining = 0, quickcheck_mask;
     779             : 
     780             :     /* An older version of the database is requested, quickchecks must be
     781             :        disabled. */
     782           0 :     if (self && UCD_Check(self))
     783           0 :         return 0;
     784             : 
     785             :     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
     786             :        as described in http://unicode.org/reports/tr15/#Annex8. */
     787           0 :     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
     788             : 
     789           0 :     i = 0;
     790           0 :     kind = PyUnicode_KIND(input);
     791           0 :     data = PyUnicode_DATA(input);
     792           0 :     len = PyUnicode_GET_LENGTH(input);
     793           0 :     while (i < len) {
     794           0 :         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
     795           0 :         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
     796           0 :         unsigned char combining = record->combining;
     797           0 :         unsigned char quickcheck = record->normalization_quick_check;
     798             : 
     799           0 :         if (quickcheck & quickcheck_mask)
     800           0 :             return 0; /* this string might need normalization */
     801           0 :         if (combining && prev_combining > combining)
     802           0 :             return 0; /* non-canonical sort order, not normalized */
     803           0 :         prev_combining = combining;
     804             :     }
     805           0 :     return 1; /* certainly normalized */
     806             : }
     807             : 
     808             : PyDoc_STRVAR(unicodedata_normalize__doc__,
     809             : "normalize(form, unistr)\n\
     810             : \n\
     811             : Return the normal form 'form' for the Unicode string unistr.  Valid\n\
     812             : values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
     813             : 
     814             : static PyObject*
     815           0 : unicodedata_normalize(PyObject *self, PyObject *args)
     816             : {
     817             :     char *form;
     818             :     PyObject *input;
     819             : 
     820           0 :     if(!PyArg_ParseTuple(args, "sO!:normalize",
     821             :                          &form, &PyUnicode_Type, &input))
     822           0 :         return NULL;
     823             : 
     824           0 :     if (PyUnicode_READY(input) == -1)
     825           0 :         return NULL;
     826             : 
     827           0 :     if (PyUnicode_GET_LENGTH(input) == 0) {
     828             :         /* Special case empty input strings, since resizing
     829             :            them  later would cause internal errors. */
     830           0 :         Py_INCREF(input);
     831           0 :         return input;
     832             :     }
     833             : 
     834           0 :     if (strcmp(form, "NFC") == 0) {
     835           0 :         if (is_normalized(self, input, 1, 0)) {
     836           0 :             Py_INCREF(input);
     837           0 :             return input;
     838             :         }
     839           0 :         return nfc_nfkc(self, input, 0);
     840             :     }
     841           0 :     if (strcmp(form, "NFKC") == 0) {
     842           0 :         if (is_normalized(self, input, 1, 1)) {
     843           0 :             Py_INCREF(input);
     844           0 :             return input;
     845             :         }
     846           0 :         return nfc_nfkc(self, input, 1);
     847             :     }
     848           0 :     if (strcmp(form, "NFD") == 0) {
     849           0 :         if (is_normalized(self, input, 0, 0)) {
     850           0 :             Py_INCREF(input);
     851           0 :             return input;
     852             :         }
     853           0 :         return nfd_nfkd(self, input, 0);
     854             :     }
     855           0 :     if (strcmp(form, "NFKD") == 0) {
     856           0 :         if (is_normalized(self, input, 0, 1)) {
     857           0 :             Py_INCREF(input);
     858           0 :             return input;
     859             :         }
     860           0 :         return nfd_nfkd(self, input, 1);
     861             :     }
     862           0 :     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     863           0 :     return NULL;
     864             : }
     865             : 
     866             : /* -------------------------------------------------------------------- */
     867             : /* unicode character name tables */
     868             : 
     869             : /* data file generated by Tools/unicode/makeunicodedata.py */
     870             : #include "unicodename_db.h"
     871             : 
     872             : /* -------------------------------------------------------------------- */
     873             : /* database code (cut and pasted from the unidb package) */
     874             : 
     875             : static unsigned long
     876           0 : _gethash(const char *s, int len, int scale)
     877             : {
     878             :     int i;
     879           0 :     unsigned long h = 0;
     880             :     unsigned long ix;
     881           0 :     for (i = 0; i < len; i++) {
     882           0 :         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
     883           0 :         ix = h & 0xff000000;
     884           0 :         if (ix)
     885           0 :             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
     886             :     }
     887           0 :     return h;
     888             : }
     889             : 
     890             : static char *hangul_syllables[][3] = {
     891             :     { "G",  "A",   ""   },
     892             :     { "GG", "AE",  "G"  },
     893             :     { "N",  "YA",  "GG" },
     894             :     { "D",  "YAE", "GS" },
     895             :     { "DD", "EO",  "N", },
     896             :     { "R",  "E",   "NJ" },
     897             :     { "M",  "YEO", "NH" },
     898             :     { "B",  "YE",  "D"  },
     899             :     { "BB", "O",   "L"  },
     900             :     { "S",  "WA",  "LG" },
     901             :     { "SS", "WAE", "LM" },
     902             :     { "",   "OE",  "LB" },
     903             :     { "J",  "YO",  "LS" },
     904             :     { "JJ", "U",   "LT" },
     905             :     { "C",  "WEO", "LP" },
     906             :     { "K",  "WE",  "LH" },
     907             :     { "T",  "WI",  "M"  },
     908             :     { "P",  "YU",  "B"  },
     909             :     { "H",  "EU",  "BS" },
     910             :     { 0,    "YI",  "S"  },
     911             :     { 0,    "I",   "SS" },
     912             :     { 0,    0,     "NG" },
     913             :     { 0,    0,     "J"  },
     914             :     { 0,    0,     "C"  },
     915             :     { 0,    0,     "K"  },
     916             :     { 0,    0,     "T"  },
     917             :     { 0,    0,     "P"  },
     918             :     { 0,    0,     "H"  }
     919             : };
     920             : 
     921             : /* These ranges need to match makeunicodedata.py:cjk_ranges. */
     922             : static int
     923           0 : is_unified_ideograph(Py_UCS4 code)
     924             : {
     925           0 :     return
     926           0 :         (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
     927           0 :         (0x4E00 <= code && code <= 0x9FCC)   || /* CJK Ideograph */
     928           0 :         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
     929           0 :         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
     930           0 :         (0x2B740 <= code && code <= 0x2B81D);   /* CJK Ideograph Extension D */
     931             : }
     932             : 
     933             : /* macros used to determine if the given codepoint is in the PUA range that
     934             :  * we are using to store aliases and named sequences */
     935             : #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
     936             : #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
     937             :                           (cp < named_sequences_end))
     938             : 
     939             : static int
     940           0 : _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
     941             :            int with_alias_and_seq)
     942             : {
     943             :     /* Find the name associated with the given codepoint.
     944             :      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
     945             :      * that we are using for aliases and named sequences. */
     946             :     int offset;
     947             :     int i;
     948             :     int word;
     949             :     unsigned char* w;
     950             : 
     951           0 :     if (code >= 0x110000)
     952           0 :         return 0;
     953             : 
     954             :     /* XXX should we just skip all the codepoints in the PUAs here? */
     955           0 :     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
     956           0 :         return 0;
     957             : 
     958           0 :     if (self && UCD_Check(self)) {
     959             :         /* in 3.2.0 there are no aliases and named sequences */
     960             :         const change_record *old;
     961           0 :         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
     962           0 :             return 0;
     963           0 :         old = get_old_record(self, code);
     964           0 :         if (old->category_changed == 0) {
     965             :             /* unassigned */
     966           0 :             return 0;
     967             :         }
     968             :     }
     969             : 
     970           0 :     if (SBase <= code && code < SBase+SCount) {
     971             :         /* Hangul syllable. */
     972           0 :         int SIndex = code - SBase;
     973           0 :         int L = SIndex / NCount;
     974           0 :         int V = (SIndex % NCount) / TCount;
     975           0 :         int T = SIndex % TCount;
     976             : 
     977           0 :         if (buflen < 27)
     978             :             /* Worst case: HANGUL SYLLABLE <10chars>. */
     979           0 :             return 0;
     980           0 :         strcpy(buffer, "HANGUL SYLLABLE ");
     981           0 :         buffer += 16;
     982           0 :         strcpy(buffer, hangul_syllables[L][0]);
     983           0 :         buffer += strlen(hangul_syllables[L][0]);
     984           0 :         strcpy(buffer, hangul_syllables[V][1]);
     985           0 :         buffer += strlen(hangul_syllables[V][1]);
     986           0 :         strcpy(buffer, hangul_syllables[T][2]);
     987           0 :         buffer += strlen(hangul_syllables[T][2]);
     988           0 :         *buffer = '\0';
     989           0 :         return 1;
     990             :     }
     991             : 
     992           0 :     if (is_unified_ideograph(code)) {
     993           0 :         if (buflen < 28)
     994             :             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
     995           0 :             return 0;
     996           0 :         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
     997           0 :         return 1;
     998             :     }
     999             : 
    1000             :     /* get offset into phrasebook */
    1001           0 :     offset = phrasebook_offset1[(code>>phrasebook_shift)];
    1002           0 :     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
    1003           0 :                                (code&((1<<phrasebook_shift)-1))];
    1004           0 :     if (!offset)
    1005           0 :         return 0;
    1006             : 
    1007           0 :     i = 0;
    1008             : 
    1009             :     for (;;) {
    1010             :         /* get word index */
    1011           0 :         word = phrasebook[offset] - phrasebook_short;
    1012           0 :         if (word >= 0) {
    1013           0 :             word = (word << 8) + phrasebook[offset+1];
    1014           0 :             offset += 2;
    1015             :         } else
    1016           0 :             word = phrasebook[offset++];
    1017           0 :         if (i) {
    1018           0 :             if (i > buflen)
    1019           0 :                 return 0; /* buffer overflow */
    1020           0 :             buffer[i++] = ' ';
    1021             :         }
    1022             :         /* copy word string from lexicon.  the last character in the
    1023             :            word has bit 7 set.  the last word in a string ends with
    1024             :            0x80 */
    1025           0 :         w = lexicon + lexicon_offset[word];
    1026           0 :         while (*w < 128) {
    1027           0 :             if (i >= buflen)
    1028           0 :                 return 0; /* buffer overflow */
    1029           0 :             buffer[i++] = *w++;
    1030             :         }
    1031           0 :         if (i >= buflen)
    1032           0 :             return 0; /* buffer overflow */
    1033           0 :         buffer[i++] = *w & 127;
    1034           0 :         if (*w == 128)
    1035           0 :             break; /* end of word */
    1036           0 :     }
    1037             : 
    1038           0 :     return 1;
    1039             : }
    1040             : 
    1041             : static int
    1042           0 : _cmpname(PyObject *self, int code, const char* name, int namelen)
    1043             : {
    1044             :     /* check if code corresponds to the given name */
    1045             :     int i;
    1046             :     char buffer[NAME_MAXLEN];
    1047           0 :     if (!_getucname(self, code, buffer, sizeof(buffer), 1))
    1048           0 :         return 0;
    1049           0 :     for (i = 0; i < namelen; i++) {
    1050           0 :         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
    1051           0 :             return 0;
    1052             :     }
    1053           0 :     return buffer[namelen] == '\0';
    1054             : }
    1055             : 
    1056             : static void
    1057           0 : find_syllable(const char *str, int *len, int *pos, int count, int column)
    1058             : {
    1059             :     int i, len1;
    1060           0 :     *len = -1;
    1061           0 :     for (i = 0; i < count; i++) {
    1062           0 :         char *s = hangul_syllables[i][column];
    1063           0 :         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
    1064           0 :         if (len1 <= *len)
    1065           0 :             continue;
    1066           0 :         if (strncmp(str, s, len1) == 0) {
    1067           0 :             *len = len1;
    1068           0 :             *pos = i;
    1069             :         }
    1070             :     }
    1071           0 :     if (*len == -1) {
    1072           0 :         *len = 0;
    1073             :     }
    1074           0 : }
    1075             : 
    1076             : static int
    1077           0 : _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
    1078             : {
    1079             :     /* check if named sequences are allowed */
    1080           0 :     if (!with_named_seq && IS_NAMED_SEQ(cp))
    1081           0 :         return 0;
    1082             :     /* if the codepoint is in the PUA range that we use for aliases,
    1083             :      * convert it to obtain the right codepoint */
    1084           0 :     if (IS_ALIAS(cp))
    1085           0 :         *code = name_aliases[cp-aliases_start];
    1086             :     else
    1087           0 :         *code = cp;
    1088           0 :     return 1;
    1089             : }
    1090             : 
    1091             : static int
    1092           0 : _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
    1093             :          int with_named_seq)
    1094             : {
    1095             :     /* Return the codepoint associated with the given name.
    1096             :      * Named aliases are resolved too (unless self != NULL (i.e. we are using
    1097             :      * 3.2.0)).  If with_named_seq is 1, returns the PUA codepoint that we are
    1098             :      * using for the named sequence, and the caller must then convert it. */
    1099             :     unsigned int h, v;
    1100           0 :     unsigned int mask = code_size-1;
    1101             :     unsigned int i, incr;
    1102             : 
    1103             :     /* Check for hangul syllables. */
    1104           0 :     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
    1105           0 :         int len, L = -1, V = -1, T = -1;
    1106           0 :         const char *pos = name + 16;
    1107           0 :         find_syllable(pos, &len, &L, LCount, 0);
    1108           0 :         pos += len;
    1109           0 :         find_syllable(pos, &len, &V, VCount, 1);
    1110           0 :         pos += len;
    1111           0 :         find_syllable(pos, &len, &T, TCount, 2);
    1112           0 :         pos += len;
    1113           0 :         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
    1114           0 :             *code = SBase + (L*VCount+V)*TCount + T;
    1115           0 :             return 1;
    1116             :         }
    1117             :         /* Otherwise, it's an illegal syllable name. */
    1118           0 :         return 0;
    1119             :     }
    1120             : 
    1121             :     /* Check for unified ideographs. */
    1122           0 :     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
    1123             :         /* Four or five hexdigits must follow. */
    1124           0 :         v = 0;
    1125           0 :         name += 22;
    1126           0 :         namelen -= 22;
    1127           0 :         if (namelen != 4 && namelen != 5)
    1128           0 :             return 0;
    1129           0 :         while (namelen--) {
    1130           0 :             v *= 16;
    1131           0 :             if (*name >= '0' && *name <= '9')
    1132           0 :                 v += *name - '0';
    1133           0 :             else if (*name >= 'A' && *name <= 'F')
    1134           0 :                 v += *name - 'A' + 10;
    1135             :             else
    1136           0 :                 return 0;
    1137           0 :             name++;
    1138             :         }
    1139           0 :         if (!is_unified_ideograph(v))
    1140           0 :             return 0;
    1141           0 :         *code = v;
    1142           0 :         return 1;
    1143             :     }
    1144             : 
    1145             :     /* the following is the same as python's dictionary lookup, with
    1146             :        only minor changes.  see the makeunicodedata script for more
    1147             :        details */
    1148             : 
    1149           0 :     h = (unsigned int) _gethash(name, namelen, code_magic);
    1150           0 :     i = (~h) & mask;
    1151           0 :     v = code_hash[i];
    1152           0 :     if (!v)
    1153           0 :         return 0;
    1154           0 :     if (_cmpname(self, v, name, namelen))
    1155           0 :         return _check_alias_and_seq(v, code, with_named_seq);
    1156           0 :     incr = (h ^ (h >> 3)) & mask;
    1157           0 :     if (!incr)
    1158           0 :         incr = mask;
    1159             :     for (;;) {
    1160           0 :         i = (i + incr) & mask;
    1161           0 :         v = code_hash[i];
    1162           0 :         if (!v)
    1163           0 :             return 0;
    1164           0 :         if (_cmpname(self, v, name, namelen))
    1165           0 :             return _check_alias_and_seq(v, code, with_named_seq);
    1166           0 :         incr = incr << 1;
    1167           0 :         if (incr > mask)
    1168           0 :             incr = incr ^ code_poly;
    1169           0 :     }
    1170             : }
    1171             : 
    1172             : static const _PyUnicode_Name_CAPI hashAPI =
    1173             : {
    1174             :     sizeof(_PyUnicode_Name_CAPI),
    1175             :     _getucname,
    1176             :     _getcode
    1177             : };
    1178             : 
    1179             : /* -------------------------------------------------------------------- */
    1180             : /* Python bindings */
    1181             : 
    1182             : PyDoc_STRVAR(unicodedata_name__doc__,
    1183             : "name(unichr[, default])\n\
    1184             : Returns the name assigned to the Unicode character unichr as a\n\
    1185             : string. If no name is defined, default is returned, or, if not\n\
    1186             : given, ValueError is raised.");
    1187             : 
    1188             : static PyObject *
    1189           0 : unicodedata_name(PyObject* self, PyObject* args)
    1190             : {
    1191             :     char name[NAME_MAXLEN];
    1192             :     Py_UCS4 c;
    1193             : 
    1194             :     PyUnicodeObject* v;
    1195           0 :     PyObject* defobj = NULL;
    1196           0 :     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
    1197           0 :         return NULL;
    1198             : 
    1199           0 :     c = getuchar(v);
    1200           0 :     if (c == (Py_UCS4)-1)
    1201           0 :         return NULL;
    1202             : 
    1203           0 :     if (!_getucname(self, c, name, sizeof(name), 0)) {
    1204           0 :         if (defobj == NULL) {
    1205           0 :             PyErr_SetString(PyExc_ValueError, "no such name");
    1206           0 :             return NULL;
    1207             :         }
    1208             :         else {
    1209           0 :             Py_INCREF(defobj);
    1210           0 :             return defobj;
    1211             :         }
    1212             :     }
    1213             : 
    1214           0 :     return PyUnicode_FromString(name);
    1215             : }
    1216             : 
    1217             : PyDoc_STRVAR(unicodedata_lookup__doc__,
    1218             : "lookup(name)\n\
    1219             : \n\
    1220             : Look up character by name.  If a character with the\n\
    1221             : given name is found, return the corresponding Unicode\n\
    1222             : character.  If not found, KeyError is raised.");
    1223             : 
    1224             : static PyObject *
    1225           0 : unicodedata_lookup(PyObject* self, PyObject* args)
    1226             : {
    1227             :     Py_UCS4 code;
    1228             : 
    1229             :     char* name;
    1230             :     int namelen;
    1231             :     unsigned int index;
    1232           0 :     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
    1233           0 :         return NULL;
    1234             : 
    1235           0 :     if (!_getcode(self, name, namelen, &code, 1)) {
    1236           0 :         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
    1237           0 :         return NULL;
    1238             :     }
    1239             :     /* check if code is in the PUA range that we use for named sequences
    1240             :        and convert it */
    1241           0 :     if (IS_NAMED_SEQ(code)) {
    1242           0 :         index = code-named_sequences_start;
    1243           0 :         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
    1244           0 :                                          named_sequences[index].seq,
    1245             :                                          named_sequences[index].seqlen);
    1246             :     }
    1247           0 :     return PyUnicode_FromOrdinal(code);
    1248             : }
    1249             : 
    1250             : /* XXX Add doc strings. */
    1251             : 
    1252             : static PyMethodDef unicodedata_functions[] = {
    1253             :     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
    1254             :     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
    1255             :     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
    1256             :     {"category", unicodedata_category, METH_VARARGS,
    1257             :                  unicodedata_category__doc__},
    1258             :     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
    1259             :                       unicodedata_bidirectional__doc__},
    1260             :     {"combining", unicodedata_combining, METH_VARARGS,
    1261             :                   unicodedata_combining__doc__},
    1262             :     {"mirrored", unicodedata_mirrored, METH_VARARGS,
    1263             :                  unicodedata_mirrored__doc__},
    1264             :     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
    1265             :                          unicodedata_east_asian_width__doc__},
    1266             :     {"decomposition", unicodedata_decomposition, METH_VARARGS,
    1267             :                       unicodedata_decomposition__doc__},
    1268             :     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
    1269             :     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
    1270             :     {"normalize", unicodedata_normalize, METH_VARARGS,
    1271             :                   unicodedata_normalize__doc__},
    1272             :     {NULL, NULL}                /* sentinel */
    1273             : };
    1274             : 
    1275             : static PyTypeObject UCD_Type = {
    1276             :         /* The ob_type field must be initialized in the module init function
    1277             :          * to be portable to Windows without using C++. */
    1278             :         PyVarObject_HEAD_INIT(NULL, 0)
    1279             :         "unicodedata.UCD",              /*tp_name*/
    1280             :         sizeof(PreviousDBVersion),      /*tp_basicsize*/
    1281             :         0,                      /*tp_itemsize*/
    1282             :         /* methods */
    1283             :         (destructor)PyObject_Del, /*tp_dealloc*/
    1284             :         0,                      /*tp_print*/
    1285             :         0,                      /*tp_getattr*/
    1286             :         0,                      /*tp_setattr*/
    1287             :         0,                      /*tp_reserved*/
    1288             :         0,                      /*tp_repr*/
    1289             :         0,                      /*tp_as_number*/
    1290             :         0,                      /*tp_as_sequence*/
    1291             :         0,                      /*tp_as_mapping*/
    1292             :         0,                      /*tp_hash*/
    1293             :         0,                      /*tp_call*/
    1294             :         0,                      /*tp_str*/
    1295             :         PyObject_GenericGetAttr,/*tp_getattro*/
    1296             :         0,                      /*tp_setattro*/
    1297             :         0,                      /*tp_as_buffer*/
    1298             :         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
    1299             :         0,                      /*tp_doc*/
    1300             :         0,                      /*tp_traverse*/
    1301             :         0,                      /*tp_clear*/
    1302             :         0,                      /*tp_richcompare*/
    1303             :         0,                      /*tp_weaklistoffset*/
    1304             :         0,                      /*tp_iter*/
    1305             :         0,                      /*tp_iternext*/
    1306             :         unicodedata_functions,  /*tp_methods*/
    1307             :         DB_members,             /*tp_members*/
    1308             :         0,                      /*tp_getset*/
    1309             :         0,                      /*tp_base*/
    1310             :         0,                      /*tp_dict*/
    1311             :         0,                      /*tp_descr_get*/
    1312             :         0,                      /*tp_descr_set*/
    1313             :         0,                      /*tp_dictoffset*/
    1314             :         0,                      /*tp_init*/
    1315             :         0,                      /*tp_alloc*/
    1316             :         0,                      /*tp_new*/
    1317             :         0,                      /*tp_free*/
    1318             :         0,                      /*tp_is_gc*/
    1319             : };
    1320             : 
    1321             : PyDoc_STRVAR(unicodedata_docstring,
    1322             : "This module provides access to the Unicode Character Database which\n\
    1323             : defines character properties for all Unicode characters. The data in\n\
    1324             : this database is based on the UnicodeData.txt file version\n\
    1325             : 6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
    1326             : \n\
    1327             : The module uses the same names and symbols as defined by the\n\
    1328             : UnicodeData File Format 6.0.0 (see\n\
    1329             : http://www.unicode.org/reports/tr44/tr44-6.html).");
    1330             : 
    1331             : 
    1332             : static struct PyModuleDef unicodedatamodule = {
    1333             :         PyModuleDef_HEAD_INIT,
    1334             :         "unicodedata",
    1335             :         unicodedata_docstring,
    1336             :         -1,
    1337             :         unicodedata_functions,
    1338             :         NULL,
    1339             :         NULL,
    1340             :         NULL,
    1341             :         NULL
    1342             : };
    1343             : 
    1344             : PyMODINIT_FUNC
    1345           0 : PyInit_unicodedata(void)
    1346             : {
    1347             :     PyObject *m, *v;
    1348             : 
    1349           0 :     Py_TYPE(&UCD_Type) = &PyType_Type;
    1350             : 
    1351           0 :     m = PyModule_Create(&unicodedatamodule);
    1352           0 :     if (!m)
    1353           0 :         return NULL;
    1354             : 
    1355           0 :     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
    1356           0 :     Py_INCREF(&UCD_Type);
    1357           0 :     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
    1358             : 
    1359             :     /* Previous versions */
    1360           0 :     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
    1361           0 :     if (v != NULL)
    1362           0 :         PyModule_AddObject(m, "ucd_3_2_0", v);
    1363             : 
    1364             :     /* Export C API */
    1365           0 :     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
    1366           0 :     if (v != NULL)
    1367           0 :         PyModule_AddObject(m, "ucnhash_CAPI", v);
    1368           0 :     return m;
    1369             : }
    1370             : 
    1371             : /*
    1372             : Local variables:
    1373             : c-basic-offset: 4
    1374             : indent-tabs-mode: nil
    1375             : End:
    1376             : */

Generated by: LCOV version 1.10