Line data Source code
1 : /* ------------------------------------------------------------------------
2 :
3 : unicodedata -- Provides access to the Unicode database.
4 :
5 : Data was extracted from the UnicodeData.txt file.
6 : The current version number is reported in the unidata_version constant.
7 :
8 : Written by Marc-Andre Lemburg (mal@lemburg.com).
9 : Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 : Modified by Martin v. Löwis (martin@v.loewis.de)
11 :
12 : Copyright (c) Corporation for National Research Initiatives.
13 :
14 : ------------------------------------------------------------------------ */
15 :
16 : #include "Python.h"
17 : #include "ucnhash.h"
18 : #include "structmember.h"
19 :
20 : /* character properties */
21 :
22 : typedef struct {
23 : const unsigned char category; /* index into
24 : _PyUnicode_CategoryNames */
25 : const unsigned char combining; /* combining class value 0 - 255 */
26 : const unsigned char bidirectional; /* index into
27 : _PyUnicode_BidirectionalNames */
28 : const unsigned char mirrored; /* true if mirrored in bidir mode */
29 : const unsigned char east_asian_width; /* index into
30 : _PyUnicode_EastAsianWidth */
31 : const unsigned char normalization_quick_check; /* see is_normalized() */
32 : } _PyUnicode_DatabaseRecord;
33 :
34 : typedef struct change_record {
35 : /* sequence of fields should be the same as in merge_old_version */
36 : const unsigned char bidir_changed;
37 : const unsigned char category_changed;
38 : const unsigned char decimal_changed;
39 : const unsigned char mirrored_changed;
40 : const double numeric_changed;
41 : } change_record;
42 :
43 : /* data file generated by Tools/unicode/makeunicodedata.py */
44 : #include "unicodedata_db.h"
45 :
46 : static const _PyUnicode_DatabaseRecord*
47 0 : _getrecord_ex(Py_UCS4 code)
48 : {
49 : int index;
50 0 : if (code >= 0x110000)
51 0 : index = 0;
52 : else {
53 0 : index = index1[(code>>SHIFT)];
54 0 : index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 : }
56 :
57 0 : return &_PyUnicode_Database_Records[index];
58 : }
59 :
60 : /* ------------- Previous-version API ------------------------------------- */
61 : typedef struct previous_version {
62 : PyObject_HEAD
63 : const char *name;
64 : const change_record* (*getrecord)(Py_UCS4);
65 : Py_UCS4 (*normalization)(Py_UCS4);
66 : } PreviousDBVersion;
67 :
68 : #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69 :
70 : static PyMemberDef DB_members[] = {
71 : {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
72 : {NULL}
73 : };
74 :
75 : /* forward declaration */
76 : static PyTypeObject UCD_Type;
77 : #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
78 :
79 : static PyObject*
80 0 : new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
81 : Py_UCS4 (*normalization)(Py_UCS4))
82 : {
83 : PreviousDBVersion *self;
84 0 : self = PyObject_New(PreviousDBVersion, &UCD_Type);
85 0 : if (self == NULL)
86 0 : return NULL;
87 0 : self->name = name;
88 0 : self->getrecord = getrecord;
89 0 : self->normalization = normalization;
90 0 : return (PyObject*)self;
91 : }
92 :
93 :
94 0 : static Py_UCS4 getuchar(PyUnicodeObject *obj)
95 : {
96 0 : if (PyUnicode_READY(obj))
97 0 : return (Py_UCS4)-1;
98 0 : if (PyUnicode_GET_LENGTH(obj) == 1) {
99 0 : if (PyUnicode_READY(obj))
100 0 : return (Py_UCS4)-1;
101 0 : return PyUnicode_READ_CHAR(obj, 0);
102 : }
103 0 : PyErr_SetString(PyExc_TypeError,
104 : "need a single Unicode character as parameter");
105 0 : return (Py_UCS4)-1;
106 : }
107 :
108 : /* --- Module API --------------------------------------------------------- */
109 :
110 : PyDoc_STRVAR(unicodedata_decimal__doc__,
111 : "decimal(unichr[, default])\n\
112 : \n\
113 : Returns the decimal value assigned to the Unicode character unichr\n\
114 : as integer. If no such value is defined, default is returned, or, if\n\
115 : not given, ValueError is raised.");
116 :
117 : static PyObject *
118 0 : unicodedata_decimal(PyObject *self, PyObject *args)
119 : {
120 : PyUnicodeObject *v;
121 0 : PyObject *defobj = NULL;
122 0 : int have_old = 0;
123 : long rc;
124 : Py_UCS4 c;
125 :
126 0 : if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
127 0 : return NULL;
128 0 : c = getuchar(v);
129 0 : if (c == (Py_UCS4)-1)
130 0 : return NULL;
131 :
132 0 : if (self && UCD_Check(self)) {
133 0 : const change_record *old = get_old_record(self, c);
134 0 : if (old->category_changed == 0) {
135 : /* unassigned */
136 0 : have_old = 1;
137 0 : rc = -1;
138 : }
139 0 : else if (old->decimal_changed != 0xFF) {
140 0 : have_old = 1;
141 0 : rc = old->decimal_changed;
142 : }
143 : }
144 :
145 0 : if (!have_old)
146 0 : rc = Py_UNICODE_TODECIMAL(c);
147 0 : if (rc < 0) {
148 0 : if (defobj == NULL) {
149 0 : PyErr_SetString(PyExc_ValueError,
150 : "not a decimal");
151 0 : return NULL;
152 : }
153 : else {
154 0 : Py_INCREF(defobj);
155 0 : return defobj;
156 : }
157 : }
158 0 : return PyLong_FromLong(rc);
159 : }
160 :
161 : PyDoc_STRVAR(unicodedata_digit__doc__,
162 : "digit(unichr[, default])\n\
163 : \n\
164 : Returns the digit value assigned to the Unicode character unichr as\n\
165 : integer. If no such value is defined, default is returned, or, if\n\
166 : not given, ValueError is raised.");
167 :
168 : static PyObject *
169 0 : unicodedata_digit(PyObject *self, PyObject *args)
170 : {
171 : PyUnicodeObject *v;
172 0 : PyObject *defobj = NULL;
173 : long rc;
174 : Py_UCS4 c;
175 :
176 0 : if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
177 0 : return NULL;
178 0 : c = getuchar(v);
179 0 : if (c == (Py_UCS4)-1)
180 0 : return NULL;
181 0 : rc = Py_UNICODE_TODIGIT(c);
182 0 : if (rc < 0) {
183 0 : if (defobj == NULL) {
184 0 : PyErr_SetString(PyExc_ValueError, "not a digit");
185 0 : return NULL;
186 : }
187 : else {
188 0 : Py_INCREF(defobj);
189 0 : return defobj;
190 : }
191 : }
192 0 : return PyLong_FromLong(rc);
193 : }
194 :
195 : PyDoc_STRVAR(unicodedata_numeric__doc__,
196 : "numeric(unichr[, default])\n\
197 : \n\
198 : Returns the numeric value assigned to the Unicode character unichr\n\
199 : as float. If no such value is defined, default is returned, or, if\n\
200 : not given, ValueError is raised.");
201 :
202 : static PyObject *
203 0 : unicodedata_numeric(PyObject *self, PyObject *args)
204 : {
205 : PyUnicodeObject *v;
206 0 : PyObject *defobj = NULL;
207 0 : int have_old = 0;
208 : double rc;
209 : Py_UCS4 c;
210 :
211 0 : if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
212 0 : return NULL;
213 0 : c = getuchar(v);
214 0 : if (c == (Py_UCS4)-1)
215 0 : return NULL;
216 :
217 0 : if (self && UCD_Check(self)) {
218 0 : const change_record *old = get_old_record(self, c);
219 0 : if (old->category_changed == 0) {
220 : /* unassigned */
221 0 : have_old = 1;
222 0 : rc = -1.0;
223 : }
224 0 : else if (old->decimal_changed != 0xFF) {
225 0 : have_old = 1;
226 0 : rc = old->decimal_changed;
227 : }
228 : }
229 :
230 0 : if (!have_old)
231 0 : rc = Py_UNICODE_TONUMERIC(c);
232 0 : if (rc == -1.0) {
233 0 : if (defobj == NULL) {
234 0 : PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 0 : return NULL;
236 : }
237 : else {
238 0 : Py_INCREF(defobj);
239 0 : return defobj;
240 : }
241 : }
242 0 : return PyFloat_FromDouble(rc);
243 : }
244 :
245 : PyDoc_STRVAR(unicodedata_category__doc__,
246 : "category(unichr)\n\
247 : \n\
248 : Returns the general category assigned to the Unicode character\n\
249 : unichr as string.");
250 :
251 : static PyObject *
252 0 : unicodedata_category(PyObject *self, PyObject *args)
253 : {
254 : PyUnicodeObject *v;
255 : int index;
256 : Py_UCS4 c;
257 :
258 0 : if (!PyArg_ParseTuple(args, "O!:category",
259 : &PyUnicode_Type, &v))
260 0 : return NULL;
261 0 : c = getuchar(v);
262 0 : if (c == (Py_UCS4)-1)
263 0 : return NULL;
264 0 : index = (int) _getrecord_ex(c)->category;
265 0 : if (self && UCD_Check(self)) {
266 0 : const change_record *old = get_old_record(self, c);
267 0 : if (old->category_changed != 0xFF)
268 0 : index = old->category_changed;
269 : }
270 0 : return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
271 : }
272 :
273 : PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274 : "bidirectional(unichr)\n\
275 : \n\
276 : Returns the bidirectional category assigned to the Unicode character\n\
277 : unichr as string. If no such value is defined, an empty string is\n\
278 : returned.");
279 :
280 : static PyObject *
281 0 : unicodedata_bidirectional(PyObject *self, PyObject *args)
282 : {
283 : PyUnicodeObject *v;
284 : int index;
285 : Py_UCS4 c;
286 :
287 0 : if (!PyArg_ParseTuple(args, "O!:bidirectional",
288 : &PyUnicode_Type, &v))
289 0 : return NULL;
290 0 : c = getuchar(v);
291 0 : if (c == (Py_UCS4)-1)
292 0 : return NULL;
293 0 : index = (int) _getrecord_ex(c)->bidirectional;
294 0 : if (self && UCD_Check(self)) {
295 0 : const change_record *old = get_old_record(self, c);
296 0 : if (old->category_changed == 0)
297 0 : index = 0; /* unassigned */
298 0 : else if (old->bidir_changed != 0xFF)
299 0 : index = old->bidir_changed;
300 : }
301 0 : return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
302 : }
303 :
304 : PyDoc_STRVAR(unicodedata_combining__doc__,
305 : "combining(unichr)\n\
306 : \n\
307 : Returns the canonical combining class assigned to the Unicode\n\
308 : character unichr as integer. Returns 0 if no combining class is\n\
309 : defined.");
310 :
311 : static PyObject *
312 0 : unicodedata_combining(PyObject *self, PyObject *args)
313 : {
314 : PyUnicodeObject *v;
315 : int index;
316 : Py_UCS4 c;
317 :
318 0 : if (!PyArg_ParseTuple(args, "O!:combining",
319 : &PyUnicode_Type, &v))
320 0 : return NULL;
321 0 : c = getuchar(v);
322 0 : if (c == (Py_UCS4)-1)
323 0 : return NULL;
324 0 : index = (int) _getrecord_ex(c)->combining;
325 0 : if (self && UCD_Check(self)) {
326 0 : const change_record *old = get_old_record(self, c);
327 0 : if (old->category_changed == 0)
328 0 : index = 0; /* unassigned */
329 : }
330 0 : return PyLong_FromLong(index);
331 : }
332 :
333 : PyDoc_STRVAR(unicodedata_mirrored__doc__,
334 : "mirrored(unichr)\n\
335 : \n\
336 : Returns the mirrored property assigned to the Unicode character\n\
337 : unichr as integer. Returns 1 if the character has been identified as\n\
338 : a \"mirrored\" character in bidirectional text, 0 otherwise.");
339 :
340 : static PyObject *
341 0 : unicodedata_mirrored(PyObject *self, PyObject *args)
342 : {
343 : PyUnicodeObject *v;
344 : int index;
345 : Py_UCS4 c;
346 :
347 0 : if (!PyArg_ParseTuple(args, "O!:mirrored",
348 : &PyUnicode_Type, &v))
349 0 : return NULL;
350 0 : c = getuchar(v);
351 0 : if (c == (Py_UCS4)-1)
352 0 : return NULL;
353 0 : index = (int) _getrecord_ex(c)->mirrored;
354 0 : if (self && UCD_Check(self)) {
355 0 : const change_record *old = get_old_record(self, c);
356 0 : if (old->category_changed == 0)
357 0 : index = 0; /* unassigned */
358 0 : else if (old->mirrored_changed != 0xFF)
359 0 : index = old->mirrored_changed;
360 : }
361 0 : return PyLong_FromLong(index);
362 : }
363 :
364 : PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365 : "east_asian_width(unichr)\n\
366 : \n\
367 : Returns the east asian width assigned to the Unicode character\n\
368 : unichr as string.");
369 :
370 : static PyObject *
371 0 : unicodedata_east_asian_width(PyObject *self, PyObject *args)
372 : {
373 : PyUnicodeObject *v;
374 : int index;
375 : Py_UCS4 c;
376 :
377 0 : if (!PyArg_ParseTuple(args, "O!:east_asian_width",
378 : &PyUnicode_Type, &v))
379 0 : return NULL;
380 0 : c = getuchar(v);
381 0 : if (c == (Py_UCS4)-1)
382 0 : return NULL;
383 0 : index = (int) _getrecord_ex(c)->east_asian_width;
384 0 : if (self && UCD_Check(self)) {
385 0 : const change_record *old = get_old_record(self, c);
386 0 : if (old->category_changed == 0)
387 0 : index = 0; /* unassigned */
388 : }
389 0 : return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
390 : }
391 :
392 : PyDoc_STRVAR(unicodedata_decomposition__doc__,
393 : "decomposition(unichr)\n\
394 : \n\
395 : Returns the character decomposition mapping assigned to the Unicode\n\
396 : character unichr as string. An empty string is returned in case no\n\
397 : such mapping is defined.");
398 :
399 : static PyObject *
400 0 : unicodedata_decomposition(PyObject *self, PyObject *args)
401 : {
402 : PyUnicodeObject *v;
403 : char decomp[256];
404 : int code, index, count;
405 : size_t i;
406 : unsigned int prefix_index;
407 : Py_UCS4 c;
408 :
409 0 : if (!PyArg_ParseTuple(args, "O!:decomposition",
410 : &PyUnicode_Type, &v))
411 0 : return NULL;
412 0 : c = getuchar(v);
413 0 : if (c == (Py_UCS4)-1)
414 0 : return NULL;
415 :
416 0 : code = (int)c;
417 :
418 0 : if (self && UCD_Check(self)) {
419 0 : const change_record *old = get_old_record(self, c);
420 0 : if (old->category_changed == 0)
421 0 : return PyUnicode_FromString(""); /* unassigned */
422 : }
423 :
424 0 : if (code < 0 || code >= 0x110000)
425 0 : index = 0;
426 : else {
427 0 : index = decomp_index1[(code>>DECOMP_SHIFT)];
428 0 : index = decomp_index2[(index<<DECOMP_SHIFT)+
429 0 : (code&((1<<DECOMP_SHIFT)-1))];
430 : }
431 :
432 : /* high byte is number of hex bytes (usually one or two), low byte
433 : is prefix code (from*/
434 0 : count = decomp_data[index] >> 8;
435 :
436 : /* XXX: could allocate the PyString up front instead
437 : (strlen(prefix) + 5 * count + 1 bytes) */
438 :
439 : /* Based on how index is calculated above and decomp_data is generated
440 : from Tools/unicode/makeunicodedata.py, it should not be possible
441 : to overflow decomp_prefix. */
442 0 : prefix_index = decomp_data[index] & 255;
443 : assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
444 :
445 : /* copy prefix */
446 0 : i = strlen(decomp_prefix[prefix_index]);
447 0 : memcpy(decomp, decomp_prefix[prefix_index], i);
448 :
449 0 : while (count-- > 0) {
450 0 : if (i)
451 0 : decomp[i++] = ' ';
452 : assert(i < sizeof(decomp));
453 0 : PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 : decomp_data[++index]);
455 0 : i += strlen(decomp + i);
456 : }
457 0 : return PyUnicode_FromStringAndSize(decomp, i);
458 : }
459 :
460 : static void
461 0 : get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
462 : {
463 0 : if (code >= 0x110000) {
464 0 : *index = 0;
465 0 : } else if (self && UCD_Check(self) &&
466 0 : get_old_record(self, code)->category_changed==0) {
467 : /* unassigned in old version */
468 0 : *index = 0;
469 : }
470 : else {
471 0 : *index = decomp_index1[(code>>DECOMP_SHIFT)];
472 0 : *index = decomp_index2[(*index<<DECOMP_SHIFT)+
473 0 : (code&((1<<DECOMP_SHIFT)-1))];
474 : }
475 :
476 : /* high byte is number of hex bytes (usually one or two), low byte
477 : is prefix code (from*/
478 0 : *count = decomp_data[*index] >> 8;
479 0 : *prefix = decomp_data[*index] & 255;
480 :
481 0 : (*index)++;
482 0 : }
483 :
484 : #define SBase 0xAC00
485 : #define LBase 0x1100
486 : #define VBase 0x1161
487 : #define TBase 0x11A7
488 : #define LCount 19
489 : #define VCount 21
490 : #define TCount 28
491 : #define NCount (VCount*TCount)
492 : #define SCount (LCount*NCount)
493 :
494 : static PyObject*
495 0 : nfd_nfkd(PyObject *self, PyObject *input, int k)
496 : {
497 : PyObject *result;
498 : Py_UCS4 *output;
499 : Py_ssize_t i, o, osize;
500 : int kind;
501 : void *data;
502 : /* Longest decomposition in Unicode 3.2: U+FDFA */
503 : Py_UCS4 stack[20];
504 : Py_ssize_t space, isize;
505 : int index, prefix, count, stackptr;
506 : unsigned char prev, cur;
507 :
508 0 : stackptr = 0;
509 0 : isize = PyUnicode_GET_LENGTH(input);
510 : /* Overallocate atmost 10 characters. */
511 0 : space = (isize > 10 ? 10 : isize) + isize;
512 0 : osize = space;
513 0 : output = PyMem_Malloc(space * sizeof(Py_UCS4));
514 0 : if (!output) {
515 0 : PyErr_NoMemory();
516 0 : return NULL;
517 : }
518 0 : i = o = 0;
519 0 : kind = PyUnicode_KIND(input);
520 0 : data = PyUnicode_DATA(input);
521 :
522 0 : while (i < isize) {
523 0 : stack[stackptr++] = PyUnicode_READ(kind, data, i++);
524 0 : while(stackptr) {
525 0 : Py_UCS4 code = stack[--stackptr];
526 : /* Hangul Decomposition adds three characters in
527 : a single step, so we need atleast that much room. */
528 0 : if (space < 3) {
529 : Py_UCS4 *new_output;
530 0 : osize += 10;
531 0 : space += 10;
532 0 : new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
533 0 : if (new_output == NULL) {
534 0 : PyMem_Free(output);
535 0 : PyErr_NoMemory();
536 0 : return NULL;
537 : }
538 0 : output = new_output;
539 : }
540 : /* Hangul Decomposition. */
541 0 : if (SBase <= code && code < (SBase+SCount)) {
542 0 : int SIndex = code - SBase;
543 0 : int L = LBase + SIndex / NCount;
544 0 : int V = VBase + (SIndex % NCount) / TCount;
545 0 : int T = TBase + SIndex % TCount;
546 0 : output[o++] = L;
547 0 : output[o++] = V;
548 0 : space -= 2;
549 0 : if (T != TBase) {
550 0 : output[o++] = T;
551 0 : space --;
552 : }
553 0 : continue;
554 : }
555 : /* normalization changes */
556 0 : if (self && UCD_Check(self)) {
557 0 : Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
558 0 : if (value != 0) {
559 0 : stack[stackptr++] = value;
560 0 : continue;
561 : }
562 : }
563 :
564 : /* Other decompositions. */
565 0 : get_decomp_record(self, code, &index, &prefix, &count);
566 :
567 : /* Copy character if it is not decomposable, or has a
568 : compatibility decomposition, but we do NFD. */
569 0 : if (!count || (prefix && !k)) {
570 0 : output[o++] = code;
571 0 : space--;
572 0 : continue;
573 : }
574 : /* Copy decomposition onto the stack, in reverse
575 : order. */
576 0 : while(count) {
577 0 : code = decomp_data[index + (--count)];
578 0 : stack[stackptr++] = code;
579 : }
580 : }
581 : }
582 :
583 0 : result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
584 : output, o);
585 0 : PyMem_Free(output);
586 0 : if (!result)
587 0 : return NULL;
588 : /* result is guaranteed to be ready, as it is compact. */
589 0 : kind = PyUnicode_KIND(result);
590 0 : data = PyUnicode_DATA(result);
591 :
592 : /* Sort canonically. */
593 0 : i = 0;
594 0 : prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
595 0 : for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
596 0 : cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
597 0 : if (prev == 0 || cur == 0 || prev <= cur) {
598 0 : prev = cur;
599 0 : continue;
600 : }
601 : /* Non-canonical order. Need to switch *i with previous. */
602 0 : o = i - 1;
603 : while (1) {
604 0 : Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
605 0 : PyUnicode_WRITE(kind, data, o+1,
606 : PyUnicode_READ(kind, data, o));
607 0 : PyUnicode_WRITE(kind, data, o, tmp);
608 0 : o--;
609 0 : if (o < 0)
610 0 : break;
611 0 : prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
612 0 : if (prev == 0 || prev <= cur)
613 : break;
614 0 : }
615 0 : prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
616 : }
617 0 : return result;
618 : }
619 :
620 : static int
621 0 : find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
622 : {
623 : unsigned int index;
624 0 : for (index = 0; nfc[index].start; index++) {
625 0 : unsigned int start = nfc[index].start;
626 0 : if (code < start)
627 0 : return -1;
628 0 : if (code <= start + nfc[index].count) {
629 0 : unsigned int delta = code - start;
630 0 : return nfc[index].index + delta;
631 : }
632 : }
633 0 : return -1;
634 : }
635 :
636 : static PyObject*
637 0 : nfc_nfkc(PyObject *self, PyObject *input, int k)
638 : {
639 : PyObject *result;
640 : int kind;
641 : void *data;
642 : Py_UCS4 *output;
643 : Py_ssize_t i, i1, o, len;
644 : int f,l,index,index1,comb;
645 : Py_UCS4 code;
646 : Py_ssize_t skipped[20];
647 0 : int cskipped = 0;
648 :
649 0 : result = nfd_nfkd(self, input, k);
650 0 : if (!result)
651 0 : return NULL;
652 : /* result will be "ready". */
653 0 : kind = PyUnicode_KIND(result);
654 0 : data = PyUnicode_DATA(result);
655 0 : len = PyUnicode_GET_LENGTH(result);
656 :
657 : /* We allocate a buffer for the output.
658 : If we find that we made no changes, we still return
659 : the NFD result. */
660 0 : output = PyMem_Malloc(len * sizeof(Py_UCS4));
661 0 : if (!output) {
662 0 : PyErr_NoMemory();
663 0 : Py_DECREF(result);
664 0 : return 0;
665 : }
666 0 : i = o = 0;
667 :
668 : again:
669 0 : while (i < len) {
670 0 : for (index = 0; index < cskipped; index++) {
671 0 : if (skipped[index] == i) {
672 : /* *i character is skipped.
673 : Remove from list. */
674 0 : skipped[index] = skipped[cskipped-1];
675 0 : cskipped--;
676 0 : i++;
677 0 : goto again; /* continue while */
678 : }
679 : }
680 : /* Hangul Composition. We don't need to check for <LV,T>
681 : pairs, since we always have decomposed data. */
682 0 : code = PyUnicode_READ(kind, data, i);
683 0 : if (LBase <= code && code < (LBase+LCount) &&
684 0 : i + 1 < len &&
685 0 : VBase <= PyUnicode_READ(kind, data, i+1) &&
686 0 : PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
687 : int LIndex, VIndex;
688 0 : LIndex = code - LBase;
689 0 : VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
690 0 : code = SBase + (LIndex*VCount+VIndex)*TCount;
691 0 : i+=2;
692 0 : if (i < len &&
693 0 : TBase <= PyUnicode_READ(kind, data, i) &&
694 0 : PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
695 0 : code += PyUnicode_READ(kind, data, i)-TBase;
696 0 : i++;
697 : }
698 0 : output[o++] = code;
699 0 : continue;
700 : }
701 :
702 : /* code is still input[i] here */
703 0 : f = find_nfc_index(self, nfc_first, code);
704 0 : if (f == -1) {
705 0 : output[o++] = code;
706 0 : i++;
707 0 : continue;
708 : }
709 : /* Find next unblocked character. */
710 0 : i1 = i+1;
711 0 : comb = 0;
712 : /* output base character for now; might be updated later. */
713 0 : output[o] = PyUnicode_READ(kind, data, i);
714 0 : while (i1 < len) {
715 0 : Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
716 0 : int comb1 = _getrecord_ex(code1)->combining;
717 0 : if (comb) {
718 0 : if (comb1 == 0)
719 0 : break;
720 0 : if (comb >= comb1) {
721 : /* Character is blocked. */
722 0 : i1++;
723 0 : continue;
724 : }
725 : }
726 0 : l = find_nfc_index(self, nfc_last, code1);
727 : /* i1 cannot be combined with i. If i1
728 : is a starter, we don't need to look further.
729 : Otherwise, record the combining class. */
730 0 : if (l == -1) {
731 : not_combinable:
732 0 : if (comb1 == 0)
733 0 : break;
734 0 : comb = comb1;
735 0 : i1++;
736 0 : continue;
737 : }
738 0 : index = f*TOTAL_LAST + l;
739 0 : index1 = comp_index[index >> COMP_SHIFT];
740 0 : code = comp_data[(index1<<COMP_SHIFT)+
741 0 : (index&((1<<COMP_SHIFT)-1))];
742 0 : if (code == 0)
743 0 : goto not_combinable;
744 :
745 : /* Replace the original character. */
746 0 : output[o] = code;
747 : /* Mark the second character unused. */
748 : assert(cskipped < 20);
749 0 : skipped[cskipped++] = i1;
750 0 : i1++;
751 0 : f = find_nfc_index(self, nfc_first, output[o]);
752 0 : if (f == -1)
753 0 : break;
754 : }
755 : /* Output character was already written.
756 : Just advance the indices. */
757 0 : o++; i++;
758 : }
759 0 : if (o == len) {
760 : /* No changes. Return original string. */
761 0 : PyMem_Free(output);
762 0 : return result;
763 : }
764 0 : Py_DECREF(result);
765 0 : result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
766 : output, o);
767 0 : PyMem_Free(output);
768 0 : return result;
769 : }
770 :
771 : /* Return 1 if the input is certainly normalized, 0 if it might not be. */
772 : static int
773 0 : is_normalized(PyObject *self, PyObject *input, int nfc, int k)
774 : {
775 : Py_ssize_t i, len;
776 : int kind;
777 : void *data;
778 0 : unsigned char prev_combining = 0, quickcheck_mask;
779 :
780 : /* An older version of the database is requested, quickchecks must be
781 : disabled. */
782 0 : if (self && UCD_Check(self))
783 0 : return 0;
784 :
785 : /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
786 : as described in http://unicode.org/reports/tr15/#Annex8. */
787 0 : quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
788 :
789 0 : i = 0;
790 0 : kind = PyUnicode_KIND(input);
791 0 : data = PyUnicode_DATA(input);
792 0 : len = PyUnicode_GET_LENGTH(input);
793 0 : while (i < len) {
794 0 : Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
795 0 : const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
796 0 : unsigned char combining = record->combining;
797 0 : unsigned char quickcheck = record->normalization_quick_check;
798 :
799 0 : if (quickcheck & quickcheck_mask)
800 0 : return 0; /* this string might need normalization */
801 0 : if (combining && prev_combining > combining)
802 0 : return 0; /* non-canonical sort order, not normalized */
803 0 : prev_combining = combining;
804 : }
805 0 : return 1; /* certainly normalized */
806 : }
807 :
808 : PyDoc_STRVAR(unicodedata_normalize__doc__,
809 : "normalize(form, unistr)\n\
810 : \n\
811 : Return the normal form 'form' for the Unicode string unistr. Valid\n\
812 : values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
813 :
814 : static PyObject*
815 0 : unicodedata_normalize(PyObject *self, PyObject *args)
816 : {
817 : char *form;
818 : PyObject *input;
819 :
820 0 : if(!PyArg_ParseTuple(args, "sO!:normalize",
821 : &form, &PyUnicode_Type, &input))
822 0 : return NULL;
823 :
824 0 : if (PyUnicode_READY(input) == -1)
825 0 : return NULL;
826 :
827 0 : if (PyUnicode_GET_LENGTH(input) == 0) {
828 : /* Special case empty input strings, since resizing
829 : them later would cause internal errors. */
830 0 : Py_INCREF(input);
831 0 : return input;
832 : }
833 :
834 0 : if (strcmp(form, "NFC") == 0) {
835 0 : if (is_normalized(self, input, 1, 0)) {
836 0 : Py_INCREF(input);
837 0 : return input;
838 : }
839 0 : return nfc_nfkc(self, input, 0);
840 : }
841 0 : if (strcmp(form, "NFKC") == 0) {
842 0 : if (is_normalized(self, input, 1, 1)) {
843 0 : Py_INCREF(input);
844 0 : return input;
845 : }
846 0 : return nfc_nfkc(self, input, 1);
847 : }
848 0 : if (strcmp(form, "NFD") == 0) {
849 0 : if (is_normalized(self, input, 0, 0)) {
850 0 : Py_INCREF(input);
851 0 : return input;
852 : }
853 0 : return nfd_nfkd(self, input, 0);
854 : }
855 0 : if (strcmp(form, "NFKD") == 0) {
856 0 : if (is_normalized(self, input, 0, 1)) {
857 0 : Py_INCREF(input);
858 0 : return input;
859 : }
860 0 : return nfd_nfkd(self, input, 1);
861 : }
862 0 : PyErr_SetString(PyExc_ValueError, "invalid normalization form");
863 0 : return NULL;
864 : }
865 :
866 : /* -------------------------------------------------------------------- */
867 : /* unicode character name tables */
868 :
869 : /* data file generated by Tools/unicode/makeunicodedata.py */
870 : #include "unicodename_db.h"
871 :
872 : /* -------------------------------------------------------------------- */
873 : /* database code (cut and pasted from the unidb package) */
874 :
875 : static unsigned long
876 0 : _gethash(const char *s, int len, int scale)
877 : {
878 : int i;
879 0 : unsigned long h = 0;
880 : unsigned long ix;
881 0 : for (i = 0; i < len; i++) {
882 0 : h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
883 0 : ix = h & 0xff000000;
884 0 : if (ix)
885 0 : h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
886 : }
887 0 : return h;
888 : }
889 :
890 : static char *hangul_syllables[][3] = {
891 : { "G", "A", "" },
892 : { "GG", "AE", "G" },
893 : { "N", "YA", "GG" },
894 : { "D", "YAE", "GS" },
895 : { "DD", "EO", "N", },
896 : { "R", "E", "NJ" },
897 : { "M", "YEO", "NH" },
898 : { "B", "YE", "D" },
899 : { "BB", "O", "L" },
900 : { "S", "WA", "LG" },
901 : { "SS", "WAE", "LM" },
902 : { "", "OE", "LB" },
903 : { "J", "YO", "LS" },
904 : { "JJ", "U", "LT" },
905 : { "C", "WEO", "LP" },
906 : { "K", "WE", "LH" },
907 : { "T", "WI", "M" },
908 : { "P", "YU", "B" },
909 : { "H", "EU", "BS" },
910 : { 0, "YI", "S" },
911 : { 0, "I", "SS" },
912 : { 0, 0, "NG" },
913 : { 0, 0, "J" },
914 : { 0, 0, "C" },
915 : { 0, 0, "K" },
916 : { 0, 0, "T" },
917 : { 0, 0, "P" },
918 : { 0, 0, "H" }
919 : };
920 :
921 : /* These ranges need to match makeunicodedata.py:cjk_ranges. */
922 : static int
923 0 : is_unified_ideograph(Py_UCS4 code)
924 : {
925 0 : return
926 0 : (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
927 0 : (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
928 0 : (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
929 0 : (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
930 0 : (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
931 : }
932 :
933 : /* macros used to determine if the given codepoint is in the PUA range that
934 : * we are using to store aliases and named sequences */
935 : #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
936 : #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
937 : (cp < named_sequences_end))
938 :
939 : static int
940 0 : _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
941 : int with_alias_and_seq)
942 : {
943 : /* Find the name associated with the given codepoint.
944 : * If with_alias_and_seq is 1, check for names in the Private Use Area 15
945 : * that we are using for aliases and named sequences. */
946 : int offset;
947 : int i;
948 : int word;
949 : unsigned char* w;
950 :
951 0 : if (code >= 0x110000)
952 0 : return 0;
953 :
954 : /* XXX should we just skip all the codepoints in the PUAs here? */
955 0 : if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
956 0 : return 0;
957 :
958 0 : if (self && UCD_Check(self)) {
959 : /* in 3.2.0 there are no aliases and named sequences */
960 : const change_record *old;
961 0 : if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
962 0 : return 0;
963 0 : old = get_old_record(self, code);
964 0 : if (old->category_changed == 0) {
965 : /* unassigned */
966 0 : return 0;
967 : }
968 : }
969 :
970 0 : if (SBase <= code && code < SBase+SCount) {
971 : /* Hangul syllable. */
972 0 : int SIndex = code - SBase;
973 0 : int L = SIndex / NCount;
974 0 : int V = (SIndex % NCount) / TCount;
975 0 : int T = SIndex % TCount;
976 :
977 0 : if (buflen < 27)
978 : /* Worst case: HANGUL SYLLABLE <10chars>. */
979 0 : return 0;
980 0 : strcpy(buffer, "HANGUL SYLLABLE ");
981 0 : buffer += 16;
982 0 : strcpy(buffer, hangul_syllables[L][0]);
983 0 : buffer += strlen(hangul_syllables[L][0]);
984 0 : strcpy(buffer, hangul_syllables[V][1]);
985 0 : buffer += strlen(hangul_syllables[V][1]);
986 0 : strcpy(buffer, hangul_syllables[T][2]);
987 0 : buffer += strlen(hangul_syllables[T][2]);
988 0 : *buffer = '\0';
989 0 : return 1;
990 : }
991 :
992 0 : if (is_unified_ideograph(code)) {
993 0 : if (buflen < 28)
994 : /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
995 0 : return 0;
996 0 : sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
997 0 : return 1;
998 : }
999 :
1000 : /* get offset into phrasebook */
1001 0 : offset = phrasebook_offset1[(code>>phrasebook_shift)];
1002 0 : offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1003 0 : (code&((1<<phrasebook_shift)-1))];
1004 0 : if (!offset)
1005 0 : return 0;
1006 :
1007 0 : i = 0;
1008 :
1009 : for (;;) {
1010 : /* get word index */
1011 0 : word = phrasebook[offset] - phrasebook_short;
1012 0 : if (word >= 0) {
1013 0 : word = (word << 8) + phrasebook[offset+1];
1014 0 : offset += 2;
1015 : } else
1016 0 : word = phrasebook[offset++];
1017 0 : if (i) {
1018 0 : if (i > buflen)
1019 0 : return 0; /* buffer overflow */
1020 0 : buffer[i++] = ' ';
1021 : }
1022 : /* copy word string from lexicon. the last character in the
1023 : word has bit 7 set. the last word in a string ends with
1024 : 0x80 */
1025 0 : w = lexicon + lexicon_offset[word];
1026 0 : while (*w < 128) {
1027 0 : if (i >= buflen)
1028 0 : return 0; /* buffer overflow */
1029 0 : buffer[i++] = *w++;
1030 : }
1031 0 : if (i >= buflen)
1032 0 : return 0; /* buffer overflow */
1033 0 : buffer[i++] = *w & 127;
1034 0 : if (*w == 128)
1035 0 : break; /* end of word */
1036 0 : }
1037 :
1038 0 : return 1;
1039 : }
1040 :
1041 : static int
1042 0 : _cmpname(PyObject *self, int code, const char* name, int namelen)
1043 : {
1044 : /* check if code corresponds to the given name */
1045 : int i;
1046 : char buffer[NAME_MAXLEN];
1047 0 : if (!_getucname(self, code, buffer, sizeof(buffer), 1))
1048 0 : return 0;
1049 0 : for (i = 0; i < namelen; i++) {
1050 0 : if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1051 0 : return 0;
1052 : }
1053 0 : return buffer[namelen] == '\0';
1054 : }
1055 :
1056 : static void
1057 0 : find_syllable(const char *str, int *len, int *pos, int count, int column)
1058 : {
1059 : int i, len1;
1060 0 : *len = -1;
1061 0 : for (i = 0; i < count; i++) {
1062 0 : char *s = hangul_syllables[i][column];
1063 0 : len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1064 0 : if (len1 <= *len)
1065 0 : continue;
1066 0 : if (strncmp(str, s, len1) == 0) {
1067 0 : *len = len1;
1068 0 : *pos = i;
1069 : }
1070 : }
1071 0 : if (*len == -1) {
1072 0 : *len = 0;
1073 : }
1074 0 : }
1075 :
1076 : static int
1077 0 : _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1078 : {
1079 : /* check if named sequences are allowed */
1080 0 : if (!with_named_seq && IS_NAMED_SEQ(cp))
1081 0 : return 0;
1082 : /* if the codepoint is in the PUA range that we use for aliases,
1083 : * convert it to obtain the right codepoint */
1084 0 : if (IS_ALIAS(cp))
1085 0 : *code = name_aliases[cp-aliases_start];
1086 : else
1087 0 : *code = cp;
1088 0 : return 1;
1089 : }
1090 :
1091 : static int
1092 0 : _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1093 : int with_named_seq)
1094 : {
1095 : /* Return the codepoint associated with the given name.
1096 : * Named aliases are resolved too (unless self != NULL (i.e. we are using
1097 : * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1098 : * using for the named sequence, and the caller must then convert it. */
1099 : unsigned int h, v;
1100 0 : unsigned int mask = code_size-1;
1101 : unsigned int i, incr;
1102 :
1103 : /* Check for hangul syllables. */
1104 0 : if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1105 0 : int len, L = -1, V = -1, T = -1;
1106 0 : const char *pos = name + 16;
1107 0 : find_syllable(pos, &len, &L, LCount, 0);
1108 0 : pos += len;
1109 0 : find_syllable(pos, &len, &V, VCount, 1);
1110 0 : pos += len;
1111 0 : find_syllable(pos, &len, &T, TCount, 2);
1112 0 : pos += len;
1113 0 : if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1114 0 : *code = SBase + (L*VCount+V)*TCount + T;
1115 0 : return 1;
1116 : }
1117 : /* Otherwise, it's an illegal syllable name. */
1118 0 : return 0;
1119 : }
1120 :
1121 : /* Check for unified ideographs. */
1122 0 : if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1123 : /* Four or five hexdigits must follow. */
1124 0 : v = 0;
1125 0 : name += 22;
1126 0 : namelen -= 22;
1127 0 : if (namelen != 4 && namelen != 5)
1128 0 : return 0;
1129 0 : while (namelen--) {
1130 0 : v *= 16;
1131 0 : if (*name >= '0' && *name <= '9')
1132 0 : v += *name - '0';
1133 0 : else if (*name >= 'A' && *name <= 'F')
1134 0 : v += *name - 'A' + 10;
1135 : else
1136 0 : return 0;
1137 0 : name++;
1138 : }
1139 0 : if (!is_unified_ideograph(v))
1140 0 : return 0;
1141 0 : *code = v;
1142 0 : return 1;
1143 : }
1144 :
1145 : /* the following is the same as python's dictionary lookup, with
1146 : only minor changes. see the makeunicodedata script for more
1147 : details */
1148 :
1149 0 : h = (unsigned int) _gethash(name, namelen, code_magic);
1150 0 : i = (~h) & mask;
1151 0 : v = code_hash[i];
1152 0 : if (!v)
1153 0 : return 0;
1154 0 : if (_cmpname(self, v, name, namelen))
1155 0 : return _check_alias_and_seq(v, code, with_named_seq);
1156 0 : incr = (h ^ (h >> 3)) & mask;
1157 0 : if (!incr)
1158 0 : incr = mask;
1159 : for (;;) {
1160 0 : i = (i + incr) & mask;
1161 0 : v = code_hash[i];
1162 0 : if (!v)
1163 0 : return 0;
1164 0 : if (_cmpname(self, v, name, namelen))
1165 0 : return _check_alias_and_seq(v, code, with_named_seq);
1166 0 : incr = incr << 1;
1167 0 : if (incr > mask)
1168 0 : incr = incr ^ code_poly;
1169 0 : }
1170 : }
1171 :
1172 : static const _PyUnicode_Name_CAPI hashAPI =
1173 : {
1174 : sizeof(_PyUnicode_Name_CAPI),
1175 : _getucname,
1176 : _getcode
1177 : };
1178 :
1179 : /* -------------------------------------------------------------------- */
1180 : /* Python bindings */
1181 :
1182 : PyDoc_STRVAR(unicodedata_name__doc__,
1183 : "name(unichr[, default])\n\
1184 : Returns the name assigned to the Unicode character unichr as a\n\
1185 : string. If no name is defined, default is returned, or, if not\n\
1186 : given, ValueError is raised.");
1187 :
1188 : static PyObject *
1189 0 : unicodedata_name(PyObject* self, PyObject* args)
1190 : {
1191 : char name[NAME_MAXLEN];
1192 : Py_UCS4 c;
1193 :
1194 : PyUnicodeObject* v;
1195 0 : PyObject* defobj = NULL;
1196 0 : if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1197 0 : return NULL;
1198 :
1199 0 : c = getuchar(v);
1200 0 : if (c == (Py_UCS4)-1)
1201 0 : return NULL;
1202 :
1203 0 : if (!_getucname(self, c, name, sizeof(name), 0)) {
1204 0 : if (defobj == NULL) {
1205 0 : PyErr_SetString(PyExc_ValueError, "no such name");
1206 0 : return NULL;
1207 : }
1208 : else {
1209 0 : Py_INCREF(defobj);
1210 0 : return defobj;
1211 : }
1212 : }
1213 :
1214 0 : return PyUnicode_FromString(name);
1215 : }
1216 :
1217 : PyDoc_STRVAR(unicodedata_lookup__doc__,
1218 : "lookup(name)\n\
1219 : \n\
1220 : Look up character by name. If a character with the\n\
1221 : given name is found, return the corresponding Unicode\n\
1222 : character. If not found, KeyError is raised.");
1223 :
1224 : static PyObject *
1225 0 : unicodedata_lookup(PyObject* self, PyObject* args)
1226 : {
1227 : Py_UCS4 code;
1228 :
1229 : char* name;
1230 : int namelen;
1231 : unsigned int index;
1232 0 : if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1233 0 : return NULL;
1234 :
1235 0 : if (!_getcode(self, name, namelen, &code, 1)) {
1236 0 : PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1237 0 : return NULL;
1238 : }
1239 : /* check if code is in the PUA range that we use for named sequences
1240 : and convert it */
1241 0 : if (IS_NAMED_SEQ(code)) {
1242 0 : index = code-named_sequences_start;
1243 0 : return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1244 0 : named_sequences[index].seq,
1245 : named_sequences[index].seqlen);
1246 : }
1247 0 : return PyUnicode_FromOrdinal(code);
1248 : }
1249 :
1250 : /* XXX Add doc strings. */
1251 :
1252 : static PyMethodDef unicodedata_functions[] = {
1253 : {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1254 : {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1255 : {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1256 : {"category", unicodedata_category, METH_VARARGS,
1257 : unicodedata_category__doc__},
1258 : {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1259 : unicodedata_bidirectional__doc__},
1260 : {"combining", unicodedata_combining, METH_VARARGS,
1261 : unicodedata_combining__doc__},
1262 : {"mirrored", unicodedata_mirrored, METH_VARARGS,
1263 : unicodedata_mirrored__doc__},
1264 : {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1265 : unicodedata_east_asian_width__doc__},
1266 : {"decomposition", unicodedata_decomposition, METH_VARARGS,
1267 : unicodedata_decomposition__doc__},
1268 : {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1269 : {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1270 : {"normalize", unicodedata_normalize, METH_VARARGS,
1271 : unicodedata_normalize__doc__},
1272 : {NULL, NULL} /* sentinel */
1273 : };
1274 :
1275 : static PyTypeObject UCD_Type = {
1276 : /* The ob_type field must be initialized in the module init function
1277 : * to be portable to Windows without using C++. */
1278 : PyVarObject_HEAD_INIT(NULL, 0)
1279 : "unicodedata.UCD", /*tp_name*/
1280 : sizeof(PreviousDBVersion), /*tp_basicsize*/
1281 : 0, /*tp_itemsize*/
1282 : /* methods */
1283 : (destructor)PyObject_Del, /*tp_dealloc*/
1284 : 0, /*tp_print*/
1285 : 0, /*tp_getattr*/
1286 : 0, /*tp_setattr*/
1287 : 0, /*tp_reserved*/
1288 : 0, /*tp_repr*/
1289 : 0, /*tp_as_number*/
1290 : 0, /*tp_as_sequence*/
1291 : 0, /*tp_as_mapping*/
1292 : 0, /*tp_hash*/
1293 : 0, /*tp_call*/
1294 : 0, /*tp_str*/
1295 : PyObject_GenericGetAttr,/*tp_getattro*/
1296 : 0, /*tp_setattro*/
1297 : 0, /*tp_as_buffer*/
1298 : Py_TPFLAGS_DEFAULT, /*tp_flags*/
1299 : 0, /*tp_doc*/
1300 : 0, /*tp_traverse*/
1301 : 0, /*tp_clear*/
1302 : 0, /*tp_richcompare*/
1303 : 0, /*tp_weaklistoffset*/
1304 : 0, /*tp_iter*/
1305 : 0, /*tp_iternext*/
1306 : unicodedata_functions, /*tp_methods*/
1307 : DB_members, /*tp_members*/
1308 : 0, /*tp_getset*/
1309 : 0, /*tp_base*/
1310 : 0, /*tp_dict*/
1311 : 0, /*tp_descr_get*/
1312 : 0, /*tp_descr_set*/
1313 : 0, /*tp_dictoffset*/
1314 : 0, /*tp_init*/
1315 : 0, /*tp_alloc*/
1316 : 0, /*tp_new*/
1317 : 0, /*tp_free*/
1318 : 0, /*tp_is_gc*/
1319 : };
1320 :
1321 : PyDoc_STRVAR(unicodedata_docstring,
1322 : "This module provides access to the Unicode Character Database which\n\
1323 : defines character properties for all Unicode characters. The data in\n\
1324 : this database is based on the UnicodeData.txt file version\n\
1325 : 6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
1326 : \n\
1327 : The module uses the same names and symbols as defined by the\n\
1328 : UnicodeData File Format 6.0.0 (see\n\
1329 : http://www.unicode.org/reports/tr44/tr44-6.html).");
1330 :
1331 :
1332 : static struct PyModuleDef unicodedatamodule = {
1333 : PyModuleDef_HEAD_INIT,
1334 : "unicodedata",
1335 : unicodedata_docstring,
1336 : -1,
1337 : unicodedata_functions,
1338 : NULL,
1339 : NULL,
1340 : NULL,
1341 : NULL
1342 : };
1343 :
1344 : PyMODINIT_FUNC
1345 0 : PyInit_unicodedata(void)
1346 : {
1347 : PyObject *m, *v;
1348 :
1349 0 : Py_TYPE(&UCD_Type) = &PyType_Type;
1350 :
1351 0 : m = PyModule_Create(&unicodedatamodule);
1352 0 : if (!m)
1353 0 : return NULL;
1354 :
1355 0 : PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1356 0 : Py_INCREF(&UCD_Type);
1357 0 : PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1358 :
1359 : /* Previous versions */
1360 0 : v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1361 0 : if (v != NULL)
1362 0 : PyModule_AddObject(m, "ucd_3_2_0", v);
1363 :
1364 : /* Export C API */
1365 0 : v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1366 0 : if (v != NULL)
1367 0 : PyModule_AddObject(m, "ucnhash_CAPI", v);
1368 0 : return m;
1369 : }
1370 :
1371 : /*
1372 : Local variables:
1373 : c-basic-offset: 4
1374 : indent-tabs-mode: nil
1375 : End:
1376 : */
|