Line data Source code
1 : /* stringlib: codec implementations */
2 :
3 : #if STRINGLIB_IS_UNICODE
4 :
5 : /* Mask to check or force alignment of a pointer to C 'long' boundaries */
6 : #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
7 :
8 : /* Mask to quickly check whether a C 'long' contains a
9 : non-ASCII, UTF8-encoded char. */
10 : #if (SIZEOF_LONG == 8)
11 : # define ASCII_CHAR_MASK 0x8080808080808080UL
12 : #elif (SIZEOF_LONG == 4)
13 : # define ASCII_CHAR_MASK 0x80808080UL
14 : #else
15 : # error C 'long' size should be either 4 or 8!
16 : #endif
17 :
18 : /* 10xxxxxx */
19 : #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
20 :
21 : Py_LOCAL_INLINE(Py_UCS4)
22 163 : STRINGLIB(utf8_decode)(const char **inptr, const char *end,
23 : STRINGLIB_CHAR *dest,
24 : Py_ssize_t *outpos)
25 : {
26 : Py_UCS4 ch;
27 163 : const char *s = *inptr;
28 163 : const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
29 163 : STRINGLIB_CHAR *p = dest + *outpos;
30 :
31 1054 : while (s < end) {
32 822 : ch = (unsigned char)*s;
33 :
34 822 : if (ch < 0x80) {
35 : /* Fast path for runs of ASCII characters. Given that common UTF-8
36 : input will consist of an overwhelming majority of ASCII
37 : characters, we try to optimize for this case by checking
38 : as many characters as a C 'long' can contain.
39 : First, check if we can do an aligned read, as most CPUs have
40 : a penalty for unaligned reads.
41 : */
42 544 : if (!((size_t) s & LONG_PTR_MASK)) {
43 : /* Help register allocation */
44 187 : register const char *_s = s;
45 187 : register STRINGLIB_CHAR *_p = p;
46 4788 : while (_s < aligned_end) {
47 : /* Read a whole long at a time (either 4 or 8 bytes),
48 : and do a fast unrolled copy if it only contains ASCII
49 : characters. */
50 4551 : unsigned long value = *(unsigned long *) _s;
51 4551 : if (value & ASCII_CHAR_MASK)
52 137 : break;
53 : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
54 4414 : _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
55 4414 : _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
56 4414 : _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
57 4414 : _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
58 : # if SIZEOF_LONG == 8
59 : _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
60 : _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
61 : _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
62 : _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
63 : # endif
64 : #else
65 : # if SIZEOF_LONG == 8
66 : _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
67 : _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
68 : _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
69 : _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
70 : _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
71 : _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
72 : _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
73 : _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
74 : # else
75 : _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
76 : _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
77 : _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
78 : _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
79 : # endif
80 : #endif
81 4414 : _s += SIZEOF_LONG;
82 4414 : _p += SIZEOF_LONG;
83 : }
84 187 : s = _s;
85 187 : p = _p;
86 187 : if (s == end)
87 7 : break;
88 180 : ch = (unsigned char)*s;
89 : }
90 537 : if (ch < 0x80) {
91 505 : s++;
92 505 : *p++ = ch;
93 505 : continue;
94 : }
95 : }
96 :
97 310 : if (ch < 0xC2) {
98 : /* invalid sequence
99 : \x80-\xBF -- continuation byte
100 : \xC0-\xC1 -- fake 0000-007F */
101 0 : goto InvalidStart;
102 : }
103 :
104 310 : if (ch < 0xE0) {
105 : /* \xC2\x80-\xDF\xBF -- 0080-07FF */
106 : Py_UCS4 ch2;
107 102 : if (end - s < 2) {
108 : /* unexpected end of data: the caller will decide whether
109 : it's an error or not */
110 0 : break;
111 : }
112 102 : ch2 = (unsigned char)s[1];
113 102 : if (!IS_CONTINUATION_BYTE(ch2))
114 : /* invalid continuation byte */
115 : goto InvalidContinuation;
116 102 : ch = (ch << 6) + ch2 -
117 : ((0xC0 << 6) + 0x80);
118 : assert ((ch > 0x007F) && (ch <= 0x07FF));
119 102 : s += 2;
120 4 : if (STRINGLIB_MAX_CHAR <= 0x007F ||
121 : (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
122 12 : goto Overflow;
123 90 : *p++ = ch;
124 90 : continue;
125 : }
126 :
127 208 : if (ch < 0xF0) {
128 : /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
129 : Py_UCS4 ch2, ch3;
130 208 : if (end - s < 3) {
131 : /* unexpected end of data: the caller will decide whether
132 : it's an error or not */
133 0 : break;
134 : }
135 208 : ch2 = (unsigned char)s[1];
136 208 : ch3 = (unsigned char)s[2];
137 208 : if (!IS_CONTINUATION_BYTE(ch2) ||
138 208 : !IS_CONTINUATION_BYTE(ch3)) {
139 : /* invalid continuation byte */
140 : goto InvalidContinuation;
141 : }
142 208 : if (ch == 0xE0) {
143 0 : if (ch2 < 0xA0)
144 : /* invalid sequence
145 : \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
146 0 : goto InvalidContinuation;
147 : }
148 208 : else if (ch == 0xED && ch2 > 0x9F) {
149 : /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
150 : will result in surrogates in range D800-DFFF. Surrogates are
151 : not valid UTF-8 so they are rejected.
152 : See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
153 : (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
154 0 : goto InvalidContinuation;
155 : }
156 208 : ch = (ch << 12) + (ch2 << 6) + ch3 -
157 : ((0xE0 << 12) + (0x80 << 6) + 0x80);
158 : assert ((ch > 0x07FF) && (ch <= 0xFFFF));
159 208 : s += 3;
160 : if (STRINGLIB_MAX_CHAR <= 0x07FF ||
161 : (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
162 75 : goto Overflow;
163 133 : *p++ = ch;
164 133 : continue;
165 : }
166 :
167 0 : if (ch < 0xF5) {
168 : /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
169 : Py_UCS4 ch2, ch3, ch4;
170 0 : if (end - s < 4) {
171 : /* unexpected end of data: the caller will decide whether
172 : it's an error or not */
173 0 : break;
174 : }
175 0 : ch2 = (unsigned char)s[1];
176 0 : ch3 = (unsigned char)s[2];
177 0 : ch4 = (unsigned char)s[3];
178 0 : if (!IS_CONTINUATION_BYTE(ch2) ||
179 0 : !IS_CONTINUATION_BYTE(ch3) ||
180 0 : !IS_CONTINUATION_BYTE(ch4)) {
181 : /* invalid continuation byte */
182 : goto InvalidContinuation;
183 : }
184 0 : if (ch == 0xF0) {
185 0 : if (ch2 < 0x90)
186 : /* invalid sequence
187 : \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
188 0 : goto InvalidContinuation;
189 : }
190 0 : else if (ch == 0xF4 && ch2 > 0x8F) {
191 : /* invalid sequence
192 : \xF4\x90\x80\80- -- 110000- overflow */
193 0 : goto InvalidContinuation;
194 : }
195 0 : ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
196 : ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
197 : assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
198 0 : s += 4;
199 : if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
200 : (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
201 0 : goto Overflow;
202 0 : *p++ = ch;
203 0 : continue;
204 : }
205 0 : goto InvalidStart;
206 : }
207 76 : ch = 0;
208 : Overflow:
209 : Return:
210 163 : *inptr = s;
211 163 : *outpos = p - dest;
212 326 : return ch;
213 : InvalidStart:
214 0 : ch = 1;
215 0 : goto Return;
216 : InvalidContinuation:
217 0 : ch = 2;
218 0 : goto Return;
219 : }
220 :
221 : #undef ASCII_CHAR_MASK
222 : #undef IS_CONTINUATION_BYTE
223 :
224 :
225 : /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
226 : PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
227 : UCS-1 strings don't need to handle surrogates for example. */
228 : Py_LOCAL_INLINE(PyObject *)
229 1 : STRINGLIB(utf8_encoder)(PyObject *unicode,
230 : STRINGLIB_CHAR *data,
231 : Py_ssize_t size,
232 : const char *errors)
233 : {
234 : #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
235 :
236 : Py_ssize_t i; /* index into s of next input byte */
237 : PyObject *result; /* result string object */
238 : char *p; /* next free byte in output buffer */
239 : Py_ssize_t nallocated; /* number of result bytes allocated */
240 : Py_ssize_t nneeded; /* number of result bytes needed */
241 : #if STRINGLIB_SIZEOF_CHAR > 1
242 0 : PyObject *errorHandler = NULL;
243 0 : PyObject *exc = NULL;
244 0 : PyObject *rep = NULL;
245 : #endif
246 : #if STRINGLIB_SIZEOF_CHAR == 1
247 1 : const Py_ssize_t max_char_size = 2;
248 : char stackbuf[MAX_SHORT_UNICHARS * 2];
249 : #elif STRINGLIB_SIZEOF_CHAR == 2
250 0 : const Py_ssize_t max_char_size = 3;
251 : char stackbuf[MAX_SHORT_UNICHARS * 3];
252 : #else /* STRINGLIB_SIZEOF_CHAR == 4 */
253 0 : const Py_ssize_t max_char_size = 4;
254 : char stackbuf[MAX_SHORT_UNICHARS * 4];
255 : #endif
256 :
257 : assert(size >= 0);
258 :
259 1 : if (size <= MAX_SHORT_UNICHARS) {
260 : /* Write into the stack buffer; nallocated can't overflow.
261 : * At the end, we'll allocate exactly as much heap space as it
262 : * turns out we need.
263 : */
264 0 : nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
265 0 : result = NULL; /* will allocate after we're done */
266 0 : p = stackbuf;
267 : }
268 : else {
269 1 : if (size > PY_SSIZE_T_MAX / max_char_size) {
270 : /* integer overflow */
271 0 : return PyErr_NoMemory();
272 : }
273 : /* Overallocate on the heap, and give the excess back at the end. */
274 1 : nallocated = size * max_char_size;
275 1 : result = PyBytes_FromStringAndSize(NULL, nallocated);
276 1 : if (result == NULL)
277 0 : return NULL;
278 1 : p = PyBytes_AS_STRING(result);
279 : }
280 :
281 6520 : for (i = 0; i < size;) {
282 6518 : Py_UCS4 ch = data[i++];
283 :
284 6518 : if (ch < 0x80) {
285 : /* Encode ASCII */
286 6515 : *p++ = (char) ch;
287 :
288 : }
289 : else
290 : #if STRINGLIB_SIZEOF_CHAR > 1
291 0 : if (ch < 0x0800)
292 : #endif
293 : {
294 : /* Encode Latin-1 */
295 3 : *p++ = (char)(0xc0 | (ch >> 6));
296 3 : *p++ = (char)(0x80 | (ch & 0x3f));
297 : }
298 : #if STRINGLIB_SIZEOF_CHAR > 1
299 0 : else if (Py_UNICODE_IS_SURROGATE(ch)) {
300 : Py_ssize_t newpos;
301 : Py_ssize_t repsize, k, startpos;
302 0 : startpos = i-1;
303 0 : rep = unicode_encode_call_errorhandler(
304 : errors, &errorHandler, "utf-8", "surrogates not allowed",
305 : unicode, &exc, startpos, startpos+1, &newpos);
306 0 : if (!rep)
307 : goto error;
308 :
309 0 : if (PyBytes_Check(rep))
310 0 : repsize = PyBytes_GET_SIZE(rep);
311 : else
312 0 : repsize = PyUnicode_GET_LENGTH(rep);
313 :
314 0 : if (repsize > max_char_size) {
315 : Py_ssize_t offset;
316 :
317 0 : if (result == NULL)
318 0 : offset = p - stackbuf;
319 : else
320 0 : offset = p - PyBytes_AS_STRING(result);
321 :
322 0 : if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
323 : /* integer overflow */
324 0 : PyErr_NoMemory();
325 : goto error;
326 : }
327 0 : nallocated += repsize - max_char_size;
328 0 : if (result != NULL) {
329 0 : if (_PyBytes_Resize(&result, nallocated) < 0)
330 : goto error;
331 : } else {
332 0 : result = PyBytes_FromStringAndSize(NULL, nallocated);
333 0 : if (result == NULL)
334 : goto error;
335 0 : Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
336 : }
337 0 : p = PyBytes_AS_STRING(result) + offset;
338 : }
339 :
340 0 : if (PyBytes_Check(rep)) {
341 0 : char *prep = PyBytes_AS_STRING(rep);
342 0 : for(k = repsize; k > 0; k--)
343 0 : *p++ = *prep++;
344 : } else /* rep is unicode */ {
345 : enum PyUnicode_Kind repkind;
346 : void *repdata;
347 :
348 0 : if (PyUnicode_READY(rep) < 0)
349 : goto error;
350 0 : repkind = PyUnicode_KIND(rep);
351 0 : repdata = PyUnicode_DATA(rep);
352 :
353 0 : for(k=0; k<repsize; k++) {
354 0 : Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
355 0 : if (0x80 <= c) {
356 0 : raise_encode_exception(&exc, "utf-8",
357 : unicode,
358 : i-1, i,
359 : "surrogates not allowed");
360 : goto error;
361 : }
362 0 : *p++ = (char)c;
363 : }
364 : }
365 0 : Py_CLEAR(rep);
366 : }
367 : else
368 : #if STRINGLIB_SIZEOF_CHAR > 2
369 0 : if (ch < 0x10000)
370 : #endif
371 : {
372 0 : *p++ = (char)(0xe0 | (ch >> 12));
373 0 : *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
374 0 : *p++ = (char)(0x80 | (ch & 0x3f));
375 : }
376 : #if STRINGLIB_SIZEOF_CHAR > 2
377 : else /* ch >= 0x10000 */
378 : {
379 : assert(ch <= MAX_UNICODE);
380 : /* Encode UCS4 Unicode ordinals */
381 0 : *p++ = (char)(0xf0 | (ch >> 18));
382 0 : *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
383 0 : *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
384 0 : *p++ = (char)(0x80 | (ch & 0x3f));
385 : }
386 : #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
387 : #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
388 : }
389 :
390 1 : if (result == NULL) {
391 : /* This was stack allocated. */
392 0 : nneeded = p - stackbuf;
393 : assert(nneeded <= nallocated);
394 0 : result = PyBytes_FromStringAndSize(stackbuf, nneeded);
395 : }
396 : else {
397 : /* Cut back to size actually needed. */
398 1 : nneeded = p - PyBytes_AS_STRING(result);
399 : assert(nneeded <= nallocated);
400 1 : _PyBytes_Resize(&result, nneeded);
401 : }
402 :
403 : #if STRINGLIB_SIZEOF_CHAR > 1
404 0 : Py_XDECREF(errorHandler);
405 0 : Py_XDECREF(exc);
406 : #endif
407 1 : return result;
408 :
409 : #if STRINGLIB_SIZEOF_CHAR > 1
410 : error:
411 0 : Py_XDECREF(rep);
412 0 : Py_XDECREF(errorHandler);
413 0 : Py_XDECREF(exc);
414 0 : Py_XDECREF(result);
415 0 : return NULL;
416 : #endif
417 :
418 : #undef MAX_SHORT_UNICHARS
419 : }
420 :
421 : /* The pattern for constructing UCS2-repeated masks. */
422 : #if SIZEOF_LONG == 8
423 : # define UCS2_REPEAT_MASK 0x0001000100010001ul
424 : #elif SIZEOF_LONG == 4
425 : # define UCS2_REPEAT_MASK 0x00010001ul
426 : #else
427 : # error C 'long' size should be either 4 or 8!
428 : #endif
429 :
430 : /* The mask for fast checking. */
431 : #if STRINGLIB_SIZEOF_CHAR == 1
432 : /* The mask for fast checking of whether a C 'long' contains a
433 : non-ASCII or non-Latin1 UTF16-encoded characters. */
434 : # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
435 : #else
436 : /* The mask for fast checking of whether a C 'long' may contain
437 : UTF16-encoded surrogate characters. This is an efficient heuristic,
438 : assuming that non-surrogate characters with a code point >= 0x8000 are
439 : rare in most input.
440 : */
441 : # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
442 : #endif
443 : /* The mask for fast byte-swapping. */
444 : #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
445 : /* Swap bytes. */
446 : #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
447 : (((value) & STRIPPED_MASK) << 8))
448 :
449 : Py_LOCAL_INLINE(Py_UCS4)
450 0 : STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
451 : STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
452 : int native_ordering)
453 : {
454 : Py_UCS4 ch;
455 0 : const unsigned char *aligned_end =
456 0 : (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
457 0 : const unsigned char *q = *inptr;
458 0 : STRINGLIB_CHAR *p = dest + *outpos;
459 : /* Offsets from q for retrieving byte pairs in the right order. */
460 : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
461 0 : int ihi = !!native_ordering, ilo = !native_ordering;
462 : #else
463 : int ihi = !native_ordering, ilo = !!native_ordering;
464 : #endif
465 0 : --e;
466 :
467 0 : while (q < e) {
468 : Py_UCS4 ch2;
469 : /* First check for possible aligned read of a C 'long'. Unaligned
470 : reads are more expensive, better to defer to another iteration. */
471 0 : if (!((size_t) q & LONG_PTR_MASK)) {
472 : /* Fast path for runs of in-range non-surrogate chars. */
473 0 : register const unsigned char *_q = q;
474 0 : while (_q < aligned_end) {
475 0 : unsigned long block = * (unsigned long *) _q;
476 0 : if (native_ordering) {
477 : /* Can use buffer directly */
478 0 : if (block & FAST_CHAR_MASK)
479 0 : break;
480 : }
481 : else {
482 : /* Need to byte-swap */
483 0 : if (block & SWAB(FAST_CHAR_MASK))
484 0 : break;
485 : #if STRINGLIB_SIZEOF_CHAR == 1
486 0 : block >>= 8;
487 : #else
488 0 : block = SWAB(block);
489 : #endif
490 : }
491 : #ifdef BYTEORDER_IS_LITTLE_ENDIAN
492 : # if SIZEOF_LONG == 4
493 0 : p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
494 0 : p[1] = (STRINGLIB_CHAR)(block >> 16);
495 : # elif SIZEOF_LONG == 8
496 : p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
497 : p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
498 : p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
499 : p[3] = (STRINGLIB_CHAR)(block >> 48);
500 : # endif
501 : #else
502 : # if SIZEOF_LONG == 4
503 : p[0] = (STRINGLIB_CHAR)(block >> 16);
504 : p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
505 : # elif SIZEOF_LONG == 8
506 : p[0] = (STRINGLIB_CHAR)(block >> 48);
507 : p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
508 : p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
509 : p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
510 : # endif
511 : #endif
512 0 : _q += SIZEOF_LONG;
513 0 : p += SIZEOF_LONG / 2;
514 : }
515 0 : q = _q;
516 0 : if (q >= e)
517 0 : break;
518 : }
519 :
520 0 : ch = (q[ihi] << 8) | q[ilo];
521 0 : q += 2;
522 0 : if (!Py_UNICODE_IS_SURROGATE(ch)) {
523 : #if STRINGLIB_SIZEOF_CHAR < 2
524 0 : if (ch > STRINGLIB_MAX_CHAR)
525 : /* Out-of-range */
526 0 : goto Return;
527 : #endif
528 0 : *p++ = (STRINGLIB_CHAR)ch;
529 0 : continue;
530 : }
531 :
532 : /* UTF-16 code pair: */
533 0 : if (q >= e)
534 0 : goto UnexpectedEnd;
535 0 : if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
536 : goto IllegalEncoding;
537 0 : ch2 = (q[ihi] << 8) | q[ilo];
538 0 : q += 2;
539 0 : if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
540 : goto IllegalSurrogate;
541 0 : ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
542 : #if STRINGLIB_SIZEOF_CHAR < 4
543 : /* Out-of-range */
544 0 : goto Return;
545 : #else
546 0 : *p++ = (STRINGLIB_CHAR)ch;
547 : #endif
548 : }
549 0 : ch = 0;
550 : Return:
551 0 : *inptr = q;
552 0 : *outpos = p - dest;
553 0 : return ch;
554 : UnexpectedEnd:
555 0 : ch = 1;
556 0 : goto Return;
557 : IllegalEncoding:
558 0 : ch = 2;
559 0 : goto Return;
560 : IllegalSurrogate:
561 0 : ch = 3;
562 0 : goto Return;
563 : }
564 : #undef UCS2_REPEAT_MASK
565 : #undef FAST_CHAR_MASK
566 : #undef STRIPPED_MASK
567 : #undef SWAB
568 : #undef LONG_PTR_MASK
569 :
570 :
571 : Py_LOCAL_INLINE(void)
572 0 : STRINGLIB(utf16_encode)(unsigned short *out,
573 : const STRINGLIB_CHAR *in,
574 : Py_ssize_t len,
575 : int native_ordering)
576 : {
577 0 : const STRINGLIB_CHAR *end = in + len;
578 : #if STRINGLIB_SIZEOF_CHAR == 1
579 : # define SWAB2(CH) ((CH) << 8)
580 : #else
581 : # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
582 : #endif
583 : #if STRINGLIB_MAX_CHAR < 0x10000
584 0 : if (native_ordering) {
585 : # if STRINGLIB_SIZEOF_CHAR == 2
586 0 : Py_MEMCPY(out, in, 2 * len);
587 : # else
588 0 : _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
589 : # endif
590 : } else {
591 0 : const STRINGLIB_CHAR *unrolled_end = in + (len & ~ (Py_ssize_t) 3);
592 0 : while (in < unrolled_end) {
593 0 : out[0] = SWAB2(in[0]);
594 0 : out[1] = SWAB2(in[1]);
595 0 : out[2] = SWAB2(in[2]);
596 0 : out[3] = SWAB2(in[3]);
597 0 : in += 4; out += 4;
598 : }
599 0 : while (in < end) {
600 0 : *out++ = SWAB2(*in);
601 0 : ++in;
602 : }
603 : }
604 : #else
605 0 : if (native_ordering) {
606 0 : while (in < end) {
607 0 : Py_UCS4 ch = *in++;
608 0 : if (ch < 0x10000)
609 0 : *out++ = ch;
610 : else {
611 0 : out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
612 0 : out[1] = Py_UNICODE_LOW_SURROGATE(ch);
613 0 : out += 2;
614 : }
615 : }
616 : } else {
617 0 : while (in < end) {
618 0 : Py_UCS4 ch = *in++;
619 0 : if (ch < 0x10000)
620 0 : *out++ = SWAB2((Py_UCS2)ch);
621 : else {
622 0 : Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
623 0 : Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
624 0 : out[0] = SWAB2(ch1);
625 0 : out[1] = SWAB2(ch2);
626 0 : out += 2;
627 : }
628 : }
629 : }
630 : #endif
631 : #undef SWAB2
632 0 : }
633 : #endif /* STRINGLIB_IS_UNICODE */
|