Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "surrogates.hxx"
21 :
22 : #include "osl/diagnose.h"
23 : #include "rtl/strbuf.hxx"
24 : #include "rtl/textenc.h"
25 : #include "rtl/textcvt.h"
26 : #include "rtl/uri.h"
27 : #include "rtl/ustrbuf.h"
28 : #include "rtl/ustrbuf.hxx"
29 : #include "rtl/ustring.h"
30 : #include "rtl/ustring.hxx"
31 : #include "sal/types.h"
32 : #include "sal/macros.h"
33 :
34 : #include <cstddef>
35 :
36 : namespace {
37 :
38 : std::size_t const nCharClassSize = 128;
39 :
40 : sal_Unicode const cEscapePrefix = 0x25; // '%'
41 :
42 30455 : inline bool isDigit(sal_uInt32 nUtf32)
43 : {
44 30455 : return nUtf32 >= 0x30 && nUtf32 <= 0x39; // '0'--'9'
45 : }
46 :
47 204190 : inline bool isAlpha(sal_uInt32 nUtf32)
48 : {
49 : // 'A'--'Z', 'a'--'z'
50 : return (
51 : (nUtf32 >= 0x41 && nUtf32 <= 0x5A) ||
52 : (nUtf32 >= 0x61 && nUtf32 <= 0x7A)
53 204190 : );
54 : }
55 :
56 59634440 : inline bool isHighSurrogate(sal_uInt32 nUtf16)
57 : {
58 59634440 : return SAL_RTL_IS_HIGH_SURROGATE(nUtf16);
59 : }
60 :
61 22 : inline bool isLowSurrogate(sal_uInt32 nUtf16)
62 : {
63 22 : return SAL_RTL_IS_LOW_SURROGATE(nUtf16);
64 : }
65 :
66 5 : inline sal_uInt32 combineSurrogates(sal_uInt32 high, sal_uInt32 low)
67 : {
68 5 : return SAL_RTL_COMBINE_SURROGATES(high, low);
69 : }
70 :
71 359394 : inline int getHexWeight(sal_uInt32 nUtf32)
72 : {
73 : return nUtf32 >= 0x30 && nUtf32 <= 0x39 ? // '0'--'9'
74 : static_cast< int >(nUtf32 - 0x30) :
75 : nUtf32 >= 0x41 && nUtf32 <= 0x46 ? // 'A'--'F'
76 : static_cast< int >(nUtf32 - 0x41 + 10) :
77 : nUtf32 >= 0x61 && nUtf32 <= 0x66 ? // 'a'--'f'
78 : static_cast< int >(nUtf32 - 0x61 + 10) :
79 359394 : -1; // not a hex digit
80 : }
81 :
82 20549862 : inline bool isValid(sal_Bool const * pCharClass, sal_uInt32 nUtf32)
83 : {
84 20549862 : return nUtf32 < nCharClassSize && pCharClass[nUtf32];
85 : }
86 :
87 59962423 : inline void writeUnicode(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
88 : sal_Unicode cChar)
89 : {
90 59962423 : rtl_uStringbuffer_insert(pBuffer, pCapacity, (*pBuffer)->length, &cChar, 1);
91 59962423 : }
92 :
93 : enum EscapeType
94 : {
95 : EscapeNo,
96 : EscapeChar,
97 : EscapeOctet
98 : };
99 :
100 : /* Read any of the following:
101 :
102 : - sequence of escape sequences representing character from eCharset,
103 : translated to single UCS4 character; or
104 :
105 : - pair of UTF-16 surrogates, translated to single UCS4 character; or
106 :
107 : _ single UTF-16 character, extended to UCS4 character.
108 : */
109 59814065 : sal_uInt32 readUcs4(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
110 : bool bEncoded, rtl_TextEncoding eCharset,
111 : EscapeType * pType)
112 : {
113 59814065 : sal_uInt32 nChar = *(*pBegin)++;
114 : int nWeight1;
115 : int nWeight2;
116 60173356 : if (nChar == cEscapePrefix && bEncoded && pEnd - *pBegin >= 2
117 179646 : && (nWeight1 = getHexWeight((*pBegin)[0])) >= 0
118 179645 : && (nWeight2 = getHexWeight((*pBegin)[1])) >= 0)
119 : {
120 179645 : *pBegin += 2;
121 179645 : nChar = static_cast< sal_uInt32 >(nWeight1 << 4 | nWeight2);
122 179645 : if (nChar <= 0x7F)
123 179596 : *pType = EscapeChar;
124 49 : else if (eCharset == RTL_TEXTENCODING_UTF8)
125 : {
126 39 : if (nChar >= 0xC0 && nChar <= 0xF4)
127 : {
128 : sal_uInt32 nEncoded;
129 : int nShift;
130 : sal_uInt32 nMin;
131 22 : if (nChar <= 0xDF)
132 : {
133 4 : nEncoded = (nChar & 0x1F) << 6;
134 4 : nShift = 0;
135 4 : nMin = 0x80;
136 : }
137 18 : else if (nChar <= 0xEF)
138 : {
139 16 : nEncoded = (nChar & 0x0F) << 12;
140 16 : nShift = 6;
141 16 : nMin = 0x800;
142 : }
143 : else
144 : {
145 2 : nEncoded = (nChar & 0x07) << 18;
146 2 : nShift = 12;
147 2 : nMin = 0x10000;
148 : }
149 22 : sal_Unicode const * p = *pBegin;
150 22 : bool bUTF8 = true;
151 62 : for (; nShift >= 0; nShift -= 6)
152 : {
153 122 : if (pEnd - p < 3 || p[0] != cEscapePrefix
154 41 : || (nWeight1 = getHexWeight(p[1])) < 8
155 : || nWeight1 > 11
156 40 : || (nWeight2 = getHexWeight(p[2])) < 0)
157 : {
158 1 : bUTF8 = sal_False;
159 1 : break;
160 : }
161 40 : p += 3;
162 40 : nEncoded |= ((nWeight1 & 3) << 4 | nWeight2) << nShift;
163 : }
164 37 : if (bUTF8 && nEncoded >= nMin && !isHighSurrogate(nEncoded)
165 15 : && !isLowSurrogate(nEncoded) && nEncoded <= 0x10FFFF)
166 : {
167 13 : *pBegin = p;
168 13 : *pType = EscapeChar;
169 13 : return nEncoded;
170 : }
171 : }
172 26 : *pType = EscapeOctet;
173 : }
174 : else
175 : {
176 10 : rtl::OStringBuffer aBuf;
177 10 : aBuf.append(static_cast< char >(nChar));
178 : rtl_TextToUnicodeConverter aConverter
179 10 : = rtl_createTextToUnicodeConverter(eCharset);
180 10 : sal_Unicode const * p = *pBegin;
181 14 : for (;;)
182 : {
183 : sal_Unicode aDst[2];
184 : sal_uInt32 nInfo;
185 : sal_Size nConverted;
186 : sal_Size nDstSize = rtl_convertTextToUnicode(
187 24 : aConverter, 0, aBuf.getStr(), aBuf.getLength(), aDst,
188 : SAL_N_ELEMENTS( aDst ),
189 : (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
190 : | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
191 : | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
192 24 : &nInfo, &nConverted);
193 24 : if (nInfo == 0)
194 : {
195 : assert( nConverted
196 : == sal::static_int_cast< sal_uInt32 >(
197 : aBuf.getLength()));
198 8 : rtl_destroyTextToUnicodeConverter(aConverter);
199 8 : *pBegin = p;
200 8 : *pType = EscapeChar;
201 : assert( nDstSize == 1
202 : || (nDstSize == 2 && isHighSurrogate(aDst[0])
203 : && isLowSurrogate(aDst[1])));
204 : return nDstSize == 1
205 8 : ? aDst[0] : combineSurrogates(aDst[0], aDst[1]);
206 : }
207 38 : else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
208 : && pEnd - p >= 3 && p[0] == cEscapePrefix
209 11 : && (nWeight1 = getHexWeight(p[1])) >= 0
210 11 : && (nWeight2 = getHexWeight(p[2])) >= 0)
211 : {
212 11 : p += 3;
213 11 : aBuf.append(static_cast< char >(nWeight1 << 4 | nWeight2));
214 : }
215 5 : else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
216 : && p != pEnd && *p <= 0x7F)
217 : {
218 3 : aBuf.append(static_cast< char >(*p++));
219 : }
220 : else
221 : {
222 : assert(
223 : (nInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)
224 : == 0);
225 : break;
226 : }
227 : }
228 2 : rtl_destroyTextToUnicodeConverter(aConverter);
229 2 : *pType = EscapeOctet;
230 : }
231 179624 : return nChar;
232 : }
233 : else
234 : {
235 59634420 : *pType = EscapeNo;
236 59634420 : return isHighSurrogate(nChar) && *pBegin < pEnd
237 7 : && isLowSurrogate(**pBegin) ?
238 59634427 : combineSurrogates(nChar, *(*pBegin)++) : nChar;
239 : }
240 : }
241 :
242 39264174 : void writeUcs4(rtl_uString ** pBuffer, sal_Int32 * pCapacity, sal_uInt32 nUtf32)
243 : {
244 : assert(nUtf32 <= 0x10FFFF); // bad UTF-32 char
245 39264174 : if (nUtf32 <= 0xFFFF) {
246 : writeUnicode(
247 39264171 : pBuffer, pCapacity, static_cast< sal_Unicode >(nUtf32));
248 : } else {
249 3 : nUtf32 -= 0x10000;
250 : writeUnicode(
251 : pBuffer, pCapacity,
252 3 : static_cast< sal_Unicode >(nUtf32 >> 10 | 0xD800));
253 : writeUnicode(
254 : pBuffer, pCapacity,
255 3 : static_cast< sal_Unicode >((nUtf32 & 0x3FF) | 0xDC00));
256 : }
257 39264174 : }
258 :
259 74164 : void writeEscapeOctet(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
260 : sal_uInt32 nOctet)
261 : {
262 : assert(nOctet <= 0xFF); // bad octet
263 :
264 : static sal_Unicode const aHex[16]
265 : = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
266 : 0x41, 0x42, 0x43, 0x44, 0x45, 0x46 }; /* '0'--'9', 'A'--'F' */
267 :
268 74164 : writeUnicode(pBuffer, pCapacity, cEscapePrefix);
269 74164 : writeUnicode(pBuffer, pCapacity, aHex[nOctet >> 4]);
270 74164 : writeUnicode(pBuffer, pCapacity, aHex[nOctet & 15]);
271 74164 : }
272 :
273 74109 : bool writeEscapeChar(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
274 : sal_uInt32 nUtf32, rtl_TextEncoding eCharset, bool bStrict)
275 : {
276 : assert(nUtf32 <= 0x10FFFF); // bad UTF-32 char
277 74109 : if (eCharset == RTL_TEXTENCODING_UTF8) {
278 60009 : if (nUtf32 < 0x80)
279 59996 : writeEscapeOctet(pBuffer, pCapacity, nUtf32);
280 13 : else if (nUtf32 < 0x800)
281 : {
282 3 : writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 6 | 0xC0);
283 3 : writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
284 : }
285 10 : else if (nUtf32 < 0x10000)
286 : {
287 7 : writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 12 | 0xE0);
288 7 : writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
289 7 : writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
290 : }
291 : else
292 : {
293 3 : writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 18 | 0xF0);
294 3 : writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 12 & 0x3F) | 0x80);
295 3 : writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
296 3 : writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
297 : }
298 : } else {
299 : rtl_UnicodeToTextConverter aConverter
300 14100 : = rtl_createUnicodeToTextConverter(eCharset);
301 : sal_Unicode aSrc[2];
302 : sal_Size nSrcSize;
303 14100 : if (nUtf32 <= 0xFFFF)
304 : {
305 14099 : aSrc[0] = static_cast< sal_Unicode >(nUtf32);
306 14099 : nSrcSize = 1;
307 : }
308 : else
309 : {
310 : aSrc[0] = static_cast< sal_Unicode >(
311 1 : ((nUtf32 - 0x10000) >> 10) | 0xD800);
312 : aSrc[1] = static_cast< sal_Unicode >(
313 1 : ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00);
314 1 : nSrcSize = 2;
315 : }
316 : sal_Char aDst[32]; // FIXME random value
317 : sal_uInt32 nInfo;
318 : sal_Size nConverted;
319 : sal_Size nDstSize = rtl_convertUnicodeToText(
320 : aConverter, 0, aSrc, nSrcSize, aDst, sizeof aDst,
321 : RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR
322 : | RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR
323 : | RTL_UNICODETOTEXT_FLAGS_FLUSH,
324 14100 : &nInfo, &nConverted);
325 : assert((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0);
326 14100 : rtl_destroyUnicodeToTextConverter(aConverter);
327 14100 : if (nInfo == 0) {
328 : assert(nConverted == nSrcSize); // bad rtl_convertUnicodeToText
329 28200 : for (sal_Size i = 0; i < nDstSize; ++i)
330 : writeEscapeOctet(pBuffer, pCapacity,
331 14103 : static_cast< unsigned char >(aDst[i]));
332 : // FIXME all octets are escaped, even if there is no need
333 : } else {
334 3 : if (bStrict) {
335 2 : return false;
336 : } else {
337 1 : writeUcs4(pBuffer, pCapacity, nUtf32);
338 : }
339 : }
340 : }
341 74107 : return true;
342 : }
343 :
344 : struct Component
345 : {
346 : sal_Unicode const * pBegin;
347 : sal_Unicode const * pEnd;
348 :
349 107560 : inline Component(): pBegin(0), pEnd(0) {}
350 :
351 53750 : inline bool isPresent() const { return pBegin != 0; }
352 :
353 : inline sal_Int32 getLength() const;
354 : };
355 :
356 32244 : inline sal_Int32 Component::getLength() const
357 : {
358 : assert(isPresent()); // taking length of non-present component
359 32244 : return static_cast< sal_Int32 >(pEnd - pBegin);
360 : }
361 :
362 21512 : struct Components
363 : {
364 : Component aScheme;
365 : Component aAuthority;
366 : Component aPath;
367 : Component aQuery;
368 : Component aFragment;
369 : };
370 :
371 21512 : void parseUriRef(rtl_uString const * pUriRef, Components * pComponents)
372 : {
373 : // This algorithm is liberal and accepts various forms of illegal input.
374 :
375 21512 : sal_Unicode const * pBegin = pUriRef->buffer;
376 21512 : sal_Unicode const * pEnd = pBegin + pUriRef->length;
377 21512 : sal_Unicode const * pPos = pBegin;
378 :
379 21512 : if (pPos != pEnd && isAlpha(*pPos))
380 : {
381 197793 : for (sal_Unicode const * p = pPos + 1; p != pEnd; ++p)
382 : {
383 196129 : if (*p == ':')
384 : {
385 13450 : pComponents->aScheme.pBegin = pBegin;
386 13450 : pComponents->aScheme.pEnd = ++p;
387 13450 : pPos = p;
388 13450 : break;
389 : }
390 182679 : else if (!isAlpha(*p) && !isDigit(*p) && *p != '+' && *p != '-'
391 : && *p != '.')
392 : {
393 6372 : break;
394 : }
395 : }
396 : }
397 :
398 21512 : if (pEnd - pPos >= 2 && pPos[0] == '/' && pPos[1] == '/')
399 : {
400 8060 : pComponents->aAuthority.pBegin = pPos;
401 8060 : pPos += 2;
402 16164 : while (pPos != pEnd && *pPos != '/' && *pPos != '?' && *pPos != '#')
403 44 : ++pPos;
404 8060 : pComponents->aAuthority.pEnd = pPos;
405 : }
406 :
407 21512 : pComponents->aPath.pBegin = pPos;
408 875993 : while (pPos != pEnd && *pPos != '?' && * pPos != '#')
409 832969 : ++pPos;
410 21512 : pComponents->aPath.pEnd = pPos;
411 :
412 21512 : if (pPos != pEnd && *pPos == '?')
413 : {
414 48 : pComponents->aQuery.pBegin = pPos++;
415 153 : while (pPos != pEnd && * pPos != '#')
416 57 : ++pPos;
417 48 : pComponents->aQuery.pEnd = pPos;
418 : }
419 :
420 21512 : if (pPos != pEnd)
421 : {
422 : assert(*pPos == '#');
423 6 : pComponents->aFragment.pBegin = pPos;
424 6 : pComponents->aFragment.pEnd = pEnd;
425 : }
426 21512 : }
427 :
428 8053 : rtl::OUString joinPaths(Component const & rBasePath, Component const & rRelPath)
429 : {
430 : assert(rBasePath.isPresent() && *rBasePath.pBegin == '/');
431 : assert(rRelPath.isPresent());
432 :
433 : // The invariant of aBuffer is that it always starts and ends with a slash
434 : // (until probably right at the end of the algorithm, when the last segment
435 : // of rRelPath is added, which does not necessarily end in a slash):
436 8053 : rtl::OUStringBuffer aBuffer(rBasePath.getLength() + rRelPath.getLength());
437 : // XXX numeric overflow
438 :
439 : // Segments "." and ".." within rBasePath are not conisdered special (but
440 : // are also not removed by ".." segments within rRelPath), RFC 2396 seems a
441 : // bit unclear about this point:
442 8053 : sal_Int32 nFixed = 1;
443 8053 : sal_Unicode const * p = rBasePath.pBegin + 1;
444 527165 : for (sal_Unicode const * q = p; q != rBasePath.pEnd; ++q)
445 519112 : if (*q == '/')
446 : {
447 56186 : if (
448 : (q - p == 1 && p[0] == '.') ||
449 1 : (q - p == 2 && p[0] == '.' && p[1] == '.')
450 : )
451 : {
452 2 : nFixed = q + 1 - rBasePath.pBegin;
453 : }
454 56185 : p = q + 1;
455 : }
456 8053 : aBuffer.append(rBasePath.pBegin, p - rBasePath.pBegin);
457 :
458 8053 : p = rRelPath.pBegin;
459 8053 : if (p != rRelPath.pEnd)
460 36 : for (;;)
461 : {
462 8088 : sal_Unicode const * q = p;
463 : sal_Unicode const * r;
464 107929 : for (;;)
465 : {
466 116017 : if (q == rRelPath.pEnd)
467 : {
468 8052 : r = q;
469 8052 : break;
470 : }
471 107965 : if (*q == '/')
472 : {
473 36 : r = q + 1;
474 36 : break;
475 : }
476 107929 : ++q;
477 : }
478 8088 : if (q - p == 2 && p[0] == '.' && p[1] == '.')
479 : {
480 : // Erroneous excess segments ".." within rRelPath are left
481 : // intact, as the examples in RFC 2396, section C.2, suggest:
482 24 : sal_Int32 i = aBuffer.getLength() - 1;
483 24 : if (i < nFixed)
484 : {
485 6 : aBuffer.append(p, r - p);
486 6 : nFixed += 3;
487 : }
488 : else
489 : {
490 58 : while (i > 0 && aBuffer[i - 1] != '/')
491 22 : --i;
492 18 : aBuffer.setLength(i);
493 24 : }
494 : }
495 8064 : else if (q - p != 1 || *p != '.')
496 8054 : aBuffer.append(p, r - p);
497 8088 : if (q == rRelPath.pEnd)
498 8052 : break;
499 36 : p = q + 1;
500 : }
501 :
502 8053 : return aBuffer.makeStringAndClear();
503 : }
504 :
505 : }
506 :
507 9296 : sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
508 : SAL_THROW_EXTERN_C()
509 : {
510 : static sal_Bool const aCharClass[][nCharClassSize]
511 : = {{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* None */
512 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
513 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* !"#$%&'()*+,-./*/
514 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*0123456789:;<=>?*/
515 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*@ABCDEFGHIJKLMNO*/
516 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*PQRSTUVWXYZ[\]^_*/
517 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*`abcdefghijklmno*/
518 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /*pqrstuvwxyz{|}~ */
519 : },
520 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Uric */
521 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
522 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* !"#$%&'()*+,-./*/
523 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
524 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
525 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*PQRSTUVWXYZ[\]^_*/
526 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
527 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
528 : },
529 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UricNoSlash */
530 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
531 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
532 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
533 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
534 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
535 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
536 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
537 : },
538 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RelSegment */
539 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
541 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
542 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
543 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
544 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
545 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
546 : },
547 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RegName */
548 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
550 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
551 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
552 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
553 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
554 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
555 : },
556 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Userinfo */
557 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
559 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
560 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
561 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
562 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
563 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
564 : },
565 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Pchar */
566 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
567 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
568 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, /*0123456789:;<=>?*/
569 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
570 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
571 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
572 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
573 : },
574 : { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UnoParamValue */
575 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
576 : 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, /* !"#$%&'()*+,-./*/
577 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*0123456789:;<=>?*/
578 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
579 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
580 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
581 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 /*pqrstuvwxyz{|}~ */
582 : }};
583 : assert(
584 : (eCharClass >= 0
585 : && (sal::static_int_cast< std::size_t >(eCharClass)
586 : < SAL_N_ELEMENTS(aCharClass)))); // bad eCharClass
587 9296 : return aCharClass[eCharClass];
588 : }
589 :
590 271028 : void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool const * pCharClass,
591 : rtl_UriEncodeMechanism eMechanism,
592 : rtl_TextEncoding eCharset, rtl_uString ** pResult)
593 : SAL_THROW_EXTERN_C()
594 : {
595 : assert(!pCharClass[0x25]); // make sure the percent sign is encoded...
596 :
597 271028 : sal_Unicode const * p = pText->buffer;
598 271028 : sal_Unicode const * pEnd = p + pText->length;
599 271028 : sal_Int32 nCapacity = pText->length;
600 271028 : rtl_uString_new_WithLength(pResult, nCapacity);
601 271028 : while (p < pEnd)
602 : {
603 : EscapeType eType;
604 : sal_uInt32 nUtf32 = readUcs4(
605 : &p, pEnd,
606 : (eMechanism == rtl_UriEncodeKeepEscapes
607 : || eMechanism == rtl_UriEncodeCheckEscapes
608 : || eMechanism == rtl_UriEncodeStrictKeepEscapes),
609 20549867 : eCharset, &eType);
610 20549867 : switch (eType)
611 : {
612 : case EscapeNo:
613 20549859 : if (isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
614 : writeUnicode(pResult, &nCapacity,
615 20475754 : static_cast< sal_Unicode >(nUtf32));
616 74105 : else if (!writeEscapeChar(
617 : pResult, &nCapacity, nUtf32, eCharset,
618 : (eMechanism == rtl_UriEncodeStrict
619 74105 : || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
620 : {
621 2 : rtl_uString_new(pResult);
622 : return;
623 : }
624 20549857 : break;
625 :
626 : case EscapeChar:
627 7 : if (eMechanism == rtl_UriEncodeCheckEscapes
628 3 : && isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
629 : writeUnicode(pResult, &nCapacity,
630 0 : static_cast< sal_Unicode >(nUtf32));
631 4 : else if (!writeEscapeChar(
632 : pResult, &nCapacity, nUtf32, eCharset,
633 : (eMechanism == rtl_UriEncodeStrict
634 4 : || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
635 : {
636 0 : rtl_uString_new(pResult);
637 : return;
638 : }
639 4 : break;
640 :
641 : case EscapeOctet:
642 4 : writeEscapeOctet(pResult, &nCapacity, nUtf32);
643 4 : break;
644 : }
645 : }
646 271026 : *pResult = rtl_uStringBuffer_makeStringAndClear( pResult, &nCapacity );
647 : }
648 :
649 391073 : void SAL_CALL rtl_uriDecode(rtl_uString * pText,
650 : rtl_UriDecodeMechanism eMechanism,
651 : rtl_TextEncoding eCharset, rtl_uString ** pResult)
652 : SAL_THROW_EXTERN_C()
653 : {
654 391073 : switch (eMechanism)
655 : {
656 : case rtl_UriDecodeNone:
657 0 : rtl_uString_assign(pResult, pText);
658 0 : break;
659 :
660 : case rtl_UriDecodeToIuri:
661 5 : eCharset = RTL_TEXTENCODING_UTF8;
662 : default: // rtl_UriDecodeWithCharset, rtl_UriDecodeStrict
663 : {
664 391073 : sal_Unicode const * p = pText->buffer;
665 391073 : sal_Unicode const * pEnd = p + pText->length;
666 391073 : sal_Int32 nCapacity = pText->length;
667 391073 : rtl_uString_new_WithLength(pResult, nCapacity);
668 40046341 : while (p < pEnd)
669 : {
670 : EscapeType eType;
671 39264198 : sal_uInt32 nUtf32 = readUcs4(&p, pEnd, true, eCharset, &eType);
672 39264198 : switch (eType)
673 : {
674 : case EscapeChar:
675 179613 : if (nUtf32 <= 0x7F && eMechanism == rtl_UriDecodeToIuri)
676 : {
677 1 : writeEscapeOctet(pResult, &nCapacity, nUtf32);
678 1 : break;
679 : }
680 : case EscapeNo:
681 39264173 : writeUcs4(pResult, &nCapacity, nUtf32);
682 39264173 : break;
683 :
684 : case EscapeOctet:
685 24 : if (eMechanism == rtl_UriDecodeStrict) {
686 3 : rtl_uString_new(pResult);
687 391073 : return;
688 : }
689 21 : writeEscapeOctet(pResult, &nCapacity, nUtf32);
690 21 : break;
691 : }
692 : }
693 391070 : *pResult = rtl_uStringBuffer_makeStringAndClear( pResult, &nCapacity );
694 : }
695 391070 : break;
696 : }
697 : }
698 :
699 13451 : sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
700 : rtl_uString * pRelUriRef,
701 : rtl_uString ** pResult,
702 : rtl_uString ** pException)
703 : SAL_THROW_EXTERN_C()
704 : {
705 : // If pRelUriRef starts with a scheme component it is an absolute URI
706 : // reference, and we are done (i.e., this algorithm does not support
707 : // backwards-compatible relative URIs starting with a scheme component, see
708 : // RFC 2396, section 5.2, step 3):
709 13451 : Components aRelComponents;
710 13451 : parseUriRef(pRelUriRef, &aRelComponents);
711 13451 : if (aRelComponents.aScheme.isPresent())
712 : {
713 5390 : rtl_uString_assign(pResult, pRelUriRef);
714 5390 : return true;
715 : }
716 :
717 : // Parse pBaseUriRef; if the scheme component is not present or not valid,
718 : // or the path component is not empty and starts with anything but a slash,
719 : // an exception is raised:
720 8061 : Components aBaseComponents;
721 8061 : parseUriRef(pBaseUriRef, &aBaseComponents);
722 8061 : if (!aBaseComponents.aScheme.isPresent())
723 : {
724 1 : rtl::OUString aMessage(pBaseUriRef);
725 : aMessage += rtl::OUString(
726 : RTL_CONSTASCII_USTRINGPARAM(
727 1 : " does not start with a scheme component"));
728 : rtl_uString_assign(pException,
729 1 : const_cast< rtl::OUString & >(aMessage).pData);
730 1 : return false;
731 : }
732 8060 : if (aBaseComponents.aPath.pBegin != aBaseComponents.aPath.pEnd
733 : && *aBaseComponents.aPath.pBegin != '/')
734 : {
735 1 : rtl::OUString aMessage(pBaseUriRef);
736 : aMessage += rtl::OUString(
737 : RTL_CONSTASCII_USTRINGPARAM(
738 1 : "path component does not start with slash"));
739 1 : rtl_uString_assign(pException, aMessage.pData);
740 1 : return false;
741 : }
742 :
743 : // Use the algorithm from RFC 2396, section 5.2, to turn the relative URI
744 : // into an absolute one (if the relative URI is a reference to the "current
745 : // document," the "current document" is here taken to be the base URI):
746 8059 : rtl::OUStringBuffer aBuffer;
747 : aBuffer.append(aBaseComponents.aScheme.pBegin,
748 8059 : aBaseComponents.aScheme.getLength());
749 8059 : if (aRelComponents.aAuthority.isPresent())
750 : {
751 : aBuffer.append(aRelComponents.aAuthority.pBegin,
752 1 : aRelComponents.aAuthority.getLength());
753 : aBuffer.append(aRelComponents.aPath.pBegin,
754 1 : aRelComponents.aPath.getLength());
755 1 : if (aRelComponents.aQuery.isPresent())
756 : aBuffer.append(aRelComponents.aQuery.pBegin,
757 0 : aRelComponents.aQuery.getLength());
758 : }
759 : else
760 : {
761 8058 : if (aBaseComponents.aAuthority.isPresent())
762 : aBuffer.append(aBaseComponents.aAuthority.pBegin,
763 8058 : aBaseComponents.aAuthority.getLength());
764 8061 : if (aRelComponents.aPath.pBegin == aRelComponents.aPath.pEnd
765 3 : && !aRelComponents.aQuery.isPresent())
766 : {
767 : aBuffer.append(aBaseComponents.aPath.pBegin,
768 2 : aBaseComponents.aPath.getLength());
769 2 : if (aBaseComponents.aQuery.isPresent())
770 : aBuffer.append(aBaseComponents.aQuery.pBegin,
771 2 : aBaseComponents.aQuery.getLength());
772 : }
773 : else
774 : {
775 8056 : if (*aRelComponents.aPath.pBegin == '/')
776 : aBuffer.append(aRelComponents.aPath.pBegin,
777 3 : aRelComponents.aPath.getLength());
778 : else
779 : aBuffer.append(joinPaths(aBaseComponents.aPath,
780 8053 : aRelComponents.aPath));
781 8056 : if (aRelComponents.aQuery.isPresent())
782 : aBuffer.append(aRelComponents.aQuery.pBegin,
783 6 : aRelComponents.aQuery.getLength());
784 : }
785 : }
786 8059 : if (aRelComponents.aFragment.isPresent())
787 : aBuffer.append(aRelComponents.aFragment.pBegin,
788 6 : aRelComponents.aFragment.getLength());
789 8059 : rtl_uString_assign(pResult, aBuffer.makeStringAndClear().pData);
790 8059 : return true;
791 : }
792 :
793 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|