Line data Source code
1 : /*
2 : *******************************************************************************
3 : *
4 : * Copyright (c) 1995-2013 International Business Machines Corporation and others
5 : *
6 : * All rights reserved.
7 : *
8 : * Permission is hereby granted, free of charge, to any person obtaining a copy of
9 : * this software and associated documentation files (the "Software"), to deal in
10 : * the Software without restriction, including without limitation the rights to
11 : * use, copy, modify, merge, publish, distribute, and/or sell copies of the
12 : * Software, and to permit persons to whom the Software is furnished to do so,
13 : * provided that the above copyright notice(s) and this permission notice appear
14 : * in all copies of the Software and that both the above copyright notice(s) and
15 : * this permission notice appear in supporting documentation.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 : * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
20 : * NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
21 : * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
22 : * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
23 : * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
24 : * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25 : *
26 : * Except as contained in this notice, the name of a copyright holder shall not be
27 : * used in advertising or otherwise to promote the sale, use or other dealings in
28 : * this Software without prior written authorization of the copyright holder.
29 : *
30 : *******************************************************************************
31 : * file name: scrptrun.cpp
32 : *
33 : * created on: 10/17/2001
34 : * created by: Eric R. Mader
35 : */
36 : /**
37 : * This file is largely copied from the ICU project,
38 : * under folder source/extra/scrptrun/scrptrun.cpp
39 : */
40 : #include "unicode/utypes.h"
41 : #include "unicode/uscript.h"
42 :
43 : #include "scrptrun.h"
44 : #include <algorithm>
45 :
46 : namespace {
47 :
48 : struct PairIndices
49 : {
50 : int8_t ma00[0xff];
51 : int8_t ma20[0x7f];
52 : int8_t ma30[0x7f];
53 :
54 267 : PairIndices()
55 : {
56 267 : std::fill_n(ma00, 0xff, -1);
57 267 : std::fill_n(ma20, 0x7f, -1);
58 267 : std::fill_n(ma30, 0x7f, -1);
59 :
60 : // characters in the range 0x0000 - 0x007e (inclusive)
61 : // ascii paired punctuation
62 267 : ma00[0x28] = 0;
63 267 : ma00[0x29] = 1;
64 267 : ma00[0x3c] = 2;
65 267 : ma00[0x3e] = 3;
66 267 : ma00[0x5b] = 4;
67 267 : ma00[0x5d] = 5;
68 267 : ma00[0x7b] = 6;
69 267 : ma00[0x7d] = 7;
70 : // guillemets
71 267 : ma00[0xab] = 8;
72 267 : ma00[0xbb] = 9;
73 :
74 : // characters in the range 0x2000 - 0x207e (inclusive)
75 : // general punctuation
76 267 : ma20[0x18] = 10;
77 267 : ma20[0x19] = 11;
78 267 : ma20[0x1c] = 12;
79 267 : ma20[0x1d] = 13;
80 267 : ma20[0x39] = 14;
81 267 : ma20[0x3a] = 15;
82 :
83 : // characters in the range 0x3000 - 0x307e (inclusive)
84 : // chinese paired punctuation
85 267 : ma30[0x08] = 16;
86 267 : ma30[0x09] = 17;
87 267 : ma30[0x0a] = 18;
88 267 : ma30[0x0b] = 19;
89 267 : ma30[0x0c] = 20;
90 267 : ma30[0x0d] = 21;
91 267 : ma30[0x0e] = 22;
92 267 : ma30[0x0f] = 23;
93 267 : ma30[0x10] = 24;
94 267 : ma30[0x11] = 25;
95 267 : ma30[0x14] = 26;
96 267 : ma30[0x15] = 27;
97 267 : ma30[0x16] = 28;
98 267 : ma30[0x17] = 29;
99 267 : ma30[0x18] = 30;
100 267 : ma30[0x19] = 31;
101 267 : ma30[0x1a] = 32;
102 267 : ma30[0x1b] = 33;
103 267 : }
104 :
105 92234277 : inline int32_t getPairIndex(UChar32 ch) const
106 : {
107 92234277 : if (ch < 0xff)
108 92054380 : return ma00[ch];
109 179897 : if (ch >= 0x2000 && ch < 0x207f)
110 17383 : return ma20[ch - 0x2000];
111 162514 : if (ch >= 0x3000 && ch < 0x307f)
112 45 : return ma30[ch - 0x3000];
113 162469 : return -1;
114 : }
115 :
116 : };
117 :
118 : }
119 :
120 267 : static const PairIndices gPairIndices;
121 :
122 :
123 : namespace vcl {
124 :
125 : const char ScriptRun::fgClassID=0;
126 :
127 92234277 : static inline UBool sameScript(int32_t scriptOne, int32_t scriptTwo)
128 : {
129 92234277 : return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
130 : }
131 :
132 2983696 : UBool ScriptRun::next()
133 : {
134 2983696 : int32_t startSP = parenSP; // used to find the first new open character
135 2983696 : UErrorCode error = U_ZERO_ERROR;
136 :
137 : // if we've fallen off the end of the text, we're done
138 2983696 : if (scriptEnd >= charLimit) {
139 1513433 : return false;
140 : }
141 :
142 1470263 : scriptCode = USCRIPT_COMMON;
143 :
144 93701491 : for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
145 92234277 : UChar high = charArray[scriptEnd];
146 92234277 : UChar32 ch = high;
147 :
148 : // if the character is a high surrogate and it's not the last one
149 : // in the text, see if it's followed by a low surrogate
150 92234277 : if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1)
151 : {
152 0 : UChar low = charArray[scriptEnd + 1];
153 :
154 : // if it is followed by a low surrogate,
155 : // consume it and form the full character
156 0 : if (low >= 0xDC00 && low <= 0xDFFF) {
157 0 : ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
158 0 : scriptEnd += 1;
159 : }
160 : }
161 :
162 92234277 : UScriptCode sc = uscript_getScript(ch, &error);
163 92234277 : int32_t pairIndex = gPairIndices.getPairIndex(ch);
164 :
165 : // Paired character handling:
166 :
167 : // if it's an open character, push it onto the stack.
168 : // if it's a close character, find the matching open on the
169 : // stack, and use that script code. Any non-matching open
170 : // characters above it on the stack will be poped.
171 92234277 : if (pairIndex >= 0) {
172 90759 : if ((pairIndex & 1) == 0) {
173 43048 : ++parenSP;
174 43048 : int32_t nVecSize = parenStack.size();
175 43048 : if (parenSP == nVecSize)
176 0 : parenStack.resize(nVecSize + 128);
177 43048 : parenStack[parenSP].pairIndex = pairIndex;
178 43048 : parenStack[parenSP].scriptCode = scriptCode;
179 47711 : } else if (parenSP >= 0) {
180 36339 : int32_t pi = pairIndex & ~1;
181 :
182 73467 : while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
183 789 : parenSP -= 1;
184 : }
185 :
186 36339 : if (parenSP < startSP) {
187 125 : startSP = parenSP;
188 : }
189 :
190 36339 : if (parenSP >= 0) {
191 35753 : sc = parenStack[parenSP].scriptCode;
192 : }
193 : }
194 : }
195 :
196 92234277 : if (sameScript(scriptCode, sc)) {
197 92231228 : if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
198 953329 : scriptCode = sc;
199 :
200 : // now that we have a final script code, fix any open
201 : // characters we pushed before we knew the script code.
202 1913998 : while (startSP < parenSP) {
203 7340 : parenStack[++startSP].scriptCode = scriptCode;
204 : }
205 : }
206 :
207 : // if this character is a close paired character,
208 : // pop it from the stack
209 92231228 : if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
210 35732 : parenSP -= 1;
211 : /* decrement startSP only if it is >= 0,
212 : decrementing it unnecessarily will lead to memory corruption
213 : while processing the above while block.
214 : e.g. startSP = -4 , parenSP = -1
215 : */
216 35732 : if (startSP >= 0) {
217 7287 : startSP -= 1;
218 : }
219 : }
220 : } else {
221 : // if the run broke on a surrogate pair,
222 : // end it before the high surrogate
223 3049 : if (ch >= 0x10000) {
224 0 : scriptEnd -= 1;
225 : }
226 :
227 3049 : break;
228 : }
229 : }
230 :
231 1470263 : return true;
232 : }
233 :
234 801 : }
|