Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <rtl/ustrbuf.hxx>
21 : #include <i18nutil/casefolding.hxx>
22 : #include <i18nutil/unicode.hxx>
23 :
24 : #include <comphelper/processfactory.hxx>
25 : #include <comphelper/string.hxx>
26 : #include <osl/diagnose.h>
27 :
28 : #include <string.h>
29 :
30 : #include "characterclassificationImpl.hxx"
31 : #include "breakiteratorImpl.hxx"
32 :
33 : #include "transliteration_body.hxx"
34 : #include <boost/scoped_array.hpp>
35 :
36 : using namespace ::com::sun::star::uno;
37 : using namespace ::com::sun::star::lang;
38 :
39 : namespace com { namespace sun { namespace star { namespace i18n {
40 :
41 39816 : Transliteration_body::Transliteration_body()
42 : {
43 39816 : nMappingType = 0;
44 39816 : transliterationName = "Transliteration_body";
45 39816 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body";
46 39816 : }
47 :
48 0 : sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException, std::exception)
49 : {
50 0 : return TransliterationType::ONE_TO_ONE;
51 : }
52 :
53 0 : sal_Bool SAL_CALL Transliteration_body::equals(
54 : const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/,
55 : const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/)
56 : throw(RuntimeException, std::exception)
57 : {
58 0 : throw RuntimeException();
59 : }
60 :
61 : Sequence< OUString > SAL_CALL
62 0 : Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 )
63 : throw( RuntimeException, std::exception)
64 : {
65 0 : Sequence< OUString > ostr(2);
66 0 : ostr[0] = str1;
67 0 : ostr[1] = str2;
68 0 : return ostr;
69 : }
70 :
71 12 : static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar )
72 : {
73 12 : sal_uInt8 nRes = nMappingType;
74 :
75 : // take care of TOGGLE_CASE transliteration:
76 : // nMappingType should not be a combination of flags, thuse we decide now
77 : // which one to use.
78 12 : if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
79 : {
80 12 : const sal_Int16 nType = unicode::getUnicodeType( cChar );
81 12 : if (nType & 0x02 /* lower case*/)
82 10 : nRes = MappingTypeLowerToUpper;
83 : else
84 : {
85 : // should also work properly for non-upper characters like white spacs, numbers, ...
86 2 : nRes = MappingTypeUpperToLower;
87 : }
88 : }
89 :
90 12 : return nRes;
91 : }
92 :
93 : OUString SAL_CALL
94 13777411 : Transliteration_body::transliterate(
95 : const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
96 : Sequence< sal_Int32 >& offset)
97 : throw(RuntimeException, std::exception)
98 : {
99 13777411 : const sal_Unicode *in = inStr.getStr() + startPos;
100 :
101 : // Two different blocks to eliminate the if(useOffset) condition inside the
102 : // inner k loop. Yes, on massive use even such small things do count.
103 13777411 : if ( useOffset )
104 : {
105 458 : sal_Int32 nOffCount = 0, i;
106 4397 : for (i = 0; i < nCount; i++)
107 : {
108 : // take care of TOGGLE_CASE transliteration:
109 3939 : sal_uInt8 nTmpMappingType = nMappingType;
110 3939 : if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
111 6 : nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
112 :
113 3939 : const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
114 3939 : nOffCount += map.nmap;
115 : }
116 458 : rtl_uString* pStr = rtl_uString_alloc(nOffCount);
117 458 : sal_Unicode* out = pStr->buffer;
118 :
119 458 : if ( nOffCount != offset.getLength() )
120 0 : offset.realloc( nOffCount );
121 :
122 458 : sal_Int32 j = 0;
123 458 : sal_Int32 * pArr = offset.getArray();
124 4397 : for (i = 0; i < nCount; i++)
125 : {
126 : // take care of TOGGLE_CASE transliteration:
127 3939 : sal_uInt8 nTmpMappingType = nMappingType;
128 3939 : if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
129 6 : nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
130 :
131 3939 : const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
132 7878 : for (sal_Int32 k = 0; k < map.nmap; k++)
133 : {
134 3939 : pArr[j] = i + startPos;
135 3939 : out[j++] = map.map[k];
136 : }
137 : }
138 458 : out[j] = 0;
139 :
140 458 : return OUString( pStr, SAL_NO_ACQUIRE );
141 : }
142 : else
143 : {
144 : // In the simple case of no offset sequence used we can eliminate the
145 : // first getValue() loop. We could also assume that most calls result
146 : // in identical string lengths, thus using a preallocated
147 : // OUStringBuffer could be an easy way to assemble the return string
148 : // without too much hassle. However, for single characters the
149 : // OUStringBuffer::append() method is quite expensive compared to a
150 : // simple array operation, so it pays here to copy the final result
151 : // instead.
152 :
153 : // Allocate the max possible buffer. Try to use stack instead of heap,
154 : // which would have to be reallocated most times anyways.
155 13776953 : const sal_Int32 nLocalBuf = 2048;
156 13776953 : sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf;
157 13776953 : boost::scoped_array<sal_Unicode> pHeapBuf;
158 13776953 : if ( nCount > nLocalBuf ) {
159 1 : pHeapBuf.reset(new sal_Unicode[ nCount * NMAPPINGMAX ]);
160 1 : out = pHeapBuf.get();
161 : }
162 :
163 13776953 : sal_Int32 j = 0;
164 66930221 : for ( sal_Int32 i = 0; i < nCount; i++)
165 : {
166 : // take care of TOGGLE_CASE transliteration:
167 53153268 : sal_uInt8 nTmpMappingType = nMappingType;
168 53153268 : if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
169 0 : nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
170 :
171 53153268 : const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
172 106306540 : for (sal_Int32 k = 0; k < map.nmap; k++)
173 : {
174 53153272 : out[j++] = map.map[k];
175 : }
176 : }
177 :
178 27553906 : OUString aRet( out, j );
179 27553906 : return aRet;
180 : }
181 : }
182 :
183 : OUString SAL_CALL
184 0 : Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException, std::exception)
185 : {
186 0 : const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
187 0 : rtl_uString* pStr = rtl_uString_alloc(map.nmap);
188 0 : sal_Unicode* out = pStr->buffer;
189 : sal_Int32 i;
190 :
191 0 : for (i = 0; i < map.nmap; i++)
192 0 : out[i] = map.map[i];
193 0 : out[i] = 0;
194 :
195 0 : return OUString( pStr, SAL_NO_ACQUIRE );
196 : }
197 :
198 : sal_Unicode SAL_CALL
199 9 : Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException, std::exception)
200 : {
201 9 : const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType);
202 9 : if (map.nmap > 1)
203 0 : throw MultipleCharsOutputException();
204 9 : return map.map[0];
205 : }
206 :
207 : OUString SAL_CALL
208 0 : Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
209 : Sequence< sal_Int32 >& offset) throw(RuntimeException, std::exception)
210 : {
211 0 : return this->transliterate(inStr, startPos, nCount, offset);
212 : }
213 :
214 39615 : Transliteration_casemapping::Transliteration_casemapping()
215 : {
216 39615 : nMappingType = 0;
217 39615 : transliterationName = "casemapping(generic)";
218 39615 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping";
219 39615 : }
220 :
221 : void SAL_CALL
222 13776908 : Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale )
223 : {
224 13776908 : nMappingType = rMappingType;
225 13776908 : aLocale = rLocale;
226 13776908 : }
227 :
228 2 : Transliteration_u2l::Transliteration_u2l()
229 : {
230 2 : nMappingType = MappingTypeUpperToLower;
231 2 : transliterationName = "upper_to_lower(generic)";
232 2 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l";
233 2 : }
234 :
235 6 : Transliteration_l2u::Transliteration_l2u()
236 : {
237 6 : nMappingType = MappingTypeLowerToUpper;
238 6 : transliterationName = "lower_to_upper(generic)";
239 6 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u";
240 6 : }
241 :
242 1 : Transliteration_togglecase::Transliteration_togglecase()
243 : {
244 : // usually nMappingType must NOT be a combiantion of different flages here,
245 : // but we take care of that problem in Transliteration_body::transliterate above
246 : // before that value is used. There we will decide which of both is to be used on
247 : // a per character basis.
248 1 : nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower;
249 1 : transliterationName = "toggle(generic)";
250 1 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase";
251 1 : }
252 :
253 1 : Transliteration_titlecase::Transliteration_titlecase()
254 : {
255 1 : nMappingType = MappingTypeToTitle;
256 1 : transliterationName = "title(generic)";
257 1 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase";
258 1 : }
259 :
260 2 : static OUString transliterate_titlecase_Impl(
261 : const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
262 : const Locale &rLocale,
263 : Sequence< sal_Int32 >& offset )
264 : throw(RuntimeException)
265 : {
266 2 : const OUString aText( inStr.copy( startPos, nCount ) );
267 :
268 2 : OUString aRes;
269 2 : if (!aText.isEmpty())
270 : {
271 2 : Reference< XComponentContext > xContext = ::comphelper::getProcessComponentContext();
272 4 : CharacterClassificationImpl aCharClassImpl( xContext );
273 :
274 : // because aCharClassImpl.toTitle does not handle ligatures or Beta but will raise
275 : // an exception we need to handle the first chara manually...
276 :
277 : // we don't want to change surrogates by accident, thuse we use proper code point iteration
278 2 : sal_Int32 nPos = 0;
279 2 : sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos );
280 4 : OUString aResolvedLigature( &cFirstChar, 1 );
281 : // toUpper can be used to properly resolve ligatures and characters like Beta
282 2 : aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
283 : // since toTitle will leave all-uppercase text unchanged we first need to
284 : // use toLower to bring possible 2nd and following charas in lowercase
285 2 : aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
286 2 : sal_Int32 nResolvedLen = aResolvedLigature.getLength();
287 :
288 : // now we can properly use toTitle to get the expected result for the resolved string.
289 : // The rest of the text should just become lowercase.
290 2 : aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale );
291 2 : aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale );
292 2 : offset.realloc( aRes.getLength() );
293 :
294 2 : sal_Int32 *pOffset = offset.getArray();
295 2 : sal_Int32 nLen = offset.getLength();
296 14 : for (sal_Int32 i = 0; i < nLen; ++i)
297 : {
298 12 : sal_Int32 nIdx = 0;
299 12 : if (i >= nResolvedLen)
300 10 : nIdx = i - nResolvedLen + 1;
301 12 : pOffset[i] = nIdx;
302 2 : }
303 : }
304 : #if OSL_DEBUG_LEVEL > 1
305 : const sal_Int32 *pCOffset = offset.getConstArray();
306 : (void) pCOffset;
307 : #endif
308 :
309 2 : return aRes;
310 : }
311 :
312 : // this function expects to be called on a word-by-word basis,
313 : // namely that startPos points to the first char of the word
314 1 : OUString SAL_CALL Transliteration_titlecase::transliterate(
315 : const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
316 : Sequence< sal_Int32 >& offset )
317 : throw(RuntimeException, std::exception)
318 : {
319 1 : return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
320 : }
321 :
322 1 : Transliteration_sentencecase::Transliteration_sentencecase()
323 : {
324 1 : nMappingType = MappingTypeToTitle; // though only to be applied to the first word...
325 1 : transliterationName = "sentence(generic)";
326 1 : implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
327 1 : }
328 :
329 : // this function expects to be called on a sentence-by-sentence basis,
330 : // namely that startPos points to the first word (NOT first char!) in the sentence
331 1 : OUString SAL_CALL Transliteration_sentencecase::transliterate(
332 : const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
333 : Sequence< sal_Int32 >& offset )
334 : throw(RuntimeException, std::exception)
335 : {
336 1 : return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
337 : }
338 :
339 : } } } }
340 :
341 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|