Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 : #include <string.h>
20 :
21 : #include <algorithm>
22 :
23 : #include <sal/types.h>
24 :
25 : #include <rtl/textenc.h>
26 : #include <rtl/tencinfo.h>
27 :
28 : #include <com/sun/star/io/XInputStream.hpp>
29 :
30 : using namespace ::com::sun::star::uno;
31 : using namespace ::com::sun::star::io;
32 :
33 :
34 : #include "xml2utf.hxx"
35 : #include <boost/scoped_array.hpp>
36 :
37 : namespace sax_expatwrap {
38 :
39 89241 : sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
40 : throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
41 : {
42 89241 : if( ! m_in.is() ) {
43 0 : throw NotConnectedException();
44 : }
45 89241 : if( ! m_bStarted ) {
46 : // it should be possible to find the encoding attribute
47 : // within the first 512 bytes == 128 chars in UCS-4
48 42949 : nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
49 : }
50 :
51 : sal_Int32 nRead;
52 89241 : Sequence< sal_Int8 > seqStart;
53 : while( true )
54 : {
55 89244 : nRead = m_in->readSomeBytes( seq , nMaxToRead );
56 :
57 89244 : if( nRead + seqStart.getLength())
58 : {
59 : // if nRead is 0, the file is already eof.
60 46300 : if( ! m_bStarted && nRead )
61 : {
62 : // ensure that enough data is available to parse encoding
63 42359 : if( seqStart.getLength() )
64 : {
65 : // prefix with what we had so far.
66 2 : sal_Int32 nLength = seq.getLength();
67 2 : seq.realloc( seqStart.getLength() + nLength );
68 :
69 4 : memmove (seq.getArray() + seqStart.getLength(),
70 2 : seq.getConstArray(),
71 6 : nLength);
72 2 : memcpy (seq.getArray(),
73 2 : seqStart.getConstArray(),
74 4 : seqStart.getLength());
75 : }
76 :
77 : // autodetection with the first bytes
78 42359 : if( ! isEncodingRecognizable( seq ) )
79 : {
80 : // remember what we have so far.
81 3 : seqStart = seq;
82 :
83 : // read more !
84 3 : continue;
85 : }
86 42356 : if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
87 : // initialize decoding
88 42044 : initializeDecoding();
89 : }
90 42356 : seqStart = Sequence < sal_Int8 > ();
91 : }
92 :
93 : // do the encoding
94 46297 : if( m_pText2Unicode && m_pUnicode2Text &&
95 46297 : m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
96 :
97 0 : Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
98 0 : seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
99 : }
100 :
101 46297 : if( ! m_bStarted )
102 : {
103 : // it must now be ensured, that no encoding attribute exist anymore
104 : // ( otherwise the expat-Parser will crash )
105 : // This must be done after decoding !
106 : // ( e.g. Files decoded in ucs-4 cannot be read properly )
107 42357 : m_bStarted = true;
108 42357 : removeEncoding( seq );
109 : }
110 46297 : nRead = seq.getLength();
111 : }
112 :
113 89241 : break;
114 : }
115 89241 : return nRead;
116 : }
117 :
118 :
119 172930 : XMLFile2UTFConverter::~XMLFile2UTFConverter()
120 : {
121 86465 : if( m_pText2Unicode )
122 0 : delete m_pText2Unicode;
123 86465 : if( m_pUnicode2Text )
124 0 : delete m_pUnicode2Text;
125 86465 : }
126 :
127 :
128 42357 : void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
129 : {
130 42357 : const sal_Int8 *pSource = seq.getArray();
131 42357 : if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4) )
132 : {
133 :
134 : // scan for encoding
135 42044 : OString str( reinterpret_cast<char const *>(pSource), seq.getLength() );
136 :
137 : // cut sequence to first line break
138 : // find first line break;
139 42044 : int nMax = str.indexOf( 10 );
140 42044 : if( nMax >= 0 )
141 : {
142 41975 : str = str.copy( 0 , nMax );
143 : }
144 :
145 42044 : int nFound = str.indexOf( " encoding" );
146 42044 : if( nFound >= 0 ) {
147 : int nStop;
148 42027 : int nStart = str.indexOf( "\"" , nFound );
149 42027 : if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
150 : {
151 42026 : nStart = str.indexOf( "'" , nFound );
152 42026 : nStop = str.indexOf( "'" , nStart +1 );
153 : }
154 : else
155 : {
156 1 : nStop = str.indexOf( "\"" , nStart +1);
157 : }
158 :
159 42027 : if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
160 : {
161 : // remove encoding tag from file
162 2 : memmove( &( seq.getArray()[nFound] ) ,
163 1 : &( seq.getArray()[nStop+1]) ,
164 3 : seq.getLength() - nStop -1);
165 1 : seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
166 : // str = String( (char * ) seq.getArray() , seq.getLen() );
167 : }
168 42044 : }
169 : }
170 42357 : }
171 :
172 : // Checks, if enough data has been accumulated to recognize the encoding
173 42359 : bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
174 : {
175 42359 : const sal_Int8 *pSource = seq.getConstArray();
176 42359 : bool bCheckIfFirstClosingBracketExsists = false;
177 :
178 42359 : if( seq.getLength() < 8 ) {
179 : // no recognition possible, when less than 8 bytes are available
180 1 : return false;
181 : }
182 :
183 42358 : if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4 ) ) {
184 : // scan if the <?xml tag finishes within this buffer
185 42016 : bCheckIfFirstClosingBracketExsists = true;
186 : }
187 654 : else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
188 624 : ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
189 : {
190 : // check for utf-16
191 0 : bCheckIfFirstClosingBracketExsists = true;
192 : }
193 372 : else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
194 60 : ( '?' == pSource[5] || '?' == pSource[7] ) )
195 : {
196 : // check for
197 0 : bCheckIfFirstClosingBracketExsists = true;
198 : }
199 :
200 42358 : if( bCheckIfFirstClosingBracketExsists )
201 : {
202 2136853 : for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
203 : {
204 : // whole <?xml tag is valid
205 2136851 : if( '>' == pSource[ i ] )
206 : {
207 42014 : return true;
208 : }
209 : }
210 2 : return false;
211 : }
212 :
213 : // No <? tag in front, no need for a bigger buffer
214 342 : return true;
215 : }
216 :
217 42356 : bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
218 : {
219 42356 : const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
220 42356 : bool bReturn = true;
221 :
222 42356 : if( seq.getLength() < 4 ) {
223 : // no recognition possible, when less than 4 bytes are available
224 0 : return false;
225 : }
226 :
227 : // first level : detect possible file formats
228 42356 : if( ! strncmp( reinterpret_cast<const char *>(pSource), "<?xml", 4 ) ) {
229 :
230 : // scan for encoding
231 42014 : OString str( reinterpret_cast<const char *>(pSource), seq.getLength() );
232 :
233 : // cut sequence to first line break
234 : //find first line break;
235 42014 : int nMax = str.indexOf( 10 );
236 42014 : if( nMax >= 0 )
237 : {
238 41957 : str = str.copy( 0 , nMax );
239 : }
240 :
241 42014 : int nFound = str.indexOf( " encoding" );
242 42014 : if( nFound >= 0 ) {
243 : int nStop;
244 41997 : int nStart = str.indexOf( "\"" , nFound );
245 41997 : if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
246 : {
247 41996 : nStart = str.indexOf( "'" , nFound );
248 41996 : nStop = str.indexOf( "'" , nStart +1 );
249 : }
250 : else
251 : {
252 1 : nStop = str.indexOf( "\"" , nStart +1);
253 : }
254 41997 : if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
255 : {
256 : // encoding found finally
257 1 : m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
258 : }
259 42014 : }
260 : }
261 342 : else if( 0xFE == pSource[0] &&
262 0 : 0xFF == pSource[1] ) {
263 : // UTF-16 big endian
264 : // conversion is done so that encoding information can be easily extracted
265 0 : m_sEncoding = "utf-16";
266 : }
267 342 : else if( 0xFF == pSource[0] &&
268 0 : 0xFE == pSource[1] ) {
269 : // UTF-16 little endian
270 : // conversion is done so that encoding information can be easily extracted
271 0 : m_sEncoding = "utf-16";
272 : }
273 342 : else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
274 : // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
275 : // The byte order mark is simply added
276 :
277 : // simply add the byte order mark !
278 0 : seq.realloc( seq.getLength() + 2 );
279 0 : memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
280 0 : reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE;
281 0 : reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
282 :
283 0 : m_sEncoding = "utf-16";
284 : }
285 342 : else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
286 : // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
287 : // The byte order mark is simply added
288 :
289 0 : seq.realloc( seq.getLength() + 2 );
290 0 : memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
291 0 : reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFF;
292 0 : reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFE;
293 :
294 0 : m_sEncoding = "utf-16";
295 : }
296 372 : else if( 0xEF == pSource[0] &&
297 60 : 0xBB == pSource[1] &&
298 30 : 0xBF == pSource[2] )
299 : {
300 : // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
301 : // The BOM is removed.
302 30 : memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
303 30 : seq.realloc( seq.getLength() - 3 );
304 30 : m_sEncoding = "utf-8";
305 : }
306 312 : else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
307 : // UCS-4 big endian
308 0 : m_sEncoding = "ucs-4";
309 : }
310 312 : else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
311 : // UCS-4 little endian
312 0 : m_sEncoding = "ucs-4";
313 : }
314 : /* TODO: no need to test for the moment since we return sal_False like default case anyway
315 : else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
316 : 0xa7 == static_cast<unsigned char> (pSource[2]) &&
317 : 0x94 == static_cast<unsigned char> (pSource[3]) ) {
318 : // EBCDIC
319 : bReturn = sal_False; // must be extended
320 : }
321 : */
322 : else {
323 : // other
324 : // UTF8 is directly recognized by the parser.
325 312 : bReturn = false;
326 : }
327 :
328 42356 : return bReturn;
329 : }
330 :
331 42044 : void XMLFile2UTFConverter::initializeDecoding()
332 : {
333 :
334 42044 : if( !m_sEncoding.isEmpty() )
335 : {
336 31 : rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
337 31 : if( encoding != RTL_TEXTENCODING_UTF8 )
338 : {
339 0 : m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
340 0 : m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
341 : }
342 : }
343 42044 : }
344 :
345 :
346 :
347 :
348 : // Text2UnicodeConverter
349 :
350 :
351 0 : Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
352 : : m_convText2Unicode(NULL)
353 : , m_contextText2Unicode(NULL)
354 0 : , m_rtlEncoding(RTL_TEXTENCODING_DONTKNOW)
355 : {
356 0 : rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
357 0 : if( RTL_TEXTENCODING_DONTKNOW == encoding )
358 : {
359 0 : m_bCanContinue = false;
360 0 : m_bInitialized = false;
361 : }
362 : else
363 : {
364 0 : init( encoding );
365 : }
366 0 : }
367 :
368 0 : Text2UnicodeConverter::~Text2UnicodeConverter()
369 : {
370 0 : if( m_bInitialized )
371 : {
372 0 : rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
373 0 : rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
374 : }
375 0 : }
376 :
377 0 : void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
378 : {
379 0 : m_bCanContinue = true;
380 0 : m_bInitialized = true;
381 :
382 0 : m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
383 0 : m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
384 0 : m_rtlEncoding = encoding;
385 0 : }
386 :
387 :
388 0 : Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
389 : {
390 : sal_uInt32 uiInfo;
391 0 : sal_Size nSrcCvtBytes = 0;
392 0 : sal_Size nTargetCount = 0;
393 0 : sal_Size nSourceCount = 0;
394 :
395 : // the whole source size
396 0 : sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
397 0 : Sequence<sal_Unicode> seqUnicode ( nSourceSize );
398 :
399 0 : const sal_Int8 *pbSource = seqText.getConstArray();
400 0 : boost::scoped_array<sal_Int8> pbTempMem;
401 :
402 0 : if( m_seqSource.getLength() ) {
403 : // put old rest and new byte sequence into one array
404 0 : pbTempMem.reset(new sal_Int8[ nSourceSize ]);
405 0 : memcpy( pbTempMem.get() , m_seqSource.getConstArray() , m_seqSource.getLength() );
406 0 : memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
407 0 : pbSource = pbTempMem.get();
408 :
409 : // set to zero again
410 0 : m_seqSource = Sequence< sal_Int8 >();
411 : }
412 :
413 : while( true ) {
414 :
415 : /* All invalid characters are transformed to the unicode undefined char */
416 : nTargetCount += rtl_convertTextToUnicode(
417 : m_convText2Unicode,
418 : m_contextText2Unicode,
419 : reinterpret_cast<const char *>(&( pbSource[nSourceCount] )),
420 : nSourceSize - nSourceCount ,
421 0 : &( seqUnicode.getArray()[ nTargetCount ] ),
422 0 : seqUnicode.getLength() - nTargetCount,
423 : RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
424 : RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
425 : RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
426 : &uiInfo,
427 0 : &nSrcCvtBytes );
428 0 : nSourceCount += nSrcCvtBytes;
429 :
430 0 : if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
431 : // save necessary bytes for next conversion
432 0 : seqUnicode.realloc( seqUnicode.getLength() * 2 );
433 0 : continue;
434 : }
435 0 : break;
436 : }
437 0 : if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
438 0 : m_seqSource.realloc( nSourceSize - nSourceCount );
439 0 : memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
440 : }
441 :
442 : // set to correct unicode size
443 0 : seqUnicode.realloc( nTargetCount );
444 :
445 0 : return seqUnicode;
446 : }
447 :
448 :
449 :
450 :
451 :
452 : // Unicode2TextConverter
453 :
454 :
455 0 : Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
456 : {
457 0 : init( encoding );
458 0 : }
459 :
460 :
461 0 : Unicode2TextConverter::~Unicode2TextConverter()
462 : {
463 0 : if( m_bInitialized ) {
464 0 : rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
465 0 : rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
466 : }
467 0 : }
468 :
469 :
470 0 : Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
471 : {
472 0 : boost::scoped_array<sal_Unicode> puTempMem;
473 :
474 0 : if( m_seqSource.getLength() ) {
475 : // For surrogates !
476 : // put old rest and new byte sequence into one array
477 : // In general when surrogates are used, they should be rarely
478 : // cut off between two convert()-calls. So this code is used
479 : // rarely and the extra copy is acceptable.
480 0 : puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
481 0 : memcpy( puTempMem.get() ,
482 0 : m_seqSource.getConstArray() ,
483 0 : m_seqSource.getLength() * sizeof( sal_Unicode ) );
484 : memcpy(
485 0 : &(puTempMem[ m_seqSource.getLength() ]) ,
486 : puSource ,
487 0 : nSourceSize*sizeof( sal_Unicode ) );
488 0 : puSource = puTempMem.get();
489 0 : nSourceSize += m_seqSource.getLength();
490 :
491 0 : m_seqSource = Sequence< sal_Unicode > ();
492 : }
493 :
494 :
495 0 : sal_Size nTargetCount = 0;
496 0 : sal_Size nSourceCount = 0;
497 :
498 : sal_uInt32 uiInfo;
499 : sal_Size nSrcCvtChars;
500 :
501 : // take nSourceSize * 3 as preference
502 : // this is an upper boundary for converting to utf8,
503 : // which most often used as the target.
504 0 : sal_Int32 nSeqSize = nSourceSize * 3;
505 :
506 0 : Sequence<sal_Int8> seqText( nSeqSize );
507 0 : sal_Char *pTarget = reinterpret_cast<char *>(seqText.getArray());
508 : while( true ) {
509 :
510 : nTargetCount += rtl_convertUnicodeToText(
511 : m_convUnicode2Text,
512 : m_contextUnicode2Text,
513 : &( puSource[nSourceCount] ),
514 : nSourceSize - nSourceCount ,
515 : &( pTarget[nTargetCount] ),
516 : nSeqSize - nTargetCount,
517 : RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
518 : RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
519 : &uiInfo,
520 0 : &nSrcCvtChars);
521 0 : nSourceCount += nSrcCvtChars;
522 :
523 0 : if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
524 0 : nSeqSize = nSeqSize *2;
525 0 : seqText.realloc( nSeqSize ); // double array size
526 0 : pTarget = reinterpret_cast<char *>(seqText.getArray());
527 0 : continue;
528 : }
529 0 : break;
530 : }
531 :
532 : // for surrogates
533 0 : if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
534 0 : m_seqSource.realloc( nSourceSize - nSourceCount );
535 0 : memcpy( m_seqSource.getArray() ,
536 0 : &(puSource[nSourceCount]),
537 0 : (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
538 : }
539 :
540 : // reduce the size of the buffer (fast, no copy necessary)
541 0 : seqText.realloc( nTargetCount );
542 :
543 0 : return seqText;
544 : }
545 :
546 0 : void Unicode2TextConverter::init( rtl_TextEncoding encoding )
547 : {
548 0 : m_bCanContinue = true;
549 0 : m_bInitialized = true;
550 :
551 0 : m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
552 0 : m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
553 0 : m_rtlEncoding = encoding;
554 0 : };
555 :
556 :
557 : }
558 :
559 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|