Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 : #include <string.h>
20 :
21 : #include <algorithm>
22 :
23 : #include <sal/types.h>
24 :
25 : #include <rtl/textenc.h>
26 : #include <rtl/tencinfo.h>
27 :
28 : #include <com/sun/star/io/XInputStream.hpp>
29 :
30 : using namespace ::com::sun::star::uno;
31 : using namespace ::com::sun::star::io;
32 :
33 :
34 : #include "xml2utf.hxx"
35 :
36 : namespace sax_expatwrap {
37 :
38 11779 : sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
39 : throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
40 : {
41 :
42 11779 : Sequence<sal_Int8> seqIn;
43 :
44 11779 : if( ! m_in.is() ) {
45 0 : throw NotConnectedException();
46 : }
47 11779 : if( ! m_bStarted ) {
48 : // it should be possible to find the encoding attribute
49 : // within the first 512 bytes == 128 chars in UCS-4
50 6029 : nMaxToRead = ::std::max( sal_Int32(512) , nMaxToRead );
51 : }
52 :
53 : sal_Int32 nRead;
54 23558 : Sequence< sal_Int8 > seqStart;
55 : while( true )
56 : {
57 11779 : nRead = m_in->readSomeBytes( seq , nMaxToRead );
58 :
59 11779 : if( nRead + seqStart.getLength())
60 : {
61 : // if nRead is 0, the file is already eof.
62 5751 : if( ! m_bStarted && nRead )
63 : {
64 : // ensure that enough data is available to parse encoding
65 5407 : if( seqStart.getLength() )
66 : {
67 : // prefix with what we had so far.
68 0 : sal_Int32 nLength = seq.getLength();
69 0 : seq.realloc( seqStart.getLength() + nLength );
70 :
71 0 : memmove (seq.getArray() + seqStart.getLength(),
72 0 : seq.getConstArray(),
73 0 : nLength);
74 0 : memcpy (seq.getArray(),
75 0 : seqStart.getConstArray(),
76 0 : seqStart.getLength());
77 : }
78 :
79 : // autodetection with the first bytes
80 5407 : if( ! isEncodingRecognizable( seq ) )
81 : {
82 : // remember what we have so far.
83 0 : seqStart = seq;
84 :
85 : // read more !
86 0 : continue;
87 : }
88 5407 : if( scanForEncoding( seq ) || !m_sEncoding.isEmpty() ) {
89 : // initialize decoding
90 5405 : initializeDecoding();
91 : }
92 5407 : nRead = seq.getLength();
93 5407 : seqStart = Sequence < sal_Int8 > ();
94 : }
95 :
96 : // do the encoding
97 5751 : if( m_pText2Unicode && m_pUnicode2Text &&
98 5751 : m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
99 :
100 0 : Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
101 0 : seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
102 : }
103 :
104 5751 : if( ! m_bStarted )
105 : {
106 : // it must now be ensured, that no encoding attribute exist anymore
107 : // ( otherwise the expat-Parser will crash )
108 : // This must be done after decoding !
109 : // ( e.g. Files decoded in ucs-4 cannot be read properly )
110 5407 : m_bStarted = sal_True;
111 5407 : removeEncoding( seq );
112 : }
113 5751 : nRead = seq.getLength();
114 : }
115 :
116 11779 : break;
117 : }
118 23558 : return nRead;
119 : }
120 :
121 :
122 24324 : XMLFile2UTFConverter::~XMLFile2UTFConverter()
123 : {
124 12162 : if( m_pText2Unicode )
125 0 : delete m_pText2Unicode;
126 12162 : if( m_pUnicode2Text )
127 0 : delete m_pUnicode2Text;
128 12162 : }
129 :
130 :
131 5407 : void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
132 : {
133 5407 : const sal_Int8 *pSource = seq.getArray();
134 5407 : if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
135 : {
136 :
137 : // scan for encoding
138 5405 : OString str( (sal_Char * ) pSource , seq.getLength() );
139 :
140 : // cut sequence to first line break
141 : // find first line break;
142 5405 : int nMax = str.indexOf( 10 );
143 5405 : if( nMax >= 0 )
144 : {
145 5374 : str = str.copy( 0 , nMax );
146 : }
147 :
148 5405 : int nFound = str.indexOf( " encoding" );
149 5405 : if( nFound >= 0 ) {
150 : int nStop;
151 5402 : int nStart = str.indexOf( "\"" , nFound );
152 5402 : if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
153 : {
154 5402 : nStart = str.indexOf( "'" , nFound );
155 5402 : nStop = str.indexOf( "'" , nStart +1 );
156 : }
157 : else
158 : {
159 0 : nStop = str.indexOf( "\"" , nStart +1);
160 : }
161 :
162 5402 : if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
163 : {
164 : // remove encoding tag from file
165 0 : memmove( &( seq.getArray()[nFound] ) ,
166 0 : &( seq.getArray()[nStop+1]) ,
167 0 : seq.getLength() - nStop -1);
168 0 : seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
169 : // str = String( (char * ) seq.getArray() , seq.getLen() );
170 : }
171 5405 : }
172 : }
173 5407 : }
174 :
175 : // Checks, if enough data has been accumulated to recognize the encoding
176 5407 : sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
177 : {
178 5407 : const sal_Int8 *pSource = seq.getConstArray();
179 5407 : sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
180 :
181 5407 : if( seq.getLength() < 8 ) {
182 : // no recognition possible, when less than 8 bytes are available
183 0 : return sal_False;
184 : }
185 :
186 5407 : if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
187 : // scan if the <?xml tag finishes within this buffer
188 5399 : bCheckIfFirstClosingBracketExsists = sal_True;
189 : }
190 10 : else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
191 4 : ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
192 : {
193 : // check for utf-16
194 0 : bCheckIfFirstClosingBracketExsists = sal_True;
195 : }
196 14 : else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
197 12 : ( '?' == pSource[5] || '?' == pSource[7] ) )
198 : {
199 : // check for
200 0 : bCheckIfFirstClosingBracketExsists = sal_True;
201 : }
202 :
203 5407 : if( bCheckIfFirstClosingBracketExsists )
204 : {
205 240918 : for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
206 : {
207 : // whole <?xml tag is valid
208 240918 : if( '>' == pSource[ i ] )
209 : {
210 5399 : return sal_True;
211 : }
212 : }
213 0 : return sal_False;
214 : }
215 :
216 : // No <? tag in front, no need for a bigger buffer
217 8 : return sal_True;
218 : }
219 :
220 5407 : sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
221 : {
222 5407 : const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
223 5407 : sal_Bool bReturn = sal_True;
224 :
225 5407 : if( seq.getLength() < 4 ) {
226 : // no recognition possible, when less than 4 bytes are available
227 0 : return sal_False;
228 : }
229 :
230 : // first level : detect possible file formats
231 5407 : if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
232 :
233 : // scan for encoding
234 5399 : OString str( (const sal_Char *) pSource , seq.getLength() );
235 :
236 : // cut sequence to first line break
237 : //find first line break;
238 5399 : int nMax = str.indexOf( 10 );
239 5399 : if( nMax >= 0 )
240 : {
241 5374 : str = str.copy( 0 , nMax );
242 : }
243 :
244 5399 : int nFound = str.indexOf( " encoding" );
245 5399 : if( nFound < str.getLength() ) {
246 : int nStop;
247 5399 : int nStart = str.indexOf( "\"" , nFound );
248 5399 : if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
249 : {
250 5399 : nStart = str.indexOf( "'" , nFound );
251 5399 : nStop = str.indexOf( "'" , nStart +1 );
252 : }
253 : else
254 : {
255 0 : nStop = str.indexOf( "\"" , nStart +1);
256 : }
257 5399 : if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
258 : {
259 : // encoding found finally
260 0 : m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
261 : }
262 5399 : }
263 : }
264 8 : else if( 0xFE == pSource[0] &&
265 0 : 0xFF == pSource[1] ) {
266 : // UTF-16 big endian
267 : // conversion is done so that encoding information can be easily extracted
268 0 : m_sEncoding = "utf-16";
269 : }
270 8 : else if( 0xFF == pSource[0] &&
271 0 : 0xFE == pSource[1] ) {
272 : // UTF-16 little endian
273 : // conversion is done so that encoding information can be easily extracted
274 0 : m_sEncoding = "utf-16";
275 : }
276 8 : else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
277 : // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
278 : // The byte order mark is simply added
279 :
280 : // simply add the byte order mark !
281 0 : seq.realloc( seq.getLength() + 2 );
282 0 : memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
283 0 : ((sal_uInt8*)seq.getArray())[0] = 0xFE;
284 0 : ((sal_uInt8*)seq.getArray())[1] = 0xFF;
285 :
286 0 : m_sEncoding = "utf-16";
287 : }
288 8 : else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
289 : // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
290 : // The byte order mark is simply added
291 :
292 0 : seq.realloc( seq.getLength() + 2 );
293 0 : memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
294 0 : ((sal_uInt8*)seq.getArray())[0] = 0xFF;
295 0 : ((sal_uInt8*)seq.getArray())[1] = 0xFE;
296 :
297 0 : m_sEncoding = "utf-16";
298 : }
299 14 : else if( 0xEF == pSource[0] &&
300 12 : 0xBB == pSource[1] &&
301 6 : 0xBF == pSource[2] )
302 : {
303 : // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
304 : // The BOM is removed.
305 6 : memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
306 6 : seq.realloc( seq.getLength() - 3 );
307 6 : m_sEncoding = "utf-8";
308 : }
309 2 : else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
310 : // UCS-4 big endian
311 0 : m_sEncoding = "ucs-4";
312 : }
313 2 : else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
314 : // UCS-4 little endian
315 0 : m_sEncoding = "ucs-4";
316 : }
317 2 : else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
318 0 : 0xa7 == static_cast<unsigned char> (pSource[2]) &&
319 0 : 0x94 == static_cast<unsigned char> (pSource[3]) ) {
320 : // EBCDIC
321 0 : bReturn = sal_False; // must be extended
322 : }
323 : else {
324 : // other
325 : // UTF8 is directly recognized by the parser.
326 2 : bReturn = sal_False;
327 : }
328 :
329 5407 : return bReturn;
330 : }
331 :
332 5405 : void XMLFile2UTFConverter::initializeDecoding()
333 : {
334 :
335 5405 : if( !m_sEncoding.isEmpty() )
336 : {
337 6 : rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
338 6 : if( encoding != RTL_TEXTENCODING_UTF8 )
339 : {
340 0 : m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
341 0 : m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
342 : }
343 : }
344 5405 : }
345 :
346 :
347 : //----------------------------------------------
348 : //
349 : // Text2UnicodeConverter
350 : //
351 : //----------------------------------------------
352 0 : Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
353 : {
354 0 : rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
355 0 : if( RTL_TEXTENCODING_DONTKNOW == encoding )
356 : {
357 0 : m_bCanContinue = sal_False;
358 0 : m_bInitialized = sal_False;
359 : }
360 : else
361 : {
362 0 : init( encoding );
363 : }
364 0 : }
365 :
366 0 : Text2UnicodeConverter::~Text2UnicodeConverter()
367 : {
368 0 : if( m_bInitialized )
369 : {
370 0 : rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
371 0 : rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
372 : }
373 0 : }
374 :
375 0 : void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
376 : {
377 0 : m_bCanContinue = sal_True;
378 0 : m_bInitialized = sal_True;
379 :
380 0 : m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
381 0 : m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
382 0 : m_rtlEncoding = encoding;
383 0 : }
384 :
385 :
386 0 : Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
387 : {
388 : sal_uInt32 uiInfo;
389 0 : sal_Size nSrcCvtBytes = 0;
390 0 : sal_Size nTargetCount = 0;
391 0 : sal_Size nSourceCount = 0;
392 :
393 : // the whole source size
394 0 : sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
395 0 : Sequence<sal_Unicode> seqUnicode ( nSourceSize );
396 :
397 0 : const sal_Int8 *pbSource = seqText.getConstArray();
398 0 : sal_Int8 *pbTempMem = 0;
399 :
400 0 : if( m_seqSource.getLength() ) {
401 : // put old rest and new byte sequence into one array
402 0 : pbTempMem = new sal_Int8[ nSourceSize ];
403 0 : memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
404 0 : memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
405 0 : pbSource = pbTempMem;
406 :
407 : // set to zero again
408 0 : m_seqSource = Sequence< sal_Int8 >();
409 : }
410 :
411 : while( true ) {
412 :
413 : /* All invalid characters are transformed to the unicode undefined char */
414 : nTargetCount += rtl_convertTextToUnicode(
415 : m_convText2Unicode,
416 : m_contextText2Unicode,
417 : ( const sal_Char * ) &( pbSource[nSourceCount] ),
418 : nSourceSize - nSourceCount ,
419 0 : &( seqUnicode.getArray()[ nTargetCount ] ),
420 0 : seqUnicode.getLength() - nTargetCount,
421 : RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
422 : RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
423 : RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
424 : &uiInfo,
425 0 : &nSrcCvtBytes );
426 0 : nSourceCount += nSrcCvtBytes;
427 :
428 0 : if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
429 : // save necessary bytes for next conversion
430 0 : seqUnicode.realloc( seqUnicode.getLength() * 2 );
431 0 : continue;
432 : }
433 0 : break;
434 : }
435 0 : if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
436 0 : m_seqSource.realloc( nSourceSize - nSourceCount );
437 0 : memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
438 : }
439 :
440 :
441 0 : if( pbTempMem ) {
442 0 : delete [] pbTempMem;
443 : }
444 :
445 : // set to correct unicode size
446 0 : seqUnicode.realloc( nTargetCount );
447 :
448 0 : return seqUnicode;
449 : }
450 :
451 :
452 :
453 : //----------------------------------------------
454 : //
455 : // Unicode2TextConverter
456 : //
457 : //----------------------------------------------
458 0 : Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
459 : {
460 0 : init( encoding );
461 0 : }
462 :
463 :
464 0 : Unicode2TextConverter::~Unicode2TextConverter()
465 : {
466 0 : if( m_bInitialized ) {
467 0 : rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
468 0 : rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
469 : }
470 0 : }
471 :
472 :
473 0 : Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
474 : {
475 0 : sal_Unicode *puTempMem = 0;
476 :
477 0 : if( m_seqSource.getLength() ) {
478 : // For surrogates !
479 : // put old rest and new byte sequence into one array
480 : // In general when surrogates are used, they should be rarely
481 : // cut off between two convert()-calls. So this code is used
482 : // rarely and the extra copy is acceptable.
483 0 : puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
484 : memcpy( puTempMem ,
485 0 : m_seqSource.getConstArray() ,
486 0 : m_seqSource.getLength() * sizeof( sal_Unicode ) );
487 : memcpy(
488 0 : &(puTempMem[ m_seqSource.getLength() ]) ,
489 : puSource ,
490 0 : nSourceSize*sizeof( sal_Unicode ) );
491 0 : puSource = puTempMem;
492 0 : nSourceSize += m_seqSource.getLength();
493 :
494 0 : m_seqSource = Sequence< sal_Unicode > ();
495 : }
496 :
497 :
498 0 : sal_Size nTargetCount = 0;
499 0 : sal_Size nSourceCount = 0;
500 :
501 : sal_uInt32 uiInfo;
502 : sal_Size nSrcCvtChars;
503 :
504 : // take nSourceSize * 3 as preference
505 : // this is an upper boundary for converting to utf8,
506 : // which most often used as the target.
507 0 : sal_Int32 nSeqSize = nSourceSize * 3;
508 :
509 0 : Sequence<sal_Int8> seqText( nSeqSize );
510 0 : sal_Char *pTarget = (sal_Char *) seqText.getArray();
511 : while( true ) {
512 :
513 : nTargetCount += rtl_convertUnicodeToText(
514 : m_convUnicode2Text,
515 : m_contextUnicode2Text,
516 : &( puSource[nSourceCount] ),
517 : nSourceSize - nSourceCount ,
518 : &( pTarget[nTargetCount] ),
519 : nSeqSize - nTargetCount,
520 : RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
521 : RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
522 : &uiInfo,
523 0 : &nSrcCvtChars);
524 0 : nSourceCount += nSrcCvtChars;
525 :
526 0 : if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
527 0 : nSeqSize = nSeqSize *2;
528 0 : seqText.realloc( nSeqSize ); // double array size
529 0 : pTarget = ( sal_Char * ) seqText.getArray();
530 0 : continue;
531 : }
532 0 : break;
533 : }
534 :
535 : // for surrogates
536 0 : if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
537 0 : m_seqSource.realloc( nSourceSize - nSourceCount );
538 0 : memcpy( m_seqSource.getArray() ,
539 0 : &(puSource[nSourceCount]),
540 0 : (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
541 : }
542 :
543 0 : if( puTempMem ) {
544 0 : delete [] puTempMem;
545 : }
546 :
547 : // reduce the size of the buffer (fast, no copy necessary)
548 0 : seqText.realloc( nTargetCount );
549 :
550 0 : return seqText;
551 : }
552 :
553 0 : void Unicode2TextConverter::init( rtl_TextEncoding encoding )
554 : {
555 0 : m_bCanContinue = sal_True;
556 0 : m_bInitialized = sal_True;
557 :
558 0 : m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
559 0 : m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
560 0 : m_rtlEncoding = encoding;
561 0 : };
562 :
563 :
564 : }
565 :
566 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|