Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "oox/vml/vmlinputstream.hxx"
21 :
22 : #include <com/sun/star/io/XTextInputStream2.hpp>
23 : #include <map>
24 : #include <string.h>
25 : #include <rtl/strbuf.hxx>
26 : #include "oox/helper/helper.hxx"
27 : #include "oox/helper/textinputstream.hxx"
28 :
29 : namespace oox {
30 : namespace vml {
31 :
32 : using namespace ::com::sun::star::io;
33 : using namespace ::com::sun::star::uno;
34 :
35 : namespace {
36 :
37 52 : inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
38 : {
39 52 : sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
40 52 : return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
41 : }
42 :
43 6414 : inline bool lclIsWhiteSpace( sal_Char cChar )
44 : {
45 6414 : return cChar < 32;
46 : }
47 :
48 280 : const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
49 : {
50 6176 : for( ; pcBeg < pcEnd; ++pcBeg )
51 5918 : if( lclIsWhiteSpace( *pcBeg ) )
52 22 : return pcBeg;
53 258 : return pcEnd;
54 : }
55 :
56 526 : const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
57 : {
58 802 : for( ; pcBeg < pcEnd; ++pcBeg )
59 460 : if( !lclIsWhiteSpace( *pcBeg ) )
60 184 : return pcBeg;
61 342 : return pcEnd;
62 : }
63 :
64 24 : const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
65 : {
66 48 : while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
67 0 : --pcEnd;
68 24 : return pcEnd;
69 : }
70 :
71 288 : inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
72 : {
73 288 : rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
74 288 : }
75 :
76 20 : void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
77 : {
78 : /* Map attribute names to char-pointer of all attributes. This map is used
79 : to find multiple occurrences of attributes with the same name. The
80 : mapped pointers are used as map key in the next map below. */
81 : typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
82 20 : AttributeNameMap aAttributeNames;
83 :
84 : /* Map the char-pointers of all attributes to the full attribute definition
85 : string. This preserves the original order of the used attributes. */
86 : typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
87 40 : AttributeDataMap aAttributes;
88 :
89 20 : bool bOk = true;
90 20 : const sal_Char* pcNameBeg = pcBeg;
91 68 : while( bOk && (pcNameBeg < pcEnd) )
92 : {
93 : // pcNameBeg points to begin of attribute name, find equality sign
94 28 : const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
95 28 : if ((bOk = (pcEqualSign < pcEnd)) == true)
96 : {
97 : // find end of attribute name (ignore whitespace between name and equality sign)
98 24 : const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
99 24 : if( (bOk = (pcNameBeg < pcNameEnd)) == true )
100 : {
101 : // find begin of attribute value (must be single or double quote)
102 24 : const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
103 24 : if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true )
104 : {
105 : // find end of attribute value (matching quote character)
106 24 : const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
107 24 : if( (bOk = (pcValueEnd < pcEnd)) == true )
108 : {
109 24 : ++pcValueEnd;
110 24 : OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
111 48 : OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
112 : // search for an existing attribute with the same name
113 24 : AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
114 : // remove its definition from the data map
115 24 : if( aIt != aAttributeNames.end() )
116 0 : aAttributes.erase( aIt->second );
117 : // insert the attribute into both maps
118 24 : aAttributeNames[ aAttribName ] = pcNameBeg;
119 24 : aAttributes[ pcNameBeg ] = aAttribData;
120 : // continue with next attribute (skip whitespace after this attribute)
121 24 : pcNameBeg = pcValueEnd;
122 24 : if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) )
123 32 : pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
124 : }
125 : }
126 : }
127 : }
128 : }
129 :
130 : // if no error has occurred, build the resulting attribute list
131 20 : if( bOk )
132 32 : for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
133 20 : rBuffer.append( ' ' ).append( aIt->second );
134 : // on error, just append the complete passed string
135 : else
136 28 : lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
137 20 : }
138 :
139 214 : void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
140 : {
141 : // check that passed string starts and ends with the brackets of an XML element
142 214 : sal_Int32 nElementLen = rElement.getLength();
143 214 : if( nElementLen == 0 )
144 214 : return;
145 :
146 214 : const sal_Char* pcOpen = rElement.getStr();
147 214 : const sal_Char* pcClose = pcOpen + nElementLen - 1;
148 :
149 : // no complete element found
150 214 : if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
151 : {
152 : // just append all passed characters
153 0 : rBuffer.append( rElement );
154 : }
155 :
156 : // skip parser instructions: '<![...]>'
157 214 : else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
158 : {
159 : // do nothing
160 : }
161 :
162 : // replace '<br>' element with newline
163 214 : else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
164 : {
165 0 : rBuffer.append( '\n' );
166 : }
167 :
168 : // check start elements and simple elements for repeated attributes
169 214 : else if( pcOpen[ 1 ] != '/' )
170 : {
171 : // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
172 148 : const sal_Char* pcContentBeg = pcOpen + 1;
173 148 : bool bIsEmptyElement = pcClose[ -1 ] == '/';
174 148 : const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
175 : // append opening bracket and element name to buffer
176 148 : const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
177 148 : lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
178 : // find begin of attributes, and process all attributes
179 148 : const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
180 148 : if( pcAttribBeg < pcContentEnd )
181 20 : lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
182 : // close the element
183 148 : if( bIsEmptyElement )
184 82 : rBuffer.append( '/' );
185 148 : rBuffer.append( '>' );
186 : }
187 :
188 : // append end elements without further processing
189 : else
190 : {
191 66 : rBuffer.append( rElement );
192 : }
193 : }
194 :
195 214 : bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
196 : {
197 : /* MSO has a very weird way to store and handle whitespaces. The stream
198 : may contain lots of spaces, tabs, and newlines which have to be handled
199 : as single space character. This will be done in this function.
200 :
201 : If the element text contains a literal line break, it will be stored as
202 : <br> tag (without matching </br> element). This input stream wrapper
203 : will replace this element with a literal LF character (see below).
204 :
205 : A single space character for its own is stored as is. Example: The
206 : element
207 : <font> </font>
208 : represents a single space character. The XML parser will ignore this
209 : space character completely without issuing a 'characters' event. The
210 : VML import filter implementation has to react on this case manually.
211 :
212 : A single space character following another character is stored
213 : literally and must not be stipped away here. Example: The element
214 : <font>abc </font>
215 : contains the three letters a, b, and c, followed by a space character.
216 :
217 : Consecutive space characters, or a leading single space character, are
218 : stored in a <span> element. If there are N space characters (N > 1),
219 : then the <span> element contains exactly (N-1) NBSP (non-breaking
220 : space) characters, followed by a regular space character. Examples:
221 : The element
222 : <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
223 : represents 4 consecutive space characters. Has to be handled by the
224 : implementation. The element
225 : <font><span style='mso-spacerun:yes'> abc</span></font>
226 : represents a space characters followed by the letters a, b, c. These
227 : strings have to be handled by the VML import filter implementation.
228 : */
229 :
230 : // passed string ends with the leading opening bracket of an XML element
231 214 : const sal_Char* pcBeg = rChars.getStr();
232 214 : const sal_Char* pcEnd = pcBeg + rChars.getLength();
233 214 : bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
234 214 : if( bHasBracket ) --pcEnd;
235 :
236 : // skip leading whitespace
237 214 : const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
238 560 : while( pcContentsBeg < pcEnd )
239 : {
240 132 : const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
241 132 : lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
242 132 : if( pcWhitespaceBeg < pcEnd )
243 2 : rBuffer.append( ' ' );
244 132 : pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
245 : }
246 :
247 214 : return bHasBracket;
248 : }
249 :
250 : } // namespace
251 :
252 8 : InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
253 : // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
254 : mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
255 : maOpeningBracket( 1 ),
256 : maClosingBracket( 1 ),
257 : maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
258 : maClosingCData( CREATE_OSTRING( "]]>" ) ),
259 8 : mnBufferPos( 0 )
260 : {
261 8 : if (!mxTextStrm.is())
262 0 : throw IOException();
263 8 : maOpeningBracket[ 0 ] = '<';
264 8 : maClosingBracket[ 0 ] = '>';
265 8 : }
266 :
267 16 : InputStream::~InputStream()
268 : {
269 16 : }
270 :
271 16 : sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
272 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
273 : {
274 16 : if( nBytesToRead < 0 )
275 0 : throw IOException();
276 :
277 16 : rData.realloc( nBytesToRead );
278 16 : sal_Int8* pcDest = rData.getArray();
279 16 : sal_Int32 nRet = 0;
280 246 : while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
281 : {
282 214 : updateBuffer();
283 214 : sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
284 214 : if( nReadSize > 0 )
285 : {
286 214 : memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
287 214 : mnBufferPos += nReadSize;
288 214 : nBytesToRead -= nReadSize;
289 214 : nRet += nReadSize;
290 : }
291 : }
292 16 : if( nRet < rData.getLength() )
293 16 : rData.realloc( nRet );
294 16 : return nRet;
295 : }
296 :
297 16 : sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
298 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
299 : {
300 16 : return readBytes( rData, nMaxBytesToRead );
301 : }
302 :
303 0 : void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
304 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
305 : {
306 0 : if( nBytesToSkip < 0 )
307 0 : throw IOException();
308 :
309 0 : while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
310 : {
311 0 : updateBuffer();
312 0 : sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
313 0 : mnBufferPos += nSkipSize;
314 0 : nBytesToSkip -= nSkipSize;
315 : }
316 0 : }
317 :
318 8 : sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException, std::exception)
319 : {
320 8 : updateBuffer();
321 8 : return maBuffer.getLength() - mnBufferPos;
322 : }
323 :
324 0 : void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException, std::exception)
325 : {
326 0 : mxTextStrm->closeInput();
327 0 : }
328 :
329 : // private --------------------------------------------------------------------
330 :
331 222 : void InputStream::updateBuffer() throw (IOException, RuntimeException)
332 : {
333 658 : while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
334 : {
335 : // collect new contents in a string buffer
336 214 : OStringBuffer aBuffer;
337 :
338 : // read and process characters until the opening bracket of the next XML element
339 428 : OString aChars = readToElementBegin();
340 214 : bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
341 :
342 : // read and process characters until (and including) closing bracket (an XML element)
343 : OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
344 214 : if( bHasOpeningBracket && !mxTextStrm->isEOF() )
345 : {
346 : // read the element text (add the leading opening bracket manually)
347 214 : OString aElement = OString( '<' ) + readToElementEnd();
348 : // check for CDATA part, starting with '<![CDATA['
349 214 : if( aElement.match( maOpeningCData ) )
350 : {
351 : // search the end tag ']]>'
352 0 : while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.endsWith( maClosingCData )) && !mxTextStrm->isEOF() )
353 0 : aElement += readToElementEnd();
354 : // copy the entire CDATA part
355 0 : aBuffer.append( aElement );
356 : }
357 : else
358 : {
359 : // no CDATA part - process the contents of the element
360 214 : lclProcessElement( aBuffer, aElement );
361 214 : }
362 : }
363 :
364 214 : maBuffer = aBuffer.makeStringAndClear();
365 214 : mnBufferPos = 0;
366 214 : }
367 222 : }
368 :
369 214 : OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
370 : {
371 214 : return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
372 : }
373 :
374 214 : OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
375 : {
376 214 : OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
377 : OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
378 214 : return aText;
379 : }
380 :
381 : } // namespace vml
382 : } // namespave oox
383 :
384 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|