Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "oox/vml/vmlinputstream.hxx"
21 :
22 : #include <com/sun/star/io/XTextInputStream2.hpp>
23 : #include <map>
24 : #include <string.h>
25 : #include <rtl/strbuf.hxx>
26 : #include <osl/diagnose.h>
27 : #include "oox/helper/helper.hxx"
28 : #include "oox/helper/textinputstream.hxx"
29 :
30 : namespace oox {
31 : namespace vml {
32 :
33 : using namespace ::com::sun::star::io;
34 : using namespace ::com::sun::star::uno;
35 :
36 : namespace {
37 :
38 36 : inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
39 : {
40 36 : sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
41 36 : return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
42 : }
43 :
44 4014 : inline bool lclIsWhiteSpace( sal_Char cChar )
45 : {
46 4014 : return cChar < 32;
47 : }
48 :
49 194 : const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
50 : {
51 3831 : for( ; pcBeg < pcEnd; ++pcBeg )
52 3652 : if( lclIsWhiteSpace( *pcBeg ) )
53 15 : return pcBeg;
54 179 : return pcEnd;
55 : }
56 :
57 364 : const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
58 : {
59 567 : for( ; pcBeg < pcEnd; ++pcBeg )
60 336 : if( !lclIsWhiteSpace( *pcBeg ) )
61 133 : return pcBeg;
62 231 : return pcEnd;
63 : }
64 :
65 17 : const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
66 : {
67 34 : while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
68 0 : --pcEnd;
69 17 : return pcEnd;
70 : }
71 :
72 200 : inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
73 : {
74 200 : rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
75 200 : }
76 :
77 14 : void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
78 : {
79 : /* Map attribute names to char-pointer of all attributes. This map is used
80 : to find multiple occurrences of attributes with the same name. The
81 : mapped pointers are used as map key in the next map below. */
82 : typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
83 14 : AttributeNameMap aAttributeNames;
84 :
85 : /* Map the char-pointers of all attributes to the full attribute definition
86 : string. This preserves the original order of the used attributes. */
87 : typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
88 28 : AttributeDataMap aAttributes;
89 :
90 14 : bool bOk = true;
91 14 : const sal_Char* pcNameBeg = pcBeg;
92 47 : while( bOk && (pcNameBeg < pcEnd) )
93 : {
94 : // pcNameBeg points to begin of attribute name, find equality sign
95 19 : const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
96 19 : if ((bOk = (pcEqualSign < pcEnd)))
97 : {
98 : // find end of attribute name (ignore whitespace between name and equality sign)
99 17 : const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
100 17 : if( (bOk = (pcNameBeg < pcNameEnd)) )
101 : {
102 : // find begin of attribute value (must be single or double quote)
103 17 : const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
104 17 : if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) )
105 : {
106 : // find end of attribute value (matching quote character)
107 17 : const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
108 17 : if( (bOk = (pcValueEnd < pcEnd)) )
109 : {
110 17 : ++pcValueEnd;
111 17 : OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
112 34 : OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
113 : // search for an existing attribute with the same name
114 17 : AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
115 : // remove its definition from the data map
116 17 : if( aIt != aAttributeNames.end() )
117 0 : aAttributes.erase( aIt->second );
118 : // insert the attribute into both maps
119 17 : aAttributeNames[ aAttribName ] = pcNameBeg;
120 17 : aAttributes[ pcNameBeg ] = aAttribData;
121 : // continue with next attribute (skip whitespace after this attribute)
122 17 : pcNameBeg = pcValueEnd;
123 17 : if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg ))) )
124 22 : pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
125 : }
126 : }
127 : }
128 : }
129 : }
130 :
131 : // if no error has occurred, build the resulting attribute list
132 14 : if( bOk )
133 21 : for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
134 13 : rBuffer.append( ' ' ).append( aIt->second );
135 : // on error, just append the complete passed string
136 : else
137 20 : lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
138 14 : }
139 :
140 143 : void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
141 : {
142 : // check that passed string starts and ends with the brackets of an XML element
143 143 : sal_Int32 nElementLen = rElement.getLength();
144 143 : if( nElementLen == 0 )
145 143 : return;
146 :
147 143 : const sal_Char* pcOpen = rElement.getStr();
148 143 : const sal_Char* pcClose = pcOpen + nElementLen - 1;
149 :
150 : // no complete element found
151 143 : if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
152 : {
153 : // just append all passed characters
154 0 : rBuffer.append( rElement );
155 : }
156 :
157 : // skip parser instructions: '<![...]>'
158 143 : else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
159 : {
160 : // do nothing
161 : }
162 :
163 : // replace '<br>' element with newline
164 143 : else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
165 : {
166 0 : rBuffer.append( '\n' );
167 : }
168 :
169 : // check start elements and simple elements for repeated attributes
170 143 : else if( pcOpen[ 1 ] != '/' )
171 : {
172 : // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
173 97 : const sal_Char* pcContentBeg = pcOpen + 1;
174 97 : bool bIsEmptyElement = pcClose[ -1 ] == '/';
175 97 : const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
176 : // append opening bracket and element name to buffer
177 97 : const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
178 97 : lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
179 : // find begin of attributes, and process all attributes
180 97 : const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
181 97 : if( pcAttribBeg < pcContentEnd )
182 14 : lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
183 : // close the element
184 97 : if( bIsEmptyElement )
185 51 : rBuffer.append( '/' );
186 97 : rBuffer.append( '>' );
187 : }
188 :
189 : // append end elements without further processing
190 : else
191 : {
192 46 : rBuffer.append( rElement );
193 : }
194 : }
195 :
196 148 : bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
197 : {
198 : /* MSO has a very weird way to store and handle whitespaces. The stream
199 : may contain lots of spaces, tabs, and newlines which have to be handled
200 : as single space character. This will be done in this function.
201 :
202 : If the element text contains a literal line break, it will be stored as
203 : <br> tag (without matching </br> element). This input stream wrapper
204 : will replace this element with a literal LF character (see below).
205 :
206 : A single space character for its own is stored as is. Example: The
207 : element
208 : <font> </font>
209 : represents a single space character. The XML parser will ignore this
210 : space character completely without issuing a 'characters' event. The
211 : VML import filter implementation has to react on this case manually.
212 :
213 : A single space character following another character is stored
214 : literally and must not be stipped away here. Example: The element
215 : <font>abc </font>
216 : contains the three letters a, b, and c, followed by a space character.
217 :
218 : Consecutive space characters, or a leading single space character, are
219 : stored in a <span> element. If there are N space characters (N > 1),
220 : then the <span> element contains exactly (N-1) NBSP (non-breaking
221 : space) characters, followed by a regular space character. Examples:
222 : The element
223 : <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
224 : represents 4 consecutive space characters. Has to be handled by the
225 : implementation. The element
226 : <font><span style='mso-spacerun:yes'> abc</span></font>
227 : represents a space characters followed by the letters a, b, c. These
228 : strings have to be handled by the VML import filter implementation.
229 : */
230 :
231 : // passed string ends with the leading opening bracket of an XML element
232 148 : const sal_Char* pcBeg = rChars.getStr();
233 148 : const sal_Char* pcEnd = pcBeg + rChars.getLength();
234 148 : bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
235 148 : if( bHasBracket ) --pcEnd;
236 :
237 : // skip leading whitespace
238 148 : const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
239 393 : while( pcContentsBeg < pcEnd )
240 : {
241 97 : const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
242 97 : lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
243 97 : if( pcWhitespaceBeg < pcEnd )
244 1 : rBuffer.append( ' ' );
245 97 : pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
246 : }
247 :
248 148 : return bHasBracket;
249 : }
250 :
251 : } // namespace
252 :
253 5 : InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
254 : // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
255 : mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
256 : maOpeningBracket( 1 ),
257 : maClosingBracket( 1 ),
258 : maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
259 : maClosingCData( CREATE_OSTRING( "]]>" ) ),
260 5 : mnBufferPos( 0 )
261 : {
262 5 : if (!mxTextStrm.is())
263 0 : throw IOException();
264 5 : maOpeningBracket[ 0 ] = '<';
265 5 : maClosingBracket[ 0 ] = '>';
266 5 : }
267 :
268 10 : InputStream::~InputStream()
269 : {
270 10 : }
271 :
272 10 : sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
273 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
274 : {
275 10 : if( nBytesToRead < 0 )
276 0 : throw IOException();
277 :
278 10 : rData.realloc( nBytesToRead );
279 10 : sal_Int8* pcDest = rData.getArray();
280 10 : sal_Int32 nRet = 0;
281 168 : while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
282 : {
283 148 : updateBuffer();
284 148 : sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
285 148 : if( nReadSize > 0 )
286 : {
287 143 : memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
288 143 : mnBufferPos += nReadSize;
289 143 : nBytesToRead -= nReadSize;
290 143 : nRet += nReadSize;
291 : }
292 : }
293 10 : if( nRet < rData.getLength() )
294 10 : rData.realloc( nRet );
295 10 : return nRet;
296 : }
297 :
298 10 : sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
299 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
300 : {
301 10 : return readBytes( rData, nMaxBytesToRead );
302 : }
303 :
304 0 : void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
305 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
306 : {
307 0 : if( nBytesToSkip < 0 )
308 0 : throw IOException();
309 :
310 0 : while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
311 : {
312 0 : updateBuffer();
313 0 : sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
314 0 : mnBufferPos += nSkipSize;
315 0 : nBytesToSkip -= nSkipSize;
316 : }
317 0 : }
318 :
319 5 : sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException, std::exception)
320 : {
321 5 : updateBuffer();
322 5 : return maBuffer.getLength() - mnBufferPos;
323 : }
324 :
325 0 : void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException, std::exception)
326 : {
327 0 : mxTextStrm->closeInput();
328 0 : }
329 :
330 : // private --------------------------------------------------------------------
331 :
332 153 : void InputStream::updateBuffer() throw (IOException, RuntimeException)
333 : {
334 454 : while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
335 : {
336 : // collect new contents in a string buffer
337 148 : OStringBuffer aBuffer;
338 :
339 : // read and process characters until the opening bracket of the next XML element
340 296 : OString aChars = readToElementBegin();
341 148 : bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
342 :
343 : // read and process characters until (and including) closing bracket (an XML element)
344 : OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
345 148 : if( bHasOpeningBracket && !mxTextStrm->isEOF() )
346 : {
347 : // read the element text (add the leading opening bracket manually)
348 143 : OString aElement = OString( '<' ) + readToElementEnd();
349 : // check for CDATA part, starting with '<![CDATA['
350 143 : if( aElement.match( maOpeningCData ) )
351 : {
352 : // search the end tag ']]>'
353 0 : while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.endsWith( maClosingCData )) && !mxTextStrm->isEOF() )
354 0 : aElement += readToElementEnd();
355 : // copy the entire CDATA part
356 0 : aBuffer.append( aElement );
357 : }
358 : else
359 : {
360 : // no CDATA part - process the contents of the element
361 143 : lclProcessElement( aBuffer, aElement );
362 143 : }
363 : }
364 :
365 148 : maBuffer = aBuffer.makeStringAndClear();
366 148 : mnBufferPos = 0;
367 148 : }
368 153 : }
369 :
370 148 : OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
371 : {
372 148 : return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
373 : }
374 :
375 143 : OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
376 : {
377 143 : OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
378 : OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
379 143 : return aText;
380 : }
381 :
382 : } // namespace vml
383 : } // namespave oox
384 :
385 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|