Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "oox/vml/vmlinputstream.hxx"
21 :
22 : #include <com/sun/star/io/XTextInputStream2.hpp>
23 : #include <map>
24 : #include <string.h>
25 : #include <rtl/strbuf.hxx>
26 : #include "oox/helper/helper.hxx"
27 : #include "oox/helper/textinputstream.hxx"
28 :
29 : namespace oox {
30 : namespace vml {
31 :
32 :
33 :
34 : using namespace ::com::sun::star::io;
35 : using namespace ::com::sun::star::uno;
36 :
37 :
38 :
39 : namespace {
40 :
41 0 : inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
42 : {
43 0 : sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
44 0 : return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
45 : }
46 :
47 0 : inline bool lclIsWhiteSpace( sal_Char cChar )
48 : {
49 0 : return cChar < 32;
50 : }
51 :
52 0 : const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
53 : {
54 0 : for( ; pcBeg < pcEnd; ++pcBeg )
55 0 : if( lclIsWhiteSpace( *pcBeg ) )
56 0 : return pcBeg;
57 0 : return pcEnd;
58 : }
59 :
60 0 : const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
61 : {
62 0 : for( ; pcBeg < pcEnd; ++pcBeg )
63 0 : if( !lclIsWhiteSpace( *pcBeg ) )
64 0 : return pcBeg;
65 0 : return pcEnd;
66 : }
67 :
68 0 : const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
69 : {
70 0 : while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
71 0 : --pcEnd;
72 0 : return pcEnd;
73 : }
74 :
75 0 : inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
76 : {
77 0 : rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
78 0 : }
79 :
80 :
81 :
82 0 : void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
83 : {
84 : /* Map attribute names to char-pointer of all attributes. This map is used
85 : to find multiple occurrences of attributes with the same name. The
86 : mapped pointers are used as map key in the next map below. */
87 : typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
88 0 : AttributeNameMap aAttributeNames;
89 :
90 : /* Map the char-pointers of all attributes to the full attribute definition
91 : string. This preserves the original order of the used attributes. */
92 : typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
93 0 : AttributeDataMap aAttributes;
94 :
95 0 : bool bOk = true;
96 0 : const sal_Char* pcNameBeg = pcBeg;
97 0 : while( bOk && (pcNameBeg < pcEnd) )
98 : {
99 : // pcNameBeg points to begin of attribute name, find equality sign
100 0 : const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
101 0 : if ((bOk = (pcEqualSign < pcEnd)) == true)
102 : {
103 : // find end of attribute name (ignore whitespace between name and equality sign)
104 0 : const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
105 0 : if( (bOk = (pcNameBeg < pcNameEnd)) == true )
106 : {
107 : // find begin of attribute value (must be single or double quote)
108 0 : const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
109 0 : if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true )
110 : {
111 : // find end of attribute value (matching quote character)
112 0 : const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
113 0 : if( (bOk = (pcValueEnd < pcEnd)) == true )
114 : {
115 0 : ++pcValueEnd;
116 0 : OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
117 0 : OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
118 : // search for an existing attribute with the same name
119 0 : AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
120 : // remove its definition from the data map
121 0 : if( aIt != aAttributeNames.end() )
122 0 : aAttributes.erase( aIt->second );
123 : // insert the attribute into both maps
124 0 : aAttributeNames[ aAttribName ] = pcNameBeg;
125 0 : aAttributes[ pcNameBeg ] = aAttribData;
126 : // continue with next attribute (skip whitespace after this attribute)
127 0 : pcNameBeg = pcValueEnd;
128 0 : if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) )
129 0 : pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
130 : }
131 : }
132 : }
133 : }
134 : }
135 :
136 : // if no error has occurred, build the resulting attribute list
137 0 : if( bOk )
138 0 : for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
139 0 : rBuffer.append( ' ' ).append( aIt->second );
140 : // on error, just append the complete passed string
141 : else
142 0 : lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
143 0 : }
144 :
145 0 : void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
146 : {
147 : // check that passed string starts and ends with the brackets of an XML element
148 0 : sal_Int32 nElementLen = rElement.getLength();
149 0 : if( nElementLen == 0 )
150 0 : return;
151 :
152 0 : const sal_Char* pcOpen = rElement.getStr();
153 0 : const sal_Char* pcClose = pcOpen + nElementLen - 1;
154 :
155 : // no complete element found
156 0 : if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
157 : {
158 : // just append all passed characters
159 0 : rBuffer.append( rElement );
160 : }
161 :
162 : // skip parser instructions: '<![...]>'
163 0 : else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
164 : {
165 : // do nothing
166 : }
167 :
168 : // replace '<br>' element with newline
169 0 : else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
170 : {
171 0 : rBuffer.append( '\n' );
172 : }
173 :
174 : // check start elements and simple elements for repeated attributes
175 0 : else if( pcOpen[ 1 ] != '/' )
176 : {
177 : // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
178 0 : const sal_Char* pcContentBeg = pcOpen + 1;
179 0 : bool bIsEmptyElement = pcClose[ -1 ] == '/';
180 0 : const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
181 : // append opening bracket and element name to buffer
182 0 : const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
183 0 : lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
184 : // find begin of attributes, and process all attributes
185 0 : const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
186 0 : if( pcAttribBeg < pcContentEnd )
187 0 : lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
188 : // close the element
189 0 : if( bIsEmptyElement )
190 0 : rBuffer.append( '/' );
191 0 : rBuffer.append( '>' );
192 : }
193 :
194 : // append end elements without further processing
195 : else
196 : {
197 0 : rBuffer.append( rElement );
198 : }
199 : }
200 :
201 0 : bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
202 : {
203 : /* MSO has a very weird way to store and handle whitespaces. The stream
204 : may contain lots of spaces, tabs, and newlines which have to be handled
205 : as single space character. This will be done in this function.
206 :
207 : If the element text contains a literal line break, it will be stored as
208 : <br> tag (without matching </br> element). This input stream wrapper
209 : will replace this element with a literal LF character (see below).
210 :
211 : A single space character for its own is stored as is. Example: The
212 : element
213 : <font> </font>
214 : represents a single space character. The XML parser will ignore this
215 : space character completely without issuing a 'characters' event. The
216 : VML import filter implementation has to react on this case manually.
217 :
218 : A single space character following another character is stored
219 : literally and must not be stipped away here. Example: The element
220 : <font>abc </font>
221 : contains the three letters a, b, and c, followed by a space character.
222 :
223 : Consecutive space characters, or a leading single space character, are
224 : stored in a <span> element. If there are N space characters (N > 1),
225 : then the <span> element contains exactly (N-1) NBSP (non-breaking
226 : space) characters, followed by a regular space character. Examples:
227 : The element
228 : <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
229 : represents 4 consecutive space characters. Has to be handled by the
230 : implementation. The element
231 : <font><span style='mso-spacerun:yes'> abc</span></font>
232 : represents a space characters followed by the letters a, b, c. These
233 : strings have to be handled by the VML import filter implementation.
234 : */
235 :
236 : // passed string ends with the leading opening bracket of an XML element
237 0 : const sal_Char* pcBeg = rChars.getStr();
238 0 : const sal_Char* pcEnd = pcBeg + rChars.getLength();
239 0 : bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
240 0 : if( bHasBracket ) --pcEnd;
241 :
242 : // skip leading whitespace
243 0 : const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
244 0 : while( pcContentsBeg < pcEnd )
245 : {
246 0 : const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
247 0 : lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
248 0 : if( pcWhitespaceBeg < pcEnd )
249 0 : rBuffer.append( ' ' );
250 0 : pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
251 : }
252 :
253 0 : return bHasBracket;
254 : }
255 :
256 : } // namespace
257 :
258 :
259 :
260 0 : InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
261 : // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
262 : mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
263 : maOpeningBracket( 1 ),
264 : maClosingBracket( 1 ),
265 : maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
266 : maClosingCData( CREATE_OSTRING( "]]>" ) ),
267 0 : mnBufferPos( 0 )
268 : {
269 0 : maOpeningBracket[ 0 ] = '<';
270 0 : maClosingBracket[ 0 ] = '>';
271 0 : }
272 :
273 0 : InputStream::~InputStream()
274 : {
275 0 : }
276 :
277 0 : sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
278 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
279 : {
280 0 : if( nBytesToRead < 0 )
281 0 : throw IOException();
282 :
283 0 : rData.realloc( nBytesToRead );
284 0 : sal_Int8* pcDest = rData.getArray();
285 0 : sal_Int32 nRet = 0;
286 0 : while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
287 : {
288 0 : updateBuffer();
289 0 : sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
290 0 : if( nReadSize > 0 )
291 : {
292 0 : memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
293 0 : mnBufferPos += nReadSize;
294 0 : nBytesToRead -= nReadSize;
295 0 : nRet += nReadSize;
296 : }
297 : }
298 0 : if( nRet < rData.getLength() )
299 0 : rData.realloc( nRet );
300 0 : return nRet;
301 : }
302 :
303 0 : sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
304 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
305 : {
306 0 : return readBytes( rData, nMaxBytesToRead );
307 : }
308 :
309 0 : void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
310 : throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException, std::exception)
311 : {
312 0 : if( nBytesToSkip < 0 )
313 0 : throw IOException();
314 :
315 0 : while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
316 : {
317 0 : updateBuffer();
318 0 : sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
319 0 : mnBufferPos += nSkipSize;
320 0 : nBytesToSkip -= nSkipSize;
321 : }
322 0 : }
323 :
324 0 : sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException, std::exception)
325 : {
326 0 : updateBuffer();
327 0 : return maBuffer.getLength() - mnBufferPos;
328 : }
329 :
330 0 : void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException, std::exception)
331 : {
332 0 : mxTextStrm->closeInput();
333 0 : }
334 :
335 : // private --------------------------------------------------------------------
336 :
337 0 : void InputStream::updateBuffer() throw (IOException, RuntimeException)
338 : {
339 0 : while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
340 : {
341 : // collect new contents in a string buffer
342 0 : OStringBuffer aBuffer;
343 :
344 : // read and process characters until the opening bracket of the next XML element
345 0 : OString aChars = readToElementBegin();
346 0 : bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
347 :
348 : // read and process characters until (and including) closing bracket (an XML element)
349 : OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
350 0 : if( bHasOpeningBracket && !mxTextStrm->isEOF() )
351 : {
352 : // read the element text (add the leading opening bracket manually)
353 0 : OString aElement = OString( '<' ) + readToElementEnd();
354 : // check for CDATA part, starting with '<![CDATA['
355 0 : if( aElement.match( maOpeningCData ) )
356 : {
357 : // search the end tag ']]>'
358 0 : while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.endsWith( maClosingCData )) && !mxTextStrm->isEOF() )
359 0 : aElement += readToElementEnd();
360 : // copy the entire CDATA part
361 0 : aBuffer.append( aElement );
362 : }
363 : else
364 : {
365 : // no CDATA part - process the contents of the element
366 0 : lclProcessElement( aBuffer, aElement );
367 0 : }
368 : }
369 :
370 0 : maBuffer = aBuffer.makeStringAndClear();
371 0 : mnBufferPos = 0;
372 0 : }
373 0 : }
374 :
375 0 : OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
376 : {
377 0 : return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
378 : }
379 :
380 0 : OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
381 : {
382 0 : OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
383 : OSL_ENSURE( aText.endsWith(">"), "InputStream::readToElementEnd - missing closing bracket of XML element" );
384 0 : return aText;
385 : }
386 :
387 :
388 :
389 : } // namespace vml
390 : } // namespave oox
391 :
392 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|