Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include "filterdetect.hxx"
11 :
12 : #include <svtools/htmltokn.h>
13 : #include <tools/urlobj.hxx>
14 : #include <tools/zcodec.hxx>
15 : #include <ucbhelper/content.hxx>
16 : #include <unotools/mediadescriptor.hxx>
17 : #include <unotools/streamwrap.hxx>
18 : #include <unotools/ucbstreamhelper.hxx>
19 :
20 : #include <com/sun/star/lang/XMultiServiceFactory.hpp>
21 : #include <com/sun/star/io/XInputStream.hpp>
22 : #include <cppuhelper/supportsservice.hxx>
23 : #include <boost/scoped_ptr.hpp>
24 :
25 : #define WRITER_TEXT_FILTER "Text"
26 : #define CALC_TEXT_FILTER "Text - txt - csv (StarCalc)"
27 :
28 : #define WEB_HTML_FILTER "HTML"
29 : #define WRITER_HTML_FILTER "HTML (StarWriter)"
30 : #define CALC_HTML_FILTER "calc_HTML_WebQuery"
31 :
32 : #define WRITER_DOCSERVICE "com.sun.star.text.TextDocument"
33 : #define CALC_DOCSERVICE "com.sun.star.sheet.SpreadsheetDocument"
34 :
35 : using namespace ::com::sun::star;
36 : using utl::MediaDescriptor;
37 :
38 : namespace {
39 :
40 15 : bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
41 : {
42 15 : boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
43 15 : if ( !pInStream || pInStream->GetError() )
44 : // No stream
45 0 : return false;
46 :
47 : // Read the stream header
48 15 : pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
49 15 : const sal_Size nUniPos = pInStream->Tell();
50 15 : const sal_uInt16 nSize = 4096;
51 :
52 30 : OString sHeader;
53 15 : if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
54 15 : sHeader = read_uInt8s_ToOString( *pInStream, nSize );
55 : else // UTF-16 (nUniPos = 2)
56 0 : sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
57 :
58 : // Now check whether the stream begins with a known HTML tag.
59 : enum DetectPhase { BeforeTag, TagOpened, InTagName };
60 15 : DetectPhase dp = BeforeTag;
61 :
62 15 : const char* pHeader = sHeader.getStr();
63 15 : const int nLength = sHeader.getLength();
64 15 : int i = 0, nStartOfTagIndex = 0;
65 :
66 26 : for ( i = 0; i < nLength; ++i, ++pHeader )
67 : {
68 26 : char c = *pHeader;
69 26 : if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
70 : {
71 0 : if ( dp == TagOpened )
72 0 : return false; // Invalid: Should start with a tag name
73 0 : else if ( dp == InTagName )
74 0 : break; // End of tag name reached
75 : }
76 26 : else if ( c == '<' )
77 : {
78 11 : if ( dp == BeforeTag )
79 11 : dp = TagOpened;
80 : else
81 0 : return false; // Invalid: Nested '<'
82 : }
83 15 : else if ( c == '>' )
84 : {
85 0 : if ( dp == InTagName )
86 0 : break; // End of tag name reached
87 : else
88 0 : return false; // Invalid: Empty tag or before '<'
89 : }
90 15 : else if ( c == '!' )
91 : {
92 11 : if ( dp == TagOpened )
93 11 : return true; // "<!" - DOCTYPE or comments block
94 : else
95 0 : return false; // Invalid: '!' before '<' or inside tag name
96 : }
97 : else
98 : {
99 4 : if ( dp == BeforeTag )
100 4 : return false; // Invalid: Should start with a tag
101 0 : else if ( dp == TagOpened )
102 : {
103 0 : nStartOfTagIndex = i;
104 0 : dp = InTagName;
105 : }
106 : }
107 : }
108 :
109 : // The string following '<' has to be a known HTML token.
110 0 : OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
111 0 : if ( GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != 0 )
112 0 : return true;
113 :
114 15 : return false;
115 : }
116 :
117 : }
118 :
119 20 : PlainTextFilterDetect::PlainTextFilterDetect(const uno::Reference<uno::XComponentContext>& xCxt) :
120 20 : mxCxt(xCxt) {}
121 :
122 40 : PlainTextFilterDetect::~PlainTextFilterDetect() {}
123 :
124 19 : OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) throw (uno::RuntimeException, std::exception)
125 : {
126 19 : MediaDescriptor aMediaDesc(lDescriptor);
127 :
128 38 : OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() );
129 38 : OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() );
130 :
131 19 : if ((aType == "generic_HTML") || (aType == "calc_HTML"))
132 : {
133 15 : uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
134 15 : if (!xInStream.is() || !IsHTMLStream(xInStream))
135 4 : return OUString();
136 :
137 11 : if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
138 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER);
139 11 : else if (aDocService == WRITER_DOCSERVICE)
140 10 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER);
141 : else
142 1 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER);
143 : }
144 :
145 4 : else if (aType == "generic_Text")
146 : {
147 4 : uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM()], uno::UNO_QUERY);
148 8 : uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
149 4 : if (xStream.is() || xInStream.is())
150 : {
151 4 : ZCodec aCodecGZ;
152 8 : std::unique_ptr<SvStream> pInStream;
153 4 : if (xStream.is())
154 1 : pInStream.reset(utl::UcbStreamHelper::CreateStream(xStream));
155 : else
156 3 : pInStream.reset(utl::UcbStreamHelper::CreateStream(xInStream));
157 8 : std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream());
158 4 : if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream, false, true))
159 : {
160 0 : uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(*pDecompressedStream));
161 0 : pDecompressedStream.release();
162 0 : aMediaDesc[MediaDescriptor::PROP_STREAM()] <<= xStreamDecompressed;
163 0 : aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()] <<= xStreamDecompressed->getInputStream();
164 0 : OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() );
165 0 : sal_Int32 nIdx = aURL.lastIndexOf(".gz");
166 0 : if (nIdx != -1)
167 0 : aMediaDesc[MediaDescriptor::PROP_URL()] <<= aURL.copy(0, nIdx);
168 4 : }
169 : }
170 : // Get the file name extension.
171 8 : INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ) );
172 8 : OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET);
173 4 : aExt = aExt.toAsciiLowerCase();
174 8 : OUString aName = aParser.getName().toAsciiLowerCase();
175 :
176 : // Decide which filter to use based on the document service first,
177 : // then on extension if that's not available.
178 4 : if (aDocService == CALC_DOCSERVICE)
179 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
180 4 : else if (aDocService == WRITER_DOCSERVICE)
181 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
182 4 : else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz"))
183 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
184 : else
185 8 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
186 : }
187 :
188 : else
189 : // Nothing to detect.
190 0 : return OUString();
191 :
192 15 : aMediaDesc >> lDescriptor;
193 34 : return aType;
194 : }
195 :
196 : // XInitialization
197 :
198 0 : void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
199 : throw (uno::Exception, uno::RuntimeException, std::exception)
200 : {
201 0 : }
202 :
203 1 : OUString PlainTextFilterDetect_getImplementationName()
204 : {
205 1 : return OUString("com.sun.star.comp.filters.PlainTextFilterDetect");
206 : }
207 :
208 1 : uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
209 : {
210 1 : uno::Sequence<OUString> aRet(2);
211 1 : OUString* pArray = aRet.getArray();
212 1 : pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
213 1 : pArray[1] = "com.sun.star.comp.filters.PlainTextFilterDetect";
214 1 : return aRet;
215 : }
216 :
217 : // XServiceInfo
218 1 : OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
219 : throw (uno::RuntimeException, std::exception)
220 : {
221 1 : return PlainTextFilterDetect_getImplementationName();
222 : }
223 :
224 0 : sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
225 : throw (uno::RuntimeException, std::exception)
226 : {
227 0 : return cppu::supportsService(this, rServiceName);
228 : }
229 :
230 1 : uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
231 : throw (uno::RuntimeException, std::exception)
232 : {
233 1 : return PlainTextFilterDetect_getSupportedServiceNames();
234 : }
235 :
236 : extern "C" SAL_DLLPUBLIC_EXPORT ::com::sun::star::uno::XInterface* SAL_CALL
237 20 : com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(::com::sun::star::uno::XComponentContext* component,
238 : ::com::sun::star::uno::Sequence<css::uno::Any> const &)
239 : {
240 20 : return cppu::acquire(new PlainTextFilterDetect(component));
241 : }
242 :
243 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|