Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include "filterdetect.hxx"
11 :
12 : #include <svtools/htmltokn.h>
13 : #include <tools/urlobj.hxx>
14 : #include <ucbhelper/content.hxx>
15 : #include <unotools/mediadescriptor.hxx>
16 : #include <unotools/ucbstreamhelper.hxx>
17 :
18 : #include <com/sun/star/lang/XMultiServiceFactory.hpp>
19 : #include <com/sun/star/io/XInputStream.hpp>
20 : #include <cppuhelper/supportsservice.hxx>
21 : #include <boost/scoped_ptr.hpp>
22 :
23 : #define WRITER_TEXT_FILTER "Text"
24 : #define CALC_TEXT_FILTER "Text - txt - csv (StarCalc)"
25 :
26 : #define WEB_HTML_FILTER "HTML"
27 : #define WRITER_HTML_FILTER "HTML (StarWriter)"
28 : #define CALC_HTML_FILTER "calc_HTML_WebQuery"
29 :
30 : #define WRITER_DOCSERVICE "com.sun.star.text.TextDocument"
31 : #define CALC_DOCSERVICE "com.sun.star.sheet.SpreadsheetDocument"
32 :
33 : using namespace ::com::sun::star;
34 : using utl::MediaDescriptor;
35 :
36 : namespace {
37 :
38 3 : bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
39 : {
40 3 : boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
41 3 : if ( !pInStream || pInStream->GetError() )
42 : // No stream
43 0 : return false;
44 :
45 : // Read the stream header
46 3 : pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
47 3 : const sal_Size nUniPos = pInStream->Tell();
48 3 : const sal_uInt16 nSize = 4096;
49 :
50 6 : OString sHeader;
51 3 : if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
52 3 : sHeader = read_uInt8s_ToOString( *pInStream, nSize );
53 : else // UTF-16 (nUniPos = 2)
54 0 : sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
55 :
56 : // Now check whether the stream begins with a known HTML tag.
57 : enum DetectPhase { BeforeTag, TagOpened, InTagName };
58 3 : DetectPhase dp = BeforeTag;
59 :
60 3 : const char* pHeader = sHeader.getStr();
61 3 : const int nLength = sHeader.getLength();
62 3 : int i = 0, nStartOfTagIndex = 0;
63 :
64 6 : for ( i = 0; i < nLength; ++i, ++pHeader )
65 : {
66 6 : char c = *pHeader;
67 6 : if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
68 : {
69 0 : if ( dp == TagOpened )
70 0 : return false; // Invalid: Should start with a tag name
71 0 : else if ( dp == InTagName )
72 0 : break; // End of tag name reached
73 : }
74 6 : else if ( c == '<' )
75 : {
76 3 : if ( dp == BeforeTag )
77 3 : dp = TagOpened;
78 : else
79 0 : return false; // Invalid: Nested '<'
80 : }
81 3 : else if ( c == '>' )
82 : {
83 0 : if ( dp == InTagName )
84 0 : break; // End of tag name reached
85 : else
86 0 : return false; // Invalid: Empty tag or before '<'
87 : }
88 3 : else if ( c == '!' )
89 : {
90 3 : if ( dp == TagOpened )
91 3 : return true; // "<!" - DOCTYPE or comments block
92 : else
93 0 : return false; // Invalid: '!' before '<' or inside tag name
94 : }
95 : else
96 : {
97 0 : if ( dp == BeforeTag )
98 0 : return false; // Invalid: Should start with a tag
99 0 : else if ( dp == TagOpened )
100 : {
101 0 : nStartOfTagIndex = i;
102 0 : dp = InTagName;
103 : }
104 : }
105 : }
106 :
107 : // The string following '<' has to be a known HTML token.
108 0 : OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
109 0 : if ( GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != 0 )
110 0 : return true;
111 :
112 3 : return false;
113 : }
114 :
115 : }
116 :
117 3 : PlainTextFilterDetect::PlainTextFilterDetect(const uno::Reference<uno::XComponentContext>& xCxt) :
118 3 : mxCxt(xCxt) {}
119 :
120 6 : PlainTextFilterDetect::~PlainTextFilterDetect() {}
121 :
122 3 : OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) throw (uno::RuntimeException, std::exception)
123 : {
124 3 : MediaDescriptor aMediaDesc(lDescriptor);
125 :
126 6 : OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() );
127 6 : OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() );
128 6 : OUString aUrl = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() );
129 :
130 : // Get the file name extension.
131 6 : INetURLObject aParser(aUrl);
132 6 : OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET);
133 3 : aExt = aExt.toAsciiLowerCase();
134 :
135 3 : if (aType == "generic_HTML")
136 : {
137 3 : uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
138 3 : if (!xInStream.is() || !IsHTMLStream(xInStream))
139 0 : return OUString();
140 :
141 : // Decide which filter to use based on the document service first,
142 : // then on extension if that's not available.
143 :
144 3 : if (aDocService == CALC_DOCSERVICE)
145 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER);
146 3 : else if (aDocService == WRITER_DOCSERVICE)
147 2 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER);
148 1 : else if (aExt == "xls")
149 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER);
150 : else
151 1 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER);
152 : }
153 :
154 0 : else if (aType == "generic_Text")
155 : {
156 0 : if (aDocService == CALC_DOCSERVICE)
157 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
158 0 : else if (aDocService == WRITER_DOCSERVICE)
159 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
160 0 : else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls")
161 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
162 : else
163 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
164 : }
165 :
166 : else
167 : // Nothing to detect.
168 0 : return OUString();
169 :
170 3 : aMediaDesc >> lDescriptor;
171 6 : return aType;
172 : }
173 :
174 : // XInitialization
175 :
176 0 : void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
177 : throw (uno::Exception, uno::RuntimeException, std::exception)
178 : {
179 0 : }
180 :
181 2 : OUString PlainTextFilterDetect_getImplementationName()
182 : {
183 2 : return OUString("com.sun.star.comp.filters.PlainTextFilterDetect");
184 : }
185 :
186 2 : uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
187 : {
188 2 : uno::Sequence<OUString> aRet(2);
189 2 : OUString* pArray = aRet.getArray();
190 2 : pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
191 2 : pArray[1] = "com.sun.star.comp.filters.PlainTextFilterDetect";
192 2 : return aRet;
193 : }
194 :
195 3 : uno::Reference<uno::XInterface> PlainTextFilterDetect_createInstance(
196 : const uno::Reference<uno::XComponentContext> & rCxt)
197 : {
198 3 : return (cppu::OWeakObject*) new PlainTextFilterDetect(rCxt);
199 : }
200 :
201 : // XServiceInfo
202 0 : OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
203 : throw (uno::RuntimeException, std::exception)
204 : {
205 0 : return PlainTextFilterDetect_getImplementationName();
206 : }
207 :
208 0 : sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
209 : throw (uno::RuntimeException, std::exception)
210 : {
211 0 : return cppu::supportsService(this, rServiceName);
212 : }
213 :
214 0 : uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
215 : throw (uno::RuntimeException, std::exception)
216 : {
217 0 : return PlainTextFilterDetect_getSupportedServiceNames();
218 : }
219 :
220 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|