Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include "filterdetect.hxx"
11 :
12 : #include <svtools/htmltokn.h>
13 : #include <tools/urlobj.hxx>
14 : #include <ucbhelper/content.hxx>
15 : #include <unotools/mediadescriptor.hxx>
16 : #include <unotools/ucbstreamhelper.hxx>
17 :
18 : #include <com/sun/star/lang/XMultiServiceFactory.hpp>
19 : #include <com/sun/star/io/XInputStream.hpp>
20 : #include <cppuhelper/supportsservice.hxx>
21 : #include <boost/scoped_ptr.hpp>
22 :
23 : #define WRITER_TEXT_FILTER "Text"
24 : #define CALC_TEXT_FILTER "Text - txt - csv (StarCalc)"
25 :
26 : #define WEB_HTML_FILTER "HTML"
27 : #define WRITER_HTML_FILTER "HTML (StarWriter)"
28 : #define CALC_HTML_FILTER "calc_HTML_WebQuery"
29 :
30 : #define WRITER_DOCSERVICE "com.sun.star.text.TextDocument"
31 : #define CALC_DOCSERVICE "com.sun.star.sheet.SpreadsheetDocument"
32 :
33 : using namespace ::com::sun::star;
34 : using utl::MediaDescriptor;
35 :
36 : namespace {
37 :
38 16 : bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
39 : {
40 16 : boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
41 16 : if ( !pInStream || pInStream->GetError() )
42 : // No stream
43 0 : return false;
44 :
45 : // Read the stream header
46 16 : pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
47 16 : const sal_Size nUniPos = pInStream->Tell();
48 16 : const sal_uInt16 nSize = 4096;
49 :
50 32 : OString sHeader;
51 16 : if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
52 16 : sHeader = read_uInt8s_ToOString( *pInStream, nSize );
53 : else // UTF-16 (nUniPos = 2)
54 0 : sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
55 :
56 : // Now check whether the stream begins with a known HTML tag.
57 : enum DetectPhase { BeforeTag, TagOpened, InTagName };
58 16 : DetectPhase dp = BeforeTag;
59 :
60 16 : const char* pHeader = sHeader.getStr();
61 16 : const int nLength = sHeader.getLength();
62 16 : int i = 0, nStartOfTagIndex = 0;
63 :
64 32 : for ( i = 0; i < nLength; ++i, ++pHeader )
65 : {
66 32 : char c = *pHeader;
67 32 : if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
68 : {
69 0 : if ( dp == TagOpened )
70 0 : return false; // Invalid: Should start with a tag name
71 0 : else if ( dp == InTagName )
72 0 : break; // End of tag name reached
73 : }
74 32 : else if ( c == '<' )
75 : {
76 16 : if ( dp == BeforeTag )
77 16 : dp = TagOpened;
78 : else
79 0 : return false; // Invalid: Nested '<'
80 : }
81 16 : else if ( c == '>' )
82 : {
83 0 : if ( dp == InTagName )
84 0 : break; // End of tag name reached
85 : else
86 0 : return false; // Invalid: Empty tag or before '<'
87 : }
88 16 : else if ( c == '!' )
89 : {
90 16 : if ( dp == TagOpened )
91 16 : return true; // "<!" - DOCTYPE or comments block
92 : else
93 0 : return false; // Invalid: '!' before '<' or inside tag name
94 : }
95 : else
96 : {
97 0 : if ( dp == BeforeTag )
98 0 : return false; // Invalid: Should start with a tag
99 0 : else if ( dp == TagOpened )
100 : {
101 0 : nStartOfTagIndex = i;
102 0 : dp = InTagName;
103 : }
104 : }
105 : }
106 :
107 : // The string following '<' has to be a known HTML token.
108 0 : OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
109 0 : if ( GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != 0 )
110 0 : return true;
111 :
112 16 : return false;
113 : }
114 :
115 : }
116 :
117 16 : PlainTextFilterDetect::PlainTextFilterDetect(const uno::Reference<uno::XComponentContext>& xCxt) :
118 16 : mxCxt(xCxt) {}
119 :
120 32 : PlainTextFilterDetect::~PlainTextFilterDetect() {}
121 :
122 16 : OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) throw (uno::RuntimeException, std::exception)
123 : {
124 16 : MediaDescriptor aMediaDesc(lDescriptor);
125 :
126 32 : OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() );
127 32 : OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() );
128 :
129 16 : if ((aType == "generic_HTML") || (aType == "calc_HTML"))
130 : {
131 16 : uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
132 16 : if (!xInStream.is() || !IsHTMLStream(xInStream))
133 0 : return OUString();
134 :
135 16 : if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
136 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER);
137 16 : else if (aDocService == WRITER_DOCSERVICE)
138 14 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER);
139 : else
140 2 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER);
141 : }
142 :
143 0 : else if (aType == "generic_Text")
144 : {
145 : // Get the file name extension.
146 0 : INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ) );
147 0 : OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET);
148 0 : aExt = aExt.toAsciiLowerCase();
149 :
150 : // Decide which filter to use based on the document service first,
151 : // then on extension if that's not available.
152 0 : if (aDocService == CALC_DOCSERVICE)
153 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
154 0 : else if (aDocService == WRITER_DOCSERVICE)
155 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
156 0 : else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls")
157 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
158 : else
159 0 : aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
160 : }
161 :
162 : else
163 : // Nothing to detect.
164 0 : return OUString();
165 :
166 16 : aMediaDesc >> lDescriptor;
167 32 : return aType;
168 : }
169 :
170 : // XInitialization
171 :
172 0 : void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
173 : throw (uno::Exception, uno::RuntimeException, std::exception)
174 : {
175 0 : }
176 :
177 6 : OUString PlainTextFilterDetect_getImplementationName()
178 : {
179 6 : return OUString("com.sun.star.comp.filters.PlainTextFilterDetect");
180 : }
181 :
182 6 : uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
183 : {
184 6 : uno::Sequence<OUString> aRet(2);
185 6 : OUString* pArray = aRet.getArray();
186 6 : pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
187 6 : pArray[1] = "com.sun.star.comp.filters.PlainTextFilterDetect";
188 6 : return aRet;
189 : }
190 :
191 16 : uno::Reference<uno::XInterface> PlainTextFilterDetect_createInstance(
192 : const uno::Reference<uno::XComponentContext> & rCxt)
193 : {
194 16 : return (cppu::OWeakObject*) new PlainTextFilterDetect(rCxt);
195 : }
196 :
197 : // XServiceInfo
198 0 : OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
199 : throw (uno::RuntimeException, std::exception)
200 : {
201 0 : return PlainTextFilterDetect_getImplementationName();
202 : }
203 :
204 0 : sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
205 : throw (uno::RuntimeException, std::exception)
206 : {
207 0 : return cppu::supportsService(this, rServiceName);
208 : }
209 :
210 0 : uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
211 : throw (uno::RuntimeException, std::exception)
212 : {
213 0 : return PlainTextFilterDetect_getSupportedServiceNames();
214 : }
215 :
216 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|