LCOV - code coverage report
Current view: top level - filter/source/textfilterdetect - filterdetect.cxx (source / functions) Hit Total Coverage
Test: commit c8344322a7af75b84dd3ca8f78b05543a976dfd5 Lines: 77 111 69.4 %
Date: 2015-06-13 12:38:46 Functions: 10 12 83.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  */
       9             : 
      10             : #include "filterdetect.hxx"
      11             : 
      12             : #include <svtools/htmltokn.h>
      13             : #include <tools/urlobj.hxx>
      14             : #include <tools/zcodec.hxx>
      15             : #include <ucbhelper/content.hxx>
      16             : #include <unotools/mediadescriptor.hxx>
      17             : #include <unotools/streamwrap.hxx>
      18             : #include <unotools/ucbstreamhelper.hxx>
      19             : 
      20             : #include <com/sun/star/lang/XMultiServiceFactory.hpp>
      21             : #include <com/sun/star/io/XInputStream.hpp>
      22             : #include <cppuhelper/supportsservice.hxx>
      23             : #include <boost/scoped_ptr.hpp>
      24             : 
      25             : #define WRITER_TEXT_FILTER "Text"
      26             : #define CALC_TEXT_FILTER   "Text - txt - csv (StarCalc)"
      27             : 
      28             : #define WEB_HTML_FILTER    "HTML"
      29             : #define WRITER_HTML_FILTER "HTML (StarWriter)"
      30             : #define CALC_HTML_FILTER   "calc_HTML_WebQuery"
      31             : 
      32             : #define WRITER_DOCSERVICE  "com.sun.star.text.TextDocument"
      33             : #define CALC_DOCSERVICE    "com.sun.star.sheet.SpreadsheetDocument"
      34             : 
      35             : using namespace ::com::sun::star;
      36             : using utl::MediaDescriptor;
      37             : 
      38             : namespace {
      39             : 
      40          15 : bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
      41             : {
      42          15 :     boost::scoped_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
      43          15 :     if ( !pInStream || pInStream->GetError() )
      44             :         // No stream
      45           0 :         return false;
      46             : 
      47             :     // Read the stream header
      48          15 :     pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
      49          15 :     const sal_Size nUniPos = pInStream->Tell();
      50          15 :     const sal_uInt16 nSize = 4096;
      51             : 
      52          30 :     OString sHeader;
      53          15 :     if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
      54          15 :         sHeader = read_uInt8s_ToOString( *pInStream, nSize );
      55             :     else // UTF-16 (nUniPos = 2)
      56           0 :         sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
      57             : 
      58             :     // Now check whether the stream begins with a known HTML tag.
      59             :     enum DetectPhase { BeforeTag, TagOpened, InTagName };
      60          15 :     DetectPhase dp = BeforeTag;
      61             : 
      62          15 :     const char* pHeader = sHeader.getStr();
      63          15 :     const int   nLength = sHeader.getLength();
      64          15 :     int i = 0, nStartOfTagIndex = 0;
      65             : 
      66          26 :     for ( i = 0; i < nLength; ++i, ++pHeader )
      67             :     {
      68          26 :         char c = *pHeader;
      69          26 :         if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
      70             :         {
      71           0 :             if ( dp == TagOpened )
      72           0 :                 return false; // Invalid: Should start with a tag name
      73           0 :             else if ( dp == InTagName )
      74           0 :                 break; // End of tag name reached
      75             :         }
      76          26 :         else if ( c == '<' )
      77             :         {
      78          11 :             if ( dp == BeforeTag )
      79          11 :                 dp = TagOpened;
      80             :             else
      81           0 :                 return false; // Invalid: Nested '<'
      82             :         }
      83          15 :         else if ( c == '>' )
      84             :         {
      85           0 :             if ( dp == InTagName )
      86           0 :                 break; // End of tag name reached
      87             :             else
      88           0 :                 return false; // Invalid: Empty tag or before '<'
      89             :         }
      90          15 :         else if ( c == '!' )
      91             :         {
      92          11 :             if ( dp == TagOpened )
      93          11 :                 return true; // "<!" - DOCTYPE or comments block
      94             :             else
      95           0 :                 return false; // Invalid: '!' before '<' or inside tag name
      96             :         }
      97             :         else
      98             :         {
      99           4 :             if ( dp == BeforeTag )
     100           4 :                 return false; // Invalid: Should start with a tag
     101           0 :             else if ( dp == TagOpened )
     102             :             {
     103           0 :                 nStartOfTagIndex = i;
     104           0 :                 dp = InTagName;
     105             :             }
     106             :         }
     107             :     }
     108             : 
     109             :     // The string following '<' has to be a known HTML token.
     110           0 :     OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
     111           0 :     if ( GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != 0 )
     112           0 :         return true;
     113             : 
     114          15 :     return false;
     115             : }
     116             : 
     117             : }
     118             : 
     119          20 : PlainTextFilterDetect::PlainTextFilterDetect(const uno::Reference<uno::XComponentContext>& xCxt) :
     120          20 :     mxCxt(xCxt) {}
     121             : 
     122          40 : PlainTextFilterDetect::~PlainTextFilterDetect() {}
     123             : 
     124          19 : OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor) throw (uno::RuntimeException, std::exception)
     125             : {
     126          19 :     MediaDescriptor aMediaDesc(lDescriptor);
     127             : 
     128          38 :     OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() );
     129          38 :     OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() );
     130             : 
     131          19 :     if ((aType == "generic_HTML") || (aType == "calc_HTML"))
     132             :     {
     133          15 :         uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
     134          15 :         if (!xInStream.is() || !IsHTMLStream(xInStream))
     135           4 :             return OUString();
     136             : 
     137          11 :         if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
     138           0 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER);
     139          11 :         else if (aDocService == WRITER_DOCSERVICE)
     140          10 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER);
     141             :         else
     142           1 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER);
     143             :     }
     144             : 
     145           4 :     else if (aType == "generic_Text")
     146             :     {
     147           4 :         uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM()], uno::UNO_QUERY);
     148           8 :         uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY);
     149           4 :         if (xStream.is() || xInStream.is())
     150             :         {
     151           4 :             ZCodec aCodecGZ;
     152           8 :             std::unique_ptr<SvStream> pInStream;
     153           4 :             if (xStream.is())
     154           1 :                 pInStream.reset(utl::UcbStreamHelper::CreateStream(xStream));
     155             :             else
     156           3 :                 pInStream.reset(utl::UcbStreamHelper::CreateStream(xInStream));
     157           8 :             std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream());
     158           4 :             if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream, false, true))
     159             :             {
     160           0 :                 uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(*pDecompressedStream));
     161           0 :                 pDecompressedStream.release();
     162           0 :                 aMediaDesc[MediaDescriptor::PROP_STREAM()] <<= xStreamDecompressed;
     163           0 :                 aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()] <<= xStreamDecompressed->getInputStream();
     164           0 :                 OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() );
     165           0 :                 sal_Int32 nIdx = aURL.lastIndexOf(".gz");
     166           0 :                 if (nIdx != -1)
     167           0 :                     aMediaDesc[MediaDescriptor::PROP_URL()] <<= aURL.copy(0, nIdx);
     168           4 :             }
     169             :         }
     170             :         // Get the file name extension.
     171           8 :         INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ) );
     172           8 :         OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DECODE_WITH_CHARSET);
     173           4 :         aExt = aExt.toAsciiLowerCase();
     174           8 :         OUString aName = aParser.getName().toAsciiLowerCase();
     175             : 
     176             :         // Decide which filter to use based on the document service first,
     177             :         // then on extension if that's not available.
     178           4 :         if (aDocService == CALC_DOCSERVICE)
     179           0 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
     180           4 :         else if (aDocService == WRITER_DOCSERVICE)
     181           0 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
     182           4 :         else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz"))
     183           0 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER);
     184             :         else
     185           8 :             aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER);
     186             :     }
     187             : 
     188             :     else
     189             :         // Nothing to detect.
     190           0 :         return OUString();
     191             : 
     192          15 :     aMediaDesc >> lDescriptor;
     193          34 :     return aType;
     194             : }
     195             : 
     196             : // XInitialization
     197             : 
     198           0 : void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
     199             :     throw (uno::Exception, uno::RuntimeException, std::exception)
     200             : {
     201           0 : }
     202             : 
     203           1 : OUString PlainTextFilterDetect_getImplementationName()
     204             : {
     205           1 :     return OUString("com.sun.star.comp.filters.PlainTextFilterDetect");
     206             : }
     207             : 
     208           1 : uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
     209             : {
     210           1 :     uno::Sequence<OUString> aRet(2);
     211           1 :     OUString* pArray = aRet.getArray();
     212           1 :     pArray[0] = "com.sun.star.document.ExtendedTypeDetection";
     213           1 :     pArray[1] = "com.sun.star.comp.filters.PlainTextFilterDetect";
     214           1 :     return aRet;
     215             : }
     216             : 
     217             : // XServiceInfo
     218           1 : OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
     219             :     throw (uno::RuntimeException, std::exception)
     220             : {
     221           1 :     return PlainTextFilterDetect_getImplementationName();
     222             : }
     223             : 
     224           0 : sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
     225             :     throw (uno::RuntimeException, std::exception)
     226             : {
     227           0 :     return cppu::supportsService(this, rServiceName);
     228             : }
     229             : 
     230           1 : uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
     231             :     throw (uno::RuntimeException, std::exception)
     232             : {
     233           1 :     return PlainTextFilterDetect_getSupportedServiceNames();
     234             : }
     235             : 
     236             : extern "C" SAL_DLLPUBLIC_EXPORT ::com::sun::star::uno::XInterface* SAL_CALL
     237          20 : com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(::com::sun::star::uno::XComponentContext* component,
     238             :                                                                            ::com::sun::star::uno::Sequence<css::uno::Any> const &)
     239             : {
     240          20 :     return cppu::acquire(new PlainTextFilterDetect(component));
     241             : }
     242             : 
     243             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.11