LCOV - commit 0e63ca4fde4e446f346e35849c756a30ca294aab

LCOV - code coverage report

Current view:	top level - svtools/source/svhtml - parhtml.cxx (source / functions)		Hit	Total	Coverage
Test:	commit 0e63ca4fde4e446f346e35849c756a30ca294aab	Lines:	459	1022	44.9 %
Date:	2014-04-11	Functions:	21	32	65.6 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : 
      21             : #include <ctype.h>
      22             : #include <stdio.h>
      23             : #include <comphelper/string.hxx>
      24             : #include <tools/stream.hxx>
      25             : #include <tools/debug.hxx>
      26             : #include <tools/color.hxx>
      27             : #include <rtl/ustrbuf.hxx>
      28             : #include <rtl/strbuf.hxx>
      29             : 
      30             : #include <tools/tenccvt.hxx>
      31             : #include <tools/datetime.hxx>
      32             : #include <svl/inettype.hxx>
      33             : #include <com/sun/star/beans/PropertyAttribute.hpp>
      34             : #include <com/sun/star/document/XDocumentProperties.hpp>
      35             : 
      36             : #include <svtools/parhtml.hxx>
      37             : #include <svtools/htmltokn.h>
      38             : #include <svtools/htmlkywd.hxx>
      39             : 
      40             : #include <memory>
      41             : 
      42             : using namespace ::com::sun::star;
      43             : 
      44             : 
      45             : const sal_Int32 MAX_LEN( 1024L );
      46             : 
      47             : const sal_Int32 MAX_ENTITY_LEN( 8L );
      48             : 
      49             : 
      50             : // Tables to convert option values into strings
      51             : 
      52             : // <INPUT TYPE=xxx>
      53             : static HTMLOptionEnum const aInputTypeOptEnums[] =
      54             : {
      55             :     { OOO_STRING_SVTOOLS_HTML_IT_text,      HTML_IT_TEXT        },
      56             :     { OOO_STRING_SVTOOLS_HTML_IT_password,  HTML_IT_PASSWORD    },
      57             :     { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTML_IT_CHECKBOX    },
      58             :     { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTML_IT_RADIO       },
      59             :     { OOO_STRING_SVTOOLS_HTML_IT_range,     HTML_IT_RANGE       },
      60             :     { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTML_IT_SCRIBBLE    },
      61             :     { OOO_STRING_SVTOOLS_HTML_IT_file,      HTML_IT_FILE        },
      62             :     { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTML_IT_HIDDEN      },
      63             :     { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTML_IT_SUBMIT      },
      64             :     { OOO_STRING_SVTOOLS_HTML_IT_image,     HTML_IT_IMAGE       },
      65             :     { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTML_IT_RESET       },
      66             :     { OOO_STRING_SVTOOLS_HTML_IT_button,    HTML_IT_BUTTON      },
      67             :     { 0,                    0                   }
      68             : };
      69             : 
      70             : // <TABLE FRAME=xxx>
      71             : static HTMLOptionEnum const aTableFrameOptEnums[] =
      72             : {
      73             :     { OOO_STRING_SVTOOLS_HTML_TF_void,  HTML_TF_VOID    },
      74             :     { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE   },
      75             :     { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW   },
      76             :     { OOO_STRING_SVTOOLS_HTML_TF_hsides,    HTML_TF_HSIDES  },
      77             :     { OOO_STRING_SVTOOLS_HTML_TF_lhs,       HTML_TF_LHS     },
      78             :     { OOO_STRING_SVTOOLS_HTML_TF_rhs,       HTML_TF_RHS     },
      79             :     { OOO_STRING_SVTOOLS_HTML_TF_vsides,    HTML_TF_VSIDES  },
      80             :     { OOO_STRING_SVTOOLS_HTML_TF_box,       HTML_TF_BOX     },
      81             :     { OOO_STRING_SVTOOLS_HTML_TF_border,    HTML_TF_BOX     },
      82             :     { 0,                0               }
      83             : };
      84             : 
      85             : // <TABLE RULES=xxx>
      86             : static HTMLOptionEnum const aTableRulesOptEnums[] =
      87             : {
      88             :     { OOO_STRING_SVTOOLS_HTML_TR_none,  HTML_TR_NONE    },
      89             :     { OOO_STRING_SVTOOLS_HTML_TR_groups,    HTML_TR_GROUPS  },
      90             :     { OOO_STRING_SVTOOLS_HTML_TR_rows,  HTML_TR_ROWS    },
      91             :     { OOO_STRING_SVTOOLS_HTML_TR_cols,  HTML_TR_COLS    },
      92             :     { OOO_STRING_SVTOOLS_HTML_TR_all,       HTML_TR_ALL     },
      93             :     { 0,                0               }
      94             : };
      95             : 
      96          58 : sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
      97             : {
      98          58 :     sal_uInt16 nValue = nDflt;
      99             : 
     100         140 :     while( pOptEnums->pName )
     101          82 :         if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
     102          58 :             break;
     103             :         else
     104          24 :             pOptEnums++;
     105             : 
     106          58 :     if( pOptEnums->pName )
     107          58 :         nValue = pOptEnums->nValue;
     108             : 
     109          58 :     return nValue;
     110             : }
     111             : 
     112          20 : bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
     113             : {
     114         168 :     while( pOptEnums->pName )
     115             :     {
     116         142 :         if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
     117          14 :             break;
     118             :         else
     119         128 :             pOptEnums++;
     120             :     }
     121             : 
     122          20 :     const sal_Char *pName = pOptEnums->pName;
     123          20 :     if( pName )
     124          14 :         rEnum = pOptEnums->nValue;
     125             : 
     126          20 :     return (pName != 0);
     127             : }
     128             : 
     129        1490 : HTMLOption::HTMLOption( sal_uInt16 nTok, const OUString& rToken,
     130             :                         const OUString& rValue )
     131             :     : aValue(rValue)
     132             :     , aToken(rToken)
     133        1490 :     , nToken( nTok )
     134             : {
     135             :     DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
     136             :         "HTMLOption: unknown token" );
     137        1490 : }
     138             : 
     139         276 : sal_uInt32 HTMLOption::GetNumber() const
     140             : {
     141             :     DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
     142             :                  nToken<HTML_OPTION_NUMBER_END) ||
     143             :                 (nToken>=HTML_OPTION_CONTEXT_START &&
     144             :                  nToken<HTML_OPTION_CONTEXT_END) ||
     145             :                 nToken==HTML_O_VALUE,
     146             :         "GetNumber: Option not numerical" );
     147         276 :     OUString aTmp(comphelper::string::stripStart(aValue, ' '));
     148         276 :     sal_Int32 nTmp = aTmp.toInt32();
     149         276 :     return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
     150             : }
     151             : 
     152           0 : sal_Int32 HTMLOption::GetSNumber() const
     153             : {
     154             :     DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
     155             :                 (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
     156             :         "GetSNumber: Option not numerical" );
     157           0 :     OUString aTmp(comphelper::string::stripStart(aValue, ' '));
     158           0 :     return aTmp.toInt32();
     159             : }
     160             : 
     161           0 : void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers, bool bSpaceDelim ) const
     162             : {
     163           0 :     rNumbers.clear();
     164             : 
     165           0 :     if( bSpaceDelim )
     166             :     {
     167             :         // This is a very simplified scanner: it only searches all
     168             :         // numerals in the string.
     169           0 :         bool bInNum = false;
     170           0 :         sal_uLong nNum = 0;
     171           0 :         for( sal_Int32 i=0; i<aValue.getLength(); i++ )
     172             :         {
     173           0 :             sal_Unicode c = aValue[ i ];
     174           0 :             if( c>='0' && c<='9' )
     175             :             {
     176           0 :                 nNum *= 10;
     177           0 :                 nNum += (c - '0');
     178           0 :                 bInNum = true;
     179             :             }
     180           0 :             else if( bInNum )
     181             :             {
     182           0 :                 rNumbers.push_back( nNum );
     183           0 :                 bInNum = false;
     184           0 :                 nNum = 0;
     185             :             }
     186             :         }
     187           0 :         if( bInNum )
     188             :         {
     189           0 :             rNumbers.push_back( nNum );
     190             :         }
     191             :     }
     192             :     else
     193             :     {
     194             :         // Check whether numbers are separated by ',' and
     195             :         // insert 0 if necessary
     196           0 :         sal_Int32 nPos = 0;
     197           0 :         while( nPos < aValue.getLength() )
     198             :         {
     199             :             sal_Unicode c;
     200           0 :             while( nPos < aValue.getLength() &&
     201           0 :                    ((c=aValue[nPos]) == ' ' || c == '\t' ||
     202           0 :                    c == '\n' || c== '\r' ) )
     203           0 :                 nPos++;
     204             : 
     205           0 :             if( nPos==aValue.getLength() )
     206           0 :                 rNumbers.push_back(0);
     207             :             else
     208             :             {
     209           0 :                 sal_Int32 nEnd = aValue.indexOf( (sal_Unicode)',', nPos );
     210           0 :                 if( -1 == nEnd )
     211             :                 {
     212           0 :                     sal_Int32 nTmp = aValue.copy(nPos).toInt32();
     213           0 :                     rNumbers.push_back( nTmp >= 0 ? (sal_uInt32)nTmp : 0 );
     214           0 :                     nPos = aValue.getLength();
     215             :                 }
     216             :                 else
     217             :                 {
     218           0 :                     sal_Int32 nTmp = aValue.copy(nPos,nEnd-nPos).toInt32();
     219           0 :                     rNumbers.push_back( nTmp >= 0 ? (sal_uInt32)nTmp : 0 );
     220           0 :                     nPos = nEnd+1;
     221             :                 }
     222             :             }
     223             :         }
     224             :     }
     225           0 : }
     226             : 
     227         267 : void HTMLOption::GetColor( Color& rColor ) const
     228             : {
     229             :     DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
     230             :         "GetColor: Option is not a color." );
     231             : 
     232         267 :     OUString aTmp(aValue.toAsciiLowerCase());
     233         267 :     sal_uInt32 nColor = SAL_MAX_UINT32;
     234         267 :     if( '#' != aTmp[0] )
     235           0 :         nColor = GetHTMLColor( aTmp );
     236             : 
     237         267 :     if( SAL_MAX_UINT32 == nColor )
     238             :     {
     239         267 :         nColor = 0;
     240         267 :         sal_Int32 nPos = 0;
     241        1869 :         for (sal_uInt32 i=0; i<6; ++i)
     242             :         {
     243             :             // Whatever Netscape does to get color values,
     244             :             // at maximum three characters < '0' are ignored.
     245        1602 :             sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
     246        1602 :             if( c < '0' )
     247             :             {
     248         267 :                 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
     249         267 :                 if( c < '0' )
     250           0 :                     c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
     251             :             }
     252        1602 :             nColor *= 16;
     253        1602 :             if( c >= '0' && c <= '9' )
     254          80 :                 nColor += (c - '0');
     255        1522 :             else if( c >= 'a' && c <= 'f' )
     256        1522 :                 nColor += (c + 0xa - 'a');
     257             :         }
     258             :     }
     259             : 
     260         267 :     rColor.SetRed(   (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
     261         267 :     rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
     262         267 :     rColor.SetBlue(  (sal_uInt8)(nColor & 0x000000ff) );
     263         267 : }
     264             : 
     265           4 : HTMLInputType HTMLOption::GetInputType() const
     266             : {
     267             :     DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option not TYPE" );
     268           4 :     return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
     269             : }
     270             : 
     271           0 : HTMLTableFrame HTMLOption::GetTableFrame() const
     272             : {
     273             :     DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option not FRAME" );
     274           0 :     return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
     275             : }
     276             : 
     277           0 : HTMLTableRules HTMLOption::GetTableRules() const
     278             : {
     279             :     DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option not RULES" );
     280           0 :     return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
     281             : }
     282             : 
     283           7 : HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
     284             :     SvParser( rIn ),
     285             :     bNewDoc(bReadNewDoc),
     286             :     bIsInHeader(true),
     287             :     bIsInBody(false),
     288             :     bReadListing(false),
     289             :     bReadXMP(false),
     290             :     bReadPRE(false),
     291             :     bReadTextArea(false),
     292             :     bReadScript(false),
     293             :     bReadStyle(false),
     294             :     bEndTokenFound(false),
     295             :     bPre_IgnoreNewPara(false),
     296             :     bReadNextChar(false),
     297             :     bReadComment(false),
     298             :     nPre_LinePos(0),
     299           7 :     mnPendingOffToken(0)
     300             : {
     301             :     //#i76649, default to UTF-8 for HTML unless we know differently
     302           7 :     SetSrcEncoding(RTL_TEXTENCODING_UTF8);
     303           7 : }
     304             : 
     305           7 : HTMLParser::~HTMLParser()
     306             : {
     307           7 : }
     308             : 
     309           7 : SvParserState HTMLParser::CallParser()
     310             : {
     311           7 :     eState = SVPAR_WORKING;
     312           7 :     nNextCh = GetNextChar();
     313           7 :     SaveState( 0 );
     314             : 
     315           7 :     nPre_LinePos = 0;
     316           7 :     bPre_IgnoreNewPara = false;
     317             : 
     318           7 :     AddRef();
     319           7 :     Continue( 0 );
     320           7 :     if( SVPAR_PENDING != eState )
     321           7 :         ReleaseRef();       // Parser not needed anymore
     322             : 
     323           7 :     return eState;
     324             : }
     325             : 
     326           7 : void HTMLParser::Continue( int nToken )
     327             : {
     328           7 :     if( !nToken )
     329           7 :         nToken = GetNextToken();
     330             : 
     331         288 :     while( IsParserWorking() )
     332             :     {
     333         274 :         SaveState( nToken );
     334         274 :         nToken = FilterToken( nToken );
     335             : 
     336         274 :         if( nToken )
     337         268 :             NextToken( nToken );
     338             : 
     339         274 :         if( IsParserWorking() )
     340         274 :             SaveState( 0 );         // continue with new token
     341             : 
     342         274 :         nToken = GetNextToken();
     343             :     }
     344           7 : }
     345             : 
     346        2881 : int HTMLParser::FilterToken( int nToken )
     347             : {
     348        2881 :     switch( nToken )
     349             :     {
     350             :     case sal_Unicode(EOF):
     351           0 :         nToken = 0;
     352           0 :         break;          // don't pass
     353             : 
     354             :     case HTML_HEAD_OFF:
     355           4 :         bIsInBody = true;
     356           4 :         bIsInHeader = false;
     357           4 :         break;
     358             : 
     359             :     case HTML_HEAD_ON:
     360           4 :         bIsInHeader = true;
     361           4 :         break;
     362             : 
     363             :     case HTML_BODY_ON:
     364           6 :         bIsInHeader = false;
     365           6 :         bIsInBody = true;
     366           6 :         break;
     367             : 
     368             :     case HTML_FRAMESET_ON:
     369           0 :         bIsInHeader = false;
     370           0 :         bIsInBody = false;
     371           0 :         break;
     372             : 
     373             :     case HTML_BODY_OFF:
     374           6 :         bIsInBody = bReadPRE = bReadListing = bReadXMP = false;
     375           6 :         break;
     376             : 
     377             :     case HTML_HTML_OFF:
     378           6 :         nToken = 0;
     379           6 :         bReadPRE = bReadListing = bReadXMP = false;
     380           6 :         break;      // HTML_ON hasn't been passed either !
     381             : 
     382             :     case HTML_PREFORMTXT_ON:
     383           0 :         StartPRE();
     384           0 :         break;
     385             : 
     386             :     case HTML_PREFORMTXT_OFF:
     387           0 :         FinishPRE();
     388           0 :         break;
     389             : 
     390             :     case HTML_LISTING_ON:
     391           0 :         StartListing();
     392           0 :         break;
     393             : 
     394             :     case HTML_LISTING_OFF:
     395           0 :         FinishListing();
     396           0 :         break;
     397             : 
     398             :     case HTML_XMP_ON:
     399           0 :         StartXMP();
     400           0 :         break;
     401             : 
     402             :     case HTML_XMP_OFF:
     403           0 :         FinishXMP();
     404           0 :         break;
     405             : 
     406             :     default:
     407        2855 :         if( bReadPRE )
     408           0 :             nToken = FilterPRE( nToken );
     409        2855 :         else if( bReadListing )
     410           0 :             nToken = FilterListing( nToken );
     411        2855 :         else if( bReadXMP )
     412           0 :             nToken = FilterXMP( nToken );
     413             : 
     414        2855 :         break;
     415             :     }
     416             : 
     417        2881 :     return nToken;
     418             : }
     419             : 
     420             : #define HTML_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
     421             : #define HTML_ISALPHA( c ) comphelper::string::isalphaAscii(c)
     422             : #define HTML_ISALNUM( c ) comphelper::string::isalnumAscii(c)
     423             : #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
     424             : #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
     425             : #define HTML_ISHEXDIGIT( c ) comphelper::string::isxdigitAscii(c)
     426             : 
     427        2188 : int HTMLParser::ScanText( const sal_Unicode cBreak )
     428             : {
     429        2188 :     OUStringBuffer sTmpBuffer( MAX_LEN );
     430        2188 :     bool bContinue = true;
     431        2188 :     bool bEqSignFound = false;
     432        2188 :     sal_Unicode cQuote = 0U;
     433             : 
     434       28616 :     while( bContinue && IsParserWorking() )
     435             :     {
     436       24243 :         bool bNextCh = true;
     437       24243 :         switch( nNextCh )
     438             :         {
     439             :         case '&':
     440           0 :             bEqSignFound = false;
     441           0 :             if( bReadXMP )
     442           0 :                 sTmpBuffer.append( '&' );
     443             :             else
     444             :             {
     445           0 :                 sal_uLong nStreamPos = rInput.Tell();
     446           0 :                 sal_uLong nLinePos = GetLinePos();
     447             : 
     448           0 :                 sal_Unicode cChar = 0U;
     449           0 :                 if( '#' == (nNextCh = GetNextChar()) )
     450             :                 {
     451           0 :                     nNextCh = GetNextChar();
     452           0 :                     const bool bIsHex( 'x' == nNextCh );
     453           0 :                     const bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
     454           0 :                     if ( bIsDecOrHex )
     455             :                     {
     456           0 :                         if ( bIsHex )
     457             :                         {
     458           0 :                             nNextCh = GetNextChar();
     459           0 :                             while ( HTML_ISHEXDIGIT(nNextCh) )
     460             :                             {
     461             :                                 cChar = cChar * 16U +
     462           0 :                                         ( nNextCh <= '9'
     463             :                                           ? sal_Unicode( nNextCh - '0' )
     464           0 :                                           : ( nNextCh <= 'F'
     465             :                                               ? sal_Unicode( nNextCh - 'A' + 10 )
     466           0 :                                               : sal_Unicode( nNextCh - 'a' + 10 ) ) );
     467           0 :                                 nNextCh = GetNextChar();
     468             :                             }
     469             :                         }
     470             :                         else
     471             :                         {
     472           0 :                             do
     473             :                             {
     474           0 :                                 cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
     475           0 :                                 nNextCh = GetNextChar();
     476             :                             }
     477           0 :                             while( HTML_ISDIGIT(nNextCh) );
     478             :                         }
     479             : 
     480           0 :                         if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
     481           0 :                             RTL_TEXTENCODING_UCS2 != eSrcEnc &&
     482           0 :                             RTL_TEXTENCODING_UTF8 != eSrcEnc &&
     483             :                             cChar < 256 )
     484             :                         {
     485             :                             const sal_uInt32 convertFlags =
     486             :                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
     487             :                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
     488           0 :                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
     489             : 
     490           0 :                             sal_Char cEncodedChar = static_cast<sal_Char>(cChar);
     491           0 :                             cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
     492           0 :                             if( 0U == cChar )
     493             :                             {
     494             :                                 // If the character could not be
     495             :                                 // converted, because a conversion is not
     496             :                                 // available, do no conversion at all.
     497           0 :                                 cChar = cEncodedChar;
     498             :                             }
     499             :                         }
     500             :                     }
     501             :                     else
     502           0 :                         nNextCh = 0U;
     503             :                 }
     504           0 :                 else if( HTML_ISALPHA( nNextCh ) )
     505             :                 {
     506           0 :                     OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
     507           0 :                     sal_Int32 nPos = 0L;
     508           0 :                     do
     509             :                     {
     510           0 :                         sEntityBuffer.append( nNextCh );
     511           0 :                         nPos++;
     512           0 :                         nNextCh = GetNextChar();
     513             :                     }
     514           0 :                     while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
     515           0 :                            !rInput.IsEof() );
     516             : 
     517           0 :                     if( IsParserWorking() && !rInput.IsEof() )
     518             :                     {
     519           0 :                         OUString sEntity(sEntityBuffer.getStr(), nPos);
     520           0 :                         cChar = GetHTMLCharName( sEntity );
     521             : 
     522             :                         // not found ( == 0 ): plain text
     523             :                         // or a character which is inserted as attribute
     524           0 :                         if( 0U == cChar && ';' != nNextCh )
     525             :                         {
     526             :                             DBG_ASSERT( rInput.Tell() - nStreamPos ==
     527             :                                         (sal_uLong)(nPos+1L)*GetCharSize(),
     528             :                                         "UTF-8 is failing here" );
     529           0 :                             for( sal_Int32 i = nPos-1; i>1; i-- )
     530             :                             {
     531           0 :                                 nNextCh = sEntityBuffer[i];
     532           0 :                                 sEntityBuffer.setLength( i );
     533           0 :                                 sEntity = OUString(sEntityBuffer.getStr(), i);
     534           0 :                                 cChar = GetHTMLCharName( sEntity );
     535           0 :                                 if( cChar )
     536             :                                 {
     537             :                                     rInput.SeekRel( -(long)
     538           0 :                                             ((nPos-i)*GetCharSize()) );
     539           0 :                                     nlLinePos -= sal_uInt32(nPos-i);
     540           0 :                                     nPos = i;
     541           0 :                                     ClearTxtConvContext();
     542           0 :                                     break;
     543             :                                 }
     544             :                             }
     545             :                         }
     546             : 
     547           0 :                         if( !cChar )        // unknown character?
     548             :                         {
     549             :                             // back in stream, insert '&'
     550             :                             // and restart with next character
     551           0 :                             sTmpBuffer.append( '&' );
     552             : 
     553             :                             DBG_ASSERT( rInput.Tell()-nStreamPos ==
     554             :                                         (sal_uLong)(nPos+1)*GetCharSize(),
     555             :                                         "Wrong stream position" );
     556             :                             DBG_ASSERT( nlLinePos-nLinePos ==
     557             :                                         (sal_uLong)(nPos+1),
     558             :                                         "Wrong line position" );
     559           0 :                             rInput.Seek( nStreamPos );
     560           0 :                             nlLinePos = nLinePos;
     561           0 :                             ClearTxtConvContext();
     562           0 :                             break;
     563             :                         }
     564             : 
     565             :                         assert(cChar != 0);
     566             : 
     567             :                         // 1 == Non Breaking Space
     568             :                         // 2 == SoftHyphen
     569             : 
     570           0 :                         if (cChar == 1 || cChar == 2)
     571             :                         {
     572           0 :                             if( '>' == cBreak )
     573             :                             {
     574             :                                 // When reading the content of a tag we have
     575             :                                 // to change it to ' ' or '-'
     576           0 :                                 if( 1U == cChar )
     577           0 :                                     cChar = ' ';
     578             :                                 else //2U
     579           0 :                                     cChar = '-';
     580             :                             }
     581             :                             else
     582             :                             {
     583             :                                 // If not scanning a tag return token
     584           0 :                                 aToken += sTmpBuffer.makeStringAndClear();
     585             : 
     586           0 :                                 if( !aToken.isEmpty() )
     587             :                                 {
     588             :                                     // restart with character
     589           0 :                                     nNextCh = '&';
     590             :                                     DBG_ASSERT( rInput.Tell()-nStreamPos ==
     591             :                                                 (sal_uLong)(nPos+1)*GetCharSize(),
     592             :                                                 "Wrong stream position" );
     593             :                                     DBG_ASSERT( nlLinePos-nLinePos ==
     594             :                                                 (sal_uLong)(nPos+1),
     595             :                                                 "Wrong line position" );
     596           0 :                                     rInput.Seek( nStreamPos );
     597           0 :                                     nlLinePos = nLinePos;
     598           0 :                                     ClearTxtConvContext();
     599           0 :                                     return HTML_TEXTTOKEN;
     600             :                                 }
     601             : 
     602             :                                 // Hack: _GetNextChar shall not read the
     603             :                                 // next character
     604           0 :                                 if( ';' != nNextCh )
     605           0 :                                     aToken += " ";
     606           0 :                                 if( 1U == cChar )
     607           0 :                                     return HTML_NONBREAKSPACE;
     608             :                                 else //2U
     609           0 :                                     return HTML_SOFTHYPH;
     610             :                             }
     611           0 :                         }
     612             :                     }
     613             :                     else
     614           0 :                         nNextCh = 0U;
     615             :                 }
     616             :                 // &{...};-JavaScript-Macros are not supported any longer.
     617           0 :                 else if( IsParserWorking() )
     618             :                 {
     619           0 :                     sTmpBuffer.append( '&' );
     620           0 :                     bNextCh = false;
     621           0 :                     break;
     622             :                 }
     623             : 
     624           0 :                 bNextCh = (';' == nNextCh);
     625           0 :                 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
     626           0 :                                     cChar=='\"' || cChar==' ') )
     627             :                 {
     628             :                     // ' and " have to be escaped withing tags to separate
     629             :                     // them from ' and " enclosing options.
     630             :                     // \ has to be escaped as well.
     631             :                     // Space is protected because it's not a delimiter between
     632             :                     // options.
     633           0 :                     sTmpBuffer.append( '\\' );
     634           0 :                     if( MAX_LEN == sTmpBuffer.getLength() )
     635           0 :                         aToken += sTmpBuffer.makeStringAndClear();
     636             :                 }
     637           0 :                 if( IsParserWorking() )
     638             :                 {
     639           0 :                     if( cChar )
     640           0 :                         sTmpBuffer.append( cChar );
     641             :                 }
     642           0 :                 else if( SVPAR_PENDING==eState && '>'!=cBreak )
     643             :                 {
     644             :                     // Restart with '&', the remainder is returned as
     645             :                     // text token.
     646           0 :                     if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
     647             :                     {
     648             :                         // _GetNextChar() returns the previous text and
     649             :                         // during the next execution a new character is read.
     650             :                         // Thus we have to position in front of the '&'.
     651           0 :                         nNextCh = 0U;
     652           0 :                         rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
     653           0 :                         nlLinePos = nLinePos-1;
     654           0 :                         ClearTxtConvContext();
     655           0 :                         bReadNextChar = true;
     656             :                     }
     657           0 :                     bNextCh = false;
     658             :                 }
     659             :             }
     660           0 :             break;
     661             :         case '=':
     662        1496 :             if( '>'==cBreak && !cQuote )
     663        1492 :                 bEqSignFound = true;
     664        1496 :             sTmpBuffer.append( nNextCh );
     665        1496 :             break;
     666             : 
     667             :         case '\\':
     668           0 :             if( '>'==cBreak )
     669             :             {
     670             :                 // Innerhalb von Tags kennzeichnen
     671           0 :                 sTmpBuffer.append( '\\' );
     672           0 :                 if( MAX_LEN == sTmpBuffer.getLength() )
     673           0 :                     aToken += sTmpBuffer.makeStringAndClear();
     674             :             }
     675           0 :             sTmpBuffer.append( '\\' );
     676           0 :             break;
     677             : 
     678             :         case '\"':
     679             :         case '\'':
     680        2989 :             if( '>'==cBreak )
     681             :             {
     682        2988 :                 if( bEqSignFound )
     683        1491 :                     cQuote = nNextCh;
     684        1497 :                 else if( cQuote && (cQuote==nNextCh ) )
     685        1491 :                     cQuote = 0U;
     686             :             }
     687        2989 :             sTmpBuffer.append( nNextCh );
     688        2989 :             bEqSignFound = false;
     689        2989 :             break;
     690             : 
     691             :         case sal_Unicode(EOF):
     692           0 :             if( rInput.IsEof() )
     693             :             {
     694           0 :                 bContinue = false;
     695             :             }
     696             :             else
     697             :             {
     698           0 :                 sTmpBuffer.append( nNextCh );
     699             :             }
     700           0 :             break;
     701             : 
     702             :         case '<':
     703        1271 :             bEqSignFound = false;
     704        1271 :             if( '>'==cBreak )
     705           0 :                 sTmpBuffer.append( nNextCh );
     706             :             else
     707        1271 :                 bContinue = false;      // break, String zusammen
     708        1271 :             break;
     709             : 
     710             :         case '\f':
     711           0 :             if( '>' == cBreak )
     712             :             {
     713             :                 // If scanning options treat it like a space, ...
     714           0 :                 sTmpBuffer.append( ' ' );
     715             :             }
     716             :             else
     717             :             {
     718             :                 // otherwise it's a separate token.
     719           0 :                 bContinue = false;
     720             :             }
     721           0 :             break;
     722             : 
     723             :         case '\r':
     724             :         case '\n':
     725        1255 :             if( '>'==cBreak )
     726             :             {
     727             :                 // cr/lf in tag is handled in _GetNextToken()
     728           0 :                 sTmpBuffer.append( nNextCh );
     729           0 :                 break;
     730             :             }
     731        1255 :             else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
     732             :             {
     733           0 :                 bContinue = false;
     734           0 :                 break;
     735             :             }
     736             :             // Reduce sequence of CR/LF/BLANK/TAB to a single blank
     737             :             // no break!!
     738             :         case '\t':
     739        1255 :             if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
     740             :             {
     741             :                 // Pass Tabs up in <PRE>
     742           0 :                 bContinue = false;
     743           0 :                 break;
     744             :             }
     745             :             // no break
     746             :         case '\x0b':
     747        1255 :             if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
     748             :                 '>'!=cBreak )
     749             :             {
     750           0 :                 break;
     751             :             }
     752        1255 :             nNextCh = ' ';
     753             :             // no break;
     754             :         case ' ':
     755        5285 :             sTmpBuffer.append( nNextCh );
     756        6562 :             if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
     757        2554 :                                 !bReadPRE && !bReadTextArea) )
     758             :             {
     759             :                 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
     760        4735 :                 do {
     761        4741 :                     if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
     762           3 :                         rInput.IsEof() )
     763             :                     {
     764           3 :                         if( !aToken.isEmpty() || sTmpBuffer.getLength() > 1L )
     765             :                         {
     766             :                             // Have seen s.th. aside from blanks?
     767           0 :                             aToken += sTmpBuffer.makeStringAndClear();
     768           0 :                             return HTML_TEXTTOKEN;
     769             :                         }
     770             :                         else
     771             :                             // Only read blanks: no text must be returned
     772             :                             // and _GetNextToken has to read until EOF
     773           3 :                             return 0;
     774             :                     }
     775       10533 :                 } while ( ' ' == nNextCh || '\t' == nNextCh ||
     776        4344 :                           '\r' == nNextCh || '\n' == nNextCh ||
     777        1274 :                           '\x0b' == nNextCh );
     778        1274 :                 bNextCh = false;
     779             :             }
     780        5282 :             break;
     781             : 
     782             :         default:
     783       13202 :             bEqSignFound = false;
     784       13202 :             if (nNextCh == cBreak && !cQuote)
     785         914 :                 bContinue = false;
     786             :             else
     787             :             {
     788       46800 :                 do {
     789             :                     // All remaining characters make their way into the text.
     790       46800 :                     sTmpBuffer.append( nNextCh );
     791       46800 :                     if( MAX_LEN == sTmpBuffer.getLength() )
     792             :                     {
     793           1 :                         aToken += sTmpBuffer.makeStringAndClear();
     794             :                     }
     795      140400 :                     if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
     796      140400 :                           rInput.IsEof() ) ||
     797       46800 :                         !IsParserWorking() )
     798             :                     {
     799           0 :                         if( !sTmpBuffer.isEmpty() )
     800           0 :                             aToken += sTmpBuffer.makeStringAndClear();
     801           0 :                         return HTML_TEXTTOKEN;
     802             :                     }
     803       46800 :                 } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
     804       12288 :                 bNextCh = false;
     805             :             }
     806             :         }
     807             : 
     808       24240 :         if( MAX_LEN == sTmpBuffer.getLength() )
     809           0 :             aToken += sTmpBuffer.makeStringAndClear();
     810             : 
     811       24240 :         if( bContinue && bNextCh )
     812        8493 :             nNextCh = GetNextChar();
     813             :     }
     814             : 
     815        2185 :     if( !sTmpBuffer.isEmpty() )
     816        2185 :         aToken += sTmpBuffer.makeStringAndClear();
     817             : 
     818        2185 :     return HTML_TEXTTOKEN;
     819             : }
     820             : 
     821          14 : int HTMLParser::_GetNextRawToken()
     822             : {
     823          14 :     OUStringBuffer sTmpBuffer( MAX_LEN );
     824             : 
     825          14 :     if( bEndTokenFound )
     826             :     {
     827             :         // During the last execution we already found the end token,
     828             :         // thus we don't have to search it again.
     829           2 :         bReadScript = false;
     830           2 :         bReadStyle = false;
     831           2 :         aEndToken = "";
     832           2 :         bEndTokenFound = false;
     833             : 
     834           2 :         return 0;
     835             :     }
     836             : 
     837             :     // Default return value: HTML_RAWDATA
     838          12 :     bool bContinue = true;
     839          12 :     int nToken = HTML_RAWDATA;
     840          12 :     SaveState( 0 );
     841         425 :     while( bContinue && IsParserWorking() )
     842             :     {
     843         401 :         bool bNextCh = true;
     844         401 :         switch( nNextCh )
     845             :         {
     846             :         case '<':
     847             :             {
     848             :                 // Maybe we've reached the end.
     849             : 
     850             :                 // Save what we have read previously...
     851           2 :                 aToken += sTmpBuffer.makeStringAndClear();
     852             : 
     853             :                 // and remember position in stream.
     854           2 :                 sal_uLong nStreamPos = rInput.Tell();
     855           2 :                 sal_uLong nLineNr = GetLineNr();
     856           2 :                 sal_uLong nLinePos = GetLinePos();
     857             : 
     858             :                 // Start of an end token?
     859           2 :                 bool bOffState = false;
     860           2 :                 if( '/' == (nNextCh = GetNextChar()) )
     861             :                 {
     862           2 :                     bOffState = true;
     863           2 :                     nNextCh = GetNextChar();
     864             :                 }
     865           0 :                 else if( '!' == nNextCh )
     866             :                 {
     867           0 :                     sTmpBuffer.append( nNextCh );
     868           0 :                     nNextCh = GetNextChar();
     869             :                 }
     870             : 
     871             :                 // Read following letters
     872          38 :                 while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
     873          32 :                        IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
     874             :                 {
     875          10 :                     sTmpBuffer.append( nNextCh );
     876          10 :                     nNextCh = GetNextChar();
     877             :                 }
     878             : 
     879           2 :                 OUString aTok( sTmpBuffer.toString() );
     880           2 :                 aTok = aTok.toAsciiLowerCase();
     881           2 :                 bool bDone = false;
     882           2 :                 if( bReadScript || !aEndToken.isEmpty() )
     883             :                 {
     884           0 :                     if( !bReadComment )
     885             :                     {
     886           0 :                         if( aTok.compareTo( OOO_STRING_SVTOOLS_HTML_comment, 3 )
     887             :                                 == 0 )
     888             :                         {
     889           0 :                             bReadComment = true;
     890             :                         }
     891             :                         else
     892             :                         {
     893             :                             // A script has to end with "</SCRIPT>". But
     894             :                             // ">" is optional for security reasons
     895           0 :                             bDone = bOffState &&
     896           0 :                             0 == ( bReadScript
     897           0 :                                 ? aTok.compareTo(OOO_STRING_SVTOOLS_HTML_script)
     898           0 :                                 : aTok.compareTo(aEndToken) );
     899             :                         }
     900             :                     }
     901           0 :                     if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
     902             :                     {
     903             :                         // End of comment of style <!----->
     904           0 :                         bReadComment = false;
     905             :                     }
     906             :                 }
     907             :                 else
     908             :                 {
     909             :                     // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
     910           2 :                     if( bOffState )
     911           8 :                         bDone = aTok.compareTo(OOO_STRING_SVTOOLS_HTML_style)
     912           6 :                                     == 0 ||
     913           2 :                                 aTok.compareTo(OOO_STRING_SVTOOLS_HTML_head)
     914           2 :                                     == 0;
     915             :                     else
     916             :                         bDone =
     917           0 :                             aTok.compareTo(OOO_STRING_SVTOOLS_HTML_body) == 0;
     918             :                 }
     919             : 
     920           2 :                 if( bDone )
     921             :                 {
     922             :                     // Done! Return the previously read string (if requested)
     923             :                     // and continue.
     924             : 
     925           2 :                     bContinue = false;
     926             : 
     927             :                     // nToken==0 means, _GetNextToken continues to read
     928           2 :                     if( aToken.isEmpty() && (bReadStyle || bReadScript) )
     929             :                     {
     930             :                         // Immediately close environment (or context?)
     931             :                         // and parse the end token
     932           0 :                         bReadScript = false;
     933           0 :                         bReadStyle = false;
     934           0 :                         aEndToken = "";
     935           0 :                         nToken = 0;
     936             :                     }
     937             :                     else
     938             :                     {
     939             :                         // Keep bReadScript/bReadStyle alive
     940             :                         // and parse end token during next execution
     941           2 :                         bEndTokenFound = true;
     942             :                     }
     943             : 
     944             :                     // Move backwards in stream to '<'
     945           2 :                     rInput.Seek( nStreamPos );
     946           2 :                     SetLineNr( nLineNr );
     947           2 :                     SetLinePos( nLinePos );
     948           2 :                     ClearTxtConvContext();
     949           2 :                     nNextCh = '<';
     950             : 
     951             :                     // Don't append string to token.
     952           2 :                     sTmpBuffer.setLength( 0L );
     953             :                 }
     954             :                 else
     955             :                 {
     956             :                     // remember "</" , everything else we find in the buffer
     957           0 :                     aToken += "<";
     958           0 :                     if( bOffState )
     959           0 :                         aToken += "/";
     960             : 
     961           0 :                     bNextCh = false;
     962           2 :                 }
     963             :             }
     964           2 :             break;
     965             :         case '-':
     966          11 :             sTmpBuffer.append( nNextCh );
     967          11 :             if( bReadComment )
     968             :             {
     969           0 :                 bool bTwoMinus = false;
     970           0 :                 nNextCh = GetNextChar();
     971           0 :                 while( '-' == nNextCh && IsParserWorking() )
     972             :                 {
     973           0 :                     bTwoMinus = true;
     974             : 
     975           0 :                     if( MAX_LEN == sTmpBuffer.getLength() )
     976           0 :                         aToken += sTmpBuffer.makeStringAndClear();
     977           0 :                     sTmpBuffer.append( nNextCh );
     978           0 :                     nNextCh = GetNextChar();
     979             :                 }
     980             : 
     981           0 :                 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
     982           0 :                     bReadComment = false;
     983             : 
     984           0 :                 bNextCh = false;
     985             :             }
     986          11 :             break;
     987             : 
     988             :         case '\r':
     989             :             // \r\n? closes the current text token (even if it's empty)
     990           0 :             nNextCh = GetNextChar();
     991           0 :             if( nNextCh=='\n' )
     992           0 :                 nNextCh = GetNextChar();
     993           0 :             bContinue = false;
     994           0 :             break;
     995             :         case '\n':
     996             :             // \n closes the current text token (even if it's empty)
     997          10 :             nNextCh = GetNextChar();
     998          10 :             bContinue = false;
     999          10 :             break;
    1000             :         case sal_Unicode(EOF):
    1001             :             // eof closes the current text token and behaves like having read
    1002             :             // an end token
    1003           0 :             if( rInput.IsEof() )
    1004             :             {
    1005           0 :                 bContinue = false;
    1006           0 :                 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
    1007             :                 {
    1008           0 :                     bEndTokenFound = true;
    1009             :                 }
    1010             :                 else
    1011             :                 {
    1012           0 :                     bReadScript = false;
    1013           0 :                     bReadStyle = false;
    1014           0 :                     aEndToken = "";
    1015           0 :                     nToken = 0;
    1016             :                 }
    1017           0 :                 break;
    1018             :             }
    1019             :             // no break
    1020             :         default:
    1021             :             // all remaining characters are appended to the buffer
    1022         378 :             sTmpBuffer.append( nNextCh );
    1023         378 :             break;
    1024             :         }
    1025             : 
    1026         794 :         if( (!bContinue && !sTmpBuffer.isEmpty()) ||
    1027         393 :             MAX_LEN == sTmpBuffer.getLength() )
    1028           8 :             aToken += sTmpBuffer.makeStringAndClear();
    1029             : 
    1030         401 :         if( bContinue && bNextCh )
    1031         389 :             nNextCh = GetNextChar();
    1032             :     }
    1033             : 
    1034          12 :     if( IsParserWorking() )
    1035          12 :         SaveState( 0 );
    1036             :     else
    1037           0 :         nToken = 0;
    1038             : 
    1039          12 :     return nToken;
    1040             : }
    1041             : 
    1042             : // Scan next token
    1043        2884 : int HTMLParser::_GetNextToken()
    1044             : {
    1045        2884 :     int nRet = 0;
    1046        2884 :     sSaveToken = "";
    1047             : 
    1048        2884 :     if (mnPendingOffToken)
    1049             :     {
    1050             :         // HTML_<TOKEN>_OFF generated for HTML_<TOKEN>_ON
    1051           0 :         nRet = mnPendingOffToken;
    1052           0 :         mnPendingOffToken = 0;
    1053           0 :         aToken = "";
    1054           0 :         return nRet;
    1055             :     }
    1056             : 
    1057             :     // Delete options
    1058        2884 :     if (!maOptions.empty())
    1059         650 :         maOptions.clear();
    1060             : 
    1061        2884 :     if( !IsParserWorking() )        // Don't continue if already an error occurred
    1062           0 :         return 0;
    1063             : 
    1064        2884 :     bool bReadNextCharSave = bReadNextChar;
    1065        2884 :     if( bReadNextChar )
    1066             :     {
    1067             :         DBG_ASSERT( !bEndTokenFound,
    1068             :                     "Read a character despite </SCRIPT> was read?" );
    1069           0 :         nNextCh = GetNextChar();
    1070           0 :         if( !IsParserWorking() )        // Don't continue if already an error occurred
    1071           0 :             return 0;
    1072           0 :         bReadNextChar = false;
    1073             :     }
    1074             : 
    1075        2884 :     if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
    1076             :     {
    1077          14 :         nRet = _GetNextRawToken();
    1078          14 :         if( nRet || !IsParserWorking() )
    1079          12 :             return nRet;
    1080             :     }
    1081             : 
    1082        2875 :     do {
    1083        2875 :         bool bNextCh = true;
    1084        2875 :         switch( nNextCh )
    1085             :         {
    1086             :         case '<':
    1087             :             {
    1088        1594 :                 sal_uLong nStreamPos = rInput.Tell();
    1089        1594 :                 sal_uLong nLineNr = GetLineNr();
    1090        1594 :                 sal_uLong nLinePos = GetLinePos();
    1091             : 
    1092        1594 :                 bool bOffState = false;
    1093        1594 :                 if( '/' == (nNextCh = GetNextChar()) )
    1094             :                 {
    1095         650 :                     bOffState = true;
    1096         650 :                     nNextCh = GetNextChar();
    1097             :                 }
    1098        1594 :                 if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh )
    1099             :                 {
    1100        1594 :                     OUStringBuffer sTmpBuffer;
    1101        2949 :                     do {
    1102        2949 :                         sTmpBuffer.append( nNextCh );
    1103        2949 :                         if( MAX_LEN == sTmpBuffer.getLength() )
    1104           0 :                             aToken += sTmpBuffer.makeStringAndClear();
    1105        2949 :                         nNextCh = GetNextChar();
    1106        6573 :                     } while( '>' != nNextCh && '/' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
    1107        5659 :                              IsParserWorking() && !rInput.IsEof() );
    1108             : 
    1109        1594 :                     if( !sTmpBuffer.isEmpty() )
    1110        1594 :                         aToken += sTmpBuffer.makeStringAndClear();
    1111             : 
    1112             :                     // Skip blanks
    1113        3843 :                     while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
    1114         655 :                         nNextCh = GetNextChar();
    1115             : 
    1116        1594 :                     if( !IsParserWorking() )
    1117             :                     {
    1118           0 :                         if( SVPAR_PENDING == eState )
    1119           0 :                             bReadNextChar = bReadNextCharSave;
    1120           0 :                         break;
    1121             :                     }
    1122             : 
    1123             :                     // Search token in table:
    1124        1594 :                     sSaveToken = aToken;
    1125        1594 :                     aToken = aToken.toAsciiLowerCase();
    1126        1594 :                     if( 0 == (nRet = GetHTMLToken( aToken )) )
    1127             :                         // Unknown control
    1128           5 :                         nRet = HTML_UNKNOWNCONTROL_ON;
    1129             : 
    1130             :                     // If it's a token which can be switched off...
    1131        1594 :                     if( bOffState )
    1132             :                     {
    1133         650 :                          if( HTML_TOKEN_ONOFF & nRet )
    1134             :                          {
    1135             :                             // and there is an off token, return off token instead
    1136         650 :                             ++nRet;
    1137             :                          }
    1138           0 :                          else if( HTML_LINEBREAK!=nRet )
    1139             :                          {
    1140             :                             // and there is no off token, return unknown token.
    1141             :                             // (except for </BR>, that is treated like <BR>)
    1142           0 :                             nRet = HTML_UNKNOWNCONTROL_OFF;
    1143             :                          }
    1144             :                     }
    1145             : 
    1146        1594 :                     if( nRet == HTML_COMMENT )
    1147             :                     {
    1148             :                         // fix: due to being case sensitive use sSaveToken as start of comment
    1149             :                         //      and append a blank.
    1150           0 :                         aToken = sSaveToken;
    1151           0 :                         if( '>'!=nNextCh )
    1152           0 :                             aToken += " ";
    1153           0 :                         sal_uLong nCStreamPos = 0;
    1154           0 :                         sal_uLong nCLineNr = 0;
    1155           0 :                         sal_uLong nCLinePos = 0;
    1156           0 :                         sal_Int32 nCStrLen = 0;
    1157             : 
    1158           0 :                         bool bDone = false;
    1159             :                         // Read until closing -->. If not found restart at first >
    1160           0 :                         while( !bDone && !rInput.IsEof() && IsParserWorking() )
    1161             :                         {
    1162           0 :                             if( '>'==nNextCh )
    1163             :                             {
    1164           0 :                                 if( !nCStreamPos )
    1165             :                                 {
    1166           0 :                                     nCStreamPos = rInput.Tell();
    1167           0 :                                     nCStrLen = aToken.getLength();
    1168           0 :                                     nCLineNr = GetLineNr();
    1169           0 :                                     nCLinePos = GetLinePos();
    1170             :                                 }
    1171           0 :                                 bDone = aToken.getLength() >= 2 &&
    1172           0 :                                         aToken.copy(aToken.getLength()-2,2).
    1173           0 :                                                         equalsAscii( "--" );
    1174           0 :                                 if( !bDone )
    1175           0 :                                 aToken += OUString(nNextCh);
    1176             :                             }
    1177             :                             else
    1178           0 :                                 aToken += OUString(nNextCh);
    1179           0 :                             if( !bDone )
    1180           0 :                                 nNextCh = GetNextChar();
    1181             :                         }
    1182           0 :                         if( !bDone && IsParserWorking() && nCStreamPos )
    1183             :                         {
    1184           0 :                             rInput.Seek( nCStreamPos );
    1185           0 :                             SetLineNr( nCLineNr );
    1186           0 :                             SetLinePos( nCLinePos );
    1187           0 :                             ClearTxtConvContext();
    1188           0 :                             aToken = aToken.copy(0, nCStrLen);
    1189           0 :                             nNextCh = '>';
    1190             :                         }
    1191             :                     }
    1192             :                     else
    1193             :                     {
    1194             :                         // TokenString not needed anymore
    1195        1594 :                         aToken = "";
    1196             :                     }
    1197             : 
    1198             :                     // Read until closing '>'
    1199        1594 :                     if( '>' != nNextCh && IsParserWorking() )
    1200             :                     {
    1201         914 :                         ScanText( '>' );
    1202             : 
    1203             :                         // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
    1204             :                         // generate pending HTML_<TOKEN>_OFF for HTML_<TOKEN>_ON
    1205             :                         // Do not convert this to a single HTML_<TOKEN>_OFF
    1206             :                         // which lead to fdo#56772.
    1207         914 :                         if ((HTML_TOKEN_ONOFF & nRet) && aToken.endsWith("/"))
    1208             :                         {
    1209           0 :                             mnPendingOffToken = nRet + 1;       // HTML_<TOKEN>_ON -> HTML_<TOKEN>_OFF
    1210           0 :                             aToken = aToken.replaceAt( aToken.getLength()-1, 1, "");   // remove trailing '/'
    1211             :                         }
    1212         914 :                         if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
    1213             :                         {
    1214             :                             // Move back in front of < and restart there.
    1215             :                             // Return < as text.
    1216           0 :                             rInput.Seek( nStreamPos );
    1217           0 :                             SetLineNr( nLineNr );
    1218           0 :                             SetLinePos( nLinePos );
    1219           0 :                             ClearTxtConvContext();
    1220             : 
    1221           0 :                             aToken = "<";
    1222           0 :                             nRet = HTML_TEXTTOKEN;
    1223           0 :                             nNextCh = GetNextChar();
    1224           0 :                             bNextCh = false;
    1225           0 :                             break;
    1226             :                         }
    1227             :                     }
    1228        1594 :                     if( SVPAR_PENDING == eState )
    1229           0 :                         bReadNextChar = bReadNextCharSave;
    1230             :                 }
    1231             :                 else
    1232             :                 {
    1233           0 :                     if( bOffState )
    1234             :                     {
    1235             :                         // einfach alles wegschmeissen
    1236           0 :                         ScanText( '>' );
    1237           0 :                         if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
    1238             :                         {
    1239             :                             // Move back in front of < and restart there.
    1240             :                             // Return < as text.
    1241           0 :                             rInput.Seek( nStreamPos );
    1242           0 :                             SetLineNr( nLineNr );
    1243           0 :                             SetLinePos( nLinePos );
    1244           0 :                             ClearTxtConvContext();
    1245             : 
    1246           0 :                             aToken = "<";
    1247           0 :                             nRet = HTML_TEXTTOKEN;
    1248           0 :                             nNextCh = GetNextChar();
    1249           0 :                             bNextCh = false;
    1250           0 :                             break;
    1251             :                         }
    1252           0 :                         if( SVPAR_PENDING == eState )
    1253           0 :                             bReadNextChar = bReadNextCharSave;
    1254           0 :                         aToken = "";
    1255             :                     }
    1256           0 :                     else if( '%' == nNextCh )
    1257             :                     {
    1258           0 :                         nRet = HTML_UNKNOWNCONTROL_ON;
    1259             : 
    1260           0 :                         sal_uLong nCStreamPos = rInput.Tell();
    1261           0 :                         sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
    1262             : 
    1263           0 :                         bool bDone = false;
    1264             :                         // Read until closing %>. If not found restart at first >.
    1265           0 :                         while( !bDone && !rInput.IsEof() && IsParserWorking() )
    1266             :                         {
    1267           0 :                             bDone = '>'==nNextCh && aToken.endsWith("%");
    1268           0 :                             if( !bDone )
    1269             :                             {
    1270           0 :                                 aToken += OUString(nNextCh);
    1271           0 :                                 nNextCh = GetNextChar();
    1272             :                             }
    1273             :                         }
    1274           0 :                         if( !bDone && IsParserWorking() )
    1275             :                         {
    1276           0 :                             rInput.Seek( nCStreamPos );
    1277           0 :                             SetLineNr( nCLineNr );
    1278           0 :                             SetLinePos( nCLinePos );
    1279           0 :                             ClearTxtConvContext();
    1280           0 :                             aToken = "<%";
    1281           0 :                             nRet = HTML_TEXTTOKEN;
    1282           0 :                             break;
    1283             :                         }
    1284           0 :                         if( IsParserWorking() )
    1285             :                         {
    1286           0 :                             sSaveToken = aToken;
    1287           0 :                             aToken = "";
    1288             :                         }
    1289             :                     }
    1290             :                     else
    1291             :                     {
    1292           0 :                         aToken = "<";
    1293           0 :                         nRet = HTML_TEXTTOKEN;
    1294           0 :                         bNextCh = false;
    1295           0 :                         break;
    1296             :                     }
    1297             :                 }
    1298             : 
    1299        1594 :                 if( IsParserWorking() )
    1300             :                 {
    1301        1594 :                     bNextCh = '>' == nNextCh;
    1302        1594 :                     switch( nRet )
    1303             :                     {
    1304             :                     case HTML_TEXTAREA_ON:
    1305           0 :                         bReadTextArea = true;
    1306           0 :                         break;
    1307             :                     case HTML_TEXTAREA_OFF:
    1308           0 :                         bReadTextArea = false;
    1309           0 :                         break;
    1310             :                     case HTML_SCRIPT_ON:
    1311           0 :                         if( !bReadTextArea )
    1312           0 :                             bReadScript = true;
    1313           0 :                         break;
    1314             :                     case HTML_SCRIPT_OFF:
    1315           0 :                         if( !bReadTextArea )
    1316             :                         {
    1317           0 :                             bReadScript = false;
    1318             :                             // JavaScript might modify the stream,
    1319             :                             // thus the last character has to be read again.
    1320           0 :                             bReadNextChar = true;
    1321           0 :                             bNextCh = false;
    1322             :                         }
    1323           0 :                         break;
    1324             : 
    1325             :                     case HTML_STYLE_ON:
    1326           2 :                         bReadStyle = true;
    1327           2 :                         break;
    1328             :                     case HTML_STYLE_OFF:
    1329           2 :                         bReadStyle = false;
    1330           2 :                         break;
    1331             :                     }
    1332             :                 }
    1333             :             }
    1334        1594 :             break;
    1335             : 
    1336             :         case sal_Unicode(EOF):
    1337           7 :             if( rInput.IsEof() )
    1338             :             {
    1339           7 :                 eState = SVPAR_ACCEPTED;
    1340           7 :                 nRet = nNextCh;
    1341             :             }
    1342             :             else
    1343             :             {
    1344             :                 // Read normal text.
    1345           0 :                 goto scan_text;
    1346             :             }
    1347           7 :             break;
    1348             : 
    1349             :         case '\f':
    1350             :             // form feeds are passed upwards separately
    1351           0 :             nRet = HTML_LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
    1352           0 :             break;
    1353             : 
    1354             :         case '\n':
    1355             :         case '\r':
    1356        1255 :             if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
    1357             :             {
    1358           0 :                 sal_Unicode c = GetNextChar();
    1359           0 :                 if( ( '\n' != nNextCh || '\r' != c ) &&
    1360           0 :                     ( '\r' != nNextCh || '\n' != c ) )
    1361             :                 {
    1362           0 :                     bNextCh = false;
    1363           0 :                     nNextCh = c;
    1364             :                 }
    1365           0 :                 nRet = HTML_NEWPARA;
    1366           0 :                 break;
    1367             :             }
    1368             :             // no break !
    1369             :         case '\t':
    1370        1255 :             if( bReadPRE )
    1371             :             {
    1372           0 :                 nRet = HTML_TABCHAR;
    1373           0 :                 break;
    1374             :             }
    1375             :             // no break !
    1376             :         case ' ':
    1377             :             // no break !
    1378             :         default:
    1379             : 
    1380             : scan_text:
    1381             :             // "normal" text to come
    1382        1274 :             nRet = ScanText();
    1383        1274 :             bNextCh = 0 == aToken.getLength();
    1384             : 
    1385             :             // the text should be processed
    1386        1274 :             if( !bNextCh && eState == SVPAR_PENDING )
    1387             :             {
    1388           0 :                 eState = SVPAR_WORKING;
    1389           0 :                 bReadNextChar = true;
    1390             :             }
    1391             : 
    1392        1274 :             break;
    1393             :         }
    1394             : 
    1395        2875 :         if( bNextCh && SVPAR_WORKING == eState )
    1396             :         {
    1397        1597 :             nNextCh = GetNextChar();
    1398        1597 :             if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
    1399             :             {
    1400           0 :                 bReadNextChar = true;
    1401           0 :                 eState = SVPAR_WORKING;
    1402             :             }
    1403             :         }
    1404             : 
    1405           3 :     } while( !nRet && SVPAR_WORKING == eState );
    1406             : 
    1407        2872 :     if( SVPAR_PENDING == eState )
    1408           0 :         nRet = -1;      // s.th. invalid
    1409             : 
    1410        2872 :     return nRet;
    1411             : }
    1412             : 
    1413           0 : void HTMLParser::UnescapeToken()
    1414             : {
    1415           0 :     sal_Int32 nPos=0;
    1416             : 
    1417           0 :     bool bEscape = false;
    1418           0 :     while( nPos < aToken.getLength() )
    1419             :     {
    1420           0 :         bool bOldEscape = bEscape;
    1421           0 :         bEscape = false;
    1422           0 :         if( '\\'==aToken[nPos] && !bOldEscape )
    1423             :         {
    1424           0 :             aToken = aToken.replaceAt( nPos, 1, "" );
    1425           0 :             bEscape = true;
    1426             :         }
    1427             :         else
    1428             :         {
    1429           0 :             nPos++;
    1430             :         }
    1431             :     }
    1432           0 : }
    1433             : 
    1434         945 : const HTMLOptions& HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken )
    1435             : {
    1436             :     // If the options for the current token have already been returned,
    1437             :     // return them once again.
    1438         945 :     if (!maOptions.empty())
    1439           7 :         return maOptions;
    1440             : 
    1441         938 :     sal_Int32 nPos = 0;
    1442        4474 :     while( nPos < aToken.getLength() )
    1443             :     {
    1444             :         // A letter? Option beginning here.
    1445        2598 :         if( HTML_ISALPHA( aToken[nPos] ) )
    1446             :         {
    1447             :             int nToken;
    1448        1490 :             OUString aValue;
    1449        1490 :             sal_Int32 nStt = nPos;
    1450        1490 :             sal_Unicode cChar = 0;
    1451             : 
    1452             :             // Actually only certain characters allowed.
    1453             :             // Netscape only looks for "=" and white space (c.f.
    1454             :             // Mozilla: PA_FetchRequestedNameValues in lipparse/pa_mdl.c)
    1455       29258 :             while( nPos < aToken.getLength() && '=' != (cChar=aToken[nPos]) &&
    1456       17022 :                    HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
    1457        7766 :                 nPos++;
    1458             : 
    1459        2980 :             OUString sName( aToken.copy( nStt, nPos-nStt ) );
    1460             : 
    1461             :             // PlugIns require original token name. Convert to lower case only for searching.
    1462        1490 :             nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
    1463             :             DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
    1464             :                         "GetOption: unknown HTML option" );
    1465           9 :             bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
    1466        4447 :                                nToken >= HTML_OPTION_SCRIPT_END) &&
    1467        1547 :                               (!pNoConvertToken || nToken != *pNoConvertToken);
    1468             : 
    1469        4470 :             while( nPos < aToken.getLength() &&
    1470        2980 :                    ( !HTML_ISPRINTABLE( (cChar=aToken[nPos]) ) ||
    1471        1490 :                      HTML_ISSPACE(cChar) ) )
    1472           0 :                 nPos++;
    1473             : 
    1474             :             // Option with value?
    1475        1490 :             if( nPos!=aToken.getLength() && '='==cChar )
    1476             :             {
    1477        1490 :                 nPos++;
    1478             : 
    1479        4470 :                 while( nPos < aToken.getLength() &&
    1480        2980 :                         ( !HTML_ISPRINTABLE( (cChar=aToken[nPos]) ) ||
    1481        1490 :                           ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
    1482           0 :                     nPos++;
    1483             : 
    1484        1490 :                 if( nPos != aToken.getLength() )
    1485             :                 {
    1486        1490 :                     sal_Int32 nLen = 0;
    1487        1490 :                     nStt = nPos;
    1488        1490 :                     if( ('"'==cChar) || ('\'')==cChar )
    1489             :                     {
    1490        1489 :                         sal_Unicode cEnd = cChar;
    1491        1489 :                         nPos++; nStt++;
    1492        1489 :                         bool bDone = false;
    1493        1489 :                         bool bEscape = false;
    1494       46038 :                         while( nPos < aToken.getLength() && !bDone )
    1495             :                         {
    1496       43060 :                             bool bOldEscape = bEscape;
    1497       43060 :                             bEscape = false;
    1498       43060 :                             cChar = aToken[nPos];
    1499       43060 :                             switch( cChar )
    1500             :                             {
    1501             :                             case '\r':
    1502             :                             case '\n':
    1503           0 :                                 if( bStripCRLF )
    1504           0 :                                     aToken = aToken.replaceAt( nPos, 1, "" );
    1505             :                                 else
    1506           0 :                                     nPos++, nLen++;
    1507           0 :                                 break;
    1508             :                             case '\\':
    1509           0 :                                 if( bOldEscape )
    1510             :                                 {
    1511           0 :                                     nPos++, nLen++;
    1512             :                                 }
    1513             :                                 else
    1514             :                                 {
    1515           0 :                                     aToken = aToken.replaceAt( nPos, 1, "" );
    1516           0 :                                     bEscape = true;
    1517             :                                 }
    1518           0 :                                 break;
    1519             :                             case '"':
    1520             :                             case '\'':
    1521        1489 :                                 bDone = !bOldEscape && cChar==cEnd;
    1522        1489 :                                 if( !bDone )
    1523           0 :                                     nPos++, nLen++;
    1524        1489 :                                 break;
    1525             :                             default:
    1526       41571 :                                 nPos++, nLen++;
    1527       41571 :                                 break;
    1528             :                             }
    1529             :                         }
    1530        1489 :                         if( nPos!=aToken.getLength() )
    1531        1489 :                             nPos++;
    1532             :                     }
    1533             :                     else
    1534             :                     {
    1535             :                         // More liberal than the standard: allow all printable characters
    1536           1 :                         bool bEscape = false;
    1537           1 :                         bool bDone = false;
    1538          11 :                         while( nPos < aToken.getLength() && !bDone )
    1539             :                         {
    1540           9 :                             bool bOldEscape = bEscape;
    1541           9 :                             bEscape = false;
    1542           9 :                             sal_Unicode c = aToken[nPos];
    1543           9 :                             switch( c )
    1544             :                             {
    1545             :                             case ' ':
    1546           1 :                                 bDone = !bOldEscape;
    1547           1 :                                 if( !bDone )
    1548           0 :                                     nPos++, nLen++;
    1549           1 :                                 break;
    1550             : 
    1551             :                             case '\t':
    1552             :                             case '\r':
    1553             :                             case '\n':
    1554           0 :                                 bDone = true;
    1555           0 :                                 break;
    1556             : 
    1557             :                             case '\\':
    1558           0 :                                 if( bOldEscape )
    1559             :                                 {
    1560           0 :                                     nPos++, nLen++;
    1561             :                                 }
    1562             :                                 else
    1563             :                                 {
    1564           0 :                                     aToken = aToken.replaceAt( nPos, 1, "" );
    1565           0 :                                     bEscape = true;
    1566             :                                 }
    1567           0 :                                 break;
    1568             : 
    1569             :                             default:
    1570           8 :                                 if( HTML_ISPRINTABLE( c ) )
    1571           8 :                                     nPos++, nLen++;
    1572             :                                 else
    1573           0 :                                     bDone = true;
    1574           8 :                                 break;
    1575             :                             }
    1576             :                         }
    1577             :                     }
    1578             : 
    1579        1490 :                     if( nLen )
    1580        1490 :                         aValue = aToken.copy( nStt, nLen );
    1581             :                 }
    1582             :             }
    1583             : 
    1584             :             // Token is known and can be saved
    1585             :             std::auto_ptr<HTMLOption> pOption(
    1586        1490 :                 new HTMLOption(sal::static_int_cast<sal_uInt16>(nToken), sName, aValue));
    1587             : 
    1588        2980 :             maOptions.push_back(pOption);
    1589             :         }
    1590             :         else
    1591             :             // Ignore white space and unexpected characters
    1592        1108 :             nPos++;
    1593             :     }
    1594             : 
    1595         938 :     return maOptions;
    1596             : }
    1597             : 
    1598           0 : int HTMLParser::FilterPRE( int nToken )
    1599             : {
    1600           0 :     switch( nToken )
    1601             :     {
    1602             :     // in Netscape they only have impact in not empty paragraphs
    1603             :     case HTML_PARABREAK_ON:
    1604           0 :         nToken = HTML_LINEBREAK;
    1605             :     case HTML_LINEBREAK:
    1606             :     case HTML_NEWPARA:
    1607           0 :         nPre_LinePos = 0;
    1608           0 :         if( bPre_IgnoreNewPara )
    1609           0 :             nToken = 0;
    1610           0 :         break;
    1611             : 
    1612             :     case HTML_TABCHAR:
    1613             :         {
    1614           0 :             sal_Int32 nSpaces = (8 - (nPre_LinePos % 8));
    1615             :             DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
    1616           0 :             if (aToken.getLength() < nSpaces)
    1617             :             {
    1618             :                 using comphelper::string::padToLength;
    1619           0 :                 OUStringBuffer aBuf(aToken);
    1620           0 :                 aToken = padToLength(aBuf, nSpaces, ' ').makeStringAndClear();
    1621             :             }
    1622           0 :             nPre_LinePos += nSpaces;
    1623           0 :             nToken = HTML_TEXTTOKEN;
    1624             :         }
    1625           0 :         break;
    1626             :     // Keep those
    1627             :     case HTML_TEXTTOKEN:
    1628           0 :         nPre_LinePos += aToken.getLength();
    1629           0 :         break;
    1630             : 
    1631             :     case HTML_SELECT_ON:
    1632             :     case HTML_SELECT_OFF:
    1633             :     case HTML_BODY_ON:
    1634             :     case HTML_FORM_ON:
    1635             :     case HTML_FORM_OFF:
    1636             :     case HTML_INPUT:
    1637             :     case HTML_OPTION:
    1638             :     case HTML_TEXTAREA_ON:
    1639             :     case HTML_TEXTAREA_OFF:
    1640             : 
    1641             :     case HTML_IMAGE:
    1642             :     case HTML_APPLET_ON:
    1643             :     case HTML_APPLET_OFF:
    1644             :     case HTML_PARAM:
    1645             :     case HTML_EMBED:
    1646             : 
    1647             :     case HTML_HEAD1_ON:
    1648             :     case HTML_HEAD1_OFF:
    1649             :     case HTML_HEAD2_ON:
    1650             :     case HTML_HEAD2_OFF:
    1651             :     case HTML_HEAD3_ON:
    1652             :     case HTML_HEAD3_OFF:
    1653             :     case HTML_HEAD4_ON:
    1654             :     case HTML_HEAD4_OFF:
    1655             :     case HTML_HEAD5_ON:
    1656             :     case HTML_HEAD5_OFF:
    1657             :     case HTML_HEAD6_ON:
    1658             :     case HTML_HEAD6_OFF:
    1659             :     case HTML_BLOCKQUOTE_ON:
    1660             :     case HTML_BLOCKQUOTE_OFF:
    1661             :     case HTML_ADDRESS_ON:
    1662             :     case HTML_ADDRESS_OFF:
    1663             :     case HTML_HORZRULE:
    1664             : 
    1665             :     case HTML_CENTER_ON:
    1666             :     case HTML_CENTER_OFF:
    1667             :     case HTML_DIVISION_ON:
    1668             :     case HTML_DIVISION_OFF:
    1669             : 
    1670             :     case HTML_SCRIPT_ON:
    1671             :     case HTML_SCRIPT_OFF:
    1672             :     case HTML_RAWDATA:
    1673             : 
    1674             :     case HTML_TABLE_ON:
    1675             :     case HTML_TABLE_OFF:
    1676             :     case HTML_CAPTION_ON:
    1677             :     case HTML_CAPTION_OFF:
    1678             :     case HTML_COLGROUP_ON:
    1679             :     case HTML_COLGROUP_OFF:
    1680             :     case HTML_COL_ON:
    1681             :     case HTML_COL_OFF:
    1682             :     case HTML_THEAD_ON:
    1683             :     case HTML_THEAD_OFF:
    1684             :     case HTML_TFOOT_ON:
    1685             :     case HTML_TFOOT_OFF:
    1686             :     case HTML_TBODY_ON:
    1687             :     case HTML_TBODY_OFF:
    1688             :     case HTML_TABLEROW_ON:
    1689             :     case HTML_TABLEROW_OFF:
    1690             :     case HTML_TABLEDATA_ON:
    1691             :     case HTML_TABLEDATA_OFF:
    1692             :     case HTML_TABLEHEADER_ON:
    1693             :     case HTML_TABLEHEADER_OFF:
    1694             : 
    1695             :     case HTML_ANCHOR_ON:
    1696             :     case HTML_ANCHOR_OFF:
    1697             :     case HTML_BOLD_ON:
    1698             :     case HTML_BOLD_OFF:
    1699             :     case HTML_ITALIC_ON:
    1700             :     case HTML_ITALIC_OFF:
    1701             :     case HTML_STRIKE_ON:
    1702             :     case HTML_STRIKE_OFF:
    1703             :     case HTML_STRIKETHROUGH_ON:
    1704             :     case HTML_STRIKETHROUGH_OFF:
    1705             :     case HTML_UNDERLINE_ON:
    1706             :     case HTML_UNDERLINE_OFF:
    1707             :     case HTML_BASEFONT_ON:
    1708             :     case HTML_BASEFONT_OFF:
    1709             :     case HTML_FONT_ON:
    1710             :     case HTML_FONT_OFF:
    1711             :     case HTML_BLINK_ON:
    1712             :     case HTML_BLINK_OFF:
    1713             :     case HTML_SPAN_ON:
    1714             :     case HTML_SPAN_OFF:
    1715             :     case HTML_SUBSCRIPT_ON:
    1716             :     case HTML_SUBSCRIPT_OFF:
    1717             :     case HTML_SUPERSCRIPT_ON:
    1718             :     case HTML_SUPERSCRIPT_OFF:
    1719             :     case HTML_BIGPRINT_ON:
    1720             :     case HTML_BIGPRINT_OFF:
    1721             :     case HTML_SMALLPRINT_OFF:
    1722             :     case HTML_SMALLPRINT_ON:
    1723             : 
    1724             :     case HTML_EMPHASIS_ON:
    1725             :     case HTML_EMPHASIS_OFF:
    1726             :     case HTML_CITIATION_ON:
    1727             :     case HTML_CITIATION_OFF:
    1728             :     case HTML_STRONG_ON:
    1729             :     case HTML_STRONG_OFF:
    1730             :     case HTML_CODE_ON:
    1731             :     case HTML_CODE_OFF:
    1732             :     case HTML_SAMPLE_ON:
    1733             :     case HTML_SAMPLE_OFF:
    1734             :     case HTML_KEYBOARD_ON:
    1735             :     case HTML_KEYBOARD_OFF:
    1736             :     case HTML_VARIABLE_ON:
    1737             :     case HTML_VARIABLE_OFF:
    1738             :     case HTML_DEFINSTANCE_ON:
    1739             :     case HTML_DEFINSTANCE_OFF:
    1740             :     case HTML_SHORTQUOTE_ON:
    1741             :     case HTML_SHORTQUOTE_OFF:
    1742             :     case HTML_LANGUAGE_ON:
    1743             :     case HTML_LANGUAGE_OFF:
    1744             :     case HTML_AUTHOR_ON:
    1745             :     case HTML_AUTHOR_OFF:
    1746             :     case HTML_PERSON_ON:
    1747             :     case HTML_PERSON_OFF:
    1748             :     case HTML_ACRONYM_ON:
    1749             :     case HTML_ACRONYM_OFF:
    1750             :     case HTML_ABBREVIATION_ON:
    1751             :     case HTML_ABBREVIATION_OFF:
    1752             :     case HTML_INSERTEDTEXT_ON:
    1753             :     case HTML_INSERTEDTEXT_OFF:
    1754             :     case HTML_DELETEDTEXT_ON:
    1755             :     case HTML_DELETEDTEXT_OFF:
    1756             :     case HTML_TELETYPE_ON:
    1757             :     case HTML_TELETYPE_OFF:
    1758             : 
    1759           0 :         break;
    1760             : 
    1761             :     // The remainder is treated as an unknown token.
    1762             :     default:
    1763           0 :         if( nToken )
    1764             :         {
    1765             :             nToken =
    1766           0 :                 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
    1767             :                     ? HTML_UNKNOWNCONTROL_OFF
    1768           0 :                     : HTML_UNKNOWNCONTROL_ON );
    1769             :         }
    1770           0 :         break;
    1771             :     }
    1772             : 
    1773           0 :     bPre_IgnoreNewPara = false;
    1774             : 
    1775           0 :     return nToken;
    1776             : }
    1777             : 
    1778           0 : int HTMLParser::FilterXMP( int nToken )
    1779             : {
    1780           0 :     switch( nToken )
    1781             :     {
    1782             :     case HTML_NEWPARA:
    1783           0 :         if( bPre_IgnoreNewPara )
    1784           0 :             nToken = 0;
    1785             :     case HTML_TEXTTOKEN:
    1786             :     case HTML_NONBREAKSPACE:
    1787             :     case HTML_SOFTHYPH:
    1788           0 :         break;              // kept
    1789             : 
    1790             :     default:
    1791           0 :         if( nToken )
    1792             :         {
    1793           0 :             if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
    1794             :             {
    1795           0 :                 sSaveToken = "</" + sSaveToken;
    1796             :             }
    1797             :             else
    1798           0 :                 sSaveToken = "<" + sSaveToken;
    1799           0 :             if( !aToken.isEmpty() )
    1800             :             {
    1801           0 :                 UnescapeToken();
    1802           0 :                 sSaveToken += " ";
    1803           0 :                 aToken = sSaveToken + aToken;
    1804             :             }
    1805             :             else
    1806           0 :                 aToken = sSaveToken;
    1807           0 :             aToken += ">";
    1808           0 :             nToken = HTML_TEXTTOKEN;
    1809             :         }
    1810           0 :         break;
    1811             :     }
    1812             : 
    1813           0 :     bPre_IgnoreNewPara = false;
    1814             : 
    1815           0 :     return nToken;
    1816             : }
    1817             : 
    1818           0 : int HTMLParser::FilterListing( int nToken )
    1819             : {
    1820           0 :     switch( nToken )
    1821             :     {
    1822             :     case HTML_NEWPARA:
    1823           0 :         if( bPre_IgnoreNewPara )
    1824           0 :             nToken = 0;
    1825             :     case HTML_TEXTTOKEN:
    1826             :     case HTML_NONBREAKSPACE:
    1827             :     case HTML_SOFTHYPH:
    1828           0 :         break;      // kept
    1829             : 
    1830             :     default:
    1831           0 :         if( nToken )
    1832             :         {
    1833             :             nToken =
    1834           0 :                 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
    1835             :                     ? HTML_UNKNOWNCONTROL_OFF
    1836           0 :                     : HTML_UNKNOWNCONTROL_ON );
    1837             :         }
    1838           0 :         break;
    1839             :     }
    1840             : 
    1841           0 :     bPre_IgnoreNewPara = false;
    1842             : 
    1843           0 :     return nToken;
    1844             : }
    1845             : 
    1846           0 : bool HTMLParser::IsHTMLFormat( const sal_Char* pHeader,
    1847             :                                bool bSwitchToUCS2,
    1848             :                                rtl_TextEncoding eEnc )
    1849             : {
    1850             :     // If the string matches one of the following regular expressions then
    1851             :     // the document is a HTML document.
    1852             : 
    1853             :     // ^[^<]*<[^ \t]*[> \t]
    1854             :     //        -------
    1855             :     // ^<!
    1856             : 
    1857             :     // where the underlined subexpression has to be a HTML token
    1858           0 :     OString sCmp;
    1859           0 :     bool bUCS2B = false;
    1860           0 :     if( bSwitchToUCS2 )
    1861             :     {
    1862           0 :         if( 0xfeU == (unsigned char)pHeader[0] &&
    1863           0 :             0xffU == (unsigned char)pHeader[1] )
    1864             :         {
    1865           0 :             eEnc = RTL_TEXTENCODING_UCS2;
    1866           0 :             bUCS2B = true;
    1867             :         }
    1868           0 :         else if( 0xffU == (unsigned char)pHeader[0] &&
    1869           0 :                  0xfeU == (unsigned char)pHeader[1] )
    1870             :         {
    1871           0 :             eEnc = RTL_TEXTENCODING_UCS2;
    1872             :         }
    1873             :     }
    1874           0 :     if
    1875             :        (
    1876           0 :         RTL_TEXTENCODING_UCS2 == eEnc &&
    1877             :         (
    1878           0 :          (0xfe == (unsigned char)pHeader[0] && 0xff == (unsigned char)pHeader[1]) ||
    1879           0 :          (0xff == (unsigned char)pHeader[0] && 0xfe == (unsigned char)pHeader[1])
    1880             :         )
    1881             :        )
    1882             :     {
    1883           0 :         if( 0xfe == (unsigned char)pHeader[0] )
    1884           0 :             bUCS2B = true;
    1885             : 
    1886           0 :         sal_Int32 nLen = 2;
    1887           0 :         while( pHeader[nLen] != 0 || pHeader[nLen+1] != 0 )
    1888           0 :              nLen += 2;
    1889             : 
    1890           0 :         OStringBuffer sTmp( (nLen - 2)/2 );
    1891           0 :         for( sal_Int32 nPos = 2; nPos < nLen; nPos += 2 )
    1892             :         {
    1893             :             sal_Unicode cUC;
    1894           0 :             if( bUCS2B )
    1895           0 :                 cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1];
    1896             :             else
    1897           0 :                 cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos];
    1898           0 :             if( 0U == cUC )
    1899           0 :                 break;
    1900             : 
    1901           0 :             sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' );
    1902             :         }
    1903           0 :         sCmp = sTmp.makeStringAndClear();
    1904             :     }
    1905             :     else
    1906             :     {
    1907           0 :         sCmp = pHeader;
    1908             :     }
    1909             : 
    1910           0 :     sCmp = sCmp.toAsciiLowerCase();
    1911             : 
    1912             :     // A HTML document must have a '<' in the first line
    1913           0 :     sal_Int32 nStart = sCmp.indexOf('<');
    1914           0 :     if (nStart == -1)
    1915           0 :         return false;
    1916           0 :     nStart++;
    1917             : 
    1918             :     // followed by arbitrary characters followed by a blank or '>'
    1919             :     sal_Char c;
    1920             :     sal_Int32 nPos;
    1921           0 :     for( nPos = nStart; nPos < sCmp.getLength(); ++nPos )
    1922             :     {
    1923           0 :         if( '>'==(c=sCmp[nPos]) || HTML_ISSPACE(c) )
    1924           0 :             break;
    1925             :     }
    1926             : 
    1927             :     // If the document ends after < it's no HTML
    1928           0 :     if( nPos==nStart )
    1929           0 :         return false;
    1930             : 
    1931             :     // the string following '<' has to be a known HTML token.
    1932             :     // <DIR> is not interpreted as HTML. Otherwise the output of the DOS command "DIR"
    1933             :     // could be interpreted as HTML.
    1934           0 :     OUString sTest(OStringToOUString(sCmp.copy(nStart, nPos-nStart), RTL_TEXTENCODING_ASCII_US));
    1935           0 :     int nTok = GetHTMLToken( sTest );
    1936           0 :     if( 0 != nTok && HTML_DIRLIST_ON != nTok )
    1937           0 :         return true;
    1938             : 
    1939             :     // "<!" at the very beginning of the file?
    1940           0 :     if( nStart == 1 && '!' == sCmp[1] )
    1941           0 :         return true;
    1942             : 
    1943             :     // <HTML> somewhere in the first 80 characters of the document
    1944           0 :     nStart = sCmp.indexOf(OOO_STRING_SVTOOLS_HTML_html);
    1945           0 :     if( nStart>0 && '<'==sCmp[nStart-1] &&
    1946           0 :         nStart+4 < sCmp.getLength() && '>'==sCmp[nStart+4] )
    1947           0 :         return true;
    1948             : 
    1949             :     // Else it's rather not a HTML document
    1950           0 :     return false;
    1951             : }
    1952             : 
    1953           1 : bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
    1954             : {
    1955           3 :     if( rURL.getLength() < 19 || 'i' != rURL[0] ||
    1956           1 :         rURL.compareTo( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != 0 )
    1957           1 :         return false;
    1958             : 
    1959           0 :     bool bFound = false;
    1960             : 
    1961           0 :     if( rURL.compareTo( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == 0 )
    1962             :     {
    1963           0 :         OUString aName( rURL.copy(16) );
    1964           0 :         switch( aName[0] )
    1965             :         {
    1966             :         case 'b':
    1967           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary;
    1968           0 :             break;
    1969             :         case 'i':
    1970           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ||
    1971           0 :                      aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index;
    1972           0 :             break;
    1973             :         case 'm':
    1974           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ||
    1975           0 :                      aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie;
    1976           0 :             break;
    1977             :         case 's':
    1978           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound;
    1979           0 :             break;
    1980             :         case 't':
    1981           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ||
    1982           0 :                      aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text;
    1983           0 :             break;
    1984             :         case 'u':
    1985           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown;
    1986           0 :             break;
    1987           0 :         }
    1988             :     }
    1989           0 :     else if( rURL.compareTo( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == 0 )
    1990             :     {
    1991           0 :         OUString aName( rURL.copy(14) );
    1992           0 :         switch( aName[0] )
    1993             :         {
    1994             :         case 'b':
    1995           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
    1996           0 :             break;
    1997             :         case 'd':
    1998           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
    1999           0 :             break;
    2000             :         case 'e':
    2001           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
    2002           0 :             break;
    2003             :         case 'i':
    2004           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
    2005           0 :             break;
    2006             :         case 'n':
    2007           0 :             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
    2008           0 :             break;
    2009           0 :         }
    2010             :     }
    2011           0 :     if( bFound )
    2012             :     {
    2013           0 :         OUString sTmp ( rURL );
    2014           0 :         rURL =  OOO_STRING_SVTOOLS_HTML_private_image;
    2015           0 :         rURL += sTmp;
    2016             :     }
    2017             : 
    2018           0 :     return bFound;
    2019             : }
    2020             : 
    2021             : enum eHtmlMetas {
    2022             :     HTML_META_NONE = 0,
    2023             :     HTML_META_AUTHOR,
    2024             :     HTML_META_DESCRIPTION,
    2025             :     HTML_META_KEYWORDS,
    2026             :     HTML_META_REFRESH,
    2027             :     HTML_META_CLASSIFICATION,
    2028             :     HTML_META_CREATED,
    2029             :     HTML_META_CHANGEDBY,
    2030             :     HTML_META_CHANGED,
    2031             :     HTML_META_GENERATOR,
    2032             :     HTML_META_SDFOOTNOTE,
    2033             :     HTML_META_SDENDNOTE,
    2034             :     HTML_META_CONTENT_TYPE
    2035             : };
    2036             : 
    2037             : // <META NAME=xxx>
    2038             : static HTMLOptionEnum const aHTMLMetaNameTable[] =
    2039             : {
    2040             :     { OOO_STRING_SVTOOLS_HTML_META_author,        HTML_META_AUTHOR        },
    2041             :     { OOO_STRING_SVTOOLS_HTML_META_changed,       HTML_META_CHANGED       },
    2042             :     { OOO_STRING_SVTOOLS_HTML_META_changedby,     HTML_META_CHANGEDBY     },
    2043             :     { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
    2044             :     { OOO_STRING_SVTOOLS_HTML_META_content_type,  HTML_META_CONTENT_TYPE  },
    2045             :     { OOO_STRING_SVTOOLS_HTML_META_created,       HTML_META_CREATED       },
    2046             :     { OOO_STRING_SVTOOLS_HTML_META_description,   HTML_META_DESCRIPTION   },
    2047             :     { OOO_STRING_SVTOOLS_HTML_META_keywords,      HTML_META_KEYWORDS      },
    2048             :     { OOO_STRING_SVTOOLS_HTML_META_generator,     HTML_META_GENERATOR     },
    2049             :     { OOO_STRING_SVTOOLS_HTML_META_refresh,       HTML_META_REFRESH       },
    2050             :     { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HTML_META_SDENDNOTE     },
    2051             :     { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HTML_META_SDFOOTNOTE    },
    2052             :     { 0,                                          0                       }
    2053             : };
    2054             : 
    2055             : 
    2056           0 : void HTMLParser::AddMetaUserDefined( OUString const & )
    2057             : {
    2058           0 : }
    2059             : 
    2060          20 : bool HTMLParser::ParseMetaOptionsImpl(
    2061             :         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
    2062             :         SvKeyValueIterator *i_pHTTPHeader,
    2063             :         const HTMLOptions& aOptions,
    2064             :         rtl_TextEncoding& o_rEnc )
    2065             : {
    2066          40 :     OUString aName, aContent;
    2067          20 :     sal_uInt16 nAction = HTML_META_NONE;
    2068          20 :     bool bHTTPEquiv = false, bChanged = false;
    2069             : 
    2070          80 :     for ( size_t i = aOptions.size(); i; )
    2071             :     {
    2072          40 :         const HTMLOption& aOption = aOptions[--i];
    2073          40 :         switch ( aOption.GetToken() )
    2074             :         {
    2075             :             case HTML_O_NAME:
    2076          17 :                 aName = aOption.GetString();
    2077          17 :                 if ( HTML_META_NONE==nAction )
    2078             :                 {
    2079          17 :                     aOption.GetEnum( nAction, aHTMLMetaNameTable );
    2080             :                 }
    2081          17 :                 break;
    2082             :             case HTML_O_HTTPEQUIV:
    2083           3 :                 aName = aOption.GetString();
    2084           3 :                 aOption.GetEnum( nAction, aHTMLMetaNameTable );
    2085           3 :                 bHTTPEquiv = true;
    2086           3 :                 break;
    2087             :             case HTML_O_CONTENT:
    2088          20 :                 aContent = aOption.GetString();
    2089          20 :                 break;
    2090             :         }
    2091             :     }
    2092             : 
    2093          20 :     if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
    2094             :     {
    2095             :         // if it is not a Description, remove CRs and LFs from CONTENT
    2096          20 :         aContent = comphelper::string::remove(aContent, '\r');
    2097          20 :         aContent = comphelper::string::remove(aContent, '\n');
    2098             :     }
    2099             :     else
    2100             :     {
    2101             :         // convert line endings for Description
    2102           0 :         aContent = convertLineEnd(aContent, GetSystemLineEnd());
    2103             :     }
    2104             : 
    2105             : 
    2106          20 :     if ( bHTTPEquiv && i_pHTTPHeader )
    2107             :     {
    2108             :         // Netscape seems to just ignore a closing ", so we do too
    2109           3 :         if ( aContent.endsWith("\"") )
    2110             :         {
    2111           0 :             aContent = aContent.copy( 0, aContent.getLength() - 1 );
    2112             :         }
    2113           3 :         SvKeyValue aKeyValue( aName, aContent );
    2114           3 :         i_pHTTPHeader->Append( aKeyValue );
    2115             :     }
    2116             : 
    2117          20 :     switch ( nAction )
    2118             :     {
    2119             :         case HTML_META_AUTHOR:
    2120           1 :             if (i_xDocProps.is()) {
    2121           1 :                 i_xDocProps->setAuthor( aContent );
    2122           1 :                 bChanged = true;
    2123             :             }
    2124           1 :             break;
    2125             :         case HTML_META_DESCRIPTION:
    2126           0 :             if (i_xDocProps.is()) {
    2127           0 :                 i_xDocProps->setDescription( aContent );
    2128           0 :                 bChanged = true;
    2129             :             }
    2130           0 :             break;
    2131             :         case HTML_META_KEYWORDS:
    2132           0 :             if (i_xDocProps.is()) {
    2133           0 :                 i_xDocProps->setKeywords(
    2134           0 :                     ::comphelper::string::convertCommaSeparated(aContent));
    2135           0 :                 bChanged = true;
    2136             :             }
    2137           0 :             break;
    2138             :         case HTML_META_CLASSIFICATION:
    2139           0 :             if (i_xDocProps.is()) {
    2140           0 :                 i_xDocProps->setSubject( aContent );
    2141           0 :                 bChanged = true;
    2142             :             }
    2143           0 :             break;
    2144             : 
    2145             :         case HTML_META_CHANGEDBY:
    2146           1 :             if (i_xDocProps.is()) {
    2147           1 :                 i_xDocProps->setModifiedBy( aContent );
    2148             :             }
    2149           1 :             break;
    2150             : 
    2151             :         case HTML_META_CREATED:
    2152             :         case HTML_META_CHANGED:
    2153          12 :             if ( i_xDocProps.is() && !aContent.isEmpty() &&
    2154           6 :                  comphelper::string::getTokenCount(aContent, ';') == 2 )
    2155             :             {
    2156           6 :                 Date aDate( (sal_uLong)aContent.getToken(0, ';').toInt32() );
    2157           6 :                 Time aTime( (sal_uLong)aContent.getToken(1, ';').toInt32() );
    2158           6 :                 DateTime aDateTime( aDate, aTime );
    2159           6 :                 ::util::DateTime uDT(aDateTime.GetNanoSec(),
    2160          12 :                     aDateTime.GetSec(), aDateTime.GetMin(),
    2161          12 :                     aDateTime.GetHour(), aDateTime.GetDay(),
    2162          12 :                     aDateTime.GetMonth(), aDateTime.GetYear(),
    2163          48 :                     false);
    2164           6 :                 if ( HTML_META_CREATED==nAction )
    2165           3 :                     i_xDocProps->setCreationDate( uDT );
    2166             :                 else
    2167           3 :                     i_xDocProps->setModificationDate( uDT );
    2168           6 :                 bChanged = true;
    2169             :             }
    2170           6 :             break;
    2171             : 
    2172             :         case HTML_META_REFRESH:
    2173             :             DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
    2174             :         "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
    2175           0 :             break;
    2176             : 
    2177             :         case HTML_META_CONTENT_TYPE:
    2178           3 :             if ( !aContent.isEmpty() )
    2179             :             {
    2180           3 :                 o_rEnc = GetEncodingByMIME( aContent );
    2181             :             }
    2182           3 :             break;
    2183             : 
    2184             :         case HTML_META_NONE:
    2185           6 :             if ( !bHTTPEquiv )
    2186             :             {
    2187           6 :                 if (i_xDocProps.is())
    2188             :                 {
    2189             :                     uno::Reference<beans::XPropertyContainer> xUDProps
    2190           6 :                         = i_xDocProps->getUserDefinedProperties();
    2191             :                     try {
    2192           6 :                         xUDProps->addProperty(aName,
    2193             :                             beans::PropertyAttribute::REMOVABLE,
    2194           6 :                             uno::makeAny(OUString(aContent)));
    2195           6 :                         AddMetaUserDefined(aName);
    2196           6 :                         bChanged = true;
    2197           0 :                     } catch (uno::Exception &) {
    2198             :                         // ignore
    2199           6 :                     }
    2200             :                 }
    2201             :             }
    2202           6 :             break;
    2203             :         default:
    2204           3 :             break;
    2205             :     }
    2206             : 
    2207          40 :     return bChanged;
    2208             : }
    2209             : 
    2210          20 : bool HTMLParser::ParseMetaOptions(
    2211             :         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
    2212             :         SvKeyValueIterator *i_pHeader )
    2213             : {
    2214          20 :     sal_uInt16 nContentOption = HTML_O_CONTENT;
    2215          20 :     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
    2216             : 
    2217             :     bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
    2218          20 :                       GetOptions(&nContentOption),
    2219          20 :                       eEnc );
    2220             : 
    2221             :     // If the encoding is set by a META tag, it may only overwrite the
    2222             :     // current encoding if both, the current and the new encoding, are 1-sal_uInt8
    2223             :     // encodings. Everything else cannot lead to reasonable results.
    2224          43 :     if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
    2225          23 :         rtl_isOctetTextEncoding( eEnc ) &&
    2226           3 :         rtl_isOctetTextEncoding( GetSrcEncoding() ) )
    2227             :     {
    2228           3 :         eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
    2229           3 :         SetSrcEncoding( eEnc );
    2230             :     }
    2231             : 
    2232          20 :     return bRet;
    2233             : }
    2234             : 
    2235           3 : rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
    2236             : {
    2237           3 :     OUString sType;
    2238           6 :     OUString sSubType;
    2239           6 :     INetContentTypeParameterList aParameters;
    2240           3 :     if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
    2241             :     {
    2242           3 :         const INetContentTypeParameter * pCharset = aParameters.find("charset");
    2243           3 :         if (pCharset != 0)
    2244             :         {
    2245           3 :             OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
    2246           3 :             return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
    2247             :         }
    2248             :     }
    2249           3 :     return RTL_TEXTENCODING_DONTKNOW;
    2250             : }
    2251             : 
    2252           7 : rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
    2253             : {
    2254           7 :     rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
    2255           7 :     if( pHTTPHeader )
    2256             :     {
    2257           7 :         SvKeyValue aKV;
    2258          14 :         for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
    2259           7 :              bCont = pHTTPHeader->GetNext( aKV ) )
    2260             :         {
    2261           7 :             if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
    2262             :             {
    2263           7 :                 if( !aKV.GetValue().isEmpty() )
    2264             :                 {
    2265           0 :                     eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
    2266             :                 }
    2267             :             }
    2268           7 :         }
    2269             :     }
    2270           7 :     return eRet;
    2271             : }
    2272             : 
    2273           7 : bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
    2274             : {
    2275           7 :     bool bRet = false;
    2276           7 :     rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
    2277           7 :     if(RTL_TEXTENCODING_DONTKNOW != eEnc)
    2278             :     {
    2279           0 :         SetSrcEncoding( eEnc );
    2280           0 :         bRet = true;
    2281             :     }
    2282           7 :     return bRet;
    2283             : }
    2284             : 
    2285             : 
    2286             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10