Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #ifndef INCLUDED_SVTOOLS_PARHTML_HXX
21 : #define INCLUDED_SVTOOLS_PARHTML_HXX
22 :
23 : #include <svtools/svtdllapi.h>
24 : #include <svtools/svparser.hxx>
25 :
26 : #include <boost/ptr_container/ptr_vector.hpp>
27 :
28 : namespace com { namespace sun { namespace star {
29 : namespace document {
30 : class XDocumentProperties;
31 : }
32 : } } }
33 :
34 : class Color;
35 : class SvNumberFormatter;
36 : class SvKeyValueIterator;
37 :
38 : #define HTMLFONTSZ1_DFLT 7
39 : #define HTMLFONTSZ2_DFLT 10
40 : #define HTMLFONTSZ3_DFLT 12
41 : #define HTMLFONTSZ4_DFLT 14
42 : #define HTMLFONTSZ5_DFLT 18
43 : #define HTMLFONTSZ6_DFLT 24
44 : #define HTMLFONTSZ7_DFLT 36
45 :
46 : enum HTMLTableFrame { HTML_TF_VOID, HTML_TF_ABOVE, HTML_TF_BELOW,
47 : HTML_TF_HSIDES, HTML_TF_LHS, HTML_TF_RHS, HTML_TF_VSIDES, HTML_TF_BOX };
48 :
49 : enum HTMLTableRules { HTML_TR_NONE, HTML_TR_GROUPS, HTML_TR_ROWS,
50 : HTML_TR_COLS, HTML_TR_ALL };
51 :
52 : enum HTMLInputType
53 : {
54 : HTML_IT_TEXT = 0x01,
55 : HTML_IT_PASSWORD = 0x02,
56 : HTML_IT_CHECKBOX = 0x03,
57 : HTML_IT_RADIO = 0x04,
58 : HTML_IT_RANGE = 0x05,
59 : HTML_IT_SCRIBBLE = 0x06,
60 : HTML_IT_FILE = 0x07,
61 : HTML_IT_HIDDEN = 0x08,
62 : HTML_IT_SUBMIT = 0x09,
63 : HTML_IT_IMAGE = 0x0a,
64 : HTML_IT_RESET = 0x0b,
65 : HTML_IT_BUTTON = 0x0c
66 : };
67 :
68 : enum HTMLScriptLanguage
69 : {
70 : HTML_SL_STARBASIC,
71 : HTML_SL_JAVASCRIPT,
72 : HTML_SL_UNKNOWN
73 : };
74 :
75 : struct HTMLOptionEnum
76 : {
77 : const sal_Char *pName; // value of an HTML option
78 : sal_uInt16 nValue; // and corresponding value of an enum
79 : };
80 :
81 : /** Representation of an HTML option (=attribute in a start tag).
82 : * The values of the options are always stored as strings.
83 : * The methods GetNumber,... may only be called if the option
84 : * is actually numerical,...
85 : */
86 1455 : class SVT_DLLPUBLIC HTMLOption
87 : {
88 : OUString aValue; // value of the option (always as string)
89 : OUString aToken; // name of the option as string
90 : sal_uInt16 nToken; // and respective token
91 :
92 : public:
93 :
94 : HTMLOption( sal_uInt16 nTyp, const OUString& rToken, const OUString& rValue );
95 :
96 : // name of the option...
97 1520 : sal_uInt16 GetToken() const { return nToken; } // ... as enum
98 7 : const OUString& GetTokenString() const { return aToken; } // ... as string
99 :
100 : // value of the option ...
101 1424 : const OUString& GetString() const { return aValue; } // ... as string
102 :
103 : sal_uInt32 GetNumber() const; // ... as number
104 : sal_Int32 GetSNumber() const; // ... as number
105 : void GetNumbers( std::vector<sal_uInt32> &rNumbers, // ... as numbers
106 : bool bSpaceDelim=false ) const;
107 : void GetColor( Color& ) const; // ... as color
108 :
109 : // ... as enum; pOptEnums is an HTMLOptionEnum array
110 : sal_uInt16 GetEnum( const HTMLOptionEnum *pOptEnums,
111 : sal_uInt16 nDflt=0 ) const;
112 : bool GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const;
113 :
114 : // ... and as a few special enums
115 : HTMLInputType GetInputType() const; // <INPUT TYPE=...>
116 : HTMLTableFrame GetTableFrame() const; // <TABLE FRAME=...>
117 : HTMLTableRules GetTableRules() const; // <TABLE RULES=...>
118 : //SvxAdjust GetAdjust() const; // <P,TH,TD ALIGN=>
119 : };
120 :
121 : typedef ::boost::ptr_vector<HTMLOption> HTMLOptions;
122 :
123 : class SVT_DLLPUBLIC HTMLParser : public SvParser
124 : {
125 : private:
126 : mutable HTMLOptions maOptions; // options of the start tag
127 :
128 : bool bNewDoc : 1; // read new Doc?
129 : bool bIsInHeader : 1; // scan header section
130 : bool bIsInBody : 1; // scan body section
131 : bool bReadListing : 1; // read listings
132 : bool bReadXMP : 1; // read XMP
133 : bool bReadPRE : 1; // read preformatted text
134 : bool bReadTextArea : 1; // read TEXTAREA
135 : bool bReadScript : 1; // read <SCRIPT>
136 : bool bReadStyle : 1; // read <STYLE>
137 : bool bEndTokenFound : 1; // found </SCRIPT> or </STYLE>
138 :
139 : bool bPre_IgnoreNewPara : 1; // flags for reading of PRE paragraphs
140 : bool bReadNextChar : 1; // true: read NextChar again(JavaScript!)
141 : bool bReadComment : 1; // true: read NextChar again (JavaScript!)
142 :
143 : sal_uInt32 nPre_LinePos; // Pos in the line in the PRE-Tag
144 :
145 : int mnPendingOffToken; ///< OFF token pending for a <XX.../> ON/OFF ON token
146 :
147 : OUString aEndToken;
148 :
149 : protected:
150 : OUString sSaveToken; // the read tag as string
151 :
152 : int ScanText( const sal_Unicode cBreak = 0U );
153 :
154 : int _GetNextRawToken();
155 :
156 : // scan next token
157 : virtual int _GetNextToken() SAL_OVERRIDE;
158 :
159 : virtual ~HTMLParser();
160 :
161 16 : void FinishHeader( bool bBody ) { bIsInHeader = false; bIsInBody = bBody; }
162 :
163 : public:
164 : HTMLParser( SvStream& rIn, bool bReadNewDoc = true );
165 :
166 : virtual SvParserState CallParser() SAL_OVERRIDE;
167 :
168 272 : bool IsNewDoc() const { return bNewDoc; }
169 43 : bool IsInHeader() const { return bIsInHeader; }
170 : bool IsInBody() const { return bIsInBody; }
171 292 : bool IsReadListing() const { return bReadListing; }
172 292 : bool IsReadXMP() const { return bReadXMP; }
173 1373 : bool IsReadPRE() const { return bReadPRE; }
174 106 : bool IsReadScript() const { return bReadScript; }
175 106 : bool IsReadStyle() const { return bReadStyle; }
176 :
177 : void SetReadNextChar() { bReadNextChar = true; }
178 :
179 : // start PRE-/LISTING or XMP mode or filter tags respectively
180 : inline void StartPRE( bool bRestart=false );
181 0 : void FinishPRE() { bReadPRE = false; }
182 : int FilterPRE( int nToken );
183 :
184 : inline void StartListing( bool bRestart=false );
185 0 : void FinishListing() { bReadListing = false; }
186 : int FilterListing( int nToken );
187 :
188 : inline void StartXMP( bool bRestart=false );
189 0 : void FinishXMP() { bReadXMP = false; }
190 : int FilterXMP( int nToken );
191 :
192 0 : void FinishTextArea() { bReadTextArea = false; }
193 :
194 : // finish PRE-/LISTING- and XMP mode
195 584 : void FinishPREListingXMP() { bReadPRE = bReadListing = bReadXMP = false; }
196 :
197 : // Filter the current token according to the current mode
198 : // (PRE, XMP, ...) and set the flags. Is called by Continue before
199 : // NextToken is called. If you implement own loops or call
200 : // NextToken yourself, you should call this method beforehand.
201 : int FilterToken( int nToken );
202 :
203 : // end scanning of a script (should only be called right after
204 : // reading of a <SCRIPT>)
205 : void EndScanScript() { bReadScript = false; }
206 :
207 0 : void ReadRawData( const OUString &rEndToken ) { aEndToken = rEndToken; }
208 :
209 : // Token without \-sequences
210 : void UnescapeToken();
211 :
212 : // Determine the options. pNoConvertToken is the optional token
213 : // of an option, for which the CR/LFs are not deleted from the value
214 : // of the option.
215 : const HTMLOptions& GetOptions( sal_uInt16 *pNoConvertToken=0 );
216 :
217 : // for asynchronous reading from the SvStream
218 : virtual void Continue( int nToken ) SAL_OVERRIDE;
219 :
220 :
221 : protected:
222 :
223 : static rtl_TextEncoding GetEncodingByMIME( const OUString& rMime );
224 :
225 : /// template method: called when ParseMetaOptions adds a user-defined meta
226 : virtual void AddMetaUserDefined( OUString const & i_rMetaName );
227 :
228 : private:
229 : /// parse meta options into XDocumentProperties and encoding
230 : bool ParseMetaOptionsImpl( const ::com::sun::star::uno::Reference<
231 : ::com::sun::star::document::XDocumentProperties>&,
232 : SvKeyValueIterator*,
233 : const HTMLOptions&,
234 : rtl_TextEncoding& rEnc );
235 :
236 : public:
237 : /// overriding method must call this implementation!
238 : virtual bool ParseMetaOptions( const ::com::sun::star::uno::Reference<
239 : ::com::sun::star::document::XDocumentProperties>&,
240 : SvKeyValueIterator* );
241 :
242 : bool ParseScriptOptions( OUString& rLangString, const OUString&, HTMLScriptLanguage& rLang,
243 : OUString& rSrc, OUString& rLibrary, OUString& rModule );
244 :
245 : // remove a comment around the content of <SCRIPT> or <STYLE>
246 : // In case of 'bFull', the whole line behind a "<!--" might
247 : // be deleted (for JavaSript)
248 : static void RemoveSGMLComment( OUString &rString, bool bFull );
249 :
250 : static bool InternalImgToPrivateURL( OUString& rURL );
251 : static rtl_TextEncoding GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader );
252 : bool SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader );
253 : };
254 :
255 0 : inline void HTMLParser::StartPRE( bool bRestart )
256 : {
257 0 : bReadPRE = true;
258 0 : bPre_IgnoreNewPara = !bRestart;
259 0 : nPre_LinePos = 0UL;
260 0 : }
261 :
262 0 : inline void HTMLParser::StartListing( bool bRestart )
263 : {
264 0 : bReadListing = true;
265 0 : bPre_IgnoreNewPara = !bRestart;
266 0 : nPre_LinePos = 0UL;
267 0 : }
268 :
269 0 : inline void HTMLParser::StartXMP( bool bRestart )
270 : {
271 0 : bReadXMP = true;
272 0 : bPre_IgnoreNewPara = !bRestart;
273 0 : nPre_LinePos = 0UL;
274 0 : }
275 :
276 : #endif
277 :
278 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|