Line data Source code
1 : /*************************************************************************
2 : *
3 : * Copyright (c) 2012 Kohei Yoshida
4 : *
5 : * Permission is hereby granted, free of charge, to any person
6 : * obtaining a copy of this software and associated documentation
7 : * files (the "Software"), to deal in the Software without
8 : * restriction, including without limitation the rights to use,
9 : * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 : * copies of the Software, and to permit persons to whom the
11 : * Software is furnished to do so, subject to the following
12 : * conditions:
13 : *
14 : * The above copyright notice and this permission notice shall be
15 : * included in all copies or substantial portions of the Software.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 : * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 : * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 : * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 : * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 : * OTHER DEALINGS IN THE SOFTWARE.
25 : *
26 : ************************************************************************/
27 :
28 : #include "orcus/dom_tree.hpp"
29 : #include "orcus/exception.hpp"
30 : #include "orcus/xml_namespace.hpp"
31 :
32 : #include "string_pool.hpp"
33 :
34 : #include <iostream>
35 : #include <sstream>
36 :
37 : using namespace std;
38 :
39 : namespace orcus {
40 :
41 : namespace {
42 :
43 : /**
44 : * Escape certain characters with backslash (\).
45 : */
46 0 : void escape(ostream& os, const pstring& val)
47 : {
48 0 : if (val.empty())
49 0 : return;
50 :
51 : const char* p = &val[0];
52 0 : const char* p_end = p + val.size();
53 0 : for (; p != p_end; ++p)
54 : {
55 0 : if (*p == '"')
56 0 : os << "\\\"";
57 0 : else if (*p == '\\')
58 0 : os << "\\\\";
59 : else
60 0 : os << *p;
61 : }
62 : }
63 :
64 : }
65 :
66 : struct dom_tree_impl
67 : {
68 : xmlns_context& m_ns_cxt;
69 : string_pool m_pool;
70 :
71 : dom_tree::attrs_type m_doc_attrs;
72 : dom_tree::attrs_type m_cur_attrs;
73 : dom_tree::element_stack_type m_elem_stack;
74 : dom_tree::element* m_root;
75 :
76 0 : dom_tree_impl(xmlns_context& cxt) : m_ns_cxt(cxt), m_root(NULL) {}
77 :
78 0 : ~dom_tree_impl()
79 0 : {
80 0 : delete m_root;
81 0 : }
82 : };
83 :
84 0 : dom_tree::entity_name::entity_name() : ns(XMLNS_UNKNOWN_ID) {}
85 :
86 0 : dom_tree::entity_name::entity_name(xmlns_id_t _ns, const pstring& _name) :
87 0 : ns(_ns), name(_name) {}
88 :
89 0 : void dom_tree::entity_name::print(std::ostream& os, const xmlns_context& cxt) const
90 : {
91 0 : if (ns)
92 : {
93 0 : size_t index = cxt.get_index(ns);
94 0 : if (index != xmlns_context::index_not_found)
95 0 : os << "ns" << index << ':';
96 : }
97 0 : os << name;
98 0 : }
99 :
100 0 : dom_tree::attr::attr(xmlns_id_t _ns, const pstring& _name, const pstring& _value) :
101 0 : name(_ns, _name), value(_value) {}
102 :
103 0 : void dom_tree::attr::print(std::ostream& os, const xmlns_context& cxt) const
104 : {
105 0 : name.print(os, cxt);
106 0 : os << "=\"";
107 0 : escape(os, value);
108 0 : os << '"';
109 0 : }
110 :
111 0 : dom_tree::node::~node() {}
112 :
113 0 : dom_tree::element::element(xmlns_id_t _ns, const pstring& _name) : node(node_element), name(_ns, _name) {}
114 :
115 0 : void dom_tree::element::print(ostream& os, const xmlns_context& cxt) const
116 : {
117 0 : name.print(os, cxt);
118 0 : }
119 :
120 0 : dom_tree::element::~element() {}
121 :
122 0 : dom_tree::content::content(const pstring& _value) : node(node_content), value(_value) {}
123 :
124 0 : void dom_tree::content::print(ostream& os, const xmlns_context& /*cxt*/) const
125 : {
126 0 : os << '"';
127 0 : escape(os, value);
128 0 : os << '"';
129 0 : }
130 :
131 0 : dom_tree::content::~content() {}
132 :
133 0 : dom_tree::dom_tree(xmlns_context& cxt) : mp_impl(new dom_tree_impl(cxt)) {}
134 :
135 0 : dom_tree::~dom_tree() { delete mp_impl; }
136 :
137 0 : void dom_tree::end_declaration()
138 : {
139 0 : mp_impl->m_doc_attrs.swap(mp_impl->m_cur_attrs);
140 0 : }
141 :
142 0 : void dom_tree::start_element(xmlns_id_t ns, const pstring& name)
143 : {
144 : // These strings must be persistent.
145 0 : pstring name_safe = mp_impl->m_pool.intern(name).first;
146 :
147 0 : element* p = NULL;
148 0 : if (!mp_impl->m_root)
149 : {
150 : // This must be the root element!
151 0 : mp_impl->m_root = new element(ns, name_safe);
152 0 : mp_impl->m_elem_stack.push_back(mp_impl->m_root);
153 0 : p = mp_impl->m_elem_stack.back();
154 : p->attrs.swap(mp_impl->m_cur_attrs);
155 0 : return;
156 : }
157 :
158 : // Append new element as a child element of the current element.
159 0 : p = mp_impl->m_elem_stack.back();
160 0 : p->child_nodes.push_back(new element(ns, name_safe));
161 0 : p = static_cast<element*>(&p->child_nodes.back());
162 0 : p->attrs.swap(mp_impl->m_cur_attrs);
163 0 : mp_impl->m_elem_stack.push_back(p);
164 : }
165 :
166 0 : void dom_tree::end_element(xmlns_id_t ns, const pstring& name)
167 : {
168 0 : const element* p = mp_impl->m_elem_stack.back();
169 0 : if (p->name.ns != ns || p->name.name != name)
170 0 : throw general_error("non-matching end element.");
171 :
172 0 : mp_impl->m_elem_stack.pop_back();
173 0 : }
174 :
175 0 : void dom_tree::set_characters(const pstring& val)
176 : {
177 0 : if (mp_impl->m_elem_stack.empty())
178 : // No root element has been encountered. Ignore this.
179 : return;
180 :
181 0 : pstring val2 = val.trim();
182 0 : if (val2.empty())
183 : return;
184 :
185 0 : element* p = mp_impl->m_elem_stack.back();
186 0 : val2 = mp_impl->m_pool.intern(val2).first; // Make sure the string is persistent.
187 0 : p->child_nodes.push_back(new content(val2));
188 : }
189 :
190 0 : void dom_tree::set_attribute(xmlns_id_t ns, const pstring& name, const pstring& val)
191 : {
192 : // These strings must be persistent.
193 0 : pstring name2 = mp_impl->m_pool.intern(name).first;
194 0 : pstring val2 = mp_impl->m_pool.intern(val).first;
195 :
196 0 : mp_impl->m_cur_attrs.push_back(attr(ns, name2, val2));
197 0 : }
198 :
199 : namespace {
200 :
201 0 : struct scope : boost::noncopyable
202 : {
203 : typedef std::vector<const dom_tree::node*> nodes_type;
204 : string name;
205 : nodes_type nodes;
206 : nodes_type::const_iterator current_pos;
207 :
208 0 : scope(const string& _name, dom_tree::node* _node) :
209 0 : name(_name)
210 : {
211 0 : nodes.push_back(_node);
212 0 : current_pos = nodes.begin();
213 0 : }
214 :
215 0 : scope(const string& _name) : name(_name) {}
216 : };
217 :
218 : typedef boost::ptr_vector<scope> scopes_type;
219 :
220 0 : void print_scope(ostream& os, const scopes_type& scopes)
221 : {
222 0 : if (scopes.empty())
223 0 : throw general_error("scope stack shouldn't be empty while dumping tree.");
224 :
225 : // Skip the first scope which is root.
226 : scopes_type::const_iterator it = scopes.begin(), it_end = scopes.end();
227 0 : for (++it; it != it_end; ++it)
228 0 : os << "/" << it->name;
229 0 : }
230 :
231 : struct sort_by_name : std::binary_function<dom_tree::attr, dom_tree::attr, bool>
232 : {
233 0 : bool operator() (const dom_tree::attr& left, const dom_tree::attr& right) const
234 : {
235 0 : return left.name.name < right.name.name;
236 : }
237 : };
238 :
239 : }
240 :
241 0 : void dom_tree::dump_compact(ostream& os) const
242 : {
243 0 : if (!mp_impl->m_root)
244 0 : return;
245 :
246 : // Dump namespaces first.
247 0 : mp_impl->m_ns_cxt.dump(os);
248 :
249 : scopes_type scopes;
250 :
251 0 : scopes.push_back(new scope(string(), mp_impl->m_root));
252 0 : while (!scopes.empty())
253 : {
254 : bool new_scope = false;
255 :
256 : // Iterate through all elements in the current scope.
257 0 : scope& cur_scope = scopes.back();
258 0 : for (; cur_scope.current_pos != cur_scope.nodes.end(); ++cur_scope.current_pos)
259 : {
260 0 : const node* this_node = *cur_scope.current_pos;
261 0 : assert(this_node);
262 0 : print_scope(os, scopes);
263 0 : if (this_node->type == node_content)
264 : {
265 : // This is a text content.
266 0 : this_node->print(os, mp_impl->m_ns_cxt);
267 : os << endl;
268 0 : continue;
269 : }
270 :
271 0 : assert(this_node->type == node_element);
272 : const element* elem = static_cast<const element*>(this_node);
273 0 : os << "/";
274 0 : elem->print(os, mp_impl->m_ns_cxt);
275 : os << endl;
276 :
277 : {
278 : // Dump attributes.
279 0 : attrs_type attrs = elem->attrs;
280 0 : sort(attrs.begin(), attrs.end(), sort_by_name());
281 : attrs_type::const_iterator it = attrs.begin(), it_end = attrs.end();
282 0 : for (; it != it_end; ++it)
283 : {
284 0 : print_scope(os, scopes);
285 0 : os << "/";
286 0 : elem->print(os, mp_impl->m_ns_cxt);
287 0 : os << "@";
288 0 : it->print(os, mp_impl->m_ns_cxt);
289 : os << endl;
290 0 : }
291 : }
292 :
293 0 : if (elem->child_nodes.empty())
294 0 : continue;
295 :
296 : // This element has child nodes. Push a new scope and populate it
297 : // with all child elements, but skip content nodes.
298 : dom_tree::nodes_type::const_iterator it = elem->child_nodes.begin(), it_end = elem->child_nodes.end();
299 : scope::nodes_type nodes;
300 0 : for (; it != it_end; ++it)
301 0 : nodes.push_back(&(*it));
302 :
303 0 : assert(!nodes.empty());
304 :
305 : // Push a new scope, and restart the loop with the new scope.
306 : ++cur_scope.current_pos;
307 0 : ostringstream elem_name;
308 0 : elem->print(elem_name, mp_impl->m_ns_cxt);
309 0 : scopes.push_back(new scope(elem_name.str()));
310 0 : scope& child_scope = scopes.back();
311 : child_scope.nodes.swap(nodes);
312 0 : child_scope.current_pos = child_scope.nodes.begin();
313 :
314 : new_scope = true;
315 : break;
316 0 : }
317 :
318 0 : if (new_scope)
319 0 : continue;
320 :
321 0 : scopes.pop_back();
322 0 : }
323 : }
324 :
325 0 : }
|