Line data Source code
1 : /*************************************************************************
2 : *
3 : * Copyright (c) 2012 Kohei Yoshida
4 : *
5 : * Permission is hereby granted, free of charge, to any person
6 : * obtaining a copy of this software and associated documentation
7 : * files (the "Software"), to deal in the Software without
8 : * restriction, including without limitation the rights to use,
9 : * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 : * copies of the Software, and to permit persons to whom the
11 : * Software is furnished to do so, subject to the following
12 : * conditions:
13 : *
14 : * The above copyright notice and this permission notice shall be
15 : * included in all copies or substantial portions of the Software.
16 : *
17 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 : * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 : * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 : * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 : * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 : * OTHER DEALINGS IN THE SOFTWARE.
25 : *
26 : ************************************************************************/
27 :
28 : #include "orcus/xml_structure_tree.hpp"
29 : #include "orcus/sax_ns_parser.hpp"
30 : #include "orcus/xml_namespace.hpp"
31 : #include "orcus/global.hpp"
32 : #include "orcus/exception.hpp"
33 :
34 : #include "string_pool.hpp"
35 :
36 : #include <iostream>
37 : #include <sstream>
38 : #include <vector>
39 : #include <cstdio>
40 :
41 : #include <boost/noncopyable.hpp>
42 : #include <boost/unordered_map.hpp>
43 : #include <boost/unordered_set.hpp>
44 : #include <boost/ptr_container/ptr_vector.hpp>
45 :
46 : using namespace std;
47 :
48 : namespace orcus {
49 :
50 : namespace {
51 :
52 : struct elem_prop;
53 : typedef boost::unordered_map<xml_structure_tree::entity_name, elem_prop*, xml_structure_tree::entity_name::hash> element_store_type;
54 : typedef boost::unordered_set<xml_structure_tree::entity_name, xml_structure_tree::entity_name::hash> attribute_names_type;
55 :
56 : /** Element properties. */
57 : struct elem_prop : boost::noncopyable
58 : {
59 : element_store_type child_elements;
60 : attribute_names_type attributes;
61 :
62 : /** Store child element names in order of appearance. */
63 : xml_structure_tree::entity_names_type child_element_names;
64 :
65 : /** Store attribute names in order of appearance. */
66 : xml_structure_tree::entity_names_type attribute_names;
67 :
68 : size_t appearance_order;
69 :
70 : size_t in_scope_count;
71 :
72 : /**
73 : * When true, this element is the base element of repeated structures.
74 : * This flag is set only with the base element; none of the child
75 : * elements below the base element have this flag set.
76 : */
77 : bool repeat:1;
78 :
79 0 : elem_prop() : appearance_order(0), in_scope_count(1), repeat(false) {}
80 0 : elem_prop(size_t _appearance_order) : appearance_order(_appearance_order), in_scope_count(1), repeat(false) {}
81 0 : ~elem_prop()
82 0 : {
83 0 : for_each(child_elements.begin(), child_elements.end(), map_object_deleter<element_store_type>());
84 0 : };
85 : };
86 :
87 0 : struct root
88 : {
89 : xml_structure_tree::entity_name name;
90 : elem_prop prop;
91 : };
92 :
93 0 : struct element_ref
94 : {
95 : xml_structure_tree::entity_name name;
96 : elem_prop* prop;
97 :
98 0 : element_ref() : prop(NULL) {}
99 : element_ref(xml_structure_tree::entity_name _name, elem_prop* _prop) :
100 0 : name(_name), prop(_prop) {}
101 : };
102 :
103 : typedef std::vector<element_ref> elements_type;
104 :
105 0 : class xml_sax_handler
106 : {
107 : string_pool& m_pool;
108 : unique_ptr<root> mp_root;
109 : elements_type m_stack;
110 : xml_structure_tree::entity_names_type m_attrs;
111 :
112 : private:
113 0 : void merge_attributes(elem_prop& prop)
114 : {
115 : xml_structure_tree::entity_names_type::const_iterator it = m_attrs.begin(), it_end = m_attrs.end();
116 0 : for (; it != it_end; ++it)
117 : {
118 0 : if (prop.attributes.find(*it) == prop.attributes.end())
119 : {
120 : // New attribute. Insert it.
121 0 : prop.attributes.insert(*it);
122 0 : prop.attribute_names.push_back(*it);
123 : }
124 : }
125 :
126 : m_attrs.clear();
127 0 : }
128 :
129 : public:
130 : xml_sax_handler(string_pool& pool) :
131 0 : m_pool(pool), mp_root(NULL) {}
132 :
133 : void declaration()
134 : {
135 : m_attrs.clear();
136 : }
137 :
138 0 : void start_element(const sax_ns_parser_element& elem)
139 : {
140 0 : if (!mp_root)
141 : {
142 : // This is a root element.
143 0 : mp_root.reset(new root);
144 0 : mp_root->name.ns = elem.ns;
145 0 : mp_root->name.name = m_pool.intern(elem.name).first;
146 0 : element_ref ref(mp_root->name, &mp_root->prop);
147 0 : merge_attributes(mp_root->prop);
148 0 : m_stack.push_back(ref);
149 : return;
150 : }
151 :
152 : // See if the current element already has a child element of the same name.
153 0 : assert(!m_stack.empty());
154 : element_ref& current = m_stack.back();
155 0 : xml_structure_tree::entity_name key(elem.ns, elem.name);
156 0 : element_store_type::const_iterator it = current.prop->child_elements.find(key);
157 0 : if (it != current.prop->child_elements.end())
158 : {
159 : // Recurring element. Set its repeat flag only when it occurs
160 : // multiple times in the same scope.
161 0 : ++it->second->in_scope_count;
162 0 : if (it->second->in_scope_count > 1)
163 0 : it->second->repeat = true;
164 :
165 0 : element_ref ref(it->first, it->second);
166 0 : merge_attributes(*it->second);
167 0 : m_stack.push_back(ref);
168 : return;
169 : }
170 :
171 : // New element.
172 0 : size_t order = current.prop->child_elements.size();
173 0 : key.name = m_pool.intern(key.name).first;
174 : pair<element_store_type::const_iterator,bool> r =
175 : current.prop->child_elements.insert(
176 0 : element_store_type::value_type(key, new elem_prop(order)));
177 :
178 0 : if (!r.second)
179 0 : throw general_error("Insertion failed");
180 :
181 0 : current.prop->child_element_names.push_back(key);
182 :
183 : it = r.first;
184 0 : element_ref ref(it->first, it->second);
185 0 : merge_attributes(*it->second);
186 0 : m_stack.push_back(ref);
187 : }
188 :
189 0 : void end_element(const sax_ns_parser_element& elem)
190 : {
191 0 : if (m_stack.empty())
192 0 : throw general_error("Element stack is empty.");
193 :
194 : const element_ref& current = m_stack.back();
195 :
196 : // Reset the in-scope count of all child elements to 0 before ending
197 : // the current scope.
198 0 : element_store_type::iterator it = current.prop->child_elements.begin(), it_end = current.prop->child_elements.end();
199 0 : for (; it != it_end; ++it)
200 0 : it->second->in_scope_count = 0;
201 :
202 : m_stack.pop_back();
203 0 : }
204 :
205 0 : void characters(const pstring&) {}
206 :
207 0 : void attribute(const pstring&, const pstring&)
208 : {
209 : // Attribute for declaration. We don't handle this.
210 0 : }
211 :
212 0 : void attribute(const sax_ns_parser_attribute& attr)
213 : {
214 0 : m_attrs.push_back(xml_structure_tree::entity_name(attr.ns, attr.name));
215 0 : }
216 :
217 : root* release_root_element()
218 : {
219 : return mp_root.release();
220 : }
221 : };
222 :
223 : struct sort_by_appearance : std::binary_function<element_ref, element_ref, bool>
224 : {
225 0 : bool operator() (const element_ref& left, const element_ref& right) const
226 : {
227 0 : return left.prop->appearance_order < right.prop->appearance_order;
228 : }
229 : };
230 :
231 0 : struct scope : boost::noncopyable
232 : {
233 : xml_structure_tree::entity_name name;
234 : elements_type elements;
235 : elements_type::const_iterator current_pos;
236 : bool repeat:1;
237 :
238 0 : scope(const xml_structure_tree::entity_name& _name, bool _repeat, const element_ref& _elem) :
239 0 : name(_name), repeat(_repeat)
240 : {
241 0 : elements.push_back(_elem);
242 0 : current_pos = elements.begin();
243 0 : }
244 :
245 : scope(const xml_structure_tree::entity_name& _name, bool _repeat) :
246 0 : name(_name), repeat(_repeat) {}
247 : };
248 :
249 : typedef boost::ptr_vector<scope> scopes_type;
250 :
251 0 : void print_scope(ostream& os, const scopes_type& scopes, const xmlns_context& cxt)
252 : {
253 0 : if (scopes.empty())
254 0 : throw general_error("scope stack shouldn't be empty while dumping tree.");
255 :
256 : // Skip the first scope which is root.
257 : scopes_type::const_iterator it = scopes.begin(), it_end = scopes.end();
258 0 : for (++it; it != it_end; ++it)
259 : {
260 0 : os << "/";
261 0 : size_t num_id = cxt.get_index(it->name.ns);
262 0 : if (num_id != xmlns_context::index_not_found)
263 0 : os << "ns" << num_id << ":";
264 0 : os << it->name.name;
265 0 : if (it->repeat)
266 0 : os << "[*]";
267 : }
268 0 : }
269 :
270 : }
271 :
272 : struct xml_structure_tree_impl : boost::noncopyable
273 : {
274 : string_pool m_pool;
275 : xmlns_context& m_xmlns_cxt;
276 : root* mp_root;
277 :
278 0 : xml_structure_tree_impl(xmlns_context& xmlns_cxt) :
279 0 : m_xmlns_cxt(xmlns_cxt), mp_root(NULL) {}
280 :
281 0 : ~xml_structure_tree_impl()
282 0 : {
283 0 : delete mp_root;
284 0 : }
285 : };
286 :
287 0 : struct xml_structure_tree::walker_impl : boost::noncopyable
288 : {
289 : const xml_structure_tree_impl& m_parent_impl;
290 : root* mp_root; /// Root element of the authoritative tree.
291 : element_ref m_cur_elem;
292 : std::vector<element_ref> m_scopes;
293 :
294 0 : walker_impl(const xml_structure_tree_impl& parent_impl) :
295 0 : m_parent_impl(parent_impl), mp_root(parent_impl.mp_root) {}
296 :
297 0 : walker_impl(const walker_impl& r) :
298 0 : m_parent_impl(r.m_parent_impl), mp_root(r.mp_root), m_cur_elem(r.m_cur_elem), m_scopes(r.m_scopes) {}
299 : };
300 :
301 0 : xml_structure_tree::entity_name::entity_name() :
302 0 : ns(XMLNS_UNKNOWN_ID) {}
303 :
304 0 : xml_structure_tree::entity_name::entity_name(xmlns_id_t _ns, const pstring& _name) :
305 0 : ns(_ns), name(_name) {}
306 :
307 0 : bool xml_structure_tree::entity_name::operator< (const entity_name& r) const
308 : {
309 0 : if (ns != r.ns)
310 0 : return ns < r.ns;
311 :
312 0 : return name < r.name;
313 : }
314 :
315 0 : bool xml_structure_tree::entity_name::operator== (const entity_name& r) const
316 : {
317 0 : return ns == r.ns && name == r.name;
318 : }
319 :
320 0 : size_t xml_structure_tree::entity_name::hash::operator() (const entity_name& val) const
321 : {
322 : static pstring::hash hasher;
323 0 : size_t n = reinterpret_cast<size_t>(val.ns);
324 0 : n += hasher(val.name);
325 0 : return n;
326 : }
327 :
328 0 : xml_structure_tree::element::element() :
329 0 : repeat(false) {}
330 :
331 0 : xml_structure_tree::element::element(const entity_name& _name, bool _repeat) :
332 0 : name(_name), repeat(_repeat) {}
333 :
334 8 : size_t xml_structure_tree::walker::index_not_found = xmlns_context::index_not_found;
335 :
336 0 : xml_structure_tree::walker::walker(const xml_structure_tree_impl& parent_impl) :
337 0 : mp_impl(new walker_impl(parent_impl))
338 : {
339 0 : }
340 :
341 0 : xml_structure_tree::walker::walker(const walker& r) :
342 0 : mp_impl(new walker_impl(*r.mp_impl))
343 : {
344 0 : }
345 :
346 0 : xml_structure_tree::walker::~walker()
347 : {
348 0 : delete mp_impl;
349 0 : }
350 :
351 0 : xml_structure_tree::walker& xml_structure_tree::walker::operator= (const walker& r)
352 : {
353 0 : mp_impl->mp_root = r.mp_impl->mp_root;
354 0 : return *this;
355 : }
356 :
357 0 : xml_structure_tree::element xml_structure_tree::walker::root()
358 : {
359 0 : if (!mp_impl->mp_root)
360 0 : throw general_error("Tree is empty.");
361 :
362 : mp_impl->m_scopes.clear();
363 :
364 : // Set the current element to root.
365 0 : element_ref ref(mp_impl->mp_root->name, &mp_impl->mp_root->prop);
366 : mp_impl->m_cur_elem = ref;
367 0 : mp_impl->m_scopes.push_back(ref);
368 0 : return xml_structure_tree::element(ref.name, false);
369 : }
370 :
371 0 : xml_structure_tree::element xml_structure_tree::walker::descend(const entity_name& name)
372 : {
373 0 : if (mp_impl->m_scopes.empty())
374 0 : throw general_error("Scope is empty.");
375 :
376 0 : assert(mp_impl->m_scopes.back().prop);
377 0 : const element_store_type& child_elems = mp_impl->m_scopes.back().prop->child_elements;
378 0 : element_store_type::const_iterator it = child_elems.find(name);
379 :
380 0 : if (it == child_elems.end())
381 0 : throw general_error("Specified child element does not exist.");
382 :
383 : // Push this new child element onto the stack.
384 0 : element_ref ref(name, it->second);
385 0 : mp_impl->m_scopes.push_back(ref);
386 :
387 0 : return element(name, it->second->repeat);
388 : }
389 :
390 0 : xml_structure_tree::element xml_structure_tree::walker::ascend()
391 : {
392 0 : if (mp_impl->m_scopes.empty())
393 0 : throw general_error("Scope is empty.");
394 :
395 0 : if (mp_impl->m_scopes.size() == 1)
396 0 : throw general_error("You can't ascend from the root element.");
397 :
398 : mp_impl->m_scopes.pop_back();
399 0 : const element_ref& ref = mp_impl->m_scopes.back();
400 0 : return element(ref.name, ref.prop->repeat);
401 : }
402 :
403 0 : void xml_structure_tree::walker::get_children(entity_names_type& names)
404 : {
405 0 : if (mp_impl->m_scopes.empty())
406 0 : throw general_error("Scope is empty.");
407 :
408 0 : assert(mp_impl->m_scopes.back().prop);
409 : const elem_prop& prop = *mp_impl->m_scopes.back().prop;
410 : names.assign(prop.child_element_names.begin(), prop.child_element_names.end());
411 0 : }
412 :
413 0 : void xml_structure_tree::walker::get_attributes(entity_names_type& names)
414 : {
415 0 : if (mp_impl->m_scopes.empty())
416 0 : throw general_error("Scope is empty.");
417 :
418 0 : assert(mp_impl->m_scopes.back().prop);
419 : const elem_prop& prop = *mp_impl->m_scopes.back().prop;
420 : names.assign(prop.attribute_names.begin(), prop.attribute_names.end());
421 0 : }
422 :
423 0 : size_t xml_structure_tree::walker::get_xmlns_index(xmlns_id_t ns) const
424 : {
425 0 : return mp_impl->m_parent_impl.m_xmlns_cxt.get_index(ns);
426 : }
427 :
428 0 : string xml_structure_tree::walker::get_xmlns_short_name(xmlns_id_t ns) const
429 : {
430 0 : return mp_impl->m_parent_impl.m_xmlns_cxt.get_short_name(ns);
431 : }
432 :
433 0 : xml_structure_tree::xml_structure_tree(xmlns_context& xmlns_cxt) :
434 0 : mp_impl(new xml_structure_tree_impl(xmlns_cxt)) {}
435 :
436 0 : xml_structure_tree::~xml_structure_tree()
437 : {
438 0 : delete mp_impl;
439 0 : }
440 :
441 0 : void xml_structure_tree::parse(const char* p, size_t n)
442 : {
443 0 : xml_sax_handler hdl(mp_impl->m_pool);
444 0 : sax_ns_parser<xml_sax_handler> parser(p, n, mp_impl->m_xmlns_cxt, hdl);
445 0 : parser.parse();
446 0 : mp_impl->mp_root = hdl.release_root_element();
447 0 : }
448 :
449 0 : void xml_structure_tree::dump_compact(ostream& os) const
450 : {
451 0 : if (!mp_impl->mp_root)
452 0 : return;
453 :
454 : scopes_type scopes;
455 0 : const xmlns_context& cxt = mp_impl->m_xmlns_cxt;
456 :
457 : // Dump all namespaces first.
458 0 : cxt.dump(os);
459 :
460 0 : element_ref ref(mp_impl->mp_root->name, &mp_impl->mp_root->prop);
461 0 : scopes.push_back(new scope(entity_name(), false, ref));
462 0 : while (!scopes.empty())
463 : {
464 : bool new_scope = false;
465 :
466 : // Iterate through all elements in the current scope.
467 0 : scope& cur_scope = scopes.back();
468 0 : for (; cur_scope.current_pos != cur_scope.elements.end(); ++cur_scope.current_pos)
469 : {
470 : const element_ref& this_elem = *cur_scope.current_pos;
471 0 : ostringstream ss;
472 0 : print_scope(ss, scopes, cxt);
473 :
474 0 : ss << "/";
475 0 : size_t num_id = cxt.get_index(this_elem.name.ns);
476 0 : if (num_id != xmlns_context::index_not_found)
477 0 : ss << "ns" << num_id << ":";
478 0 : ss << this_elem.name.name;
479 0 : if (this_elem.prop->repeat)
480 0 : ss << "[*]";
481 :
482 0 : string elem_name = ss.str();
483 0 : os << elem_name << endl;
484 :
485 : // Print all attributes that belong to this element.
486 : {
487 0 : const entity_names_type& attrs = this_elem.prop->attribute_names;
488 : entity_names_type::const_iterator it = attrs.begin(), it_end = attrs.end();
489 0 : for (; it != it_end; ++it)
490 0 : os << elem_name << '@' << it->name << endl;
491 : }
492 :
493 0 : const element_store_type& child_elements = this_elem.prop->child_elements;
494 0 : if (child_elements.empty())
495 0 : continue;
496 :
497 : // This element has child elements. Push a new scope and populate
498 : // it with all child elements.
499 : element_store_type::const_iterator it = child_elements.begin(), it_end = child_elements.end();
500 : elements_type elems;
501 0 : for (; it != it_end; ++it)
502 : {
503 : ref.name = it->first;
504 0 : ref.prop = it->second;
505 0 : elems.push_back(ref);
506 : }
507 :
508 : // Sort the elements by order of appearance.
509 0 : std::sort(elems.begin(), elems.end(), sort_by_appearance());
510 :
511 0 : assert(!elems.empty());
512 :
513 : // Push a new scope, and restart the loop with the new scope.
514 : ++cur_scope.current_pos;
515 0 : scopes.push_back(new scope(this_elem.name, this_elem.prop->repeat));
516 0 : scope& child_scope = scopes.back();
517 : child_scope.elements.swap(elems);
518 0 : child_scope.current_pos = child_scope.elements.begin();
519 :
520 : new_scope = true;
521 : break;
522 0 : }
523 :
524 0 : if (new_scope)
525 0 : continue;
526 :
527 0 : scopes.pop_back();
528 0 : }
529 : }
530 :
531 0 : xml_structure_tree::walker xml_structure_tree::get_walker() const
532 : {
533 0 : return walker(*mp_impl);
534 : }
535 :
536 24 : }
|