forked from nostrademons/gumbo-libxml
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgumbo_libxml.c
137 lines (122 loc) · 4.49 KB
/
gumbo_libxml.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// Copyright 2015 Jonathan Tang ([email protected]). All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gumbo_libxml.h"
#include <assert.h>
#include <string.h>
#include "gumbo.h"
#include "libxml/tree.h"
// Namespace constants, indexed by GumboNamespaceEnum.
static const char* kLegalXmlns[] = {
"http://www.w3.org/1999/xhtml",
"http://www.w3.org/2000/svg",
"http://www.w3.org/1998/Math/MathML"
};
static xmlNodePtr convert_node(
xmlDocPtr doc, GumboNode* node, bool attach_original) {
xmlNodePtr result;
switch (node->type) {
case GUMBO_NODE_DOCUMENT:
assert(false &&
"convert_node cannot be used on the document node. "
"Doctype information is automatically added to the xmlDocPtr.");
break;
case GUMBO_NODE_ELEMENT:
case GUMBO_NODE_TEMPLATE:
{
GumboElement* elem = &node->v.element;
// Tag name & namespace.
xmlNsPtr namespace = NULL;
char *elementName = gumbo_normalized_tagname(elem->tag);
if (strlen(elementName) > 0) {
result = xmlNewNode(NULL, BAD_CAST elementName);
} else {
GumboStringPiece gsp = elem->original_tag;
gumbo_tag_from_original_text(&gsp);
xmlChar *unknownTagName = xmlCharStrndup(gsp.data, gsp.length);
result = xmlNewNode(NULL, unknownTagName);
xmlFree(unknownTagName);
}
if (node->parent->type != GUMBO_NODE_DOCUMENT &&
elem->tag_namespace != node->parent->v.element.tag_namespace) {
namespace = xmlNewNs(
result, BAD_CAST kLegalXmlns[elem->tag_namespace], NULL);
xmlSetNs(result, namespace);
}
// Attributes.
for (int i = 0; i < elem->attributes.length; ++i) {
GumboAttribute* attr = elem->attributes.data[i];
xmlNewProp(result, BAD_CAST attr->name, BAD_CAST attr->value);
}
// Children.
for (int i = 0; i < elem->children.length; ++i) {
xmlAddChild(result, convert_node(
doc, elem->children.data[i], attach_original));
}
}
break;
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
result = xmlNewText(BAD_CAST node->v.text.text);
break;
case GUMBO_NODE_COMMENT:
result = xmlNewComment(BAD_CAST node->v.text.text);
break;
case GUMBO_NODE_CDATA:
{
// TODO: probably would be faster to use some calculation on
// original_text.length rather than strlen, but I haven't verified that
// that's correct in all cases.
const char* node_text = node->v.text.text;
result = xmlNewCDataBlock(doc, BAD_CAST node_text, strlen(node_text));
}
break;
default:
assert(false && "unknown node type");
}
if (attach_original) {
result->_private = node;
}
return result;
}
xmlDocPtr gumbo_libxml_parse_with_options(
GumboOptions* options, const char* buffer, size_t buffer_length) {
xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
GumboOutput* output = gumbo_parse_with_options(options, buffer, buffer_length);
GumboDocument* doctype = & output->document->v.document;
xmlCreateIntSubset(
doc,
BAD_CAST doctype->name,
BAD_CAST doctype->public_identifier,
BAD_CAST doctype->system_identifier);
GumboVector* children = &output->document->v.element.children;
for (unsigned int i = 0; i < children->length; i++) {
GumboNode* child = (GumboNode*) children->data[i];
switch (child->type) {
case GUMBO_NODE_COMMENT:
xmlAddChild((xmlNodePtr) doc, xmlNewDocComment(doc, BAD_CAST child->v.text.text));
break;
case GUMBO_NODE_ELEMENT:
xmlDocSetRootElement(doc, convert_node(doc, output->root, false));
break;
default:
break;
}
}
gumbo_destroy_output(options, output);
return doc;
}
xmlDocPtr gumbo_libxml_parse(const char* buffer) {
GumboOptions options = kGumboDefaultOptions;
return gumbo_libxml_parse_with_options(&options, buffer, strlen(buffer));
}