397 lines
8.4 KiB
Go
397 lines
8.4 KiB
Go
package xmlpath
|
|
|
|
import (
|
|
"golang.org/x/net/html"
|
|
"encoding/xml"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// Node is an item in an xml tree that was compiled to
|
|
// be processed via xml paths. A node may represent:
|
|
//
|
|
// - An element in the xml document (<body>)
|
|
// - An attribute of an element in the xml document (href="...")
|
|
// - A comment in the xml document (<!--...-->)
|
|
// - A processing instruction in the xml document (<?...?>)
|
|
// - Some text within the xml document
|
|
//
|
|
type Node struct {
|
|
kind nodeKind
|
|
name xml.Name
|
|
attr string
|
|
text []byte
|
|
|
|
nodes []Node
|
|
pos int
|
|
end int
|
|
|
|
up *Node
|
|
down []*Node
|
|
}
|
|
|
|
type nodeKind int
|
|
|
|
const (
|
|
anyNode nodeKind = iota
|
|
startNode
|
|
endNode
|
|
attrNode
|
|
textNode
|
|
commentNode
|
|
procInstNode
|
|
)
|
|
|
|
// String returns the string value of node.
|
|
//
|
|
// The string value of a node is:
|
|
//
|
|
// - For element nodes, the concatenation of all text nodes within the element.
|
|
// - For text nodes, the text itself.
|
|
// - For attribute nodes, the attribute value.
|
|
// - For comment nodes, the text within the comment delimiters.
|
|
// - For processing instruction nodes, the content of the instruction.
|
|
//
|
|
func (node *Node) String() string {
|
|
if node.kind == attrNode {
|
|
return node.attr
|
|
}
|
|
return string(node.Bytes())
|
|
}
|
|
|
|
// Bytes returns the string value of node as a byte slice.
|
|
// See Node.String for a description of what the string value of a node is.
|
|
func (node *Node) Bytes() []byte {
|
|
if node.kind == attrNode {
|
|
return []byte(node.attr)
|
|
}
|
|
if node.kind != startNode {
|
|
return node.text
|
|
}
|
|
size := 0
|
|
for i := node.pos; i < node.end; i++ {
|
|
if node.nodes[i].kind == textNode {
|
|
size += len(node.nodes[i].text)
|
|
}
|
|
}
|
|
text := make([]byte, 0, size)
|
|
for i := node.pos; i < node.end; i++ {
|
|
if node.nodes[i].kind == textNode {
|
|
text = append(text, node.nodes[i].text...)
|
|
}
|
|
}
|
|
return text
|
|
}
|
|
|
|
// equals returns whether the string value of node is equal to s,
|
|
// without allocating memory.
|
|
func (node *Node) equals(s string) bool {
|
|
if node.kind == attrNode {
|
|
return s == node.attr
|
|
}
|
|
if node.kind != startNode {
|
|
if len(s) != len(node.text) {
|
|
return false
|
|
}
|
|
for i := range s {
|
|
if s[i] != node.text[i] {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
si := 0
|
|
for i := node.pos; i < node.end; i++ {
|
|
if node.nodes[i].kind == textNode {
|
|
for _, c := range node.nodes[i].text {
|
|
if si > len(s) {
|
|
return false
|
|
}
|
|
if s[si] != c {
|
|
return false
|
|
}
|
|
si++
|
|
}
|
|
}
|
|
}
|
|
return si == len(s)
|
|
}
|
|
|
|
// contains returns whether the string value of node contains s,
|
|
// without allocating memory.
|
|
func (node *Node) contains(s string) (ok bool) {
|
|
if len(s) == 0 {
|
|
return true
|
|
}
|
|
if node.kind == attrNode {
|
|
return strings.Contains(node.attr, s)
|
|
}
|
|
s0 := s[0]
|
|
for i := node.pos; i < node.end; i++ {
|
|
if node.nodes[i].kind == textNode {
|
|
text := node.nodes[i].text
|
|
NextTry:
|
|
for ci, c := range text {
|
|
if c != s0 {
|
|
continue
|
|
}
|
|
si := 1
|
|
for ci++; ci < len(text) && si < len(s); ci++ {
|
|
if s[si] != text[ci] {
|
|
continue NextTry
|
|
}
|
|
si++
|
|
}
|
|
if si == len(s) {
|
|
return true
|
|
}
|
|
for j := i + 1; j < node.end; j++ {
|
|
if node.nodes[j].kind == textNode {
|
|
for _, c := range node.nodes[j].text {
|
|
if s[si] != c {
|
|
continue NextTry
|
|
}
|
|
si++
|
|
if si == len(s) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Parse reads an xml document from r, parses it, and returns its root node.
|
|
func Parse(r io.Reader) (*Node, error) {
|
|
return ParseDecoder(xml.NewDecoder(r))
|
|
}
|
|
|
|
// ParseDecoder parses the xml document being decoded by d and returns
|
|
// its root node.
|
|
func ParseDecoder(d *xml.Decoder) (*Node, error) {
|
|
var nodes []Node
|
|
var text []byte
|
|
|
|
// The root node.
|
|
nodes = append(nodes, Node{kind: startNode})
|
|
|
|
for {
|
|
t, err := d.Token()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
switch t := t.(type) {
|
|
case xml.EndElement:
|
|
nodes = append(nodes, Node{
|
|
kind: endNode,
|
|
})
|
|
case xml.StartElement:
|
|
nodes = append(nodes, Node{
|
|
kind: startNode,
|
|
name: t.Name,
|
|
})
|
|
for _, attr := range t.Attr {
|
|
nodes = append(nodes, Node{
|
|
kind: attrNode,
|
|
name: attr.Name,
|
|
attr: attr.Value,
|
|
})
|
|
}
|
|
case xml.CharData:
|
|
texti := len(text)
|
|
text = append(text, t...)
|
|
nodes = append(nodes, Node{
|
|
kind: textNode,
|
|
text: text[texti : texti+len(t)],
|
|
})
|
|
case xml.Comment:
|
|
texti := len(text)
|
|
text = append(text, t...)
|
|
nodes = append(nodes, Node{
|
|
kind: commentNode,
|
|
text: text[texti : texti+len(t)],
|
|
})
|
|
case xml.ProcInst:
|
|
texti := len(text)
|
|
text = append(text, t.Inst...)
|
|
nodes = append(nodes, Node{
|
|
kind: procInstNode,
|
|
name: xml.Name{Local: t.Target},
|
|
text: text[texti : texti+len(t.Inst)],
|
|
})
|
|
}
|
|
}
|
|
|
|
// Close the root node.
|
|
nodes = append(nodes, Node{kind: endNode})
|
|
|
|
stack := make([]*Node, 0, len(nodes))
|
|
downs := make([]*Node, len(nodes))
|
|
downCount := 0
|
|
|
|
for pos := range nodes {
|
|
|
|
switch nodes[pos].kind {
|
|
|
|
case startNode, attrNode, textNode, commentNode, procInstNode:
|
|
node := &nodes[pos]
|
|
node.nodes = nodes
|
|
node.pos = pos
|
|
if len(stack) > 0 {
|
|
node.up = stack[len(stack)-1]
|
|
}
|
|
if node.kind == startNode {
|
|
stack = append(stack, node)
|
|
} else {
|
|
node.end = pos + 1
|
|
}
|
|
|
|
case endNode:
|
|
node := stack[len(stack)-1]
|
|
node.end = pos
|
|
stack = stack[:len(stack)-1]
|
|
|
|
// Compute downs. Doing that here is what enables the
|
|
// use of a slice of a contiguous pre-allocated block.
|
|
node.down = downs[downCount:downCount]
|
|
for i := node.pos + 1; i < node.end; i++ {
|
|
if nodes[i].up == node {
|
|
switch nodes[i].kind {
|
|
case startNode, textNode, commentNode, procInstNode:
|
|
node.down = append(node.down, &nodes[i])
|
|
downCount++
|
|
}
|
|
}
|
|
}
|
|
if len(stack) == 0 {
|
|
return node, nil
|
|
}
|
|
}
|
|
}
|
|
return nil, io.EOF
|
|
}
|
|
|
|
// ParseHTML reads an HTML document from r, parses it using a proper HTML
|
|
// parser, and returns its root node.
|
|
//
|
|
// The document will be processed as a properly structured HTML document,
|
|
// emulating the behavior of a browser when processing it. This includes
|
|
// putting the content inside proper <html> and <body> tags, if the
|
|
// provided text misses them.
|
|
func ParseHTML(r io.Reader) (*Node, error) {
|
|
ns, err := html.ParseFragment(r, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var nodes []Node
|
|
var text []byte
|
|
|
|
n := ns[0]
|
|
|
|
// The root node.
|
|
nodes = append(nodes, Node{kind: startNode})
|
|
|
|
for n != nil {
|
|
switch n.Type {
|
|
case html.DocumentNode:
|
|
case html.ElementNode:
|
|
nodes = append(nodes, Node{
|
|
kind: startNode,
|
|
name: xml.Name{Local: n.Data, Space: n.Namespace},
|
|
})
|
|
for _, attr := range n.Attr {
|
|
nodes = append(nodes, Node{
|
|
kind: attrNode,
|
|
name: xml.Name{Local: attr.Key, Space: attr.Namespace},
|
|
attr: attr.Val,
|
|
})
|
|
}
|
|
case html.TextNode:
|
|
texti := len(text)
|
|
text = append(text, n.Data...)
|
|
nodes = append(nodes, Node{
|
|
kind: textNode,
|
|
text: text[texti : texti+len(n.Data)],
|
|
})
|
|
case html.CommentNode:
|
|
texti := len(text)
|
|
text = append(text, n.Data...)
|
|
nodes = append(nodes, Node{
|
|
kind: commentNode,
|
|
text: text[texti : texti+len(n.Data)],
|
|
})
|
|
}
|
|
|
|
if n.FirstChild != nil {
|
|
n = n.FirstChild
|
|
continue
|
|
}
|
|
|
|
for n != nil {
|
|
if n.Type == html.ElementNode {
|
|
nodes = append(nodes, Node{kind: endNode})
|
|
}
|
|
if n.NextSibling != nil {
|
|
n = n.NextSibling
|
|
break
|
|
}
|
|
n = n.Parent
|
|
}
|
|
}
|
|
|
|
// Close the root node.
|
|
nodes = append(nodes, Node{kind: endNode})
|
|
|
|
stack := make([]*Node, 0, len(nodes))
|
|
downs := make([]*Node, len(nodes))
|
|
downCount := 0
|
|
|
|
for pos := range nodes {
|
|
|
|
switch nodes[pos].kind {
|
|
|
|
case startNode, attrNode, textNode, commentNode, procInstNode:
|
|
node := &nodes[pos]
|
|
node.nodes = nodes
|
|
node.pos = pos
|
|
if len(stack) > 0 {
|
|
node.up = stack[len(stack)-1]
|
|
}
|
|
if node.kind == startNode {
|
|
stack = append(stack, node)
|
|
} else {
|
|
node.end = pos + 1
|
|
}
|
|
|
|
case endNode:
|
|
node := stack[len(stack)-1]
|
|
node.end = pos
|
|
stack = stack[:len(stack)-1]
|
|
|
|
// Compute downs. Doing that here is what enables the
|
|
// use of a slice of a contiguous pre-allocated block.
|
|
node.down = downs[downCount:downCount]
|
|
for i := node.pos + 1; i < node.end; i++ {
|
|
if nodes[i].up == node {
|
|
switch nodes[i].kind {
|
|
case startNode, textNode, commentNode, procInstNode:
|
|
node.down = append(node.down, &nodes[i])
|
|
downCount++
|
|
}
|
|
}
|
|
}
|
|
if len(stack) == 0 {
|
|
return node, nil
|
|
}
|
|
}
|
|
}
|
|
return nil, io.EOF
|
|
}
|