303 lines
7.0 KiB
Go
303 lines
7.0 KiB
Go
package xmlquery
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html/charset"
|
|
)
|
|
|
|
// A NodeType is the type of a Node.
|
|
type NodeType uint
|
|
|
|
const (
|
|
// DocumentNode is a document object that, as the root of the document tree,
|
|
// provides access to the entire XML document.
|
|
DocumentNode NodeType = iota
|
|
// DeclarationNode is the document type declaration, indicated by the following
|
|
// tag (for example, <!DOCTYPE...> ).
|
|
DeclarationNode
|
|
// ElementNode is an element (for example, <item> ).
|
|
ElementNode
|
|
// TextNode is the text content of a node.
|
|
TextNode
|
|
// CommentNode a comment (for example, <!-- my comment --> ).
|
|
CommentNode
|
|
// AttributeNode is an attribute of element.
|
|
AttributeNode
|
|
)
|
|
|
|
// A Node consists of a NodeType and some Data (tag name for
|
|
// element nodes, content for text) and are part of a tree of Nodes.
|
|
type Node struct {
|
|
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
|
|
|
|
Type NodeType
|
|
Data string
|
|
Prefix string
|
|
NamespaceURI string
|
|
Attr []xml.Attr
|
|
|
|
level int // node level in the tree
|
|
}
|
|
|
|
// InnerText returns the text between the start and end tags of the object.
|
|
func (n *Node) InnerText() string {
|
|
var output func(*bytes.Buffer, *Node)
|
|
output = func(buf *bytes.Buffer, n *Node) {
|
|
switch n.Type {
|
|
case TextNode:
|
|
buf.WriteString(n.Data)
|
|
return
|
|
case CommentNode:
|
|
return
|
|
}
|
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
|
output(buf, child)
|
|
}
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
output(&buf, n)
|
|
return buf.String()
|
|
}
|
|
|
|
func outputXML(buf *bytes.Buffer, n *Node) {
|
|
if n.Type == TextNode {
|
|
xml.EscapeText(buf, []byte(strings.TrimSpace(n.Data)))
|
|
return
|
|
}
|
|
if n.Type == CommentNode {
|
|
buf.WriteString("<!--")
|
|
buf.WriteString(n.Data)
|
|
buf.WriteString("-->")
|
|
return
|
|
}
|
|
if n.Type == DeclarationNode {
|
|
buf.WriteString("<?" + n.Data)
|
|
} else {
|
|
if n.Prefix == "" {
|
|
buf.WriteString("<" + n.Data)
|
|
} else {
|
|
buf.WriteString("<" + n.Prefix + ":" + n.Data)
|
|
}
|
|
}
|
|
|
|
for _, attr := range n.Attr {
|
|
if attr.Name.Space != "" {
|
|
buf.WriteString(fmt.Sprintf(` %s:%s="%s"`, attr.Name.Space, attr.Name.Local, attr.Value))
|
|
} else {
|
|
buf.WriteString(fmt.Sprintf(` %s="%s"`, attr.Name.Local, attr.Value))
|
|
}
|
|
}
|
|
if n.Type == DeclarationNode {
|
|
buf.WriteString("?>")
|
|
} else {
|
|
buf.WriteString(">")
|
|
}
|
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
|
outputXML(buf, child)
|
|
}
|
|
if n.Type != DeclarationNode {
|
|
if n.Prefix == "" {
|
|
buf.WriteString(fmt.Sprintf("</%s>", n.Data))
|
|
} else {
|
|
buf.WriteString(fmt.Sprintf("</%s:%s>", n.Prefix, n.Data))
|
|
}
|
|
}
|
|
}
|
|
|
|
// OutputXML returns the text that including tags name.
|
|
func (n *Node) OutputXML(self bool) string {
|
|
var buf bytes.Buffer
|
|
if self {
|
|
outputXML(&buf, n)
|
|
} else {
|
|
for n := n.FirstChild; n != nil; n = n.NextSibling {
|
|
outputXML(&buf, n)
|
|
}
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func addAttr(n *Node, key, val string) {
|
|
var attr xml.Attr
|
|
if i := strings.Index(key, ":"); i > 0 {
|
|
attr = xml.Attr{
|
|
Name: xml.Name{Space: key[:i], Local: key[i+1:]},
|
|
Value: val,
|
|
}
|
|
} else {
|
|
attr = xml.Attr{
|
|
Name: xml.Name{Local: key},
|
|
Value: val,
|
|
}
|
|
}
|
|
|
|
n.Attr = append(n.Attr, attr)
|
|
}
|
|
|
|
func addChild(parent, n *Node) {
|
|
n.Parent = parent
|
|
if parent.FirstChild == nil {
|
|
parent.FirstChild = n
|
|
} else {
|
|
parent.LastChild.NextSibling = n
|
|
n.PrevSibling = parent.LastChild
|
|
}
|
|
|
|
parent.LastChild = n
|
|
}
|
|
|
|
func addSibling(sibling, n *Node) {
|
|
for t := sibling.NextSibling; t != nil; t = t.NextSibling {
|
|
sibling = t
|
|
}
|
|
n.Parent = sibling.Parent
|
|
sibling.NextSibling = n
|
|
n.PrevSibling = sibling
|
|
if sibling.Parent != nil {
|
|
sibling.Parent.LastChild = n
|
|
}
|
|
}
|
|
|
|
// LoadURL loads the XML document from the specified URL.
|
|
func LoadURL(url string) (*Node, error) {
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
return parse(resp.Body)
|
|
}
|
|
|
|
func parse(r io.Reader) (*Node, error) {
|
|
var (
|
|
decoder = xml.NewDecoder(r)
|
|
doc = &Node{Type: DocumentNode}
|
|
space2prefix = make(map[string]string)
|
|
level = 0
|
|
)
|
|
// http://www.w3.org/XML/1998/namespace is bound by definition to the prefix xml.
|
|
space2prefix["http://www.w3.org/XML/1998/namespace"] = "xml"
|
|
decoder.CharsetReader = charset.NewReaderLabel
|
|
prev := doc
|
|
for {
|
|
tok, err := decoder.Token()
|
|
switch {
|
|
case err == io.EOF:
|
|
goto quit
|
|
case err != nil:
|
|
return nil, err
|
|
}
|
|
|
|
switch tok := tok.(type) {
|
|
case xml.StartElement:
|
|
if level == 0 {
|
|
// mising XML declaration
|
|
node := &Node{Type: DeclarationNode, Data: "xml", level: 1}
|
|
addChild(prev, node)
|
|
level = 1
|
|
prev = node
|
|
}
|
|
// https://www.w3.org/TR/xml-names/#scoping-defaulting
|
|
for _, att := range tok.Attr {
|
|
if att.Name.Local == "xmlns" {
|
|
space2prefix[att.Value] = ""
|
|
} else if att.Name.Space == "xmlns" {
|
|
space2prefix[att.Value] = att.Name.Local
|
|
}
|
|
}
|
|
|
|
if tok.Name.Space != "" {
|
|
if _, found := space2prefix[tok.Name.Space]; !found {
|
|
return nil, errors.New("xmlquery: invalid XML document, namespace is missing")
|
|
}
|
|
}
|
|
|
|
for i := 0; i < len(tok.Attr); i++ {
|
|
att := &tok.Attr[i]
|
|
if prefix, ok := space2prefix[att.Name.Space]; ok {
|
|
att.Name.Space = prefix
|
|
}
|
|
}
|
|
|
|
node := &Node{
|
|
Type: ElementNode,
|
|
Data: tok.Name.Local,
|
|
Prefix: space2prefix[tok.Name.Space],
|
|
NamespaceURI: tok.Name.Space,
|
|
Attr: tok.Attr,
|
|
level: level,
|
|
}
|
|
//fmt.Println(fmt.Sprintf("start > %s : %d", node.Data, level))
|
|
if level == prev.level {
|
|
addSibling(prev, node)
|
|
} else if level > prev.level {
|
|
addChild(prev, node)
|
|
} else if level < prev.level {
|
|
for i := prev.level - level; i > 1; i-- {
|
|
prev = prev.Parent
|
|
}
|
|
addSibling(prev.Parent, node)
|
|
}
|
|
prev = node
|
|
level++
|
|
case xml.EndElement:
|
|
level--
|
|
case xml.CharData:
|
|
node := &Node{Type: TextNode, Data: string(tok), level: level}
|
|
if level == prev.level {
|
|
addSibling(prev, node)
|
|
} else if level > prev.level {
|
|
addChild(prev, node)
|
|
}
|
|
case xml.Comment:
|
|
node := &Node{Type: CommentNode, Data: string(tok), level: level}
|
|
if level == prev.level {
|
|
addSibling(prev, node)
|
|
} else if level > prev.level {
|
|
addChild(prev, node)
|
|
} else if level < prev.level {
|
|
for i := prev.level - level; i > 1; i-- {
|
|
prev = prev.Parent
|
|
}
|
|
addSibling(prev.Parent, node)
|
|
}
|
|
case xml.ProcInst: // Processing Instruction
|
|
if prev.Type != DeclarationNode {
|
|
level++
|
|
}
|
|
node := &Node{Type: DeclarationNode, Data: tok.Target, level: level}
|
|
pairs := strings.Split(string(tok.Inst), " ")
|
|
for _, pair := range pairs {
|
|
pair = strings.TrimSpace(pair)
|
|
if i := strings.Index(pair, "="); i > 0 {
|
|
addAttr(node, pair[:i], strings.Trim(pair[i+1:], `"`))
|
|
}
|
|
}
|
|
if level == prev.level {
|
|
addSibling(prev, node)
|
|
} else if level > prev.level {
|
|
addChild(prev, node)
|
|
}
|
|
prev = node
|
|
case xml.Directive:
|
|
}
|
|
|
|
}
|
|
quit:
|
|
return doc, nil
|
|
}
|
|
|
|
// Parse returns the parse tree for the XML from the given Reader.
|
|
func Parse(r io.Reader) (*Node, error) {
|
|
return parse(r)
|
|
}
|