package xmlpath import ( "golang.org/x/net/html" "encoding/xml" "io" "strings" ) // Node is an item in an xml tree that was compiled to // be processed via xml paths. A node may represent: // // - An element in the xml document () // - An attribute of an element in the xml document (href="...") // - A comment in the xml document () // - A processing instruction in the xml document () // - Some text within the xml document // type Node struct { kind nodeKind name xml.Name attr string text []byte nodes []Node pos int end int up *Node down []*Node } type nodeKind int const ( anyNode nodeKind = iota startNode endNode attrNode textNode commentNode procInstNode ) // String returns the string value of node. // // The string value of a node is: // // - For element nodes, the concatenation of all text nodes within the element. // - For text nodes, the text itself. // - For attribute nodes, the attribute value. // - For comment nodes, the text within the comment delimiters. // - For processing instruction nodes, the content of the instruction. // func (node *Node) String() string { if node.kind == attrNode { return node.attr } return string(node.Bytes()) } // Bytes returns the string value of node as a byte slice. // See Node.String for a description of what the string value of a node is. func (node *Node) Bytes() []byte { if node.kind == attrNode { return []byte(node.attr) } if node.kind != startNode { return node.text } size := 0 for i := node.pos; i < node.end; i++ { if node.nodes[i].kind == textNode { size += len(node.nodes[i].text) } } text := make([]byte, 0, size) for i := node.pos; i < node.end; i++ { if node.nodes[i].kind == textNode { text = append(text, node.nodes[i].text...) } } return text } // equals returns whether the string value of node is equal to s, // without allocating memory. func (node *Node) equals(s string) bool { if node.kind == attrNode { return s == node.attr } if node.kind != startNode { if len(s) != len(node.text) { return false } for i := range s { if s[i] != node.text[i] { return false } } return true } si := 0 for i := node.pos; i < node.end; i++ { if node.nodes[i].kind == textNode { for _, c := range node.nodes[i].text { if si > len(s) { return false } if s[si] != c { return false } si++ } } } return si == len(s) } // contains returns whether the string value of node contains s, // without allocating memory. func (node *Node) contains(s string) (ok bool) { if len(s) == 0 { return true } if node.kind == attrNode { return strings.Contains(node.attr, s) } s0 := s[0] for i := node.pos; i < node.end; i++ { if node.nodes[i].kind == textNode { text := node.nodes[i].text NextTry: for ci, c := range text { if c != s0 { continue } si := 1 for ci++; ci < len(text) && si < len(s); ci++ { if s[si] != text[ci] { continue NextTry } si++ } if si == len(s) { return true } for j := i + 1; j < node.end; j++ { if node.nodes[j].kind == textNode { for _, c := range node.nodes[j].text { if s[si] != c { continue NextTry } si++ if si == len(s) { return true } } } } } } } return false } // Parse reads an xml document from r, parses it, and returns its root node. func Parse(r io.Reader) (*Node, error) { return ParseDecoder(xml.NewDecoder(r)) } // ParseDecoder parses the xml document being decoded by d and returns // its root node. func ParseDecoder(d *xml.Decoder) (*Node, error) { var nodes []Node var text []byte // The root node. nodes = append(nodes, Node{kind: startNode}) for { t, err := d.Token() if err == io.EOF { break } if err != nil { return nil, err } switch t := t.(type) { case xml.EndElement: nodes = append(nodes, Node{ kind: endNode, }) case xml.StartElement: nodes = append(nodes, Node{ kind: startNode, name: t.Name, }) for _, attr := range t.Attr { nodes = append(nodes, Node{ kind: attrNode, name: attr.Name, attr: attr.Value, }) } case xml.CharData: texti := len(text) text = append(text, t...) nodes = append(nodes, Node{ kind: textNode, text: text[texti : texti+len(t)], }) case xml.Comment: texti := len(text) text = append(text, t...) nodes = append(nodes, Node{ kind: commentNode, text: text[texti : texti+len(t)], }) case xml.ProcInst: texti := len(text) text = append(text, t.Inst...) nodes = append(nodes, Node{ kind: procInstNode, name: xml.Name{Local: t.Target}, text: text[texti : texti+len(t.Inst)], }) } } // Close the root node. nodes = append(nodes, Node{kind: endNode}) stack := make([]*Node, 0, len(nodes)) downs := make([]*Node, len(nodes)) downCount := 0 for pos := range nodes { switch nodes[pos].kind { case startNode, attrNode, textNode, commentNode, procInstNode: node := &nodes[pos] node.nodes = nodes node.pos = pos if len(stack) > 0 { node.up = stack[len(stack)-1] } if node.kind == startNode { stack = append(stack, node) } else { node.end = pos + 1 } case endNode: node := stack[len(stack)-1] node.end = pos stack = stack[:len(stack)-1] // Compute downs. Doing that here is what enables the // use of a slice of a contiguous pre-allocated block. node.down = downs[downCount:downCount] for i := node.pos + 1; i < node.end; i++ { if nodes[i].up == node { switch nodes[i].kind { case startNode, textNode, commentNode, procInstNode: node.down = append(node.down, &nodes[i]) downCount++ } } } if len(stack) == 0 { return node, nil } } } return nil, io.EOF } // ParseHTML reads an HTML document from r, parses it using a proper HTML // parser, and returns its root node. // // The document will be processed as a properly structured HTML document, // emulating the behavior of a browser when processing it. This includes // putting the content inside proper and tags, if the // provided text misses them. func ParseHTML(r io.Reader) (*Node, error) { ns, err := html.ParseFragment(r, nil) if err != nil { return nil, err } var nodes []Node var text []byte n := ns[0] // The root node. nodes = append(nodes, Node{kind: startNode}) for n != nil { switch n.Type { case html.DocumentNode: case html.ElementNode: nodes = append(nodes, Node{ kind: startNode, name: xml.Name{Local: n.Data, Space: n.Namespace}, }) for _, attr := range n.Attr { nodes = append(nodes, Node{ kind: attrNode, name: xml.Name{Local: attr.Key, Space: attr.Namespace}, attr: attr.Val, }) } case html.TextNode: texti := len(text) text = append(text, n.Data...) nodes = append(nodes, Node{ kind: textNode, text: text[texti : texti+len(n.Data)], }) case html.CommentNode: texti := len(text) text = append(text, n.Data...) nodes = append(nodes, Node{ kind: commentNode, text: text[texti : texti+len(n.Data)], }) } if n.FirstChild != nil { n = n.FirstChild continue } for n != nil { if n.Type == html.ElementNode { nodes = append(nodes, Node{kind: endNode}) } if n.NextSibling != nil { n = n.NextSibling break } n = n.Parent } } // Close the root node. nodes = append(nodes, Node{kind: endNode}) stack := make([]*Node, 0, len(nodes)) downs := make([]*Node, len(nodes)) downCount := 0 for pos := range nodes { switch nodes[pos].kind { case startNode, attrNode, textNode, commentNode, procInstNode: node := &nodes[pos] node.nodes = nodes node.pos = pos if len(stack) > 0 { node.up = stack[len(stack)-1] } if node.kind == startNode { stack = append(stack, node) } else { node.end = pos + 1 } case endNode: node := stack[len(stack)-1] node.end = pos stack = stack[:len(stack)-1] // Compute downs. Doing that here is what enables the // use of a slice of a contiguous pre-allocated block. node.down = downs[downCount:downCount] for i := node.pos + 1; i < node.end; i++ { if nodes[i].up == node { switch nodes[i].kind { case startNode, textNode, commentNode, procInstNode: node.down = append(node.down, &nodes[i]) downCount++ } } } if len(stack) == 0 { return node, nil } } } return nil, io.EOF }