2019-06-06 10:40:39 -04:00
|
|
|
package robotstxt
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
|
|
|
"go/token"
|
|
|
|
"os"
|
2019-08-21 06:28:34 -04:00
|
|
|
"sync"
|
2019-06-06 10:40:39 -04:00
|
|
|
"unicode/utf8"
|
|
|
|
)
|
|
|
|
|
|
|
|
type byteScanner struct {
|
|
|
|
pos token.Position
|
2019-08-21 06:28:34 -04:00
|
|
|
buf []byte
|
|
|
|
ErrorCount int
|
2019-06-06 10:40:39 -04:00
|
|
|
ch rune
|
2019-08-21 06:28:34 -04:00
|
|
|
Quiet bool
|
2019-06-06 10:40:39 -04:00
|
|
|
keyTokenFound bool
|
2019-08-21 06:28:34 -04:00
|
|
|
lastChunk bool
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
const tokEOL = "\n"
|
|
|
|
|
2019-06-06 10:40:39 -04:00
|
|
|
var WhitespaceChars = []rune{' ', '\t', '\v'}
|
2019-08-21 06:28:34 -04:00
|
|
|
var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
|
2019-06-06 10:40:39 -04:00
|
|
|
|
|
|
|
func newByteScanner(srcname string, quiet bool) *byteScanner {
|
|
|
|
return &byteScanner{
|
|
|
|
Quiet: quiet,
|
|
|
|
ch: -1,
|
|
|
|
pos: token.Position{Filename: srcname},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
func (s *byteScanner) feed(input []byte, end bool) {
|
2019-06-06 10:40:39 -04:00
|
|
|
s.buf = input
|
|
|
|
s.pos.Offset = 0
|
|
|
|
s.pos.Line = 1
|
|
|
|
s.pos.Column = 1
|
|
|
|
s.lastChunk = end
|
|
|
|
|
|
|
|
// Read first char into look-ahead buffer `s.ch`.
|
2019-08-21 06:28:34 -04:00
|
|
|
if !s.nextChar() {
|
|
|
|
return
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
// Skip UTF-8 byte order mark
|
|
|
|
if s.ch == 65279 {
|
|
|
|
s.nextChar()
|
|
|
|
s.pos.Column = 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *byteScanner) GetPosition() token.Position {
|
|
|
|
return s.pos
|
|
|
|
}
|
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
func (s *byteScanner) scan() string {
|
|
|
|
// Note Offset > len, not >=, so we can scan last character.
|
|
|
|
if s.lastChunk && s.pos.Offset > len(s.buf) {
|
|
|
|
return ""
|
|
|
|
}
|
2019-06-06 10:40:39 -04:00
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
s.skipSpace()
|
2019-06-06 10:40:39 -04:00
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
if s.ch == -1 {
|
|
|
|
return ""
|
|
|
|
}
|
2019-06-06 10:40:39 -04:00
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
// EOL
|
|
|
|
if s.isEol() {
|
|
|
|
s.keyTokenFound = false
|
|
|
|
// skip subsequent newline chars
|
|
|
|
for s.ch != -1 && s.isEol() {
|
|
|
|
s.nextChar()
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
2019-08-21 06:28:34 -04:00
|
|
|
// emit newline as separate token
|
|
|
|
return tokEOL
|
|
|
|
}
|
2019-06-06 10:40:39 -04:00
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
// skip comments
|
|
|
|
if s.ch == '#' {
|
|
|
|
s.keyTokenFound = false
|
|
|
|
s.skipUntilEol()
|
|
|
|
if s.ch == -1 {
|
|
|
|
return ""
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
2019-08-21 06:28:34 -04:00
|
|
|
// emit newline as separate token
|
|
|
|
return tokEOL
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
// else we found something
|
|
|
|
tok := tokBuffers.Get().(*bytes.Buffer)
|
|
|
|
defer tokBuffers.Put(tok)
|
|
|
|
tok.Reset()
|
2019-06-06 10:40:39 -04:00
|
|
|
tok.WriteRune(s.ch)
|
|
|
|
s.nextChar()
|
|
|
|
for s.ch != -1 && !s.isSpace() && !s.isEol() {
|
|
|
|
// Do not consider ":" to be a token separator if a first key token
|
|
|
|
// has already been found on this line (avoid cutting an absolute URL
|
|
|
|
// after the "http:")
|
|
|
|
if s.ch == ':' && !s.keyTokenFound {
|
|
|
|
s.nextChar()
|
|
|
|
s.keyTokenFound = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
tok.WriteRune(s.ch)
|
|
|
|
s.nextChar()
|
|
|
|
}
|
2019-08-21 06:28:34 -04:00
|
|
|
return tok.String()
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
|
|
|
|
2019-08-21 06:28:34 -04:00
|
|
|
func (s *byteScanner) scanAll() []string {
|
|
|
|
results := make([]string, 0, 64) // random guess of average tokens length
|
2019-06-06 10:40:39 -04:00
|
|
|
for {
|
2019-08-21 06:28:34 -04:00
|
|
|
token := s.scan()
|
|
|
|
if token != "" {
|
|
|
|
results = append(results, token)
|
|
|
|
} else {
|
2019-06-06 10:40:39 -04:00
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2019-08-21 06:28:34 -04:00
|
|
|
return results
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *byteScanner) error(pos token.Position, msg string) {
|
|
|
|
s.ErrorCount++
|
|
|
|
if !s.Quiet {
|
|
|
|
fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *byteScanner) isEol() bool {
|
|
|
|
return s.ch == '\n' || s.ch == '\r'
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *byteScanner) isSpace() bool {
|
|
|
|
for _, r := range WhitespaceChars {
|
|
|
|
if s.ch == r {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *byteScanner) skipSpace() {
|
|
|
|
for s.ch != -1 && s.isSpace() {
|
|
|
|
s.nextChar()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *byteScanner) skipUntilEol() {
|
|
|
|
for s.ch != -1 && !s.isEol() {
|
|
|
|
s.nextChar()
|
|
|
|
}
|
|
|
|
// skip subsequent newline chars
|
|
|
|
for s.ch != -1 && s.isEol() {
|
|
|
|
s.nextChar()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reads next Unicode char.
|
2019-08-21 06:28:34 -04:00
|
|
|
func (s *byteScanner) nextChar() bool {
|
2019-06-06 10:40:39 -04:00
|
|
|
if s.pos.Offset >= len(s.buf) {
|
|
|
|
s.ch = -1
|
2019-08-21 06:28:34 -04:00
|
|
|
return false
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|
|
|
|
s.pos.Column++
|
|
|
|
if s.ch == '\n' {
|
|
|
|
s.pos.Line++
|
|
|
|
s.pos.Column = 1
|
|
|
|
}
|
|
|
|
r, w := rune(s.buf[s.pos.Offset]), 1
|
|
|
|
if r >= 0x80 {
|
|
|
|
r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
|
|
|
|
if r == utf8.RuneError && w == 1 {
|
|
|
|
s.error(s.pos, "illegal UTF-8 encoding")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
s.pos.Column++
|
|
|
|
s.pos.Offset += w
|
|
|
|
s.ch = r
|
2019-08-21 06:28:34 -04:00
|
|
|
return true
|
2019-06-06 10:40:39 -04:00
|
|
|
}
|