// TOML lexer. // // Written using the principles developed by Rob Pike in // http://www.youtube.com/watch?v=HxaD_trXwRE package toml import ( "bytes" "errors" "fmt" "regexp" "strconv" "strings" ) var dateRegexp *regexp.Regexp // Define state functions type tomlLexStateFn func() tomlLexStateFn // Define lexer type tomlLexer struct { inputIdx int input []rune // Textual source currentTokenStart int currentTokenStop int tokens []token brackets []rune line int col int endbufferLine int endbufferCol int } // Basic read operations on input func (l *tomlLexer) read() rune { r := l.peek() if r == '\n' { l.endbufferLine++ l.endbufferCol = 1 } else { l.endbufferCol++ } l.inputIdx++ return r } func (l *tomlLexer) next() rune { r := l.read() if r != eof { l.currentTokenStop++ } return r } func (l *tomlLexer) ignore() { l.currentTokenStart = l.currentTokenStop l.line = l.endbufferLine l.col = l.endbufferCol } func (l *tomlLexer) skip() { l.next() l.ignore() } func (l *tomlLexer) fastForward(n int) { for i := 0; i < n; i++ { l.next() } } func (l *tomlLexer) emitWithValue(t tokenType, value string) { l.tokens = append(l.tokens, token{ Position: Position{l.line, l.col}, typ: t, val: value, }) l.ignore() } func (l *tomlLexer) emit(t tokenType) { l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) } func (l *tomlLexer) peek() rune { if l.inputIdx >= len(l.input) { return eof } return l.input[l.inputIdx] } func (l *tomlLexer) peekString(size int) string { maxIdx := len(l.input) upperIdx := l.inputIdx + size // FIXME: potential overflow if upperIdx > maxIdx { upperIdx = maxIdx } return string(l.input[l.inputIdx:upperIdx]) } func (l *tomlLexer) follow(next string) bool { return next == l.peekString(len(next)) } // Error management func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { l.tokens = append(l.tokens, token{ Position: Position{l.line, l.col}, typ: tokenError, val: fmt.Sprintf(format, args...), }) return nil } // State functions func (l *tomlLexer) lexVoid() tomlLexStateFn { for { next := l.peek() switch next { case '}': // after '{' return l.lexRightCurlyBrace case '[': return l.lexTableKey case '#': return l.lexComment(l.lexVoid) case '=': return l.lexEqual case '\r': fallthrough case '\n': l.skip() continue } if isSpace(next) { l.skip() } if isKeyStartChar(next) { return l.lexKey } if next == eof { l.next() break } } l.emit(tokenEOF) return nil } func (l *tomlLexer) lexRvalue() tomlLexStateFn { for { next := l.peek() switch next { case '.': return l.errorf("cannot start float with a dot") case '=': return l.lexEqual case '[': return l.lexLeftBracket case ']': return l.lexRightBracket case '{': return l.lexLeftCurlyBrace case '}': return l.lexRightCurlyBrace case '#': return l.lexComment(l.lexRvalue) case '"': return l.lexString case '\'': return l.lexLiteralString case ',': return l.lexComma case '\r': fallthrough case '\n': l.skip() if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' { return l.lexRvalue } return l.lexVoid } if l.follow("true") { return l.lexTrue } if l.follow("false") { return l.lexFalse } if l.follow("inf") { return l.lexInf } if l.follow("nan") { return l.lexNan } if isSpace(next) { l.skip() continue } if next == eof { l.next() break } possibleDate := l.peekString(35) dateSubmatches := dateRegexp.FindStringSubmatch(possibleDate) if dateSubmatches != nil && dateSubmatches[0] != "" { l.fastForward(len(dateSubmatches[0])) if dateSubmatches[2] == "" { // no timezone information => local date return l.lexLocalDate } return l.lexDate } if next == '+' || next == '-' || isDigit(next) { return l.lexNumber } return l.errorf("no value can start with %c", next) } l.emit(tokenEOF) return nil } func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { l.next() l.emit(tokenLeftCurlyBrace) l.brackets = append(l.brackets, '{') return l.lexVoid } func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { l.next() l.emit(tokenRightCurlyBrace) if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' { return l.errorf("cannot have '}' here") } l.brackets = l.brackets[:len(l.brackets)-1] return l.lexRvalue } func (l *tomlLexer) lexDate() tomlLexStateFn { l.emit(tokenDate) return l.lexRvalue } func (l *tomlLexer) lexLocalDate() tomlLexStateFn { l.emit(tokenLocalDate) return l.lexRvalue } func (l *tomlLexer) lexTrue() tomlLexStateFn { l.fastForward(4) l.emit(tokenTrue) return l.lexRvalue } func (l *tomlLexer) lexFalse() tomlLexStateFn { l.fastForward(5) l.emit(tokenFalse) return l.lexRvalue } func (l *tomlLexer) lexInf() tomlLexStateFn { l.fastForward(3) l.emit(tokenInf) return l.lexRvalue } func (l *tomlLexer) lexNan() tomlLexStateFn { l.fastForward(3) l.emit(tokenNan) return l.lexRvalue } func (l *tomlLexer) lexEqual() tomlLexStateFn { l.next() l.emit(tokenEqual) return l.lexRvalue } func (l *tomlLexer) lexComma() tomlLexStateFn { l.next() l.emit(tokenComma) if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' { return l.lexVoid } return l.lexRvalue } // Parse the key and emits its value without escape sequences. // bare keys, basic string keys and literal string keys are supported. func (l *tomlLexer) lexKey() tomlLexStateFn { var sb strings.Builder for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { if r == '"' { l.next() str, err := l.lexStringAsString(`"`, false, true) if err != nil { return l.errorf(err.Error()) } sb.WriteString("\"") sb.WriteString(str) sb.WriteString("\"") l.next() continue } else if r == '\'' { l.next() str, err := l.lexLiteralStringAsString(`'`, false) if err != nil { return l.errorf(err.Error()) } sb.WriteString("'") sb.WriteString(str) sb.WriteString("'") l.next() continue } else if r == '\n' { return l.errorf("keys cannot contain new lines") } else if isSpace(r) { var str strings.Builder str.WriteString(" ") // skip trailing whitespace l.next() for r = l.peek(); isSpace(r); r = l.peek() { str.WriteRune(r) l.next() } // break loop if not a dot if r != '.' { break } str.WriteString(".") // skip trailing whitespace after dot l.next() for r = l.peek(); isSpace(r); r = l.peek() { str.WriteRune(r) l.next() } sb.WriteString(str.String()) continue } else if r == '.' { // skip } else if !isValidBareChar(r) { return l.errorf("keys cannot contain %c character", r) } sb.WriteRune(r) l.next() } l.emitWithValue(tokenKey, sb.String()) return l.lexVoid } func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { return func() tomlLexStateFn { for next := l.peek(); next != '\n' && next != eof; next = l.peek() { if next == '\r' && l.follow("\r\n") { break } l.next() } l.ignore() return previousState } } func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { l.next() l.emit(tokenLeftBracket) l.brackets = append(l.brackets, '[') return l.lexRvalue } func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { var sb strings.Builder if discardLeadingNewLine { if l.follow("\r\n") { l.skip() l.skip() } else if l.peek() == '\n' { l.skip() } } // find end of string for { if l.follow(terminator) { return sb.String(), nil } next := l.peek() if next == eof { break } sb.WriteRune(l.next()) } return "", errors.New("unclosed string") } func (l *tomlLexer) lexLiteralString() tomlLexStateFn { l.skip() // handle special case for triple-quote terminator := "'" discardLeadingNewLine := false if l.follow("''") { l.skip() l.skip() terminator = "'''" discardLeadingNewLine = true } str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) if err != nil { return l.errorf(err.Error()) } l.emitWithValue(tokenString, str) l.fastForward(len(terminator)) l.ignore() return l.lexRvalue } // Lex a string and return the results as a string. // Terminator is the substring indicating the end of the token. // The resulting string does not include the terminator. func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { var sb strings.Builder if discardLeadingNewLine { if l.follow("\r\n") { l.skip() l.skip() } else if l.peek() == '\n' { l.skip() } } for { if l.follow(terminator) { return sb.String(), nil } if l.follow("\\") { l.next() switch l.peek() { case '\r': fallthrough case '\n': fallthrough case '\t': fallthrough case ' ': // skip all whitespace chars following backslash for strings.ContainsRune("\r\n\t ", l.peek()) { l.next() } case '"': sb.WriteString("\"") l.next() case 'n': sb.WriteString("\n") l.next() case 'b': sb.WriteString("\b") l.next() case 'f': sb.WriteString("\f") l.next() case '/': sb.WriteString("/") l.next() case 't': sb.WriteString("\t") l.next() case 'r': sb.WriteString("\r") l.next() case '\\': sb.WriteString("\\") l.next() case 'u': l.next() var code strings.Builder for i := 0; i < 4; i++ { c := l.peek() if !isHexDigit(c) { return "", errors.New("unfinished unicode escape") } l.next() code.WriteRune(c) } intcode, err := strconv.ParseInt(code.String(), 16, 32) if err != nil { return "", errors.New("invalid unicode escape: \\u" + code.String()) } sb.WriteRune(rune(intcode)) case 'U': l.next() var code strings.Builder for i := 0; i < 8; i++ { c := l.peek() if !isHexDigit(c) { return "", errors.New("unfinished unicode escape") } l.next() code.WriteRune(c) } intcode, err := strconv.ParseInt(code.String(), 16, 64) if err != nil { return "", errors.New("invalid unicode escape: \\U" + code.String()) } sb.WriteRune(rune(intcode)) default: return "", errors.New("invalid escape sequence: \\" + string(l.peek())) } } else { r := l.peek() if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) { return "", fmt.Errorf("unescaped control character %U", r) } l.next() sb.WriteRune(r) } if l.peek() == eof { break } } return "", errors.New("unclosed string") } func (l *tomlLexer) lexString() tomlLexStateFn { l.skip() // handle special case for triple-quote terminator := `"` discardLeadingNewLine := false acceptNewLines := false if l.follow(`""`) { l.skip() l.skip() terminator = `"""` discardLeadingNewLine = true acceptNewLines = true } str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) if err != nil { return l.errorf(err.Error()) } l.emitWithValue(tokenString, str) l.fastForward(len(terminator)) l.ignore() return l.lexRvalue } func (l *tomlLexer) lexTableKey() tomlLexStateFn { l.next() if l.peek() == '[' { // token '[[' signifies an array of tables l.next() l.emit(tokenDoubleLeftBracket) return l.lexInsideTableArrayKey } // vanilla table key l.emit(tokenLeftBracket) return l.lexInsideTableKey } // Parse the key till "]]", but only bare keys are supported func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { for r := l.peek(); r != eof; r = l.peek() { switch r { case ']': if l.currentTokenStop > l.currentTokenStart { l.emit(tokenKeyGroupArray) } l.next() if l.peek() != ']' { break } l.next() l.emit(tokenDoubleRightBracket) return l.lexVoid case '[': return l.errorf("table array key cannot contain ']'") default: l.next() } } return l.errorf("unclosed table array key") } // Parse the key till "]" but only bare keys are supported func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { for r := l.peek(); r != eof; r = l.peek() { switch r { case ']': if l.currentTokenStop > l.currentTokenStart { l.emit(tokenKeyGroup) } l.next() l.emit(tokenRightBracket) return l.lexVoid case '[': return l.errorf("table key cannot contain ']'") default: l.next() } } return l.errorf("unclosed table key") } func (l *tomlLexer) lexRightBracket() tomlLexStateFn { l.next() l.emit(tokenRightBracket) if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' { return l.errorf("cannot have ']' here") } l.brackets = l.brackets[:len(l.brackets)-1] return l.lexRvalue } type validRuneFn func(r rune) bool func isValidHexRune(r rune) bool { return r >= 'a' && r <= 'f' || r >= 'A' && r <= 'F' || r >= '0' && r <= '9' || r == '_' } func isValidOctalRune(r rune) bool { return r >= '0' && r <= '7' || r == '_' } func isValidBinaryRune(r rune) bool { return r == '0' || r == '1' || r == '_' } func (l *tomlLexer) lexNumber() tomlLexStateFn { r := l.peek() if r == '0' { follow := l.peekString(2) if len(follow) == 2 { var isValidRune validRuneFn switch follow[1] { case 'x': isValidRune = isValidHexRune case 'o': isValidRune = isValidOctalRune case 'b': isValidRune = isValidBinaryRune default: if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) } } if isValidRune != nil { l.next() l.next() digitSeen := false for { next := l.peek() if !isValidRune(next) { break } digitSeen = true l.next() } if !digitSeen { return l.errorf("number needs at least one digit") } l.emit(tokenInteger) return l.lexRvalue } } } if r == '+' || r == '-' { l.next() if l.follow("inf") { return l.lexInf } if l.follow("nan") { return l.lexNan } } pointSeen := false expSeen := false digitSeen := false for { next := l.peek() if next == '.' { if pointSeen { return l.errorf("cannot have two dots in one float") } l.next() if !isDigit(l.peek()) { return l.errorf("float cannot end with a dot") } pointSeen = true } else if next == 'e' || next == 'E' { expSeen = true l.next() r := l.peek() if r == '+' || r == '-' { l.next() } } else if isDigit(next) { digitSeen = true l.next() } else if next == '_' { l.next() } else { break } if pointSeen && !digitSeen { return l.errorf("cannot start float with a dot") } } if !digitSeen { return l.errorf("no digit in that number") } if pointSeen || expSeen { l.emit(tokenFloat) } else { l.emit(tokenInteger) } return l.lexRvalue } func (l *tomlLexer) run() { for state := l.lexVoid; state != nil; { state = state() } } func init() { // Regexp for all date/time formats supported by TOML. // Group 1: nano precision // Group 2: timezone // // /!\ also matches the empty string // // Example matches: // 1979-05-27T07:32:00Z // 1979-05-27T00:32:00-07:00 // 1979-05-27T00:32:00.999999-07:00 // 1979-05-27 07:32:00Z // 1979-05-27 00:32:00-07:00 // 1979-05-27 00:32:00.999999-07:00 // 1979-05-27T07:32:00 // 1979-05-27T00:32:00.999999 // 1979-05-27 07:32:00 // 1979-05-27 00:32:00.999999 // 1979-05-27 // 07:32:00 // 00:32:00.999999 dateRegexp = regexp.MustCompile(`^(?:\d{1,4}-\d{2}-\d{2})?(?:[T ]?\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})?)?`) } // Entry point func lexToml(inputBytes []byte) []token { runes := bytes.Runes(inputBytes) l := &tomlLexer{ input: runes, tokens: make([]token, 0, 256), line: 1, col: 1, endbufferLine: 1, endbufferCol: 1, } l.run() return l.tokens }