310 lines
7.3 KiB
Go
310 lines
7.3 KiB
Go
// Copyright 2017 The Bazel Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package syntax
|
|
|
|
// Starlark quoted string utilities.
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// unesc maps single-letter chars following \ to their actual values.
|
|
var unesc = [256]byte{
|
|
'a': '\a',
|
|
'b': '\b',
|
|
'f': '\f',
|
|
'n': '\n',
|
|
'r': '\r',
|
|
't': '\t',
|
|
'v': '\v',
|
|
'\\': '\\',
|
|
'\'': '\'',
|
|
'"': '"',
|
|
}
|
|
|
|
// esc maps escape-worthy bytes to the char that should follow \.
|
|
var esc = [256]byte{
|
|
'\a': 'a',
|
|
'\b': 'b',
|
|
'\f': 'f',
|
|
'\n': 'n',
|
|
'\r': 'r',
|
|
'\t': 't',
|
|
'\v': 'v',
|
|
'\\': '\\',
|
|
'\'': '\'',
|
|
'"': '"',
|
|
}
|
|
|
|
// unquote unquotes the quoted string, returning the actual
|
|
// string value, whether the original was triple-quoted,
|
|
// whether it was a byte string, and an error describing invalid input.
|
|
func unquote(quoted string) (s string, triple, isByte bool, err error) {
|
|
// Check for raw prefix: means don't interpret the inner \.
|
|
raw := false
|
|
if strings.HasPrefix(quoted, "r") {
|
|
raw = true
|
|
quoted = quoted[1:]
|
|
}
|
|
// Check for bytes prefix.
|
|
if strings.HasPrefix(quoted, "b") {
|
|
isByte = true
|
|
quoted = quoted[1:]
|
|
}
|
|
|
|
if len(quoted) < 2 {
|
|
err = fmt.Errorf("string literal too short")
|
|
return
|
|
}
|
|
|
|
if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
|
|
err = fmt.Errorf("string literal has invalid quotes")
|
|
return
|
|
}
|
|
|
|
// Check for triple quoted string.
|
|
quote := quoted[0]
|
|
if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
|
|
triple = true
|
|
quoted = quoted[3 : len(quoted)-3]
|
|
} else {
|
|
quoted = quoted[1 : len(quoted)-1]
|
|
}
|
|
|
|
// Now quoted is the quoted data, but no quotes.
|
|
// If we're in raw mode or there are no escapes or
|
|
// carriage returns, we're done.
|
|
var unquoteChars string
|
|
if raw {
|
|
unquoteChars = "\r"
|
|
} else {
|
|
unquoteChars = "\\\r"
|
|
}
|
|
if !strings.ContainsAny(quoted, unquoteChars) {
|
|
s = quoted
|
|
return
|
|
}
|
|
|
|
// Otherwise process quoted string.
|
|
// Each iteration processes one escape sequence along with the
|
|
// plain text leading up to it.
|
|
buf := new(strings.Builder)
|
|
for {
|
|
// Remove prefix before escape sequence.
|
|
i := strings.IndexAny(quoted, unquoteChars)
|
|
if i < 0 {
|
|
i = len(quoted)
|
|
}
|
|
buf.WriteString(quoted[:i])
|
|
quoted = quoted[i:]
|
|
|
|
if len(quoted) == 0 {
|
|
break
|
|
}
|
|
|
|
// Process carriage return.
|
|
if quoted[0] == '\r' {
|
|
buf.WriteByte('\n')
|
|
if len(quoted) > 1 && quoted[1] == '\n' {
|
|
quoted = quoted[2:]
|
|
} else {
|
|
quoted = quoted[1:]
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Process escape sequence.
|
|
if len(quoted) == 1 {
|
|
err = fmt.Errorf(`truncated escape sequence \`)
|
|
return
|
|
}
|
|
|
|
switch quoted[1] {
|
|
default:
|
|
// In Starlark, like Go, a backslash must escape something.
|
|
// (Python still treats unnecessary backslashes literally,
|
|
// but since 3.6 has emitted a deprecation warning.)
|
|
err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
|
|
return
|
|
|
|
case '\n':
|
|
// Ignore the escape and the line break.
|
|
quoted = quoted[2:]
|
|
|
|
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
|
|
// One-char escape.
|
|
// Escapes are allowed for both kinds of quotation
|
|
// mark, not just the kind in use.
|
|
buf.WriteByte(unesc[quoted[1]])
|
|
quoted = quoted[2:]
|
|
|
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
|
// Octal escape, up to 3 digits, \OOO.
|
|
n := int(quoted[1] - '0')
|
|
quoted = quoted[2:]
|
|
for i := 1; i < 3; i++ {
|
|
if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
|
|
break
|
|
}
|
|
n = n*8 + int(quoted[0]-'0')
|
|
quoted = quoted[1:]
|
|
}
|
|
if !isByte && n > 127 {
|
|
err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
|
|
return
|
|
}
|
|
if n >= 256 {
|
|
// NOTE: Python silently discards the high bit,
|
|
// so that '\541' == '\141' == 'a'.
|
|
// Let's see if we can avoid doing that in BUILD files.
|
|
err = fmt.Errorf(`invalid escape sequence \%03o`, n)
|
|
return
|
|
}
|
|
buf.WriteByte(byte(n))
|
|
|
|
case 'x':
|
|
// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
|
|
if len(quoted) < 4 {
|
|
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
|
|
return
|
|
}
|
|
n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
|
|
if err1 != nil {
|
|
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
|
|
return
|
|
}
|
|
if !isByte && n > 127 {
|
|
err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
|
|
quoted[:4], n, n)
|
|
return
|
|
}
|
|
buf.WriteByte(byte(n))
|
|
quoted = quoted[4:]
|
|
|
|
case 'u', 'U':
|
|
// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
|
|
sz := 6
|
|
if quoted[1] == 'U' {
|
|
sz = 10
|
|
}
|
|
if len(quoted) < sz {
|
|
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
|
|
return
|
|
}
|
|
n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
|
|
if err1 != nil {
|
|
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
|
|
return
|
|
}
|
|
if n > unicode.MaxRune {
|
|
err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
|
|
quoted[:sz], n)
|
|
return
|
|
}
|
|
// As in Go, surrogates are disallowed.
|
|
if 0xD800 <= n && n < 0xE000 {
|
|
err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
|
|
return
|
|
}
|
|
buf.WriteRune(rune(n))
|
|
quoted = quoted[sz:]
|
|
}
|
|
}
|
|
|
|
s = buf.String()
|
|
return
|
|
}
|
|
|
|
// indexByte returns the index of the first instance of b in s, or else -1.
|
|
func indexByte(s string, b byte) int {
|
|
for i := 0; i < len(s); i++ {
|
|
if s[i] == b {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// Quote returns a Starlark literal that denotes s.
|
|
// If b, it returns a bytes literal.
|
|
func Quote(s string, b bool) string {
|
|
const hex = "0123456789abcdef"
|
|
var runeTmp [utf8.UTFMax]byte
|
|
|
|
buf := make([]byte, 0, 3*len(s)/2)
|
|
if b {
|
|
buf = append(buf, 'b')
|
|
}
|
|
buf = append(buf, '"')
|
|
for width := 0; len(s) > 0; s = s[width:] {
|
|
r := rune(s[0])
|
|
width = 1
|
|
if r >= utf8.RuneSelf {
|
|
r, width = utf8.DecodeRuneInString(s)
|
|
}
|
|
if width == 1 && r == utf8.RuneError {
|
|
// String (!b) literals accept \xXX escapes only for ASCII,
|
|
// but we must use them here to represent invalid bytes.
|
|
// The result is not a legal literal.
|
|
buf = append(buf, `\x`...)
|
|
buf = append(buf, hex[s[0]>>4])
|
|
buf = append(buf, hex[s[0]&0xF])
|
|
continue
|
|
}
|
|
if r == '"' || r == '\\' { // always backslashed
|
|
buf = append(buf, '\\')
|
|
buf = append(buf, byte(r))
|
|
continue
|
|
}
|
|
if strconv.IsPrint(r) {
|
|
n := utf8.EncodeRune(runeTmp[:], r)
|
|
buf = append(buf, runeTmp[:n]...)
|
|
continue
|
|
}
|
|
switch r {
|
|
case '\a':
|
|
buf = append(buf, `\a`...)
|
|
case '\b':
|
|
buf = append(buf, `\b`...)
|
|
case '\f':
|
|
buf = append(buf, `\f`...)
|
|
case '\n':
|
|
buf = append(buf, `\n`...)
|
|
case '\r':
|
|
buf = append(buf, `\r`...)
|
|
case '\t':
|
|
buf = append(buf, `\t`...)
|
|
case '\v':
|
|
buf = append(buf, `\v`...)
|
|
default:
|
|
switch {
|
|
case r < ' ' || r == 0x7f:
|
|
buf = append(buf, `\x`...)
|
|
buf = append(buf, hex[byte(r)>>4])
|
|
buf = append(buf, hex[byte(r)&0xF])
|
|
case r > utf8.MaxRune:
|
|
r = 0xFFFD
|
|
fallthrough
|
|
case r < 0x10000:
|
|
buf = append(buf, `\u`...)
|
|
for s := 12; s >= 0; s -= 4 {
|
|
buf = append(buf, hex[r>>uint(s)&0xF])
|
|
}
|
|
default:
|
|
buf = append(buf, `\U`...)
|
|
for s := 28; s >= 0; s -= 4 {
|
|
buf = append(buf, hex[r>>uint(s)&0xF])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
buf = append(buf, '"')
|
|
return string(buf)
|
|
}
|