Skip to content

Commit 974236b

Browse files
committed
os, syscall: support ill-formed UTF-16 strings on Windows
Windows UTF-16 strings can contain unpaired surrogates, which can't be decoded into a valid UTF-8 string. This file defines a set of functions that can be used to encode and decode potentially ill-formed UTF-16 strings by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). WTF-8 is a strict superset of UTF-8, i.e. any string that is well-formed in UTF-8 is also well-formed in WTF-8 and the content is unchanged. Also, the conversion never fails and is lossless. The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string is that the conversion is lossless even for ill-formed UTF-16 strings. This property allows to read an ill-formed UTF-16 string, convert it to a Go string, and convert it back to the same original UTF-16 string. Fixes #59971 Change-Id: Id6007f6e537844913402b233e73d698688cd5ba6 Reviewed-on: https://go-review.googlesource.com/c/go/+/493036 TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Bryan Mills <[email protected]> Run-TryBot: Quim Muntal <[email protected]> Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Paul Hampson <[email protected]>
1 parent 91b8cc0 commit 974236b

12 files changed

+402
-36
lines changed

src/internal/syscall/execenv/execenv_windows.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ package execenv
99
import (
1010
"internal/syscall/windows"
1111
"syscall"
12-
"unicode/utf16"
1312
"unsafe"
1413
)
1514

@@ -41,7 +40,7 @@ func Default(sys *syscall.SysProcAttr) (env []string, err error) {
4140
}
4241

4342
entry := unsafe.Slice(blockp, (uintptr(end)-uintptr(unsafe.Pointer(blockp)))/2)
44-
env = append(env, string(utf16.Decode(entry)))
43+
env = append(env, syscall.UTF16ToString(entry))
4544
blockp = (*uint16)(unsafe.Add(end, size))
4645
}
4746
return

src/internal/syscall/windows/registry/value.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func (k Key) GetStringsValue(name string) (val []string, valtype uint32, err err
217217
from := 0
218218
for i, c := range p {
219219
if c == 0 {
220-
val = append(val, string(utf16.Decode(p[from:i])))
220+
val = append(val, syscall.UTF16ToString(p[from:i]))
221221
from = i + 1
222222
}
223223
}

src/internal/syscall/windows/syscall_windows.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ package windows
77
import (
88
"sync"
99
"syscall"
10-
"unicode/utf16"
1110
"unsafe"
1211
)
1312

@@ -17,17 +16,13 @@ func UTF16PtrToString(p *uint16) string {
1716
if p == nil {
1817
return ""
1918
}
20-
// Find NUL terminator.
2119
end := unsafe.Pointer(p)
2220
n := 0
2321
for *(*uint16)(end) != 0 {
2422
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
2523
n++
2624
}
27-
// Turn *uint16 into []uint16.
28-
s := unsafe.Slice(p, n)
29-
// Decode []uint16 into string.
30-
return string(utf16.Decode(s))
25+
return syscall.UTF16ToString(unsafe.Slice(p, n))
3126
}
3227

3328
const (

src/os/dir_windows.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"runtime"
1212
"sync"
1313
"syscall"
14-
"unicode/utf16"
1514
"unsafe"
1615
)
1716

@@ -104,7 +103,7 @@ func (file *File) readdir(n int, mode readdirMode) (names []string, dirents []Di
104103
d.bufp = 0
105104
}
106105
nameslice := unsafe.Slice(&info.FileName[0], info.FileNameLength/2)
107-
name := string(utf16.Decode(nameslice))
106+
name := syscall.UTF16ToString(nameslice)
108107
if name == "." || name == ".." { // Useless names
109108
continue
110109
}

src/os/exec/lp_windows_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,6 @@ package main
587587
import (
588588
"os"
589589
"syscall"
590-
"unicode/utf16"
591590
"unsafe"
592591
)
593592
@@ -599,7 +598,7 @@ func getMyName() (string, error) {
599598
if n == 0 {
600599
return "", err
601600
}
602-
return string(utf16.Decode(b[0:n])), nil
601+
return syscall.UTF16ToString(b[0:n]), nil
603602
}
604603
605604
func main() {

src/os/file_windows.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"runtime"
1212
"sync"
1313
"syscall"
14-
"unicode/utf16"
1514
"unsafe"
1615
)
1716

@@ -259,7 +258,7 @@ func tempDir() string {
259258
// Otherwise remove terminating \.
260259
n--
261260
}
262-
return string(utf16.Decode(b[:n]))
261+
return syscall.UTF16ToString(b[:n])
263262
}
264263
}
265264

src/os/os_windows_test.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"path/filepath"
1919
"reflect"
2020
"runtime"
21+
"slices"
2122
"sort"
2223
"strings"
2324
"syscall"
@@ -1377,3 +1378,71 @@ func TestAppExecLinkStat(t *testing.T) {
13771378
t.Errorf("exec.LookPath(%q) = %q; want %q", pythonPath, p, pythonPath)
13781379
}
13791380
}
1381+
1382+
func TestIllformedUTF16FileName(t *testing.T) {
1383+
dir := t.TempDir()
1384+
const sep = string(os.PathSeparator)
1385+
if !strings.HasSuffix(dir, sep) {
1386+
dir += sep
1387+
}
1388+
1389+
// This UTF-16 file name is ill-formed as it contains low surrogates that are not preceded by high surrogates ([1:5]).
1390+
namew := []uint16{0x2e, 0xdc6d, 0xdc73, 0xdc79, 0xdc73, 0x30, 0x30, 0x30, 0x31, 0}
1391+
1392+
// Create a file whose name contains unpaired surrogates.
1393+
// Use syscall.CreateFile instead of os.Create to simulate a file that is created by
1394+
// a non-Go program so the file name hasn't gone through syscall.UTF16FromString.
1395+
dirw := utf16.Encode([]rune(dir))
1396+
pathw := append(dirw, namew...)
1397+
fd, err := syscall.CreateFile(&pathw[0], syscall.GENERIC_ALL, 0, nil, syscall.CREATE_NEW, 0, 0)
1398+
if err != nil {
1399+
t.Fatal(err)
1400+
}
1401+
syscall.CloseHandle(fd)
1402+
1403+
name := syscall.UTF16ToString(namew)
1404+
path := filepath.Join(dir, name)
1405+
// Verify that os.Lstat can query the file.
1406+
fi, err := os.Lstat(path)
1407+
if err != nil {
1408+
t.Fatal(err)
1409+
}
1410+
if got := fi.Name(); got != name {
1411+
t.Errorf("got %q, want %q", got, name)
1412+
}
1413+
// Verify that File.Readdirnames lists the file.
1414+
f, err := os.Open(dir)
1415+
if err != nil {
1416+
t.Fatal(err)
1417+
}
1418+
files, err := f.Readdirnames(0)
1419+
f.Close()
1420+
if err != nil {
1421+
t.Fatal(err)
1422+
}
1423+
if !slices.Contains(files, name) {
1424+
t.Error("file not listed")
1425+
}
1426+
// Verify that os.RemoveAll can remove the directory
1427+
// and that it doesn't hang.
1428+
err = os.RemoveAll(dir)
1429+
if err != nil {
1430+
t.Error(err)
1431+
}
1432+
}
1433+
1434+
func TestUTF16Alloc(t *testing.T) {
1435+
allowsPerRun := func(want int, f func()) {
1436+
t.Helper()
1437+
got := int(testing.AllocsPerRun(5, f))
1438+
if got != want {
1439+
t.Errorf("got %d allocs, want %d", got, want)
1440+
}
1441+
}
1442+
allowsPerRun(1, func() {
1443+
syscall.UTF16ToString([]uint16{'a', 'b', 'c'})
1444+
})
1445+
allowsPerRun(1, func() {
1446+
syscall.UTF16FromString("abc")
1447+
})
1448+
}

src/syscall/env_windows.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
package syscall
88

99
import (
10-
"unicode/utf16"
1110
"unsafe"
1211
)
1312

@@ -24,7 +23,7 @@ func Getenv(key string) (value string, found bool) {
2423
return "", false
2524
}
2625
if n <= uint32(len(b)) {
27-
return string(utf16.Decode(b[:n])), true
26+
return UTF16ToString(b[:n]), true
2827
}
2928
}
3029
}
@@ -90,7 +89,7 @@ func Environ() []string {
9089
}
9190

9291
entry := unsafe.Slice(envp, (uintptr(end)-uintptr(unsafe.Pointer(envp)))/size)
93-
r = append(r, string(utf16.Decode(entry)))
92+
r = append(r, UTF16ToString(entry))
9493
envp = (*uint16)(unsafe.Add(end, size))
9594
}
9695
return r

src/syscall/export_windows_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@ var UpdateProcThreadAttribute = updateProcThreadAttribute
99
var DeleteProcThreadAttributeList = deleteProcThreadAttributeList
1010

1111
const PROC_THREAD_ATTRIBUTE_HANDLE_LIST = _PROC_THREAD_ATTRIBUTE_HANDLE_LIST
12+
13+
var EncodeWTF16 = encodeWTF16
14+
var DecodeWTF16 = decodeWTF16

src/syscall/syscall_windows.go

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"internal/race"
1515
"runtime"
1616
"sync"
17-
"unicode/utf16"
1817
"unsafe"
1918
)
2019

@@ -37,7 +36,8 @@ func StringToUTF16(s string) []uint16 {
3736

3837
// UTF16FromString returns the UTF-16 encoding of the UTF-8 string
3938
// s, with a terminating NUL added. If s contains a NUL byte at any
40-
// location, it returns (nil, EINVAL).
39+
// location, it returns (nil, EINVAL). Unpaired surrogates
40+
// are encoded using WTF-8.
4141
func UTF16FromString(s string) ([]uint16, error) {
4242
if bytealg.IndexByteString(s, 0) != -1 {
4343
return nil, EINVAL
@@ -49,22 +49,37 @@ func UTF16FromString(s string) ([]uint16, error) {
4949
// equal than the number of UTF-16 code units.
5050
// Also account for the terminating NUL character.
5151
buf := make([]uint16, 0, len(s)+1)
52-
for _, r := range s {
53-
buf = utf16.AppendRune(buf, r)
54-
}
55-
return utf16.AppendRune(buf, '\x00'), nil
52+
buf = encodeWTF16(s, buf)
53+
return append(buf, 0), nil
5654
}
5755

5856
// UTF16ToString returns the UTF-8 encoding of the UTF-16 sequence s,
59-
// with a terminating NUL removed.
57+
// with a terminating NUL removed. Unpaired surrogates are decoded
58+
// using WTF-8 instead of UTF-8 encoding.
6059
func UTF16ToString(s []uint16) string {
60+
maxLen := 0
6161
for i, v := range s {
6262
if v == 0 {
6363
s = s[0:i]
6464
break
6565
}
66+
switch {
67+
case v <= rune1Max:
68+
maxLen += 1
69+
case v <= rune2Max:
70+
maxLen += 2
71+
default:
72+
// r is a non-surrogate that decodes to 3 bytes,
73+
// or is an unpaired surrogate (also 3 bytes in WTF-8),
74+
// or is one half of a valid surrogate pair.
75+
// If it is half of a pair, we will add 3 for the second surrogate
76+
// (total of 6) and overestimate by 2 bytes for the pair,
77+
// since the resulting rune only requires 4 bytes.
78+
maxLen += 3
79+
}
6680
}
67-
return string(utf16.Decode(s))
81+
buf := decodeWTF16(s, make([]byte, 0, maxLen))
82+
return unsafe.String(unsafe.SliceData(buf), len(buf))
6883
}
6984

7085
// utf16PtrToString is like UTF16ToString, but takes *uint16
@@ -73,17 +88,13 @@ func utf16PtrToString(p *uint16) string {
7388
if p == nil {
7489
return ""
7590
}
76-
// Find NUL terminator.
7791
end := unsafe.Pointer(p)
7892
n := 0
7993
for *(*uint16)(end) != 0 {
8094
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
8195
n++
8296
}
83-
// Turn *uint16 into []uint16.
84-
s := unsafe.Slice(p, n)
85-
// Decode []uint16 into string.
86-
return string(utf16.Decode(s))
97+
return UTF16ToString(unsafe.Slice(p, n))
8798
}
8899

89100
// StringToUTF16Ptr returns pointer to the UTF-16 encoding of
@@ -97,6 +108,7 @@ func StringToUTF16Ptr(s string) *uint16 { return &StringToUTF16(s)[0] }
97108
// UTF16PtrFromString returns pointer to the UTF-16 encoding of
98109
// the UTF-8 string s, with a terminating NUL added. If s
99110
// contains a NUL byte at any location, it returns (nil, EINVAL).
111+
// Unpaired surrogates are encoded using WTF-8.
100112
func UTF16PtrFromString(s string) (*uint16, error) {
101113
a, err := UTF16FromString(s)
102114
if err != nil {
@@ -143,7 +155,7 @@ func (e Errno) Error() string {
143155
// trim terminating \r and \n
144156
for ; n > 0 && (b[n-1] == '\n' || b[n-1] == '\r'); n-- {
145157
}
146-
return string(utf16.Decode(b[:n]))
158+
return UTF16ToString(b[:n])
147159
}
148160

149161
const (
@@ -525,7 +537,7 @@ func Getwd() (wd string, err error) {
525537
if e != nil {
526538
return "", e
527539
}
528-
return string(utf16.Decode(b[0:n])), nil
540+
return UTF16ToString(b[0:n]), nil
529541
}
530542

531543
func Chdir(path string) (err error) {
@@ -573,13 +585,13 @@ func Rename(oldpath, newpath string) (err error) {
573585
}
574586

575587
func ComputerName() (name string, err error) {
576-
var n uint32 = MAX_COMPUTERNAME_LENGTH + 1
577-
b := make([]uint16, n)
588+
b := make([]uint16, MAX_COMPUTERNAME_LENGTH+1)
589+
var n uint32
578590
e := GetComputerName(&b[0], &n)
579591
if e != nil {
580592
return "", e
581593
}
582-
return string(utf16.Decode(b[0:n])), nil
594+
return UTF16ToString(b[:n]), nil
583595
}
584596

585597
func Ftruncate(fd Handle, length int64) (err error) {

0 commit comments

Comments
 (0)