Skip to content

Commit a97f8bd

Browse files
committed
bytes, strings: add Lines, SplitSeq, SplitAfterSeq, FieldsSeq, FieldsFuncSeq
Fixes #61901.
1 parent ca17bda commit a97f8bd

File tree

8 files changed

+417
-0
lines changed

8 files changed

+417
-0
lines changed

api/next/61901.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
pkg bytes, func FieldsFuncSeq([]uint8, func(int32) bool) iter.Seq[[]uint8] #61901
2+
pkg bytes, func FieldsSeq([]uint8) iter.Seq[[]uint8] #61901
3+
pkg bytes, func Lines([]uint8) iter.Seq[[]uint8] #61901
4+
pkg bytes, func SplitAfterSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
5+
pkg bytes, func SplitSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
6+
pkg strings, func FieldsFuncSeq(string, func(int32) bool) iter.Seq[string] #61901
7+
pkg strings, func FieldsSeq(string) iter.Seq[string] #61901
8+
pkg strings, func Lines(string) iter.Seq[string] #61901
9+
pkg strings, func SplitAfterSeq(string, string) iter.Seq[string] #61901
10+
pkg strings, func SplitSeq(string, string) iter.Seq[string] #61901

doc/next/6-stdlib/3-iter.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,29 @@ The [maps] package adds several functions that work with iterators:
2727
- [Values](/pkg/maps#Values) returns an iterator over values in m.
2828
- [Insert](/pkg/maps#Insert) adds the key-value pairs from seq to m.
2929
- [Collect](/pkg/maps#Collect) collects key-value pairs from seq into a new map and returns it.
30+
31+
The [bytes] package adds several functions that work with iterators:
32+
- [Lines](/pkg/bytes#Lines) returns an iterator over the
33+
newline-terminated lines in the byte slice s.
34+
- [SplitSeq](/pkg/bytes#SplitSeq) returns an iterator over
35+
all substrings of s separated by sep.
36+
- [SplitAfterSeq](/pkg/bytes#SplitAfterSeq) returns an iterator
37+
over substrings of s split after each instance of sep.
38+
- [FieldsSeq](/pkg/bytes#FieldsSeq) returns an iterator over
39+
substrings of s split around runs of whitespace characters,
40+
as defined by unicode.IsSpace.
41+
- [FieldsFuncSeq](/pkg/bytes#FieldsFuncSeq) returns an iterator
42+
over substrings of s split around runs of Unicode code points satisfying f(c).
43+
44+
The [strings] package adds several functions that work with iterators:
45+
- [Lines](/pkg/strings#Lines) returns an iterator over
46+
the newline-terminated lines in the string s.
47+
- [SplitSeq](/pkg/strings#SplitSeq) returns an iterator over
48+
all substrings of s separated by sep.
49+
- [SplitAfterSeq](/pkg/strings#SplitAfterSeq) returns an iterator
50+
over substrings of s split after each instance of sep.
51+
- [FieldsSeq](/pkg/strings#FieldsSeq) returns an iterator over
52+
substrings of s split around runs of whitespace characters,
53+
as defined by unicode.IsSpace.
54+
- [FieldsFuncSeq](/pkg/strings#FieldsFuncSeq) returns an iterator
55+
over substrings of s split around runs of Unicode code points satisfying f(c).
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<!-- see ../../3-iter.md -->
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<!-- see ../../3-iter.md -->

src/bytes/bytes.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package bytes
88

99
import (
1010
"internal/bytealg"
11+
"iter"
1112
"unicode"
1213
"unicode/utf8"
1314
)
@@ -318,6 +319,28 @@ func LastIndexAny(s []byte, chars string) int {
318319
return -1
319320
}
320321

322+
// Lines returns an iterator over the newline-terminated lines in the byte slice s.
323+
// The lines yielded by the iterator include their terminating newlines.
324+
// If s is empty, the iterator yields no lines at all.
325+
// If s does not end in a newline, the final yielded line will not end in a newline.
326+
// It returns a single-use iterator.
327+
func Lines(s []byte) iter.Seq[[]byte] {
328+
return func(yield func([]byte) bool) {
329+
for len(s) > 0 {
330+
var line []byte
331+
if i := IndexByte(s, '\n'); i >= 0 {
332+
line, s = s[:i+1], s[i+1:]
333+
} else {
334+
line, s = s, nil
335+
}
336+
if !yield(line) {
337+
return
338+
}
339+
}
340+
return
341+
}
342+
}
343+
321344
// Generic split: splits after each instance of sep,
322345
// including sepSave bytes of sep in the subslices.
323346
func genSplit(s, sep []byte, sepSave, n int) [][]byte {
@@ -390,6 +413,57 @@ func SplitAfter(s, sep []byte) [][]byte {
390413
return genSplit(s, sep, len(sep), -1)
391414
}
392415

416+
// explodeSeq returns an iterator over the runes in s.
417+
func explodeSeq(s []byte) iter.Seq[[]byte] {
418+
return func(yield func([]byte) bool) {
419+
for len(s) > 0 {
420+
_, size := utf8.DecodeRune(s)
421+
if !yield(s[:size]) {
422+
return
423+
}
424+
s = s[size:]
425+
}
426+
}
427+
}
428+
429+
// splitSeq is SplitSeq or SplitAfterSeq, configured by how many
430+
// bytes of sep to include in the results (none or all).
431+
func splitSeq(s, sep []byte, sepSave int) iter.Seq[[]byte] {
432+
if len(sep) == 0 {
433+
return explodeSeq(s)
434+
}
435+
return func(yield func([]byte) bool) {
436+
for {
437+
i := Index(s, sep)
438+
if i < 0 {
439+
break
440+
}
441+
frag := s[:i+sepSave]
442+
if !yield(frag) {
443+
return
444+
}
445+
s = s[i+len(sep):]
446+
}
447+
yield(s)
448+
}
449+
}
450+
451+
// SplitSeq returns an iterator over all substrings of s separated by sep.
452+
// The iterator yields the same strings that would be returned by Split(s, sep),
453+
// but without constructing the slice.
454+
// It returns a single-use iterator.
455+
func SplitSeq(s, sep []byte) iter.Seq[[]byte] {
456+
return splitSeq(s, sep, 0)
457+
}
458+
459+
// SplitAfterSeq returns an iterator over substrings of s split after each instance of sep.
460+
// The iterator yields the same strings that would be returned by SplitAfter(s, sep),
461+
// but without constructing the slice.
462+
// It returns a single-use iterator.
463+
func SplitAfterSeq(s, sep []byte) iter.Seq[[]byte] {
464+
return splitSeq(s, sep, len(sep))
465+
}
466+
393467
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
394468

395469
// Fields interprets s as a sequence of UTF-8-encoded code points.
@@ -446,6 +520,40 @@ func Fields(s []byte) [][]byte {
446520
return a
447521
}
448522

523+
// FieldsSeq returns an iterator over substrings of s split around runs of
524+
// whitespace characters, as defined by unicode.IsSpace.
525+
// The iterator yields the same strings that would be returned by Fields(s),
526+
// but without constructing the slice.
527+
func FieldsSeq(s []byte) iter.Seq[[]byte] {
528+
return func(yield func([]byte) bool) {
529+
s := s
530+
start := -1
531+
for i := 0; i < len(s); {
532+
size := 1
533+
r := rune(s[i])
534+
isSpace := asciiSpace[s[i]] != 0
535+
if r >= utf8.RuneSelf {
536+
r, size = utf8.DecodeRune(s[i:])
537+
isSpace = unicode.IsSpace(r)
538+
}
539+
if isSpace {
540+
if start >= 0 {
541+
if !yield(s[start:i]) {
542+
return
543+
}
544+
start = -1
545+
}
546+
} else if start < 0 {
547+
start = i
548+
}
549+
i += size
550+
}
551+
if start >= 0 {
552+
yield(s[start:])
553+
}
554+
}
555+
}
556+
449557
// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
450558
// It splits the slice s at each run of code points c satisfying f(c) and
451559
// returns a slice of subslices of s. If all code points in s satisfy f(c), or
@@ -500,6 +608,38 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
500608
return a
501609
}
502610

611+
// FieldsFuncSeq returns an iterator over substrings of s split around runs of
612+
// Unicode code points satisfying f(c).
613+
// The iterator yields the same strings that would be returned by FieldsFunc(s),
614+
// but without constructing the slice.
615+
func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
616+
return func(yield func([]byte) bool) {
617+
s := s
618+
start := -1
619+
for i := 0; i < len(s); {
620+
size := 1
621+
r := rune(s[i])
622+
if r >= utf8.RuneSelf {
623+
r, size = utf8.DecodeRune(s[i:])
624+
}
625+
if f(r) {
626+
if start >= 0 {
627+
if !yield(s[start:i]) {
628+
return
629+
}
630+
start = -1
631+
}
632+
} else if start < 0 {
633+
start = i
634+
}
635+
i += size
636+
}
637+
if start >= 0 {
638+
yield(s[start:])
639+
}
640+
}
641+
}
642+
503643
// Join concatenates the elements of s to create a new byte slice. The separator
504644
// sep is placed between elements in the resulting slice.
505645
func Join(s [][]byte, sep []byte) []byte {

src/bytes/bytes_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ import (
88
. "bytes"
99
"fmt"
1010
"internal/testenv"
11+
"iter"
1112
"math"
1213
"math/rand"
1314
"reflect"
15+
"slices"
1416
"strings"
1517
"testing"
1618
"unicode"
@@ -769,6 +771,22 @@ func BenchmarkCountSingle(b *testing.B) {
769771
})
770772
}
771773

774+
var LinesTest = []string{
775+
"abc\nabc\n",
776+
"abc\r\nabc",
777+
"abc\r\n",
778+
"abc\n",
779+
}
780+
781+
func TestLines(t *testing.T) {
782+
for _, s := range LinesTest {
783+
result := Join(slices.Collect(Lines([]byte(s))), []byte(""))
784+
if string(result) != s {
785+
t.Errorf(`Join(collect(Lines(%q)), "") = %q`, s, result)
786+
}
787+
}
788+
}
789+
772790
type SplitTest struct {
773791
s string
774792
sep string
@@ -812,6 +830,14 @@ func TestSplit(t *testing.T) {
812830
t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, result, tt.a)
813831
continue
814832
}
833+
834+
if tt.n < 0 {
835+
result2 := sliceOfString(slices.Collect(SplitSeq([]byte(tt.s), []byte(tt.sep))))
836+
if !eq(result2, tt.a) {
837+
t.Errorf(`collect(SplitSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
838+
}
839+
}
840+
815841
if tt.n == 0 || len(a) == 0 {
816842
continue
817843
}
@@ -871,6 +897,13 @@ func TestSplitAfter(t *testing.T) {
871897
continue
872898
}
873899

900+
if tt.n < 0 {
901+
result2 := sliceOfString(slices.Collect(SplitAfterSeq([]byte(tt.s), []byte(tt.sep))))
902+
if !eq(result2, tt.a) {
903+
t.Errorf(`collect(SplitAfterSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
904+
}
905+
}
906+
874907
if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
875908
t.Errorf("last appended result was %s; want %s", x, want)
876909
}
@@ -924,6 +957,11 @@ func TestFields(t *testing.T) {
924957
continue
925958
}
926959

960+
result2 := sliceOfString(collect(t, FieldsSeq([]byte(tt.s))))
961+
if !eq(result2, tt.a) {
962+
t.Errorf(`collect(FieldsSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
963+
}
964+
927965
if string(b) != tt.s {
928966
t.Errorf("slice changed to %s; want %s", string(b), tt.s)
929967
}
@@ -966,6 +1004,11 @@ func TestFieldsFunc(t *testing.T) {
9661004
t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a)
9671005
}
9681006

1007+
result2 := sliceOfString(collect(t, FieldsFuncSeq([]byte(tt.s), pred)))
1008+
if !eq(result2, tt.a) {
1009+
t.Errorf(`collect(FieldsFuncSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
1010+
}
1011+
9691012
if string(b) != tt.s {
9701013
t.Errorf("slice changed to %s; want %s", b, tt.s)
9711014
}
@@ -2278,3 +2321,12 @@ func TestClone(t *testing.T) {
22782321
}
22792322
}
22802323
}
2324+
2325+
func collect(t *testing.T, seq iter.Seq[[]byte]) [][]byte {
2326+
out := slices.Collect(seq)
2327+
out1 := slices.Collect(seq)
2328+
if !reflect.DeepEqual(out, out1) {
2329+
t.Fatalf("inconsistent seq:\n%s\n%s", out, out1)
2330+
}
2331+
return out
2332+
}

0 commit comments

Comments
 (0)