Skip to content

Commit 22da091

Browse files
lukesandbergcgdecker
authored andcommitted
Rollforward [] which was rolled back in [] because it
caused some tests using appengine to fail with IOExceptions when calling FileInputStream.available(). The workaround in the rollforward is to not call the method. The one case where we were calling it we had actually already called a similar method, so trying again doesn't seem valuable anyway. *** Original change description *** Implement ByteSource.asCharSource(charset).read() using the decoding string constructor instead of streaming the contents into a StringBuilder. this allows us to avoid a number of copies that are currently happening for each character (1. into a temporary CharBuffer, 2. into a StringBuilder, 3 into the String char[]) and replace it with simply whatever is required by ByteSource.read() and the String(byte[], charset) constructor. For certain ByteSource implementations (like FileByteSource) ByteSource.read() can often presize the byte[] correctly and the string constructor can also do some things to guess at the correct array sizes and avoid copies in the common case. Benchmarks have shown that this should improve the speed of Files.toString significantly. *** ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=162275181
1 parent b87e1f1 commit 22da091

File tree

6 files changed

+316
-8
lines changed

6 files changed

+316
-8
lines changed
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/*
2+
* Copyright (C) 2017 The Guava Authors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5+
* in compliance with the License. You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software distributed under the License
10+
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11+
* or implied. See the License for the specific language governing permissions and limitations under
12+
* the License.
13+
*/
14+
15+
package com.google.common.io;
16+
17+
import com.google.caliper.BeforeExperiment;
18+
import com.google.caliper.Benchmark;
19+
import com.google.caliper.Param;
20+
import com.google.caliper.api.VmOptions;
21+
import com.google.common.base.Optional;
22+
import java.io.IOException;
23+
import java.io.InputStreamReader;
24+
import java.nio.charset.Charset;
25+
import java.util.Random;
26+
27+
/**
28+
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
29+
*/
30+
// These benchmarks allocate a lot of data so use a large heap
31+
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
32+
public class ByteSourceAsCharSourceReadBenchmark {
33+
enum ReadStrategy {
34+
TO_BYTE_ARRAY_NEW_STRING {
35+
@Override
36+
String read(ByteSource byteSource, Charset cs) throws IOException {
37+
return new String(byteSource.read(), cs);
38+
}
39+
},
40+
USING_CHARSTREAMS_COPY {
41+
@Override
42+
String read(ByteSource byteSource, Charset cs) throws IOException {
43+
StringBuilder sb = new StringBuilder();
44+
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
45+
CharStreams.copy(reader, sb);
46+
}
47+
return sb.toString();
48+
}
49+
},
50+
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't
51+
// my best guess is that the jdk authors have spent more time optimizing that callpath than this
52+
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases
53+
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least
54+
// come close.
55+
USING_DECODER_WITH_SIZE_HINT {
56+
@Override
57+
String read(ByteSource byteSource, Charset cs) throws IOException {
58+
Optional<Long> size = byteSource.sizeIfKnown();
59+
// if we know the size and it fits in an int
60+
if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
61+
// otherwise try to presize a StringBuilder
62+
// it is kind of lame that we need to construct a decoder to access this value.
63+
// if this is a concern we could add special cases for some known charsets (like utf8)
64+
// or we could avoid inputstreamreader and use the decoder api directly
65+
// TODO(lukes): in a real implementation we would need to handle overflow conditions
66+
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
67+
char[] buffer = new char[maxChars];
68+
int bufIndex = 0;
69+
int remaining = buffer.length;
70+
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
71+
int nRead = 0;
72+
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
73+
bufIndex += nRead;
74+
remaining -= nRead;
75+
}
76+
if (nRead == -1) {
77+
// we reached EOF
78+
return new String(buffer, 0, bufIndex);
79+
}
80+
// otherwise we got the size wrong. This can happen if the size changes between when
81+
// we called sizeIfKnown and when we started reading the file (or i guess if
82+
// maxCharsPerByte is wrong)
83+
// Fallback to an incremental approach
84+
StringBuilder builder = new StringBuilder(bufIndex + 32);
85+
builder.append(buffer, 0, bufIndex);
86+
buffer = null; // release for gc
87+
CharStreams.copy(reader, builder);
88+
return builder.toString();
89+
}
90+
91+
} else {
92+
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
93+
}
94+
}
95+
};
96+
97+
abstract String read(ByteSource byteSource, Charset cs) throws IOException;
98+
}
99+
100+
@Param({"UTF-8"})
101+
String charsetName;
102+
103+
@Param ReadStrategy strategy;
104+
105+
@Param({"10", "1024", "1048576"})
106+
int size;
107+
108+
Charset charset;
109+
ByteSource data;
110+
111+
@BeforeExperiment
112+
public void setUp() {
113+
charset = Charset.forName(charsetName);
114+
StringBuilder sb = new StringBuilder();
115+
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
116+
sb.ensureCapacity(size);
117+
for (int k = 0; k < size; k++) {
118+
// [9-127) includes all ascii non-control characters
119+
sb.append((char) (random.nextInt(127 - 9) + 9));
120+
}
121+
String string = sb.toString();
122+
sb.setLength(0);
123+
data = ByteSource.wrap(string.getBytes(charset));
124+
}
125+
126+
@Benchmark
127+
public int timeCopy(int reps) throws IOException {
128+
int r = 0;
129+
final Charset localCharset = charset;
130+
final ByteSource localData = data;
131+
final ReadStrategy localStrategy = strategy;
132+
for (int i = 0; i < reps; i++) {
133+
r += localStrategy.read(localData, localCharset).hashCode();
134+
}
135+
return r;
136+
}
137+
}

android/guava/src/com/google/common/io/ByteSource.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,18 @@ public Reader openStream() throws IOException {
455455
return new InputStreamReader(ByteSource.this.openStream(), charset);
456456
}
457457

458+
@Override
459+
public String read() throws IOException {
460+
// Reading all the data as a byte array is more efficient than the default read()
461+
// implementation because:
462+
// 1. the string constructor can avoid an extra copy most of the time by correctly sizing the
463+
// internal char array (hard to avoid using StringBuilder)
464+
// 2. we avoid extra copies into temporary buffers altogether
465+
// The downside is that this will cause us to store the file bytes in memory twice for a short
466+
// amount of time.
467+
return new String(ByteSource.this.read(), charset);
468+
}
469+
458470
@Override
459471
public String toString() {
460472
return ByteSource.this.toString() + ".asCharSource(" + charset + ")";

android/guava/src/com/google/common/io/Files.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,15 @@ static byte[] readFile(InputStream in, long expectedSize) throws IOException {
176176
}
177177

178178
// some special files may return size 0 but have content, so read
179-
// the file normally in that case
180-
return expectedSize == 0
181-
? ByteStreams.toByteArray(in)
182-
: ByteStreams.toByteArray(in, (int) expectedSize);
179+
// the file normally in that case guessing at the buffer size to use. Note, there is no point
180+
// in calling the 'toByteArray' overload that doesn't take a size because that calls
181+
// InputStream.available(), but our caller has already done that. So instead just guess that
182+
// the file is 4K bytes long and rely on the fallback in toByteArray to expand the buffer if
183+
// needed.
184+
// This also works around an app-engine bug where FileInputStream.available() consistently
185+
// throws an IOException for certain files, even though FileInputStream.getChannel().size() does
186+
// not!
187+
return ByteStreams.toByteArray(in, expectedSize == 0 ? 4096 : (int) expectedSize);
183188
}
184189

185190
/**
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/*
2+
* Copyright (C) 2017 The Guava Authors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5+
* in compliance with the License. You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software distributed under the License
10+
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11+
* or implied. See the License for the specific language governing permissions and limitations under
12+
* the License.
13+
*/
14+
15+
package com.google.common.io;
16+
17+
import com.google.caliper.BeforeExperiment;
18+
import com.google.caliper.Benchmark;
19+
import com.google.caliper.Param;
20+
import com.google.caliper.api.VmOptions;
21+
import com.google.common.base.Optional;
22+
import java.io.IOException;
23+
import java.io.InputStreamReader;
24+
import java.nio.charset.Charset;
25+
import java.util.Random;
26+
27+
/**
28+
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
29+
*/
30+
// These benchmarks allocate a lot of data so use a large heap
31+
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
32+
public class ByteSourceAsCharSourceReadBenchmark {
33+
enum ReadStrategy {
34+
TO_BYTE_ARRAY_NEW_STRING {
35+
@Override
36+
String read(ByteSource byteSource, Charset cs) throws IOException {
37+
return new String(byteSource.read(), cs);
38+
}
39+
},
40+
USING_CHARSTREAMS_COPY {
41+
@Override
42+
String read(ByteSource byteSource, Charset cs) throws IOException {
43+
StringBuilder sb = new StringBuilder();
44+
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
45+
CharStreams.copy(reader, sb);
46+
}
47+
return sb.toString();
48+
}
49+
},
50+
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't
51+
// my best guess is that the jdk authors have spent more time optimizing that callpath than this
52+
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases
53+
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least
54+
// come close.
55+
USING_DECODER_WITH_SIZE_HINT {
56+
@Override
57+
String read(ByteSource byteSource, Charset cs) throws IOException {
58+
Optional<Long> size = byteSource.sizeIfKnown();
59+
// if we know the size and it fits in an int
60+
if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
61+
// otherwise try to presize a StringBuilder
62+
// it is kind of lame that we need to construct a decoder to access this value.
63+
// if this is a concern we could add special cases for some known charsets (like utf8)
64+
// or we could avoid inputstreamreader and use the decoder api directly
65+
// TODO(lukes): in a real implementation we would need to handle overflow conditions
66+
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
67+
char[] buffer = new char[maxChars];
68+
int bufIndex = 0;
69+
int remaining = buffer.length;
70+
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
71+
int nRead = 0;
72+
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
73+
bufIndex += nRead;
74+
remaining -= nRead;
75+
}
76+
if (nRead == -1) {
77+
// we reached EOF
78+
return new String(buffer, 0, bufIndex);
79+
}
80+
// otherwise we got the size wrong. This can happen if the size changes between when
81+
// we called sizeIfKnown and when we started reading the file (or i guess if
82+
// maxCharsPerByte is wrong)
83+
// Fallback to an incremental approach
84+
StringBuilder builder = new StringBuilder(bufIndex + 32);
85+
builder.append(buffer, 0, bufIndex);
86+
buffer = null; // release for gc
87+
CharStreams.copy(reader, builder);
88+
return builder.toString();
89+
}
90+
91+
} else {
92+
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
93+
}
94+
}
95+
};
96+
97+
abstract String read(ByteSource byteSource, Charset cs) throws IOException;
98+
}
99+
100+
@Param({"UTF-8"})
101+
String charsetName;
102+
103+
@Param ReadStrategy strategy;
104+
105+
@Param({"10", "1024", "1048576"})
106+
int size;
107+
108+
Charset charset;
109+
ByteSource data;
110+
111+
@BeforeExperiment
112+
public void setUp() {
113+
charset = Charset.forName(charsetName);
114+
StringBuilder sb = new StringBuilder();
115+
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
116+
sb.ensureCapacity(size);
117+
for (int k = 0; k < size; k++) {
118+
// [9-127) includes all ascii non-control characters
119+
sb.append((char) (random.nextInt(127 - 9) + 9));
120+
}
121+
String string = sb.toString();
122+
sb.setLength(0);
123+
data = ByteSource.wrap(string.getBytes(charset));
124+
}
125+
126+
@Benchmark
127+
public int timeCopy(int reps) throws IOException {
128+
int r = 0;
129+
final Charset localCharset = charset;
130+
final ByteSource localData = data;
131+
final ReadStrategy localStrategy = strategy;
132+
for (int i = 0; i < reps; i++) {
133+
r += localStrategy.read(localData, localCharset).hashCode();
134+
}
135+
return r;
136+
}
137+
}

guava/src/com/google/common/io/ByteSource.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,18 @@ public Reader openStream() throws IOException {
455455
return new InputStreamReader(ByteSource.this.openStream(), charset);
456456
}
457457

458+
@Override
459+
public String read() throws IOException {
460+
// Reading all the data as a byte array is more efficient than the default read()
461+
// implementation because:
462+
// 1. the string constructor can avoid an extra copy most of the time by correctly sizing the
463+
// internal char array (hard to avoid using StringBuilder)
464+
// 2. we avoid extra copies into temporary buffers altogether
465+
// The downside is that this will cause us to store the file bytes in memory twice for a short
466+
// amount of time.
467+
return new String(ByteSource.this.read(), charset);
468+
}
469+
458470
@Override
459471
public String toString() {
460472
return ByteSource.this.toString() + ".asCharSource(" + charset + ")";

guava/src/com/google/common/io/Files.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,15 @@ static byte[] readFile(InputStream in, long expectedSize) throws IOException {
176176
}
177177

178178
// some special files may return size 0 but have content, so read
179-
// the file normally in that case
180-
return expectedSize == 0
181-
? ByteStreams.toByteArray(in)
182-
: ByteStreams.toByteArray(in, (int) expectedSize);
179+
// the file normally in that case guessing at the buffer size to use. Note, there is no point
180+
// in calling the 'toByteArray' overload that doesn't take a size because that calls
181+
// InputStream.available(), but our caller has already done that. So instead just guess that
182+
// the file is 4K bytes long and rely on the fallback in toByteArray to expand the buffer if
183+
// needed.
184+
// This also works around an app-engine bug where FileInputStream.available() consistently
185+
// throws an IOException for certain files, even though FileInputStream.getChannel().size() does
186+
// not!
187+
return ByteStreams.toByteArray(in, expectedSize == 0 ? 4096 : (int) expectedSize);
183188
}
184189

185190
/**

0 commit comments

Comments
 (0)