diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java index 90009ecb61bb5..8762bedb17ab8 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java @@ -26,6 +26,9 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; +import static org.apache.hadoop.io.Sizes.S_16K; +import static org.apache.hadoop.io.Sizes.S_1M; + /** * Stream that permits positional reading. * @@ -95,7 +98,7 @@ void readFully(long position, byte[] buffer, int offset, int length) * @return the minimum number of bytes */ default int minSeekForVectorReads() { - return 4 * 1024; + return S_16K; } /** @@ -103,7 +106,7 @@ default int minSeekForVectorReads() { * @return the number of bytes to read at once */ default int maxReadSizeForVectorReads() { - return 1024 * 1024; + return S_1M; } /** diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Sizes.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Sizes.java new file mode 100644 index 0000000000000..bf2dc78741f51 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Sizes.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Sizes of binary values and other some common sizes. + * This avoids having to remember the larger binary values, + * and stops IDEs/style checkers complaining about numeric + * values in source code. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public final class Sizes { + + /** 2^8 bytes: {@value}. */ + public static final int S_256 = 256; + + /** 2^9 bytes: {@value}. */ + public static final int S_512 = S_256 << 1; + + /** 2^10 bytes - 1 KiB: {@value}. */ + public static final int S_1K = S_512 << 1; + + /** 2^11 bytes - 1 KiB: {@value}. */ + public static final int S_2K = S_1K << 1; + + /** 2^12 bytes - 2 KiB: {@value}. */ + public static final int S_4K = S_2K << 1; + + /** 2^13 bytes: {@value}. */ + public static final int S_8K = S_4K << 1; + + /** 2^14 bytes: {@value}. */ + public static final int S_16K = S_8K << 1; + + /** 2^15 bytes: {@value}. */ + public static final int S_32K = S_16K << 1; + + /** 2^16 bytes: {@value}. */ + public static final int S_64K = S_32K << 1; + + /** 2^17 bytes, 128 KiB: {@value}. */ + public static final int S_128K = S_64K << 1; + + /** 2^18 bytes, 256 KiB: {@value}. */ + public static final int S_256K = S_128K << 1; + + /** 2^19 bytes, 512 KiB: {@value}. */ + public static final int S_512K = S_256K << 1; + + /** 2^20 bytes, 1 MiB: {@value}. */ + public static final int S_1M = S_512K << 1; + + /** 2^21 bytes, 2 MiB: {@value}. */ + public static final int S_2M = S_1M << 1; + + /** 2^22 bytes, 4 MiB: {@value}. */ + public static final int S_4M = S_2M << 1; + + /** 2^23 bytes, MiB: {@value}. */ + public static final int S_8M = S_4M << 1; + + /** 2^24 bytes, MiB: {@value}. */ + public static final int S_16M = S_8M << 1; + + /** 2^25 bytes, MiB: {@value}. */ + public static final int S_32M = S_16M << 1; + + /** 5 MiB: {@value}. */ + public static final int S_5M = 5 * S_1M; + + /** 10 MiB: {@value}. */ + public static final int S_10M = 10 * S_1M; + +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 0a6944c6d293f..e695e918c953d 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -26,6 +26,9 @@ import java.time.Duration; import java.util.concurrent.TimeUnit; +import static org.apache.hadoop.io.Sizes.S_128K; +import static org.apache.hadoop.io.Sizes.S_2M; + /** * Constants used with the {@link S3AFileSystem}. * @@ -1545,14 +1548,14 @@ private Constants() { "fs.s3a.vectored.read.max.merged.size"; /** - * Default minimum seek in bytes during vectored reads : {@value}. + * Default minimum seek in bytes during vectored reads: {@value}. */ - public static final int DEFAULT_AWS_S3_VECTOR_READS_MIN_SEEK_SIZE = 4096; // 4K + public static final int DEFAULT_AWS_S3_VECTOR_READS_MIN_SEEK_SIZE = S_128K; /** * Default maximum read size in bytes during vectored reads : {@value}. */ - public static final int DEFAULT_AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE = 1048576; //1M + public static final int DEFAULT_AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE = S_2M; /** * Maximum number of range reads a single input stream can have diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md index 8fddddf79c74f..53239b1295d18 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/performance.md @@ -67,7 +67,7 @@ on the client requirements. ```xml fs.s3a.vectored.read.min.seek.size - 4K + 128K What is the smallest reasonable seek in bytes such that we group ranges together during vectored @@ -76,7 +76,7 @@ on the client requirements. fs.s3a.vectored.read.max.merged.size - 1M + 2M What is the largest merged read size in bytes such that we group ranges together during vectored read. @@ -282,7 +282,7 @@ Fix: Use one of the dedicated [S3A Committers](committers.md). ## Options to Tune -### Performance Flags: `fs.s3a.performance.flag` +### Performance Flags: `fs.s3a.performance.flags` This option takes a comma separated list of performance flags. View it as the equivalent of the `-O` compiler optimization list C/C++ compilers offer. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java index 8096f55bcd54c..fbb6d5a04d27a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java @@ -61,8 +61,12 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.range; import static org.apache.hadoop.fs.contract.ContractTestUtils.returnBuffersToPoolPostRead; import static org.apache.hadoop.fs.contract.ContractTestUtils.validateVectoredReadResult; +import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE; +import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_VECTOR_READS_MIN_SEEK_SIZE; import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue; import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString; +import static org.apache.hadoop.io.Sizes.S_1M; +import static org.apache.hadoop.io.Sizes.S_4K; import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture; import static org.apache.hadoop.test.MoreAsserts.assertEqual; @@ -139,13 +143,13 @@ public void testEOFRanges416Handling() throws Exception { public void testMinSeekAndMaxSizeConfigsPropagation() throws Exception { Configuration conf = getFileSystem().getConf(); S3ATestUtils.removeBaseAndBucketOverrides(conf, - Constants.AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, - Constants.AWS_S3_VECTOR_READS_MIN_SEEK_SIZE); + AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, + AWS_S3_VECTOR_READS_MIN_SEEK_SIZE); S3ATestUtils.disableFilesystemCaching(conf); final int configuredMinSeek = 2 * 1024; final int configuredMaxSize = 10 * 1024 * 1024; - conf.set(Constants.AWS_S3_VECTOR_READS_MIN_SEEK_SIZE, "2K"); - conf.set(Constants.AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, "10M"); + conf.set(AWS_S3_VECTOR_READS_MIN_SEEK_SIZE, "2K"); + conf.set(AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, "10M"); try (S3AFileSystem fs = S3ATestUtils.createTestFileSystem(conf)) { try (FSDataInputStream fis = openVectorFile(fs)) { int newMinSeek = fis.minSeekForVectorReads(); @@ -162,8 +166,8 @@ public void testMinSeekAndMaxSizeConfigsPropagation() throws Exception { public void testMinSeekAndMaxSizeDefaultValues() throws Exception { Configuration conf = getFileSystem().getConf(); S3ATestUtils.removeBaseAndBucketOverrides(conf, - Constants.AWS_S3_VECTOR_READS_MIN_SEEK_SIZE, - Constants.AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE); + AWS_S3_VECTOR_READS_MIN_SEEK_SIZE, + AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE); try (S3AFileSystem fs = S3ATestUtils.createTestFileSystem(conf)) { try (FSDataInputStream fis = openVectorFile(fs)) { int minSeek = fis.minSeekForVectorReads(); @@ -400,16 +404,25 @@ public void testMultiVectoredReadStatsCollection() throws Exception { } } + /** + * Create a test fs with no readahead. + * The vector IO ranges are set to the original small values, + * so ranges on small files are not coalesced. + * @return a filesystem + * @throws IOException failure to instantiate. + */ private S3AFileSystem getTestFileSystemWithReadAheadDisabled() throws IOException { Configuration conf = getFileSystem().getConf(); // also resetting the min seek and max size values is important // as this same test suite has test which overrides these params. S3ATestUtils.removeBaseAndBucketOverrides(conf, Constants.READAHEAD_RANGE, - Constants.AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, - Constants.AWS_S3_VECTOR_READS_MIN_SEEK_SIZE); + AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, + AWS_S3_VECTOR_READS_MIN_SEEK_SIZE); S3ATestUtils.disableFilesystemCaching(conf); conf.setInt(Constants.READAHEAD_RANGE, 0); + conf.setInt(AWS_S3_VECTOR_READS_MIN_SEEK_SIZE, S_4K); + conf.setInt(AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE, S_1M); return S3ATestUtils.createTestFileSystem(conf); } } diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsInputStream.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsInputStream.java index cacd3b092eb3f..12bc471b50d6f 100644 --- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsInputStream.java +++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsInputStream.java @@ -26,6 +26,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.classification.VisibleForTesting; +import org.apache.hadoop.fs.PositionedReadable; import org.apache.hadoop.fs.impl.BackReference; import org.apache.hadoop.util.Preconditions; @@ -53,6 +54,8 @@ import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_KB; import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.STREAM_ID_LEN; import static org.apache.hadoop.fs.azurebfs.constants.InternalConstants.CAPABILITY_SAFE_READAHEAD; +import static org.apache.hadoop.io.Sizes.S_128K; +import static org.apache.hadoop.io.Sizes.S_2M; import static org.apache.hadoop.util.StringUtils.toLowerCase; /** @@ -891,4 +894,15 @@ long getLimit() { BackReference getFsBackRef() { return fsBackRef; } + + @Override + public int minSeekForVectorReads() { + return S_128K; + } + + @Override + public int maxReadSizeForVectorReads() { + return S_2M; + } + }