diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb8f4af4698..e8203a5a5c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 ## master / unreleased
 
+* [FEATURE] Fan out parallelizable queries to backend queriers concurrently. #1878
+  * `querier.parallelise-shardable-queries` (bool)
+  * Requires a shard-compatible schema (v10+)
+  * This causes the number of traces to increase accordingly.
+  * The query-frontend now requires a schema config to determine how/when to shard queries, either from a file or from flags (i.e. by the `config-yaml` CLI flag). This is the same schema config the queriers consume. The schema is only required to use this option.
+  * It's also advised to increase downstream concurrency controls as well:
+    * `querier.max-outstanding-requests-per-tenant`
+    * `querier.max-query-parallelism`
+    * `querier.max-concurrent`
+    * `server.grpc-max-concurrent-streams` (for both query-frontends and queriers)
 * [CHANGE] The frontend http server will now send 502 in case of deadline exceeded and 499 if the user requested cancellation. #2156
 * [CHANGE] Config file changed to remove top level `config_store` field in favor of a nested `configdb` field. #2125
 * [CHANGE] Removed unnecessary `frontend.cache-split-interval` in favor of `querier.split-queries-by-interval` both to reduce configuration complexity and guarantee alignment of these two configs. Starting from now, `-querier.cache-results` may only be enabled in conjunction with `-querier.split-queries-by-interval` (previously the cache interval default was `24h` so if you want to preserve the same behaviour you should set `-querier.split-queries-by-interval=24h`). #2040
diff --git a/docs/configuration/arguments.md b/docs/configuration/arguments.md
index 737a52f727d..94acfd86878 100644
--- a/docs/configuration/arguments.md
+++ b/docs/configuration/arguments.md
@@ -68,6 +68,30 @@ The ingester query API was improved over time, but defaults to the old behaviour
 
 ## Query Frontend
 
+- `-querier.parallelise-shardable-queries`
+
+   If set to true, will cause the query frontend to mutate incoming queries when possible by turning `sum` operations into sharded `sum` operations. This requires a shard-compatible schema (v10+). An abridged example:
+   `sum by (foo) (rate(bar{baz=”blip”}[1m]))` ->
+   ```
+   sum by (foo) (
+    sum by (foo) (rate(bar{baz=”blip”,__cortex_shard__=”0of16”}[1m])) or
+    sum by (foo) (rate(bar{baz=”blip”,__cortex_shard__=”1of16”}[1m])) or
+    ...
+    sum by (foo) (rate(bar{baz=”blip”,__cortex_shard__=”15of16”}[1m]))
+   )
+   ```
+   When enabled, the query-frontend requires a schema config to determine how/when to shard queries, either from a file or from flags (i.e. by the `config-yaml` CLI flag). This is the same schema config the queriers consume.
+   It's also advised to increase downstream concurrency controls as well to account for more queries of smaller sizes:
+
+   - `querier.max-outstanding-requests-per-tenant`
+   - `querier.max-query-parallelism`
+   - `querier.max-concurrent`
+   - `server.grpc-max-concurrent-streams` (for both query-frontends and queriers)
+
+   Furthermore, both querier and query-frontend components require the `querier.query-ingesters-within` parameter to know when to start sharding requests (ingester queries are not sharded). It's recommended to align this with `ingester.max-chunk-age`.
+
+   Instrumentation (traces) also scale with the number of sharded queries and it's suggested to account for increased throughput there as well (for instance via `JAEGER_REPORTER_MAX_QUEUE_SIZE`).
+
 - `-querier.align-querier-with-step`
 
    If set to true, will cause the query frontend to mutate incoming queries and align their start and end parameters to the step parameter of the query.  This improves the cacheability of the query results.
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
index f5476173763..1201fe0e839 100644
--- a/docs/configuration/config-file-reference.md
+++ b/docs/configuration/config-file-reference.md
@@ -641,6 +641,11 @@ results_cache:
 # error is returned.
 # CLI flag: -querier.max-retries-per-request
 [max_retries: <int> | default = 5]
+
+# Perform query parallelisations based on storage sharding configuration and
+# query ASTs. This feature is supported only by the chunks storage engine.
+# CLI flag: -querier.parallelise-shardable-queries
+[parallelise_shardable_queries: <boolean> | default = false]
 ```
 
 ## `ruler_config`
diff --git a/docs/operations/query-auditor.md b/docs/operations/query-auditor.md
new file mode 100644
index 00000000000..32828cef159
--- /dev/null
+++ b/docs/operations/query-auditor.md
@@ -0,0 +1,140 @@
+---
+title: "Query Auditor (tool)"
+linkTitle: "Query Auditor (tool)"
+weight: 2
+slug: query-auditor
+---
+
+The query auditor is a tool bundled in the Cortex repository, but **not** included in Docker images -- this must be built from source. It's primarily useful for those _developing_ Cortex, but can be helpful to operators as well during certain scenarios (backend migrations come to mind).
+
+## How it works
+
+The `query-audit` tool performs a set of queries against two backends that expose the Prometheus read API. This is generally the `query-frontend` component of two Cortex deployments. It will then compare the differences in the responses to determine the average difference for each query. It does this by:
+
+ - Ensuring the resulting label sets match.
+ - For each label set, ensuring they contain the same number of samples as their pair from the other backend.
+ - For each sample, calculates their difference against it's pair from the other backend/label set.
+ - Calculates the average diff per query from the above diffs.
+
+### Limitations
+
+It currently only supports queries with `Matrix` response types.
+
+### Use cases
+
+- Correctness testing when working on the read path.
+- Comparing results from different backends.
+
+### Example Configuration
+
+```yaml
+control:
+  host: http://localhost:8080/api/prom
+  headers:
+    "X-Scope-OrgID": 1234
+
+test:
+  host: http://localhost:8081/api/prom
+  headers:
+    "X-Scope-OrgID": 1234
+
+queries:
+  - query: 'sum(rate(container_cpu_usage_seconds_total[5m]))'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-28T00:00:00Z
+    step_size: 15m
+  - query: 'sum(rate(container_cpu_usage_seconds_total[5m])) by (container_name)'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-28T00:00:00Z
+    step_size: 15m
+  - query: 'sum(rate(container_cpu_usage_seconds_total[5m])) without (container_name)'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-26T00:00:00Z
+    step_size: 15m
+  - query: 'histogram_quantile(0.9, sum(rate(cortex_cache_value_size_bytes_bucket[5m])) by (le, job))'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-25T06:00:00Z
+    step_size: 15m
+    # two shardable legs
+  - query: 'sum without (instance, job) (rate(cortex_query_frontend_queue_length[5m])) or sum by (job) (rate(cortex_query_frontend_queue_length[5m]))'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-25T06:00:00Z
+    step_size: 15m
+    # one shardable leg
+  - query: 'sum without (instance, job) (rate(cortex_cache_request_duration_seconds_count[5m])) or rate(cortex_cache_request_duration_seconds_count[5m])'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-25T06:00:00Z
+    step_size: 15m
+```
+
+### Example Output
+
+Under ideal circumstances, you'll see output like the following:
+
+```
+$ go run ./tools/query-audit/ -f config.yaml
+
+0.000000% avg diff for:
+        query: sum(rate(container_cpu_usage_seconds_total[5m]))
+        series: 1
+        samples: 289
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-28 00:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: sum(rate(container_cpu_usage_seconds_total[5m])) by (container_name)
+        series: 95
+        samples: 25877
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-28 00:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: sum(rate(container_cpu_usage_seconds_total[5m])) without (container_name)
+        series: 4308
+        samples: 374989
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-26 00:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: histogram_quantile(0.9, sum(rate(cortex_cache_value_size_bytes_bucket[5m])) by (le, job))
+        series: 13
+        samples: 325
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-25 06:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: sum without (instance, job) (rate(cortex_query_frontend_queue_length[5m])) or sum by (job) (rate(cortex_query_frontend_queue_length[5m]))
+        series: 21
+        samples: 525
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-25 06:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: sum without (instance, job) (rate(cortex_cache_request_duration_seconds_count[5m])) or rate(cortex_cache_request_duration_seconds_count[5m])
+        series: 942
+        samples: 23550
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-25 06:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: sum by (namespace) (predict_linear(container_cpu_usage_seconds_total[5m], 10))
+        series: 16
+        samples: 400
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-25 06:00:00 +0000 UTC
+        step: 15m0s
+
+0.000000% avg diff for:
+        query: sum by (namespace) (avg_over_time((rate(container_cpu_usage_seconds_total[5m]))[10m:]) > 1)
+        series: 4
+        samples: 52
+        start: 2019-11-25 00:00:00 +0000 UTC
+        end: 2019-11-25 01:00:00 +0000 UTC
+        step: 5m0s
+```
diff --git a/go.mod b/go.mod
index 3f247cd7477..35c28decf1b 100644
--- a/go.mod
+++ b/go.mod
@@ -72,6 +72,7 @@ require (
 	google.golang.org/api v0.14.0
 	google.golang.org/grpc v1.25.1
 	gopkg.in/yaml.v2 v2.2.5
+	sigs.k8s.io/yaml v1.1.0
 )
 
 replace github.com/Azure/azure-sdk-for-go => github.com/Azure/azure-sdk-for-go v36.2.0+incompatible
diff --git a/pkg/chunk/chunk_store.go b/pkg/chunk/chunk_store.go
index a3dd5fcad19..566c0e89f55 100644
--- a/pkg/chunk/chunk_store.go
+++ b/pkg/chunk/chunk_store.go
@@ -429,6 +429,9 @@ func (c *store) lookupChunksByMetricName(ctx context.Context, userID string, fro
 }
 
 func (c *store) lookupEntriesByQueries(ctx context.Context, queries []IndexQuery) ([]IndexEntry, error) {
+	log, ctx := spanlogger.New(ctx, "store.lookupEntriesByQueries")
+	defer log.Span.Finish()
+
 	var lock sync.Mutex
 	var entries []IndexEntry
 	err := c.index.QueryPages(ctx, queries, func(query IndexQuery, resp ReadBatch) bool {
diff --git a/pkg/chunk/chunk_store_test.go b/pkg/chunk/chunk_store_test.go
index 41cd0944deb..f7f96ae9271 100644
--- a/pkg/chunk/chunk_store_test.go
+++ b/pkg/chunk/chunk_store_test.go
@@ -77,6 +77,8 @@ func newTestChunkStoreConfig(t require.TestingT, schemaName string, storeCfg Sto
 		tbmConfig TableManagerConfig
 		schemaCfg = DefaultSchemaConfig("", schemaName, 0)
 	)
+	err := schemaCfg.Validate()
+	require.NoError(t, err)
 	flagext.DefaultValues(&tbmConfig)
 	storage := NewMockStorage()
 	tableManager, err := NewTableManager(tbmConfig, schemaCfg, maxChunkAge, storage, nil)
diff --git a/pkg/chunk/chunk_store_utils.go b/pkg/chunk/chunk_store_utils.go
index eb6ced986d1..27a5a84fe97 100644
--- a/pkg/chunk/chunk_store_utils.go
+++ b/pkg/chunk/chunk_store_utils.go
@@ -146,13 +146,13 @@ func (c *Fetcher) worker() {
 // FetchChunks fetches a set of chunks from cache and store. Note that the keys passed in must be
 // lexicographically sorted, while the returned chunks are not in the same order as the passed in chunks.
 func (c *Fetcher) FetchChunks(ctx context.Context, chunks []Chunk, keys []string) ([]Chunk, error) {
-	log, ctx := spanlogger.New(ctx, "ChunkStore.fetchChunks")
+	log, ctx := spanlogger.New(ctx, "ChunkStore.FetchChunks")
 	defer log.Span.Finish()
 
 	// Now fetch the actual chunk data from Memcache / S3
 	cacheHits, cacheBufs, _ := c.cache.Fetch(ctx, keys)
 
-	fromCache, missing, err := c.processCacheResponse(chunks, cacheHits, cacheBufs)
+	fromCache, missing, err := c.processCacheResponse(ctx, chunks, cacheHits, cacheBufs)
 	if err != nil {
 		level.Warn(log).Log("msg", "error fetching from cache", "err", err)
 	}
@@ -199,12 +199,14 @@ func (c *Fetcher) writeBackCache(ctx context.Context, chunks []Chunk) error {
 
 // ProcessCacheResponse decodes the chunks coming back from the cache, separating
 // hits and misses.
-func (c *Fetcher) processCacheResponse(chunks []Chunk, keys []string, bufs [][]byte) ([]Chunk, []Chunk, error) {
+func (c *Fetcher) processCacheResponse(ctx context.Context, chunks []Chunk, keys []string, bufs [][]byte) ([]Chunk, []Chunk, error) {
 	var (
 		requests  = make([]decodeRequest, 0, len(keys))
 		responses = make(chan decodeResponse)
 		missing   []Chunk
 	)
+	log, _ := spanlogger.New(ctx, "Fetcher.processCacheResponse")
+	defer log.Span.Finish()
 
 	i, j := 0, 0
 	for i < len(chunks) && j < len(keys) {
@@ -229,6 +231,7 @@ func (c *Fetcher) processCacheResponse(chunks []Chunk, keys []string, bufs [][]b
 	for ; i < len(chunks); i++ {
 		missing = append(missing, chunks[i])
 	}
+	level.Debug(log).Log("chunks", len(chunks), "decodeRequests", len(requests), "missing", len(missing))
 
 	go func() {
 		for _, request := range requests {
diff --git a/pkg/chunk/schema.go b/pkg/chunk/schema.go
index e52a5eed115..7a9441e0d73 100644
--- a/pkg/chunk/schema.go
+++ b/pkg/chunk/schema.go
@@ -5,11 +5,16 @@ import (
 	"encoding/hex"
 	"errors"
 	"fmt"
+	"strconv"
 	"strings"
 
+	"github.com/go-kit/kit/log/level"
 	jsoniter "github.com/json-iterator/go"
 	"github.com/prometheus/common/model"
 	"github.com/prometheus/prometheus/pkg/labels"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+	"github.com/cortexproject/cortex/pkg/util"
 )
 
 const (
@@ -48,6 +53,7 @@ type Schema interface {
 	GetReadQueriesForMetric(from, through model.Time, userID string, metricName string) ([]IndexQuery, error)
 	GetReadQueriesForMetricLabel(from, through model.Time, userID string, metricName string, labelName string) ([]IndexQuery, error)
 	GetReadQueriesForMetricLabelValue(from, through model.Time, userID string, metricName string, labelName string, labelValue string) ([]IndexQuery, error)
+	FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery
 
 	// If the query resulted in series IDs, use this method to find chunks.
 	GetChunksForSeries(from, through model.Time, userID string, seriesID []byte) ([]IndexQuery, error)
@@ -218,6 +224,10 @@ func (s schema) GetLabelNamesForSeries(from, through model.Time, userID string,
 	return result, nil
 }
 
+func (s schema) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return s.entries.FilterReadQueries(queries, shard)
+}
+
 type entries interface {
 	GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error)
 	GetLabelWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error)
@@ -228,6 +238,7 @@ type entries interface {
 	GetReadMetricLabelValueQueries(bucket Bucket, metricName string, labelName string, labelValue string) ([]IndexQuery, error)
 	GetChunksForSeries(bucket Bucket, seriesID []byte) ([]IndexQuery, error)
 	GetLabelNamesForSeries(bucket Bucket, seriesID []byte) ([]IndexQuery, error)
+	FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery
 }
 
 // original entries:
@@ -303,6 +314,10 @@ func (originalEntries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery,
 	return nil, ErrNotSupported
 }
 
+func (originalEntries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return queries
+}
+
 // v3Schema went to base64 encoded label values & a version ID
 // - range key: <label name>\0<base64(label value)>\0<chunk name>\0<version 1>
 
@@ -422,6 +437,10 @@ func (labelNameInHashKeyEntries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]I
 	return nil, ErrNotSupported
 }
 
+func (labelNameInHashKeyEntries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return queries
+}
+
 // v5 schema is an extension of v4, with the chunk end time in the
 // range key to improve query latency.  However, it did it wrong
 // so the chunk end times are ignored.
@@ -496,6 +515,10 @@ func (v5Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, error
 	return nil, ErrNotSupported
 }
 
+func (v5Entries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return queries
+}
+
 // v6Entries fixes issues with v5 time encoding being wrong (see #337), and
 // moves label value out of range key (see #199).
 type v6Entries struct{}
@@ -576,10 +599,13 @@ func (v6Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, error
 	return nil, ErrNotSupported
 }
 
-// v9Entries adds a layer of indirection between labels -> series -> chunks.
-type v9Entries struct {
+func (v6Entries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return queries
 }
 
+// v9Entries adds a layer of indirection between labels -> series -> chunks.
+type v9Entries struct{}
+
 func (v9Entries) GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
 	return nil, ErrNotSupported
 }
@@ -675,6 +701,10 @@ func (v9Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, error
 	return nil, ErrNotSupported
 }
 
+func (v9Entries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return queries
+}
+
 // v10Entries builds on v9 by sharding index rows to reduce their size.
 type v10Entries struct {
 	rowShards uint32
@@ -784,6 +814,33 @@ func (v10Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, erro
 	return nil, ErrNotSupported
 }
 
+// FilterReadQueries will return only queries that match a certain shard
+func (v10Entries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) (matches []IndexQuery) {
+	if shard == nil {
+		return queries
+	}
+
+	for _, query := range queries {
+		s := strings.Split(query.HashValue, ":")[0]
+		n, err := strconv.Atoi(s)
+		if err != nil {
+			level.Error(util.Logger).Log(
+				"msg",
+				"Unable to determine shard from IndexQuery",
+				"HashValue",
+				query.HashValue,
+				"schema",
+				"v10",
+			)
+		}
+
+		if err == nil && n == shard.Shard {
+			matches = append(matches, query)
+		}
+	}
+	return matches
+}
+
 // v11Entries builds on v10 but adds index entries for each series to store respective labels.
 type v11Entries struct {
 	v10Entries
diff --git a/pkg/chunk/schema_config.go b/pkg/chunk/schema_config.go
index 06312f827bf..4d794f7a8ed 100644
--- a/pkg/chunk/schema_config.go
+++ b/pkg/chunk/schema_config.go
@@ -197,10 +197,18 @@ func (cfg *SchemaConfig) Validate() error {
 			return err
 		}
 	}
-
 	return nil
 }
 
+func defaultRowShards(schema string) uint32 {
+	switch schema {
+	case "v1", "v2", "v3", "v4", "v5", "v6", "v9":
+		return 0
+	default:
+		return 16
+	}
+}
+
 // ForEachAfter will call f() on every entry after t, splitting
 // entries if necessary so there is an entry starting at t
 func (cfg *SchemaConfig) ForEachAfter(t model.Time, f func(config *PeriodConfig)) {
@@ -219,7 +227,7 @@ func (cfg *SchemaConfig) ForEachAfter(t model.Time, f func(config *PeriodConfig)
 
 // CreateSchema returns the schema defined by the PeriodConfig
 func (cfg PeriodConfig) CreateSchema() Schema {
-	rowShards := uint32(16)
+	rowShards := defaultRowShards(cfg.Schema)
 	if cfg.RowShards > 0 {
 		rowShards = cfg.RowShards
 	}
diff --git a/pkg/chunk/schema_test.go b/pkg/chunk/schema_test.go
index f42b4108768..0393b118ee5 100644
--- a/pkg/chunk/schema_test.go
+++ b/pkg/chunk/schema_test.go
@@ -14,6 +14,8 @@ import (
 	"github.com/prometheus/prometheus/promql"
 	"github.com/stretchr/testify/require"
 	"github.com/weaveworks/common/test"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
 )
 
 type ByHashRangeKey []IndexEntry
@@ -387,5 +389,70 @@ func BenchmarkEncodeLabelsString(b *testing.B) {
 	}
 	b.Log("data size", len(data))
 	b.Log("decode", decoded)
+}
+
+// Ensure all currently defined entries can inhabit the entries interface
+func TestEnsureEntriesInhabitInterface(t *testing.T) {
+	var _ = []entries{
+		originalEntries{},
+		base64Entries{},
+		labelNameInHashKeyEntries{},
+		v5Entries{},
+		v6Entries{},
+		v9Entries{},
+		v10Entries{},
+		v11Entries{},
+	}
+}
+
+func TestV10IndexQueries(t *testing.T) {
+	fromShards := func(n int) (res []IndexQuery) {
+		for i := 0; i < n; i++ {
+			res = append(res, IndexQuery{
+				TableName:       "tbl",
+				HashValue:       fmt.Sprintf("%02d:%s:%s:%s", i, "hash", "metric", "label"),
+				RangeValueStart: []byte(string(i)),
+				ValueEqual:      []byte(string(i)),
+			})
+		}
+		return res
+	}
+
+	var testExprs = []struct {
+		name     string
+		queries  []IndexQuery
+		shard    *astmapper.ShardAnnotation
+		expected []IndexQuery
+	}{
+		{
+			name:     "passthrough when no shard specified",
+			queries:  fromShards(2),
+			shard:    nil,
+			expected: fromShards(2),
+		},
+		{
+			name:    "out of bounds shard returns 0 matches",
+			queries: fromShards(2),
+			shard: &astmapper.ShardAnnotation{
+				Shard: 3,
+			},
+			expected: nil,
+		},
+		{
+			name:    "return correct shard",
+			queries: fromShards(3),
+			shard: &astmapper.ShardAnnotation{
+				Shard: 1,
+			},
+			expected: []IndexQuery{fromShards(2)[1]},
+		},
+	}
 
+	for _, c := range testExprs {
+		t.Run(c.name, func(t *testing.T) {
+			s := v10Entries{}
+			filtered := s.FilterReadQueries(c.queries, c.shard)
+			require.Equal(t, c.expected, filtered)
+		})
+	}
 }
diff --git a/pkg/chunk/series_store.go b/pkg/chunk/series_store.go
index 03cc64c3a37..c4bb30518a5 100644
--- a/pkg/chunk/series_store.go
+++ b/pkg/chunk/series_store.go
@@ -14,6 +14,7 @@ import (
 	"github.com/weaveworks/common/httpgrpc"
 
 	"github.com/cortexproject/cortex/pkg/chunk/cache"
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
 	"github.com/cortexproject/cortex/pkg/util"
 	"github.com/cortexproject/cortex/pkg/util/spanlogger"
 )
@@ -131,6 +132,15 @@ func (c *seriesStore) Get(ctx context.Context, userID string, from, through mode
 		return nil, err
 	}
 
+	// inject artificial __cortex_shard__ labels if present in the query. GetChunkRefs guarantees any chunk refs match the shard.
+	shard, _, err := astmapper.ShardFromMatchers(allMatchers)
+	if err != nil {
+		return nil, err
+	}
+	if shard != nil {
+		injectShardLabels(allChunks, *shard)
+	}
+
 	// Filter out chunks based on the empty matchers in the query.
 	filteredChunks := filterChunksByMatchers(allChunks, allMatchers)
 	return filteredChunks, nil
@@ -252,10 +262,21 @@ func (c *seriesStore) lookupSeriesByMetricNameMatchers(ctx context.Context, from
 	log, ctx := spanlogger.New(ctx, "SeriesStore.lookupSeriesByMetricNameMatchers", "metricName", metricName, "matchers", len(matchers))
 	defer log.Span.Finish()
 
+	// Check if one of the labels is a shard annotation, pass that information to lookupSeriesByMetricNameMatcher,
+	// and remove the label.
+	shard, shardLabelIndex, err := astmapper.ShardFromMatchers(matchers)
+	if err != nil {
+		return nil, err
+	}
+
+	if shard != nil {
+		matchers = append(matchers[:shardLabelIndex], matchers[shardLabelIndex+1:]...)
+	}
+
 	// Just get series for metric if there are no matchers
 	if len(matchers) == 0 {
 		indexLookupsPerQuery.Observe(1)
-		series, err := c.lookupSeriesByMetricNameMatcher(ctx, from, through, userID, metricName, nil)
+		series, err := c.lookupSeriesByMetricNameMatcher(ctx, from, through, userID, metricName, nil, shard)
 		if err != nil {
 			preIntersectionPerQuery.Observe(float64(len(series)))
 			postIntersectionPerQuery.Observe(float64(len(series)))
@@ -269,7 +290,7 @@ func (c *seriesStore) lookupSeriesByMetricNameMatchers(ctx context.Context, from
 	indexLookupsPerQuery.Observe(float64(len(matchers)))
 	for _, matcher := range matchers {
 		go func(matcher *labels.Matcher) {
-			ids, err := c.lookupSeriesByMetricNameMatcher(ctx, from, through, userID, metricName, matcher)
+			ids, err := c.lookupSeriesByMetricNameMatcher(ctx, from, through, userID, metricName, matcher, shard)
 			if err != nil {
 				incomingErrors <- err
 				return
@@ -320,7 +341,7 @@ func (c *seriesStore) lookupSeriesByMetricNameMatchers(ctx context.Context, from
 	return ids, nil
 }
 
-func (c *seriesStore) lookupSeriesByMetricNameMatcher(ctx context.Context, from, through model.Time, userID, metricName string, matcher *labels.Matcher) ([]string, error) {
+func (c *seriesStore) lookupSeriesByMetricNameMatcher(ctx context.Context, from, through model.Time, userID, metricName string, matcher *labels.Matcher, shard *astmapper.ShardAnnotation) ([]string, error) {
 	log, ctx := spanlogger.New(ctx, "SeriesStore.lookupSeriesByMetricNameMatcher", "metricName", metricName, "matcher", matcher)
 	defer log.Span.Finish()
 
@@ -341,6 +362,10 @@ func (c *seriesStore) lookupSeriesByMetricNameMatcher(ctx context.Context, from,
 	}
 	level.Debug(log).Log("queries", len(queries))
 
+	queries = c.schema.FilterReadQueries(queries, shard)
+
+	level.Debug(log).Log("filteredQueries", len(queries))
+
 	entries, err := c.lookupEntriesByQueries(ctx, queries)
 	if e, ok := err.(CardinalityExceededError); ok {
 		e.MetricName = metricName
@@ -509,3 +534,13 @@ func (c *seriesStore) calculateIndexEntries(ctx context.Context, from, through m
 
 	return result, missing, nil
 }
+
+func injectShardLabels(chunks []Chunk, shard astmapper.ShardAnnotation) {
+	for i, chunk := range chunks {
+		b := labels.NewBuilder(chunk.Metric)
+		l := shard.Label()
+		b.Set(l.Name, l.Value)
+		chunk.Metric = b.Labels()
+		chunks[i] = chunk
+	}
+}
diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go
index 550eb86cd49..7a71018740a 100644
--- a/pkg/cortex/modules.go
+++ b/pkg/cortex/modules.go
@@ -3,6 +3,7 @@ package cortex
 import (
 	"errors"
 	"fmt"
+	"math"
 	"net/http"
 	"os"
 	"regexp"
@@ -12,6 +13,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/common/route"
 	"github.com/prometheus/prometheus/config"
+	"github.com/prometheus/prometheus/promql"
 	v1 "github.com/prometheus/prometheus/web/api/v1"
 	httpgrpc_server "github.com/weaveworks/common/httpgrpc/server"
 	"github.com/weaveworks/common/middleware"
@@ -355,11 +357,32 @@ func (t *Cortex) stopStore() error {
 }
 
 func (t *Cortex) initQueryFrontend(cfg *Config) (err error) {
+	err = cfg.Schema.Load()
+	if err != nil {
+		return
+	}
+
 	t.frontend, err = frontend.New(cfg.Frontend, util.Logger)
 	if err != nil {
 		return
 	}
-	tripperware, cache, err := queryrange.NewTripperware(cfg.QueryRange, util.Logger, t.overrides, queryrange.PrometheusCodec, queryrange.PrometheusResponseExtractor)
+	tripperware, cache, err := queryrange.NewTripperware(
+		cfg.QueryRange,
+		util.Logger,
+		t.overrides,
+		queryrange.PrometheusCodec,
+		queryrange.PrometheusResponseExtractor,
+		cfg.Schema,
+		promql.EngineOpts{
+			Logger:        util.Logger,
+			Reg:           prometheus.DefaultRegisterer,
+			MaxConcurrent: int(math.MaxInt64), // the frontend's promql engine should not set any concurrency controls (these are handled by middleware)
+			MaxSamples:    cfg.Querier.MaxSamples,
+			Timeout:       cfg.Querier.Timeout,
+		},
+		cfg.Querier.QueryIngestersWithin,
+	)
+
 	if err != nil {
 		return err
 	}
diff --git a/pkg/querier/astmapper/astmapper.go b/pkg/querier/astmapper/astmapper.go
new file mode 100644
index 00000000000..0263645294d
--- /dev/null
+++ b/pkg/querier/astmapper/astmapper.go
@@ -0,0 +1,187 @@
+package astmapper
+
+import (
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/promql"
+)
+
+// ASTMapper is the exported interface for mapping between multiple AST representations
+type ASTMapper interface {
+	Map(node promql.Node) (promql.Node, error)
+}
+
+// MapperFunc is a function adapter for ASTMapper
+type MapperFunc func(node promql.Node) (promql.Node, error)
+
+// Map applies a mapperfunc as an ASTMapper
+func (fn MapperFunc) Map(node promql.Node) (promql.Node, error) {
+	return fn(node)
+}
+
+// MultiMapper can compose multiple ASTMappers
+type MultiMapper struct {
+	mappers []ASTMapper
+}
+
+// Map implements ASTMapper
+func (m *MultiMapper) Map(node promql.Node) (promql.Node, error) {
+	var result promql.Node = node
+	var err error
+
+	if len(m.mappers) == 0 {
+		return nil, errors.New("MultiMapper: No mappers registered")
+	}
+
+	for _, x := range m.mappers {
+		result, err = x.Map(result)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return result, nil
+
+}
+
+// Register adds ASTMappers into a multimapper.
+// Since registered functions are applied in the order they're registered, it's advised to register them
+// in decreasing priority and only operate on nodes that each function cares about, defaulting to CloneNode.
+func (m *MultiMapper) Register(xs ...ASTMapper) {
+	m.mappers = append(m.mappers, xs...)
+}
+
+// NewMultiMapper instaniates an ASTMapper from multiple ASTMappers
+func NewMultiMapper(xs ...ASTMapper) *MultiMapper {
+	m := &MultiMapper{}
+	m.Register(xs...)
+	return m
+}
+
+// CloneNode is a helper function to clone a node.
+func CloneNode(node promql.Node) (promql.Node, error) {
+	return promql.ParseExpr(node.String())
+}
+
+// NodeMapper either maps a single AST node or returns the unaltered node.
+// It also returns a bool to signal that no further recursion is necessary.
+// This is helpful because it allows mappers to only implement logic for node types they want to change.
+// It makes some mappers trivially easy to implement
+type NodeMapper interface {
+	MapNode(node promql.Node) (mapped promql.Node, finished bool, err error)
+}
+
+// NodeMapperFunc is an adapter for NodeMapper
+type NodeMapperFunc func(node promql.Node) (promql.Node, bool, error)
+
+// MapNode applies a NodeMapperFunc as a NodeMapper
+func (f NodeMapperFunc) MapNode(node promql.Node) (promql.Node, bool, error) {
+	return f(node)
+}
+
+// NewASTNodeMapper creates an ASTMapper from a NodeMapper
+func NewASTNodeMapper(mapper NodeMapper) ASTNodeMapper {
+	return ASTNodeMapper{mapper}
+}
+
+// ASTNodeMapper is an ASTMapper adapter which uses a NodeMapper internally.
+type ASTNodeMapper struct {
+	NodeMapper
+}
+
+// Map implements ASTMapper from a NodeMapper
+func (nm ASTNodeMapper) Map(node promql.Node) (promql.Node, error) {
+	node, fin, err := nm.MapNode(node)
+
+	if err != nil {
+		return nil, err
+	}
+
+	if fin {
+		return node, nil
+	}
+
+	switch n := node.(type) {
+	case nil:
+		// nil handles cases where we check optional fields that are not set
+		return nil, nil
+
+	case promql.Expressions:
+		for i, e := range n {
+			mapped, err := nm.Map(e)
+			if err != nil {
+				return nil, err
+			}
+			n[i] = mapped.(promql.Expr)
+		}
+		return n, nil
+
+	case *promql.AggregateExpr:
+		expr, err := nm.Map(n.Expr)
+		if err != nil {
+			return nil, err
+		}
+		n.Expr = expr.(promql.Expr)
+		return n, nil
+
+	case *promql.BinaryExpr:
+		lhs, err := nm.Map(n.LHS)
+		if err != nil {
+			return nil, err
+		}
+		n.LHS = lhs.(promql.Expr)
+
+		rhs, err := nm.Map(n.RHS)
+		if err != nil {
+			return nil, err
+		}
+		n.RHS = rhs.(promql.Expr)
+		return n, nil
+
+	case *promql.Call:
+		for i, e := range n.Args {
+			mapped, err := nm.Map(e)
+			if err != nil {
+				return nil, err
+			}
+			n.Args[i] = mapped.(promql.Expr)
+		}
+		return n, nil
+
+	case *promql.SubqueryExpr:
+		mapped, err := nm.Map(n.Expr)
+		if err != nil {
+			return nil, err
+		}
+		n.Expr = mapped.(promql.Expr)
+		return n, nil
+
+	case *promql.ParenExpr:
+		mapped, err := nm.Map(n.Expr)
+		if err != nil {
+			return nil, err
+		}
+		n.Expr = mapped.(promql.Expr)
+		return n, nil
+
+	case *promql.UnaryExpr:
+		mapped, err := nm.Map(n.Expr)
+		if err != nil {
+			return nil, err
+		}
+		n.Expr = mapped.(promql.Expr)
+		return n, nil
+
+	case *promql.EvalStmt:
+		mapped, err := nm.Map(n.Expr)
+		if err != nil {
+			return nil, err
+		}
+		n.Expr = mapped.(promql.Expr)
+		return n, nil
+
+	case *promql.NumberLiteral, *promql.StringLiteral, *promql.VectorSelector, *promql.MatrixSelector:
+		return n, nil
+
+	default:
+		panic(errors.Errorf("nodeMapper: unhandled node type %T", node))
+	}
+}
diff --git a/pkg/querier/astmapper/astmapper_test.go b/pkg/querier/astmapper/astmapper_test.go
new file mode 100644
index 00000000000..ca90026da91
--- /dev/null
+++ b/pkg/querier/astmapper/astmapper_test.go
@@ -0,0 +1,102 @@
+package astmapper
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCloneNode(t *testing.T) {
+	var testExpr = []struct {
+		input    promql.Expr
+		expected promql.Expr
+	}{
+		// simple unmodified case
+		{
+			&promql.BinaryExpr{
+				Op:  promql.ADD,
+				LHS: &promql.NumberLiteral{Val: 1},
+				RHS: &promql.NumberLiteral{Val: 1},
+			},
+			&promql.BinaryExpr{
+				Op:  promql.ADD,
+				LHS: &promql.NumberLiteral{Val: 1},
+				RHS: &promql.NumberLiteral{Val: 1},
+			},
+		},
+		{
+			&promql.AggregateExpr{
+				Op:      promql.SUM,
+				Without: true,
+				Expr: &promql.VectorSelector{
+					Name: "some_metric",
+					LabelMatchers: []*labels.Matcher{
+						mustLabelMatcher(labels.MatchEqual, string(model.MetricNameLabel), "some_metric"),
+					},
+				},
+				Grouping: []string{"foo"},
+			},
+			&promql.AggregateExpr{
+				Op:      promql.SUM,
+				Without: true,
+				Expr: &promql.VectorSelector{
+					Name: "some_metric",
+					LabelMatchers: []*labels.Matcher{
+						mustLabelMatcher(labels.MatchEqual, string(model.MetricNameLabel), "some_metric"),
+					},
+				},
+				Grouping: []string{"foo"},
+			},
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			res, err := CloneNode(c.input)
+			require.NoError(t, err)
+			require.Equal(t, c.expected, res)
+		})
+	}
+}
+
+func TestCloneNode_String(t *testing.T) {
+	var testExpr = []struct {
+		input    string
+		expected string
+	}{
+		{
+			input:    `rate(http_requests_total{cluster="us-central1"}[1m])`,
+			expected: `rate(http_requests_total{cluster="us-central1"}[1m])`,
+		},
+		{
+			input: `sum(
+sum(rate(http_requests_total{cluster="us-central1"}[1m]))
+/
+sum(rate(http_requests_total{cluster="ops-tools1"}[1m]))
+)`,
+			expected: `sum(sum(rate(http_requests_total{cluster="us-central1"}[1m])) / sum(rate(http_requests_total{cluster="ops-tools1"}[1m])))`,
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			expr, err := promql.ParseExpr(c.input)
+			require.Nil(t, err)
+			res, err := CloneNode(expr)
+			require.Nil(t, err)
+			require.Equal(t, c.expected, res.String())
+		})
+	}
+}
+
+func mustLabelMatcher(mt labels.MatchType, name, val string) *labels.Matcher {
+	m, err := labels.NewMatcher(mt, name, val)
+	if err != nil {
+		panic(err)
+	}
+	return m
+}
diff --git a/pkg/querier/astmapper/embedded.go b/pkg/querier/astmapper/embedded.go
new file mode 100644
index 00000000000..fd448803b6f
--- /dev/null
+++ b/pkg/querier/astmapper/embedded.go
@@ -0,0 +1,82 @@
+package astmapper
+
+import (
+	"encoding/json"
+
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+)
+
+/*
+Design:
+
+The prometheus api package enforces a (*promql.Engine argument), making it infeasible to do lazy AST
+evaluation and substitution from within this package.
+This leaves the (storage.Queryable) interface as the remaining target for conducting application level sharding.
+
+The main idea is to analyze the AST and determine which subtrees can be parallelized. With those in hand, the queries may
+be remapped into vector or matrix selectors utilizing a reserved label containing the original query. These may then be parallelized in the storage implementation.
+*/
+
+const (
+	// QueryLabel is a reserved label containing an embedded query
+	QueryLabel = "__cortex_queries__"
+	// EmbeddedQueriesMetricName is a reserved label (metric name) denoting an embedded query
+	EmbeddedQueriesMetricName = "__embedded_queries__"
+)
+
+// EmbeddedQueries is a wrapper type for encoding queries
+type EmbeddedQueries struct {
+	Concat []string `json:"Concat"`
+}
+
+// JSONCodec is a Codec that uses JSON representations of EmbeddedQueries structs
+var JSONCodec jsonCodec
+
+type jsonCodec struct{}
+
+func (c jsonCodec) Encode(queries []string) (string, error) {
+	embedded := EmbeddedQueries{
+		Concat: queries,
+	}
+	b, err := json.Marshal(embedded)
+	return string(b), err
+}
+
+func (c jsonCodec) Decode(encoded string) (queries []string, err error) {
+	var embedded EmbeddedQueries
+	err = json.Unmarshal([]byte(encoded), &embedded)
+	if err != nil {
+		return nil, err
+	}
+
+	return embedded.Concat, nil
+}
+
+// VectorSquash reduces an AST into a single vector query which can be hijacked by a Queryable impl.
+// It always uses a VectorSelector as the substitution node.
+// This is important because logical/set binops can only be applied against vectors and not matrices.
+func VectorSquasher(nodes ...promql.Node) (promql.Expr, error) {
+
+	// concat OR legs
+	strs := make([]string, 0, len(nodes))
+	for _, node := range nodes {
+		strs = append(strs, node.String())
+	}
+
+	encoded, err := JSONCodec.Encode(strs)
+	if err != nil {
+		return nil, err
+	}
+
+	embeddedQuery, err := labels.NewMatcher(labels.MatchEqual, QueryLabel, encoded)
+	if err != nil {
+		return nil, err
+	}
+
+	return &promql.VectorSelector{
+		Name:          EmbeddedQueriesMetricName,
+		LabelMatchers: []*labels.Matcher{embeddedQuery},
+	}, nil
+
+}
diff --git a/pkg/querier/astmapper/instrumentation.go b/pkg/querier/astmapper/instrumentation.go
new file mode 100644
index 00000000000..39af54e5686
--- /dev/null
+++ b/pkg/querier/astmapper/instrumentation.go
@@ -0,0 +1,12 @@
+package astmapper
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+)
+
+var shardCounter = promauto.NewCounter(prometheus.CounterOpts{
+	Namespace: "cortex",
+	Name:      "frontend_sharded_queries_total",
+	Help:      "Total number of sharded queries",
+})
diff --git a/pkg/querier/astmapper/parallel.go b/pkg/querier/astmapper/parallel.go
new file mode 100644
index 00000000000..d0fee3c430c
--- /dev/null
+++ b/pkg/querier/astmapper/parallel.go
@@ -0,0 +1,98 @@
+package astmapper
+
+import (
+	"fmt"
+
+	"github.com/go-kit/kit/log/level"
+	"github.com/prometheus/prometheus/promql"
+
+	"github.com/cortexproject/cortex/pkg/util"
+)
+
+var summableAggregates = map[promql.ItemType]struct{}{
+	promql.SUM:     {},
+	promql.MIN:     {},
+	promql.MAX:     {},
+	promql.TOPK:    {},
+	promql.BOTTOMK: {},
+	promql.COUNT:   {},
+}
+
+var nonParallelFuncs = []string{
+	"histogram_quantile",
+	"quantile_over_time",
+	"absent",
+}
+
+// CanParallelize tests if a subtree is parallelizable.
+// A subtree is parallelizable if all of its components are parallelizable.
+func CanParallelize(node promql.Node) bool {
+	switch n := node.(type) {
+	case nil:
+		// nil handles cases where we check optional fields that are not set
+		return true
+
+	case promql.Expressions:
+		for _, e := range n {
+			if !CanParallelize(e) {
+				return false
+			}
+		}
+		return true
+
+	case *promql.AggregateExpr:
+		_, ok := summableAggregates[n.Op]
+		return ok && CanParallelize(n.Expr)
+
+	case *promql.BinaryExpr:
+		// since binary exprs use each side for merging, they cannot be parallelized
+		return false
+
+	case *promql.Call:
+		if n.Func == nil {
+			return false
+		}
+		if !ParallelizableFunc(*n.Func) {
+			return false
+		}
+
+		for _, e := range n.Args {
+			if !CanParallelize(e) {
+				return false
+			}
+		}
+		return true
+
+	case *promql.SubqueryExpr:
+		return CanParallelize(n.Expr)
+
+	case *promql.ParenExpr:
+		return CanParallelize(n.Expr)
+
+	case *promql.UnaryExpr:
+		// Since these are only currently supported for Scalars, should be parallel-compatible
+		return true
+
+	case *promql.EvalStmt:
+		return CanParallelize(n.Expr)
+
+	case *promql.MatrixSelector, *promql.NumberLiteral, *promql.StringLiteral, *promql.VectorSelector:
+		return true
+
+	default:
+		level.Error(util.Logger).Log("err", fmt.Sprintf("CanParallel: unhandled node type %T", node))
+		return false
+	}
+
+}
+
+// ParallelizableFunc ensures that a promql function can be part of a parallel query.
+func ParallelizableFunc(f promql.Function) bool {
+
+	for _, v := range nonParallelFuncs {
+		if v == f.Name {
+			return false
+		}
+	}
+	return true
+}
diff --git a/pkg/querier/astmapper/parallel_test.go b/pkg/querier/astmapper/parallel_test.go
new file mode 100644
index 00000000000..1ad07d39980
--- /dev/null
+++ b/pkg/querier/astmapper/parallel_test.go
@@ -0,0 +1,119 @@
+package astmapper
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCanParallel(t *testing.T) {
+	var testExpr = []struct {
+		input    promql.Expr
+		expected bool
+	}{
+		// simple sum
+		{
+			&promql.AggregateExpr{
+				Op:      promql.SUM,
+				Without: true,
+				Expr: &promql.VectorSelector{
+					Name: "some_metric",
+					LabelMatchers: []*labels.Matcher{
+						mustLabelMatcher(labels.MatchEqual, string(model.MetricNameLabel), "some_metric"),
+					},
+				},
+				Grouping: []string{"foo"},
+			},
+			true,
+		},
+		/*
+			  sum(
+				  sum by (foo) bar1{baz=”blip”}[1m])
+				/
+				  sum by (foo) bar2{baz=”blip”}[1m]))
+			  )
+		*/
+		{
+			&promql.AggregateExpr{
+				Op: promql.SUM,
+				Expr: &promql.BinaryExpr{
+					Op: promql.DIV,
+					LHS: &promql.AggregateExpr{
+						Op:       promql.SUM,
+						Grouping: []string{"foo"},
+						Expr: &promql.VectorSelector{
+							Name: "idk",
+							LabelMatchers: []*labels.Matcher{
+								mustLabelMatcher(labels.MatchEqual, string(model.MetricNameLabel), "bar1"),
+							}},
+					},
+					RHS: &promql.AggregateExpr{
+						Op:       promql.SUM,
+						Grouping: []string{"foo"},
+						Expr: &promql.VectorSelector{
+							Name: "idk",
+							LabelMatchers: []*labels.Matcher{
+								mustLabelMatcher(labels.MatchEqual, string(model.MetricNameLabel), "bar2"),
+							}},
+					},
+				},
+			},
+			false,
+		},
+		// sum by (foo) bar1{baz=”blip”}[1m]) ---- this is the first leg of the above
+		{
+			&promql.AggregateExpr{
+				Op:       promql.SUM,
+				Grouping: []string{"foo"},
+				Expr: &promql.VectorSelector{
+					Name: "idk",
+					LabelMatchers: []*labels.Matcher{
+						mustLabelMatcher(labels.MatchEqual, string(model.MetricNameLabel), "bar1"),
+					}},
+			},
+			true,
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			res := CanParallelize(c.input)
+			require.Equal(t, c.expected, res)
+		})
+	}
+}
+
+func TestCanParallel_String(t *testing.T) {
+	var testExpr = []struct {
+		input    string
+		expected bool
+	}{
+		{
+			`sum by (foo) (rate(bar1{baz="blip"}[1m]))`,
+			true,
+		},
+		{
+			`sum by (foo) (histogram_quantile(0.9, rate(http_request_duration_seconds_bucket[10m])))`,
+			false,
+		},
+		{
+			`sum by (foo) (
+			  quantile_over_time(0.9, http_request_duration_seconds_bucket[10m])
+			)`,
+			false,
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			expr, err := promql.ParseExpr(c.input)
+			require.Nil(t, err)
+			res := CanParallelize(expr)
+			require.Equal(t, c.expected, res)
+		})
+	}
+}
diff --git a/pkg/querier/astmapper/shard_summer.go b/pkg/querier/astmapper/shard_summer.go
new file mode 100644
index 00000000000..c437f0ca231
--- /dev/null
+++ b/pkg/querier/astmapper/shard_summer.go
@@ -0,0 +1,292 @@
+package astmapper
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+)
+
+const (
+	// ShardLabel is a reserved label referencing a cortex shard
+	ShardLabel = "__cortex_shard__"
+	// ShardLabelFmt is the fmt of the ShardLabel key.
+	ShardLabelFmt = "%d_of_%d"
+)
+
+var (
+	// ShardLabelRE matches a value in ShardLabelFmt
+	ShardLabelRE = regexp.MustCompile("^[0-9]+_of_[0-9]+$")
+)
+
+type squasher = func(...promql.Node) (promql.Expr, error)
+
+type shardSummer struct {
+	shards       int
+	currentShard *int
+	squash       squasher
+}
+
+// NewShardSummer instantiates an ASTMapper which will fan out sum queries by shard
+func NewShardSummer(shards int, squasher squasher) (ASTMapper, error) {
+	if squasher == nil {
+		return nil, errors.Errorf("squasher required and not passed")
+	}
+
+	return NewASTNodeMapper(&shardSummer{
+		shards:       shards,
+		squash:       squasher,
+		currentShard: nil,
+	}), nil
+}
+
+// CopyWithCurShard clones a shardSummer with a new current shard.
+func (summer *shardSummer) CopyWithCurShard(curshard int) *shardSummer {
+	s := *summer
+	s.currentShard = &curshard
+	return &s
+}
+
+// shardSummer expands a query AST by sharding and re-summing when possible
+func (summer *shardSummer) MapNode(node promql.Node) (promql.Node, bool, error) {
+
+	switch n := node.(type) {
+	case *promql.AggregateExpr:
+		if CanParallelize(n) && n.Op == promql.SUM {
+			result, err := summer.shardSum(n)
+			return result, true, err
+		}
+
+		return n, false, nil
+
+	case *promql.VectorSelector:
+		if summer.currentShard != nil {
+			mapped, err := shardVectorSelector(*summer.currentShard, summer.shards, n)
+			return mapped, true, err
+		}
+		return n, true, nil
+
+	case *promql.MatrixSelector:
+		if summer.currentShard != nil {
+			mapped, err := shardMatrixSelector(*summer.currentShard, summer.shards, n)
+			return mapped, true, err
+		}
+		return n, true, nil
+
+	default:
+		return n, false, nil
+	}
+}
+
+// shardSum contains the logic for how we split/stitch legs of a parallelized sum query
+func (summer *shardSummer) shardSum(expr *promql.AggregateExpr) (promql.Node, error) {
+
+	parent, subSums, err := summer.splitSum(expr)
+	if err != nil {
+		return nil, err
+	}
+
+	combinedSums, err := summer.squash(subSums...)
+
+	if err != nil {
+		return nil, err
+	}
+
+	parent.Expr = combinedSums
+	return parent, nil
+}
+
+// splitSum forms the parent and child legs of a parallel query
+func (summer *shardSummer) splitSum(
+	expr *promql.AggregateExpr,
+) (
+	parent *promql.AggregateExpr,
+	children []promql.Node,
+	err error,
+) {
+	parent = &promql.AggregateExpr{
+		Op:    expr.Op,
+		Param: expr.Param,
+	}
+	var mkChild func(sharded *promql.AggregateExpr) promql.Expr
+
+	if expr.Without {
+		/*
+			parallelizing a sum using without(foo) is representable naively as
+			sum without(foo) (
+			  sum without(__cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum without(__cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)
+			or (more optimized):
+			sum without(__cortex_shard__) (
+			  sum without(foo) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum without(foo) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)
+
+		*/
+		parent.Grouping = []string{ShardLabel}
+		parent.Without = true
+		mkChild = func(sharded *promql.AggregateExpr) promql.Expr {
+			sharded.Grouping = expr.Grouping
+			sharded.Without = true
+			return sharded
+		}
+	} else if len(expr.Grouping) > 0 {
+		/*
+			parallelizing a sum using by(foo) is representable as
+			sum by(foo) (
+			  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)
+		*/
+		parent.Grouping = expr.Grouping
+		mkChild = func(sharded *promql.AggregateExpr) promql.Expr {
+			groups := make([]string, 0, len(expr.Grouping)+1)
+			groups = append(groups, expr.Grouping...)
+			groups = append(groups, ShardLabel)
+			sharded.Grouping = groups
+			return sharded
+		}
+	} else {
+		/*
+			parallelizing a non-parameterized sum is representable as
+			sum(
+			  sum without(__cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum without(__cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)
+			or (more optimized):
+			sum without(__cortex_shard__) (
+			  sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)
+		*/
+		parent.Grouping = []string{ShardLabel}
+		parent.Without = true
+		mkChild = func(sharded *promql.AggregateExpr) promql.Expr {
+			sharded.Grouping = []string{ShardLabel}
+			return sharded
+		}
+	}
+
+	// iterate across shardFactor to create children
+	for i := 0; i < summer.shards; i++ {
+		cloned, err := CloneNode(expr.Expr)
+		if err != nil {
+			return parent, children, err
+		}
+
+		subSummer := NewASTNodeMapper(summer.CopyWithCurShard(i))
+		sharded, err := subSummer.Map(cloned)
+		if err != nil {
+			return parent, children, err
+		}
+
+		subSum := mkChild(&promql.AggregateExpr{
+			Op:   expr.Op,
+			Expr: sharded.(promql.Expr),
+		})
+
+		children = append(children,
+			subSum,
+		)
+	}
+
+	shardCounter.Add(float64(summer.shards))
+
+	return parent, children, nil
+}
+
+func shardVectorSelector(curshard, shards int, selector *promql.VectorSelector) (promql.Node, error) {
+	shardMatcher, err := labels.NewMatcher(labels.MatchEqual, ShardLabel, fmt.Sprintf(ShardLabelFmt, curshard, shards))
+	if err != nil {
+		return nil, err
+	}
+
+	return &promql.VectorSelector{
+		Name:   selector.Name,
+		Offset: selector.Offset,
+		LabelMatchers: append(
+			[]*labels.Matcher{shardMatcher},
+			selector.LabelMatchers...,
+		),
+	}, nil
+}
+
+func shardMatrixSelector(curshard, shards int, selector *promql.MatrixSelector) (promql.Node, error) {
+	shardMatcher, err := labels.NewMatcher(labels.MatchEqual, ShardLabel, fmt.Sprintf(ShardLabelFmt, curshard, shards))
+	if err != nil {
+		return nil, err
+	}
+
+	return &promql.MatrixSelector{
+		Name:   selector.Name,
+		Range:  selector.Range,
+		Offset: selector.Offset,
+		LabelMatchers: append(
+			[]*labels.Matcher{shardMatcher},
+			selector.LabelMatchers...,
+		),
+	}, nil
+}
+
+// ParseShard will extract the shard information encoded in ShardLabelFmt
+func ParseShard(input string) (parsed ShardAnnotation, err error) {
+	if !ShardLabelRE.MatchString(input) {
+		return parsed, errors.Errorf("Invalid ShardLabel value: [%s]", input)
+	}
+
+	matches := strings.Split(input, "_")
+	x, err := strconv.Atoi(matches[0])
+	if err != nil {
+		return parsed, err
+	}
+	of, err := strconv.Atoi(matches[2])
+	if err != nil {
+		return parsed, err
+	}
+
+	if x >= of {
+		return parsed, errors.Errorf("Shards out of bounds: [%d] >= [%d]", x, of)
+	}
+	return ShardAnnotation{
+		Shard: x,
+		Of:    of,
+	}, err
+}
+
+// ShardAnnotation is a convenience struct which holds data from a parsed shard label
+type ShardAnnotation struct {
+	Shard int
+	Of    int
+}
+
+// String encodes a shardAnnotation into a label value
+func (shard ShardAnnotation) String() string {
+	return fmt.Sprintf(ShardLabelFmt, shard.Shard, shard.Of)
+}
+
+// Label generates the ShardAnnotation as a label
+func (shard ShardAnnotation) Label() labels.Label {
+	return labels.Label{
+		Name:  ShardLabel,
+		Value: shard.String(),
+	}
+}
+
+// ShardFromMatchers extracts a ShardAnnotation and the index it was pulled from in the matcher list
+func ShardFromMatchers(matchers []*labels.Matcher) (shard *ShardAnnotation, idx int, err error) {
+	for i, matcher := range matchers {
+		if matcher.Name == ShardLabel && matcher.Type == labels.MatchEqual {
+			shard, err := ParseShard(matcher.Value)
+			if err != nil {
+				return nil, i, err
+			}
+			return &shard, i, nil
+		}
+	}
+	return nil, 0, nil
+}
diff --git a/pkg/querier/astmapper/shard_summer_test.go b/pkg/querier/astmapper/shard_summer_test.go
new file mode 100644
index 00000000000..a76fcb79560
--- /dev/null
+++ b/pkg/querier/astmapper/shard_summer_test.go
@@ -0,0 +1,270 @@
+package astmapper
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+)
+
+// orSquasher is a custom squasher which mimics the intuitive but less efficient OR'ing of sharded vectors.
+// It's helpful for tests because of its intuitive & human readable output.
+func orSquasher(nodes ...promql.Node) (promql.Expr, error) {
+	combined := nodes[0]
+	for i := 1; i < len(nodes); i++ {
+		combined = &promql.BinaryExpr{
+			Op:  promql.LOR,
+			LHS: combined.(promql.Expr),
+			RHS: nodes[i].(promql.Expr),
+		}
+	}
+	return combined.(promql.Expr), nil
+}
+
+func TestShardSummer(t *testing.T) {
+	var testExpr = []struct {
+		shards   int
+		input    string
+		expected string
+	}{
+		{
+			shards: 3,
+			input:  `sum(rate(bar1{baz="blip"}[1m]))`,
+			expected: `sum without(__cortex_shard__) (
+			  sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="0_of_3",baz="blip"}[1m])) or
+			  sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="1_of_3",baz="blip"}[1m])) or
+			  sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="2_of_3",baz="blip"}[1m]))
+			)`,
+		},
+		{
+			shards: 3,
+			input:  `sum by(foo) (rate(bar1{baz="blip"}[1m]))`,
+			expected: `sum by(foo) (
+			  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="0_of_3",baz="blip"}[1m])) or
+			  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="1_of_3",baz="blip"}[1m])) or
+			  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="2_of_3",baz="blip"}[1m]))
+			)`,
+		},
+		{
+			shards: 2,
+			input: `sum(
+				sum by (foo) (rate(bar1{baz="blip"}[1m]))
+				/
+				sum by (foo) (rate(foo{baz="blip"}[1m]))
+			)`,
+			expected: `sum(
+			  sum by(foo) (
+				sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+				sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			  )
+			  /
+			  sum by(foo) (
+				sum by(foo, __cortex_shard__) (rate(foo{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+				sum by(foo, __cortex_shard__) (rate(foo{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			  )
+			)`,
+		},
+		// This nested sum example is nonsensical, but equivalent.
+		{
+			shards: 2,
+			input:  `sum(sum by(foo) (rate(bar1{baz="blip"}[1m])))`,
+			expected: `sum without(__cortex_shard__) (
+			  sum by(__cortex_shard__) (
+				sum by(foo) (
+				  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+				  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+				)
+			  ) or
+			  sum by(__cortex_shard__)(
+				sum by(foo) (
+				  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+				  sum by(foo, __cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+				)
+			  )
+			)`,
+		},
+		// without
+		{
+			shards: 2,
+			input:  `sum without(foo) (rate(bar1{baz="blip"}[1m]))`,
+			expected: `sum without(__cortex_shard__) (
+			  sum without(foo) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum without(foo) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)`,
+		},
+		// multiple dimensions
+		{
+			shards: 2,
+			input:  `sum by(foo, bom) (rate(bar1{baz="blip"}[1m]))`,
+			expected: `sum by(foo, bom) (
+			  sum by(foo, bom, __cortex_shard__) (rate(bar1{__cortex_shard__="0_of_2",baz="blip"}[1m])) or
+			  sum by(foo, bom, __cortex_shard__) (rate(bar1{__cortex_shard__="1_of_2",baz="blip"}[1m]))
+			)`,
+		},
+		// sharding histogram inputs
+		{
+			shards: 2,
+			input:  `histogram_quantile(0.9, sum(rate(alertmanager_http_request_duration_seconds_bucket[10m])) by (job, le))`,
+			expected: `histogram_quantile(
+				    0.9,
+				    sum by(job, le) (
+				      sum by(job, le, __cortex_shard__) (rate(alertmanager_http_request_duration_seconds_bucket{__cortex_shard__="0_of_2"}[10m])) or
+				      sum by(job, le, __cortex_shard__) (rate(alertmanager_http_request_duration_seconds_bucket{__cortex_shard__="1_of_2"}[10m]))
+				    )
+				  )`,
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+
+			summer, err := NewShardSummer(c.shards, orSquasher)
+			require.Nil(t, err)
+			expr, err := promql.ParseExpr(c.input)
+			require.Nil(t, err)
+			res, err := summer.Map(expr)
+			require.Nil(t, err)
+
+			expected, err := promql.ParseExpr(c.expected)
+			require.Nil(t, err)
+
+			require.Equal(t, expected.String(), res.String())
+		})
+	}
+}
+
+func TestShardSummerWithEncoding(t *testing.T) {
+	for i, c := range []struct {
+		shards   int
+		input    string
+		expected string
+	}{
+		{
+			shards:   3,
+			input:    `sum(rate(bar1{baz="blip"}[1m]))`,
+			expected: `sum without(__cortex_shard__) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"0_of_3\\\",baz=\\\"blip\\\"}[1m]))\",\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"1_of_3\\\",baz=\\\"blip\\\"}[1m]))\",\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"2_of_3\\\",baz=\\\"blip\\\"}[1m]))\"]}"})`,
+		},
+	} {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			summer, err := NewShardSummer(c.shards, VectorSquasher)
+			require.Nil(t, err)
+			expr, err := promql.ParseExpr(c.input)
+			require.Nil(t, err)
+			res, err := summer.Map(expr)
+			require.Nil(t, err)
+
+			expected, err := promql.ParseExpr(c.expected)
+			require.Nil(t, err)
+
+			require.Equal(t, expected.String(), res.String())
+		})
+	}
+}
+
+func TestParseShard(t *testing.T) {
+	var testExpr = []struct {
+		input  string
+		output ShardAnnotation
+		err    bool
+	}{
+		{
+			input:  "lsdjf",
+			output: ShardAnnotation{},
+			err:    true,
+		},
+		{
+			input:  "a_of_3",
+			output: ShardAnnotation{},
+			err:    true,
+		},
+		{
+			input:  "3_of_3",
+			output: ShardAnnotation{},
+			err:    true,
+		},
+		{
+			input: "1_of_2",
+			output: ShardAnnotation{
+				Shard: 1,
+				Of:    2,
+			},
+		},
+	}
+
+	for _, c := range testExpr {
+		t.Run(fmt.Sprint(c.input), func(t *testing.T) {
+			shard, err := ParseShard(c.input)
+			if c.err {
+				require.NotNil(t, err)
+			} else {
+				require.Nil(t, err)
+				require.Equal(t, c.output, shard)
+			}
+		})
+	}
+
+}
+
+func TestShardFromMatchers(t *testing.T) {
+	var testExpr = []struct {
+		input []*labels.Matcher
+		shard *ShardAnnotation
+		idx   int
+		err   bool
+	}{
+		{
+			input: []*labels.Matcher{
+				{},
+				{
+					Name: ShardLabel,
+					Type: labels.MatchEqual,
+					Value: ShardAnnotation{
+						Shard: 10,
+						Of:    16,
+					}.String(),
+				},
+				{},
+			},
+			shard: &ShardAnnotation{
+				Shard: 10,
+				Of:    16,
+			},
+			idx: 1,
+			err: false,
+		},
+		{
+			input: []*labels.Matcher{
+				{
+					Name:  ShardLabel,
+					Type:  labels.MatchEqual,
+					Value: "invalid-fmt",
+				},
+			},
+			shard: nil,
+			idx:   0,
+			err:   true,
+		},
+		{
+			input: []*labels.Matcher{},
+			shard: nil,
+			idx:   0,
+			err:   false,
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprint(i), func(t *testing.T) {
+			shard, idx, err := ShardFromMatchers(c.input)
+			if c.err {
+				require.NotNil(t, err)
+			} else {
+				require.Nil(t, err)
+				require.Equal(t, c.shard, shard)
+				require.Equal(t, c.idx, idx)
+			}
+		})
+	}
+
+}
diff --git a/pkg/querier/astmapper/subtree_folder.go b/pkg/querier/astmapper/subtree_folder.go
new file mode 100644
index 00000000000..5d142bea116
--- /dev/null
+++ b/pkg/querier/astmapper/subtree_folder.go
@@ -0,0 +1,95 @@
+package astmapper
+
+import (
+	"github.com/prometheus/prometheus/promql"
+)
+
+/*
+subtreeFolder is a NodeMapper which embeds an entire promql.Node in an embedded query
+if it does not contain any previously embedded queries. This allows the frontend to "zip up" entire
+subtrees of an AST that have not already been parallelized.
+
+*/
+type subtreeFolder struct{}
+
+// NewSubtreeFolder creates a subtreeFolder which can reduce an AST
+// to one embedded query if it contains no embedded queries yet
+func NewSubtreeFolder() ASTMapper {
+	return NewASTNodeMapper(&subtreeFolder{})
+}
+
+// MapNode implements NodeMapper
+func (f *subtreeFolder) MapNode(node promql.Node) (promql.Node, bool, error) {
+	switch n := node.(type) {
+	// do not attempt to fold number or string leaf nodes
+	case *promql.NumberLiteral, *promql.StringLiteral:
+		return n, true, nil
+	}
+
+	containsEmbedded, err := Predicate(node, predicate(isEmbedded))
+	if err != nil {
+		return nil, true, err
+	}
+
+	if containsEmbedded {
+		return node, false, nil
+	}
+
+	expr, err := VectorSquasher(node)
+	return expr, true, err
+}
+
+func isEmbedded(node promql.Node) (bool, error) {
+	switch n := node.(type) {
+	case *promql.VectorSelector:
+		if n.Name == EmbeddedQueriesMetricName {
+			return true, nil
+		}
+
+	case *promql.MatrixSelector:
+		if n.Name == EmbeddedQueriesMetricName {
+			return true, nil
+		}
+
+	}
+	return false, nil
+}
+
+type predicate = func(promql.Node) (bool, error)
+
+// Predicate is a helper which uses promql.Walk under the hood determine if any node in a subtree
+// returns true for a specified function
+func Predicate(node promql.Node, fn predicate) (bool, error) {
+	v := &visitor{
+		fn: fn,
+	}
+
+	if err := promql.Walk(v, node, nil); err != nil {
+		return false, err
+	}
+	return v.result, nil
+}
+
+type visitor struct {
+	fn     predicate
+	result bool
+}
+
+// Visit implements promql.Visitor
+func (v *visitor) Visit(node promql.Node, path []promql.Node) (promql.Visitor, error) {
+	// if the visitor has already seen a predicate success, don't overwrite
+	if v.result {
+		return nil, nil
+	}
+
+	var err error
+
+	v.result, err = v.fn(node)
+	if err != nil {
+		return nil, err
+	}
+	if v.result {
+		return nil, nil
+	}
+	return v, nil
+}
diff --git a/pkg/querier/astmapper/subtree_folder_test.go b/pkg/querier/astmapper/subtree_folder_test.go
new file mode 100644
index 00000000000..b30e0d83c4c
--- /dev/null
+++ b/pkg/querier/astmapper/subtree_folder_test.go
@@ -0,0 +1,113 @@
+package astmapper
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+)
+
+func TestPredicate(t *testing.T) {
+	for i, tc := range []struct {
+		input    string
+		fn       predicate
+		expected bool
+		err      bool
+	}{
+		{
+			input: "selector1{} or selector2{}",
+			fn: predicate(func(node promql.Node) (bool, error) {
+				return false, errors.New("some err")
+			}),
+			expected: false,
+			err:      true,
+		},
+		{
+			input: "selector1{} or selector2{}",
+			fn: predicate(func(node promql.Node) (bool, error) {
+				return false, nil
+			}),
+			expected: false,
+			err:      false,
+		},
+		{
+			input: "selector1{} or selector2{}",
+			fn: predicate(func(node promql.Node) (bool, error) {
+				return true, nil
+			}),
+			expected: true,
+			err:      false,
+		},
+		{
+			input:    `sum without(__cortex_shard__) (__embedded_queries__{__cortex_queries__="tstquery"}) or sum(selector)`,
+			fn:       predicate(isEmbedded),
+			expected: true,
+			err:      false,
+		},
+	} {
+		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
+			expr, err := promql.ParseExpr(tc.input)
+			require.Nil(t, err)
+
+			res, err := Predicate(expr.(promql.Node), tc.fn)
+			if tc.err {
+				require.Error(t, err)
+			} else {
+				require.Nil(t, err)
+			}
+
+			require.Equal(t, tc.expected, res)
+		})
+	}
+}
+
+func TestSubtreeMapper(t *testing.T) {
+	for i, tc := range []struct {
+		input    string
+		expected string
+	}{
+		// embed an entire histogram
+		{
+			input:    "histogram_quantile(0.5, rate(alertmanager_http_request_duration_seconds_bucket[1m]))",
+			expected: `__embedded_queries__{__cortex_queries__="{\"Concat\":[\"histogram_quantile(0.5, rate(alertmanager_http_request_duration_seconds_bucket[1m]))\"]}"}`,
+		},
+		// embed a binary expression across two functions
+		{
+			input:    `rate(http_requests_total{cluster="eu-west2"}[5m]) or rate(http_requests_total{cluster="us-central1"}[5m])`,
+			expected: `__embedded_queries__{__cortex_queries__="{\"Concat\":[\"rate(http_requests_total{cluster=\\\"eu-west2\\\"}[5m]) or rate(http_requests_total{cluster=\\\"us-central1\\\"}[5m])\"]}"}`,
+		},
+
+		// the first leg (histogram) hasn't been embedded at any level, so embed that, but ignore the right leg
+		// which has already been embedded.
+		{
+			input: `sum(histogram_quantile(0.5, rate(selector[1m]))) +
+				sum without(__cortex_shard__) (__embedded_queries__{__cortex_queries__="tstquery"})`,
+			expected: `
+			  __embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum(histogram_quantile(0.5, rate(selector[1m])))\"]}"} +
+			  sum without(__cortex_shard__) (__embedded_queries__{__cortex_queries__="tstquery"})
+`,
+		},
+		// should not embed scalars
+		{
+			input:    `histogram_quantile(0.5, __embedded_queries__{__cortex_queries__="tstquery"})`,
+			expected: `histogram_quantile(0.5, __embedded_queries__{__cortex_queries__="tstquery"})`,
+		},
+	} {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			mapper := NewSubtreeFolder()
+
+			expr, err := promql.ParseExpr(tc.input)
+			require.Nil(t, err)
+			res, err := mapper.Map(expr)
+			require.Nil(t, err)
+
+			expected, err := promql.ParseExpr(tc.expected)
+			require.Nil(t, err)
+
+			require.Equal(t, expected.String(), res.String())
+
+		})
+	}
+}
diff --git a/pkg/querier/block.go b/pkg/querier/block.go
index acdf5a7d0c2..7a56813ea95 100644
--- a/pkg/querier/block.go
+++ b/pkg/querier/block.go
@@ -18,6 +18,7 @@ import (
 	"github.com/weaveworks/common/user"
 	"google.golang.org/grpc/metadata"
 
+	"github.com/cortexproject/cortex/pkg/querier/series"
 	"github.com/cortexproject/cortex/pkg/storage/tsdb"
 	"github.com/cortexproject/cortex/pkg/util"
 	"github.com/cortexproject/cortex/pkg/util/spanlogger"
@@ -244,7 +245,7 @@ func (bqs *blockQuerierSeries) Labels() labels.Labels {
 func (bqs *blockQuerierSeries) Iterator() storage.SeriesIterator {
 	if len(bqs.chunks) == 0 {
 		// should not happen in practice, but we have a unit test for it
-		return errIterator{err: errors.New("no chunks")}
+		return series.NewErrIterator(errors.New("no chunks"))
 	}
 
 	its := make([]chunkenc.Iterator, 0, len(bqs.chunks))
@@ -252,7 +253,7 @@ func (bqs *blockQuerierSeries) Iterator() storage.SeriesIterator {
 	for _, c := range bqs.chunks {
 		ch, err := chunkenc.FromData(chunkenc.EncXOR, c.Raw.Data)
 		if err != nil {
-			return errIterator{err: errors.Wrapf(err, "failed to initialize chunk from XOR encoded raw data (series: %v min time: %d max time: %d)", bqs.Labels(), c.MinTime, c.MaxTime)}
+			return series.NewErrIterator(errors.Wrapf(err, "failed to initialize chunk from XOR encoded raw data (series: %v min time: %d max time: %d)", bqs.Labels(), c.MinTime, c.MaxTime))
 		}
 
 		it := ch.Iterator(nil)
diff --git a/pkg/querier/chunk_store_queryable.go b/pkg/querier/chunk_store_queryable.go
index beaba3f0b1f..1c1922ff7d8 100644
--- a/pkg/querier/chunk_store_queryable.go
+++ b/pkg/querier/chunk_store_queryable.go
@@ -11,11 +11,13 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/chunk"
 	"github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/querier/chunkstore"
+	seriesset "github.com/cortexproject/cortex/pkg/querier/series"
 )
 
 type chunkIteratorFunc func(chunks []chunk.Chunk, from, through model.Time) storage.SeriesIterator
 
-func newChunkStoreQueryable(store ChunkStore, chunkIteratorFunc chunkIteratorFunc) storage.Queryable {
+func newChunkStoreQueryable(store chunkstore.ChunkStore, chunkIteratorFunc chunkIteratorFunc) storage.Queryable {
 	return storage.QueryableFunc(func(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
 		return &chunkStoreQuerier{
 			store:             store,
@@ -28,7 +30,7 @@ func newChunkStoreQueryable(store ChunkStore, chunkIteratorFunc chunkIteratorFun
 }
 
 type chunkStoreQuerier struct {
-	store             ChunkStore
+	store             chunkstore.ChunkStore
 	chunkIteratorFunc chunkIteratorFunc
 	ctx               context.Context
 	mint, maxt        int64
@@ -66,7 +68,7 @@ func partitionChunks(chunks []chunk.Chunk, mint, maxt int64, iteratorFunc chunkI
 		})
 	}
 
-	return newConcreteSeriesSet(series)
+	return seriesset.NewConcreteSeriesSet(series)
 }
 
 func (q *chunkStoreQuerier) LabelValues(name string) ([]string, storage.Warnings, error) {
diff --git a/pkg/querier/chunk_tar_test.go b/pkg/querier/chunk_tar_test.go
index 245c0cf5f26..bfc2a974af7 100644
--- a/pkg/querier/chunk_tar_test.go
+++ b/pkg/querier/chunk_tar_test.go
@@ -18,10 +18,11 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/chunk"
 	"github.com/cortexproject/cortex/pkg/querier/batch"
+	"github.com/cortexproject/cortex/pkg/querier/chunkstore"
 	"github.com/cortexproject/cortex/pkg/util"
 )
 
-func getTarDataFromEnv(t testing.TB) (query string, from, through time.Time, step time.Duration, store ChunkStore) {
+func getTarDataFromEnv(t testing.TB) (query string, from, through time.Time, step time.Duration, store chunkstore.ChunkStore) {
 	var (
 		err            error
 		chunksFilename = os.Getenv("CHUNKS")
@@ -48,7 +49,7 @@ func getTarDataFromEnv(t testing.TB) (query string, from, through time.Time, ste
 	return query, from, through, step, &mockChunkStore{chunks}
 }
 
-func runRangeQuery(t testing.TB, query string, from, through time.Time, step time.Duration, store ChunkStore) {
+func runRangeQuery(t testing.TB, query string, from, through time.Time, step time.Duration, store chunkstore.ChunkStore) {
 	if len(query) == 0 || store == nil {
 		return
 	}
diff --git a/pkg/querier/chunks_handler.go b/pkg/querier/chunks_handler.go
index d91ed8eb58c..ea694ae12b0 100644
--- a/pkg/querier/chunks_handler.go
+++ b/pkg/querier/chunks_handler.go
@@ -10,6 +10,7 @@ import (
 	"github.com/prometheus/prometheus/storage"
 	"github.com/weaveworks/common/user"
 
+	"github.com/cortexproject/cortex/pkg/querier/chunkstore"
 	"github.com/cortexproject/cortex/pkg/querier/queryrange"
 )
 
@@ -48,7 +49,7 @@ func ChunksHandler(queryable storage.Queryable) http.Handler {
 			return
 		}
 
-		store, ok := querier.(ChunkStore)
+		store, ok := querier.(chunkstore.ChunkStore)
 		if !ok {
 			http.Error(w, "not supported", http.StatusServiceUnavailable)
 			return
diff --git a/pkg/querier/chunkstore/chunkstore.go b/pkg/querier/chunkstore/chunkstore.go
new file mode 100644
index 00000000000..754ae0a9544
--- /dev/null
+++ b/pkg/querier/chunkstore/chunkstore.go
@@ -0,0 +1,16 @@
+package chunkstore
+
+import (
+	"context"
+
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+
+	"github.com/cortexproject/cortex/pkg/chunk"
+)
+
+// ChunkStore is the read-interface to the Chunk Store.  Made an interface here
+// to reduce package coupling.
+type ChunkStore interface {
+	Get(ctx context.Context, userID string, from, through model.Time, matchers ...*labels.Matcher) ([]chunk.Chunk, error)
+}
diff --git a/pkg/querier/distributor_queryable.go b/pkg/querier/distributor_queryable.go
index cd9a0602dea..25961025912 100644
--- a/pkg/querier/distributor_queryable.go
+++ b/pkg/querier/distributor_queryable.go
@@ -12,6 +12,7 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/ingester/client"
 	"github.com/cortexproject/cortex/pkg/prom1/storage/metric"
+	"github.com/cortexproject/cortex/pkg/querier/series"
 	"github.com/cortexproject/cortex/pkg/util/chunkcompat"
 )
 
@@ -54,7 +55,7 @@ func (q *distributorQuerier) Select(sp *storage.SelectParams, matchers ...*label
 		if err != nil {
 			return nil, nil, err
 		}
-		return metricsToSeriesSet(ms), nil, nil
+		return series.MetricsToSeriesSet(ms), nil, nil
 	}
 
 	mint, maxt := sp.Start, sp.End
@@ -68,7 +69,7 @@ func (q *distributorQuerier) Select(sp *storage.SelectParams, matchers ...*label
 		return nil, nil, promql.ErrStorage{Err: err}
 	}
 
-	return matrixToSeriesSet(matrix), nil, nil
+	return series.MatrixToSeriesSet(matrix), nil, nil
 }
 
 func (q *distributorQuerier) streamingSelect(sp storage.SelectParams, matchers []*labels.Matcher) (storage.SeriesSet, storage.Warnings, error) {
@@ -107,7 +108,7 @@ func (q *distributorQuerier) streamingSelect(sp storage.SelectParams, matchers [
 		serieses = append(serieses, series)
 	}
 
-	return newConcreteSeriesSet(serieses), nil, nil
+	return series.NewConcreteSeriesSet(serieses), nil, nil
 }
 
 func (q *distributorQuerier) LabelValues(name string) ([]string, storage.Warnings, error) {
diff --git a/pkg/querier/lazy_querier.go b/pkg/querier/lazy_querier.go
deleted file mode 100644
index 99c73d8cd91..00000000000
--- a/pkg/querier/lazy_querier.go
+++ /dev/null
@@ -1,89 +0,0 @@
-package querier
-
-import (
-	"github.com/prometheus/prometheus/pkg/labels"
-	"github.com/prometheus/prometheus/storage"
-)
-
-type lazyQuerier struct {
-	next storage.Querier
-}
-
-// newLazyQuerier wraps a storage.Querier, does the Select in the background.
-// Return value cannot be used from more than one goroutine simultaneously.
-func newLazyQuerier(next storage.Querier) storage.Querier {
-	return lazyQuerier{next}
-}
-
-func (l lazyQuerier) Select(params *storage.SelectParams, matchers ...*labels.Matcher) (storage.SeriesSet, storage.Warnings, error) {
-	// make sure there is space in the buffer, to unblock the goroutine and let it die even if nobody is
-	// waiting for the result yet (or anymore).
-	future := make(chan storage.SeriesSet, 1)
-	go func() {
-		set, _, err := l.next.Select(params, matchers...)
-		if err != nil {
-			future <- errSeriesSet{err}
-		} else {
-			future <- set
-		}
-	}()
-	return &lazySeriesSet{
-		future: future,
-	}, nil, nil
-}
-
-func (l lazyQuerier) LabelValues(name string) ([]string, storage.Warnings, error) {
-	return l.next.LabelValues(name)
-}
-
-func (l lazyQuerier) LabelNames() ([]string, storage.Warnings, error) {
-	return l.next.LabelNames()
-}
-
-func (l lazyQuerier) Close() error {
-	return l.next.Close()
-}
-
-// errSeriesSet implements storage.SeriesSet, just returning an error.
-type errSeriesSet struct {
-	err error
-}
-
-func (errSeriesSet) Next() bool {
-	return false
-}
-
-func (errSeriesSet) At() storage.Series {
-	return nil
-}
-
-func (e errSeriesSet) Err() error {
-	return e.err
-}
-
-type lazySeriesSet struct {
-	next   storage.SeriesSet
-	future chan storage.SeriesSet
-}
-
-// Next implements storage.SeriesSet.  NB not thread safe!
-func (s *lazySeriesSet) Next() bool {
-	if s.next == nil {
-		s.next = <-s.future
-	}
-	return s.next.Next()
-}
-
-func (s lazySeriesSet) At() storage.Series {
-	if s.next == nil {
-		s.next = <-s.future
-	}
-	return s.next.At()
-}
-
-func (s lazySeriesSet) Err() error {
-	if s.next == nil {
-		s.next = <-s.future
-	}
-	return s.next.Err()
-}
diff --git a/pkg/querier/lazyquery/lazyquery.go b/pkg/querier/lazyquery/lazyquery.go
new file mode 100644
index 00000000000..ba6531f74f0
--- /dev/null
+++ b/pkg/querier/lazyquery/lazyquery.go
@@ -0,0 +1,135 @@
+package lazyquery
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/storage"
+
+	"github.com/cortexproject/cortex/pkg/chunk"
+	"github.com/cortexproject/cortex/pkg/querier/chunkstore"
+)
+
+// LazyQueryable wraps a storage.Queryable
+type LazyQueryable struct {
+	q storage.Queryable
+}
+
+// Querier impls storage.Queryable
+func (lq LazyQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
+	q, err := lq.q.Querier(ctx, mint, maxt)
+	if err != nil {
+		return nil, err
+	}
+
+	return NewLazyQuerier(q), nil
+}
+
+// NewLazyQueryable returns a lazily wrapped queryable
+func NewLazyQueryable(q storage.Queryable) storage.Queryable {
+	return LazyQueryable{q}
+}
+
+// LazyQuerier is a lazy-loaded adapter for a storage.Querier
+type LazyQuerier struct {
+	next storage.Querier
+}
+
+// NewLazyQuerier wraps a storage.Querier, does the Select in the background.
+// Return value cannot be used from more than one goroutine simultaneously.
+func NewLazyQuerier(next storage.Querier) storage.Querier {
+	return LazyQuerier{next}
+}
+
+// Select impls Storage.Querier
+func (l LazyQuerier) Select(params *storage.SelectParams, matchers ...*labels.Matcher) (storage.SeriesSet, storage.Warnings, error) {
+	// make sure there is space in the buffer, to unblock the goroutine and let it die even if nobody is
+	// waiting for the result yet (or anymore).
+	future := make(chan storage.SeriesSet, 1)
+	go func() {
+		set, _, err := l.next.Select(params, matchers...)
+		if err != nil {
+			future <- errSeriesSet{err}
+		} else {
+			future <- set
+		}
+	}()
+	return &lazySeriesSet{
+		future: future,
+	}, nil, nil
+}
+
+// LabelValues impls Storage.Querier
+func (l LazyQuerier) LabelValues(name string) ([]string, storage.Warnings, error) {
+	return l.next.LabelValues(name)
+}
+
+// LabelNames impls Storage.Querier
+func (l LazyQuerier) LabelNames() ([]string, storage.Warnings, error) {
+	return l.next.LabelNames()
+}
+
+// Close impls Storage.Querier
+func (l LazyQuerier) Close() error {
+	return l.next.Close()
+}
+
+// Get implements ChunkStore for the chunk tar HTTP handler.
+func (l LazyQuerier) Get(ctx context.Context, userID string, from, through model.Time, matchers ...*labels.Matcher) ([]chunk.Chunk, error) {
+	store, ok := l.next.(chunkstore.ChunkStore)
+	if !ok {
+		return nil, fmt.Errorf("not supported")
+	}
+
+	return store.Get(ctx, userID, from, through, matchers...)
+}
+
+func NewErrSeriesSet(err error) storage.SeriesSet {
+	return errSeriesSet{err}
+}
+
+// errSeriesSet implements storage.SeriesSet, just returning an error.
+type errSeriesSet struct {
+	err error
+}
+
+func (errSeriesSet) Next() bool {
+	return false
+}
+
+func (errSeriesSet) At() storage.Series {
+	return nil
+}
+
+func (e errSeriesSet) Err() error {
+	return e.err
+}
+
+type lazySeriesSet struct {
+	next   storage.SeriesSet
+	future chan storage.SeriesSet
+}
+
+// Next implements storage.SeriesSet.  NB not thread safe!
+func (s *lazySeriesSet) Next() bool {
+	if s.next == nil {
+		s.next = <-s.future
+	}
+	return s.next.Next()
+}
+
+func (s lazySeriesSet) At() storage.Series {
+	if s.next == nil {
+		s.next = <-s.future
+	}
+	return s.next.At()
+}
+
+func (s lazySeriesSet) Err() error {
+	if s.next == nil {
+		s.next = <-s.future
+	}
+	return s.next.Err()
+}
diff --git a/pkg/querier/matrix.go b/pkg/querier/matrix.go
index c6bd9d0ec3f..8ac963ad5ac 100644
--- a/pkg/querier/matrix.go
+++ b/pkg/querier/matrix.go
@@ -5,6 +5,7 @@ import (
 	"github.com/prometheus/prometheus/storage"
 
 	"github.com/cortexproject/cortex/pkg/chunk"
+	"github.com/cortexproject/cortex/pkg/querier/series"
 	"github.com/cortexproject/cortex/pkg/util"
 )
 
@@ -13,12 +14,12 @@ func mergeChunks(chunks []chunk.Chunk, from, through model.Time) storage.SeriesI
 	for _, c := range chunks {
 		ss, err := c.Samples(from, through)
 		if err != nil {
-			return errIterator{err}
+			return series.NewErrIterator(err)
 		}
 
 		samples = append(samples, ss)
 	}
 
 	merged := util.MergeNSampleSets(samples...)
-	return newConcreteSeriesIterator(newConcreteSeries(nil, merged))
+	return series.NewConcreteSeriesIterator(series.NewConcreteSeries(nil, merged))
 }
diff --git a/pkg/querier/querier.go b/pkg/querier/querier.go
index 86751f27d93..63120f442ea 100644
--- a/pkg/querier/querier.go
+++ b/pkg/querier/querier.go
@@ -14,7 +14,9 @@ import (
 
 	"github.com/cortexproject/cortex/pkg/chunk"
 	"github.com/cortexproject/cortex/pkg/querier/batch"
+	"github.com/cortexproject/cortex/pkg/querier/chunkstore"
 	"github.com/cortexproject/cortex/pkg/querier/iterators"
+	"github.com/cortexproject/cortex/pkg/querier/lazyquery"
 	"github.com/cortexproject/cortex/pkg/util"
 )
 
@@ -74,12 +76,6 @@ func (cfg *Config) Validate() error {
 	return nil
 }
 
-// ChunkStore is the read-interface to the Chunk Store.  Made an interface here
-// to reduce package coupling.
-type ChunkStore interface {
-	Get(ctx context.Context, userID string, from, through model.Time, matchers ...*labels.Matcher) ([]chunk.Chunk, error)
-}
-
 func getChunksIteratorFunction(cfg Config) chunkIteratorFunc {
 	if cfg.BatchIterators {
 		return batch.NewChunkMergeIterator
@@ -89,7 +85,7 @@ func getChunksIteratorFunction(cfg Config) chunkIteratorFunc {
 	return mergeChunks
 }
 
-func NewChunkStoreQueryable(cfg Config, chunkStore ChunkStore) storage.Queryable {
+func NewChunkStoreQueryable(cfg Config, chunkStore chunkstore.ChunkStore) storage.Queryable {
 	return newChunkStoreQueryable(chunkStore, getChunksIteratorFunction(cfg))
 }
 
@@ -106,7 +102,7 @@ func New(cfg Config, distributor Distributor, storeQueryable storage.Queryable)
 		if err != nil {
 			return nil, err
 		}
-		return newLazyQuerier(querier), nil
+		return lazyquery.NewLazyQuerier(querier), nil
 	})
 
 	promql.SetDefaultEvaluationInterval(cfg.DefaultEvaluationInterval)
@@ -239,7 +235,7 @@ func (q querier) mergeSeriesSets(sets []storage.SeriesSet) storage.SeriesSet {
 			// If there is error, we better report it.
 			err := set.Err()
 			if err != nil {
-				otherSets = append(otherSets, errSeriesSet{err: err})
+				otherSets = append(otherSets, lazyquery.NewErrSeriesSet(err))
 			}
 			continue
 		}
diff --git a/pkg/querier/queryrange/instrumentation.go b/pkg/querier/queryrange/instrumentation.go
index c55a1b1071e..f40fd971dbc 100644
--- a/pkg/querier/queryrange/instrumentation.go
+++ b/pkg/querier/queryrange/instrumentation.go
@@ -8,12 +8,26 @@ import (
 	"github.com/weaveworks/common/instrument"
 )
 
-var queryRangeDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
-	Namespace: "cortex",
-	Name:      "frontend_query_range_duration_seconds",
-	Help:      "Total time spent in seconds doing query range requests.",
-	Buckets:   prometheus.DefBuckets,
-}, []string{"method", "status_code"})
+var (
+	queryRangeDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
+		Namespace: "cortex",
+		Name:      "frontend_query_range_duration_seconds",
+		Help:      "Total time spent in seconds doing query range requests.",
+		Buckets:   prometheus.DefBuckets,
+	}, []string{"method", "status_code"})
+
+	mappedASTCounter = promauto.NewCounter(prometheus.CounterOpts{
+		Namespace: "cortex",
+		Name:      "frontend_mapped_asts_total",
+		Help:      "Total number of queries that have undergone AST mapping",
+	})
+
+	splitByCounter = promauto.NewCounter(prometheus.CounterOpts{
+		Namespace: "cortex",
+		Name:      "frontend_split_queries_total",
+		Help:      "Total number of underlying query requests after the split by interval is applied",
+	})
+)
 
 // InstrumentMiddleware can be inserted into the middleware chain to expose timing information.
 func InstrumentMiddleware(name string) Middleware {
diff --git a/pkg/querier/queryrange/promql_test.go b/pkg/querier/queryrange/promql_test.go
new file mode 100644
index 00000000000..ded2cc12416
--- /dev/null
+++ b/pkg/querier/queryrange/promql_test.go
@@ -0,0 +1,610 @@
+package queryrange
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+	"github.com/cortexproject/cortex/pkg/util"
+)
+
+var (
+	start  = time.Unix(1000, 0)
+	end    = start.Add(3 * time.Minute)
+	step   = 30 * time.Second
+	ctx    = context.Background()
+	engine = promql.NewEngine(promql.EngineOpts{
+		Reg:                prometheus.DefaultRegisterer,
+		MaxConcurrent:      1000,
+		Logger:             util.Logger,
+		Timeout:            1 * time.Hour,
+		MaxSamples:         10e6,
+		ActiveQueryTracker: nil,
+	})
+)
+
+// This test allows to verify which PromQL expressions can be parallelized.
+func Test_PromQL(t *testing.T) {
+	t.Parallel()
+
+	var tests = []struct {
+		normalQuery string
+		shardQuery  string
+		shouldEqual bool
+	}{
+		// Vector can be parallelized but we need to remove the cortex shard label.
+		// It should be noted that the __cortex_shard__ label is required by the engine
+		// and therefore should be returned by the storage.
+		// Range vectors `bar1{baz="blip"}[1m]` are not tested here because it is not supported
+		// by range queries.
+		{
+			`bar1{baz="blip"}`,
+			`label_replace(
+				bar1{__cortex_shard__="0_of_3",baz="blip"} or
+				bar1{__cortex_shard__="1_of_3",baz="blip"} or
+				bar1{__cortex_shard__="2_of_3",baz="blip"},
+				"__cortex_shard__","","",""
+			)`,
+			true,
+		},
+		// __cortex_shard__ label is required otherwise the or will keep only the first series.
+		{
+			`sum(bar1{baz="blip"})`,
+			`sum(
+				sum (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				sum (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				sum (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			false,
+		},
+		{
+			`sum(bar1{baz="blip"})`,
+			`sum(
+				sum without(__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				sum without(__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				sum without(__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			true,
+		},
+		{
+			`sum by (foo) (bar1{baz="blip"})`,
+			`sum by (foo) (
+				sum by(foo,__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				sum by(foo,__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				sum by(foo,__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			true,
+		},
+		{
+			`sum by (foo,bar) (bar1{baz="blip"})`,
+			`sum by (foo,bar)(
+				sum by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				sum by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				sum by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			true,
+		},
+		// since series are unique to a shard, it's safe to sum without shard first, then reaggregate
+		{
+			`sum without (foo,bar) (bar1{baz="blip"})`,
+			`sum without (foo,bar)(
+				sum without(__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				sum without(__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				sum without(__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			true,
+		},
+		{
+			`min by (foo,bar) (bar1{baz="blip"})`,
+			`min by (foo,bar)(
+				min by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				min by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				min by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			true,
+		},
+		{
+			`max by (foo,bar) (bar1{baz="blip"})`,
+			` max by (foo,bar)(
+				max by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				max by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				max by(foo,bar,__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			true,
+		},
+		// avg generally cant be parallelized
+		{
+			`avg(bar1{baz="blip"})`,
+			`avg(
+				avg by(__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				avg by(__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				avg by(__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			false,
+		},
+		// stddev can't be parallelized.
+		{
+			`stddev(bar1{baz="blip"})`,
+			` stddev(
+				stddev by(__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				stddev by(__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				stddev by(__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			false,
+		},
+		// stdvar can't be parallelized.
+		{
+			`stdvar(bar1{baz="blip"})`,
+			`stdvar(
+				stdvar by(__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				stdvar by(__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				stdvar by(__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			  )`,
+			false,
+		},
+		{
+			`count(bar1{baz="blip"})`,
+			`count(
+				count without (__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				count without (__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				count without (__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+				)`,
+			true,
+		},
+		{
+			`count by (foo,bar) (bar1{baz="blip"})`,
+			`count by (foo,bar) (
+				count by (foo,bar,__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				count by (foo,bar,__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				count by (foo,bar,__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			)`,
+			true,
+		},
+		// different ways to represent count without.
+		{
+			`count without (foo) (bar1{baz="blip"})`,
+			`count without (foo) (
+				count without (__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				count without (__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				count without (__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			)`,
+			true,
+		},
+		{
+			`count without (foo) (bar1{baz="blip"})`,
+			`sum without (__cortex_shard__) (
+				count without (foo) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				count without (foo) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				count without (foo) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			)`,
+			true,
+		},
+		{
+			`count without (foo, bar) (bar1{baz="blip"})`,
+			`count without (foo, bar) (
+				count without (__cortex_shard__) (bar1{__cortex_shard__="0_of_3",baz="blip"}) or
+				count without (__cortex_shard__) (bar1{__cortex_shard__="1_of_3",baz="blip"}) or
+				count without (__cortex_shard__) (bar1{__cortex_shard__="2_of_3",baz="blip"})
+			)`,
+			true,
+		},
+		{
+			`topk(2,bar1{baz="blip"})`,
+			`label_replace(
+				topk(2,
+					topk(2,(bar1{__cortex_shard__="0_of_3",baz="blip"})) without(__cortex_shard__) or
+					topk(2,(bar1{__cortex_shard__="1_of_3",baz="blip"})) without(__cortex_shard__) or
+					topk(2,(bar1{__cortex_shard__="2_of_3",baz="blip"})) without(__cortex_shard__)
+				),
+                          "__cortex_shard__","","","")`,
+			true,
+		},
+		{
+			`bottomk(2,bar1{baz="blip"})`,
+			`label_replace(
+				bottomk(2,
+					bottomk(2,(bar1{__cortex_shard__="0_of_3",baz="blip"})) without(__cortex_shard__) or
+					bottomk(2,(bar1{__cortex_shard__="1_of_3",baz="blip"})) without(__cortex_shard__) or
+					bottomk(2,(bar1{__cortex_shard__="2_of_3",baz="blip"})) without(__cortex_shard__)
+				),
+                          "__cortex_shard__","","","")`,
+			true,
+		},
+		{
+			`sum by (foo,bar) (avg_over_time(bar1{baz="blip"}[1m]))`,
+			`sum by (foo,bar)(
+				sum by(foo,bar,__cortex_shard__) (avg_over_time(bar1{__cortex_shard__="0_of_3",baz="blip"}[1m])) or
+				sum by(foo,bar,__cortex_shard__) (avg_over_time(bar1{__cortex_shard__="1_of_3",baz="blip"}[1m])) or
+				sum by(foo,bar,__cortex_shard__) (avg_over_time(bar1{__cortex_shard__="2_of_3",baz="blip"}[1m]))
+			  )`,
+			true,
+		},
+		{
+			`sum by (foo,bar) (min_over_time(bar1{baz="blip"}[1m]))`,
+			`sum by (foo,bar)(
+				sum by(foo,bar,__cortex_shard__) (min_over_time(bar1{__cortex_shard__="0_of_3",baz="blip"}[1m])) or
+				sum by(foo,bar,__cortex_shard__) (min_over_time(bar1{__cortex_shard__="1_of_3",baz="blip"}[1m])) or
+				sum by(foo,bar,__cortex_shard__) (min_over_time(bar1{__cortex_shard__="2_of_3",baz="blip"}[1m]))
+			  )`,
+			true,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+		t.Run(tt.normalQuery, func(t *testing.T) {
+
+			baseQuery, err := engine.NewRangeQuery(shardAwareQueryable, tt.normalQuery, start, end, step)
+			require.Nil(t, err)
+			shardQuery, err := engine.NewRangeQuery(shardAwareQueryable, tt.shardQuery, start, end, step)
+			require.Nil(t, err)
+			baseResult := baseQuery.Exec(ctx)
+			shardResult := shardQuery.Exec(ctx)
+			t.Logf("base: %v\n", baseResult)
+			t.Logf("shard: %v\n", shardResult)
+			if tt.shouldEqual {
+				require.Equal(t, baseResult, shardResult)
+				return
+			}
+			require.NotEqual(t, baseResult, shardResult)
+		})
+	}
+
+}
+
+func Test_FunctionParallelism(t *testing.T) {
+	tpl := `sum(<fn>(bar1{}<fArgs>))`
+	shardTpl := `sum(
+				sum without(__cortex_shard__) (<fn>(bar1{__cortex_shard__="0_of_3"}<fArgs>)) or
+				sum without(__cortex_shard__) (<fn>(bar1{__cortex_shard__="1_of_3"}<fArgs>)) or
+				sum without(__cortex_shard__) (<fn>(bar1{__cortex_shard__="2_of_3"}<fArgs>))
+			  )`
+
+	mkQuery := func(tpl, fn string, testMatrix bool, fArgs []string) (result string) {
+		result = strings.Replace(tpl, "<fn>", fn, -1)
+
+		if testMatrix {
+			// turn selectors into ranges
+			result = strings.Replace(result, "}<fArgs>", "}[1m]<fArgs>", -1)
+		}
+
+		if len(fArgs) > 0 {
+			args := "," + strings.Join(fArgs, ",")
+			result = strings.Replace(result, "<fArgs>", args, -1)
+		} else {
+			result = strings.Replace(result, "<fArgs>", "", -1)
+		}
+
+		return result
+	}
+
+	for _, tc := range []struct {
+		fn           string
+		fArgs        []string
+		isTestMatrix bool
+		approximate  bool
+	}{
+		{
+			fn: "abs",
+		},
+		{
+			fn:           "avg_over_time",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn: "ceil",
+		},
+		{
+			fn:           "changes",
+			isTestMatrix: true,
+		},
+		{
+			fn:           "count_over_time",
+			isTestMatrix: true,
+		},
+		{
+			fn: "days_in_month",
+		},
+		{
+			fn: "day_of_month",
+		},
+		{
+			fn: "day_of_week",
+		},
+		{
+			fn:           "delta",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:           "deriv",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:          "exp",
+			approximate: true,
+		},
+		{
+			fn: "floor",
+		},
+		{
+			fn: "hour",
+		},
+		{
+			fn:           "idelta",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:           "increase",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:           "irate",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:          "ln",
+			approximate: true,
+		},
+		{
+			fn:          "log10",
+			approximate: true,
+		},
+		{
+			fn:          "log2",
+			approximate: true,
+		},
+		{
+			fn:           "max_over_time",
+			isTestMatrix: true,
+		},
+		{
+			fn:           "min_over_time",
+			isTestMatrix: true,
+		},
+		{
+			fn: "minute",
+		},
+		{
+			fn: "month",
+		},
+		{
+			fn:           "rate",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:           "resets",
+			isTestMatrix: true,
+		},
+		{
+			fn: "sort",
+		},
+		{
+			fn: "sort_desc",
+		},
+		{
+			fn:          "sqrt",
+			approximate: true,
+		},
+		{
+			fn:           "stddev_over_time",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:           "stdvar_over_time",
+			isTestMatrix: true,
+			approximate:  true,
+		},
+		{
+			fn:           "sum_over_time",
+			isTestMatrix: true,
+		},
+		{
+			fn: "timestamp",
+		},
+		{
+			fn: "year",
+		},
+		{
+			fn:    "clamp_max",
+			fArgs: []string{"5"},
+		},
+		{
+			fn:    "clamp_min",
+			fArgs: []string{"5"},
+		},
+		{
+			fn:           "predict_linear",
+			isTestMatrix: true,
+			approximate:  true,
+			fArgs:        []string{"1"},
+		},
+		{
+			fn:    "round",
+			fArgs: []string{"20"},
+		},
+		{
+			fn:           "holt_winters",
+			isTestMatrix: true,
+			fArgs:        []string{"0.5", "0.7"},
+			approximate:  true,
+		},
+	} {
+
+		t.Run(tc.fn, func(t *testing.T) {
+			baseQuery, err := engine.NewRangeQuery(
+				shardAwareQueryable,
+				mkQuery(tpl, tc.fn, tc.isTestMatrix, tc.fArgs),
+				start,
+				end,
+				step,
+			)
+			require.Nil(t, err)
+			shardQuery, err := engine.NewRangeQuery(
+				shardAwareQueryable,
+				mkQuery(shardTpl, tc.fn, tc.isTestMatrix, tc.fArgs),
+				start,
+				end,
+				step,
+			)
+			require.Nil(t, err)
+			baseResult := baseQuery.Exec(ctx)
+			shardResult := shardQuery.Exec(ctx)
+			t.Logf("base: %+v\n", baseResult)
+			t.Logf("shard: %+v\n", shardResult)
+			if !tc.approximate {
+				require.Equal(t, baseResult, shardResult)
+			} else {
+				// Some functions yield tiny differences when sharded due to combining floating point calculations.
+				baseSeries := baseResult.Value.(promql.Matrix)[0]
+				shardSeries := shardResult.Value.(promql.Matrix)[0]
+
+				require.Equal(t, len(baseSeries.Points), len(shardSeries.Points))
+				for i, basePt := range baseSeries.Points {
+					shardPt := shardSeries.Points[i]
+					require.Equal(t, basePt.T, shardPt.T)
+					require.Equal(
+						t,
+						math.Round(basePt.V*1e6)/1e6,
+						math.Round(shardPt.V*1e6)/1e6,
+					)
+				}
+
+			}
+		})
+	}
+
+}
+
+var shardAwareQueryable = storage.QueryableFunc(func(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
+	return &testMatrix{
+		series: []*promql.StorageSeries{
+			newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}, {Name: "baz", Value: "blip"}, {Name: "bar", Value: "blop"}, {Name: "foo", Value: "barr"}}, factor(5)),
+			newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}, {Name: "baz", Value: "blip"}, {Name: "bar", Value: "blop"}, {Name: "foo", Value: "bazz"}}, factor(7)),
+			newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}, {Name: "baz", Value: "blip"}, {Name: "bar", Value: "blap"}, {Name: "foo", Value: "buzz"}}, factor(12)),
+			newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}, {Name: "baz", Value: "blip"}, {Name: "bar", Value: "blap"}, {Name: "foo", Value: "bozz"}}, factor(11)),
+			newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}, {Name: "baz", Value: "blip"}, {Name: "bar", Value: "blop"}, {Name: "foo", Value: "buzz"}}, factor(8)),
+			newSeries(labels.Labels{{Name: "__name__", Value: "bar1"}, {Name: "baz", Value: "blip"}, {Name: "bar", Value: "blap"}, {Name: "foo", Value: "bazz"}}, identity),
+		},
+	}, nil
+})
+
+type testMatrix struct {
+	series []*promql.StorageSeries
+}
+
+func (m *testMatrix) Copy() *testMatrix {
+	cpy := *m
+	return &cpy
+}
+
+func (m testMatrix) Next() bool { return len(m.series) != 0 }
+
+func (m *testMatrix) At() storage.Series {
+	res := m.series[0]
+	m.series = m.series[1:]
+	return res
+}
+
+func (m *testMatrix) Err() error { return nil }
+
+func (m *testMatrix) Select(selectParams *storage.SelectParams, matchers ...*labels.Matcher) (storage.SeriesSet, storage.Warnings, error) {
+	s, _, err := astmapper.ShardFromMatchers(matchers)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if s != nil {
+		return splitByShard(s.Shard, s.Of, m), nil, nil
+	}
+
+	return m.Copy(), nil, nil
+}
+
+func (m *testMatrix) LabelValues(name string) ([]string, storage.Warnings, error) {
+	return nil, nil, nil
+}
+func (m *testMatrix) LabelNames() ([]string, storage.Warnings, error) { return nil, nil, nil }
+func (m *testMatrix) Close() error                                    { return nil }
+
+func newSeries(metric labels.Labels, generator func(float64) float64) *promql.StorageSeries {
+	sort.Sort(metric)
+	var points []promql.Point
+
+	for ts := start.Add(-step); ts.Unix() <= end.Unix(); ts = ts.Add(step) {
+		t := ts.Unix() * 1e3
+		points = append(points, promql.Point{
+			T: t,
+			V: generator(float64(t)),
+		})
+	}
+
+	return promql.NewStorageSeries(promql.Series{
+		Metric: metric,
+		Points: points,
+	})
+}
+
+func identity(t float64) float64 {
+	return float64(t)
+}
+
+func factor(f float64) func(float64) float64 {
+	i := 0.
+	return func(float64) float64 {
+		i++
+		res := i * f
+		return res
+	}
+}
+
+// var identity(t int64) float64 {
+// 	return float64(t)
+// }
+
+// splitByShard returns the shard subset of a testMatrix.
+// e.g if a testMatrix has 6 series, and we want 3 shard, then each shard will contain
+// 2 series.
+func splitByShard(shardIndex, shardTotal int, testMatrices *testMatrix) *testMatrix {
+	res := &testMatrix{}
+	for i, s := range testMatrices.series {
+		if i%shardTotal != shardIndex {
+			continue
+		}
+		var points []promql.Point
+		it := s.Iterator()
+		for it.Next() {
+			t, v := it.At()
+			points = append(points, promql.Point{
+				T: t,
+				V: v,
+			})
+
+		}
+		lbs := s.Labels().Copy()
+		lbs = append(lbs, labels.Label{Name: "__cortex_shard__", Value: fmt.Sprintf("%d_of_%d", shardIndex, shardTotal)})
+		res.series = append(res.series, promql.NewStorageSeries(promql.Series{
+			Metric: lbs,
+			Points: points,
+		}))
+	}
+	return res
+}
diff --git a/pkg/querier/queryrange/query_range.go b/pkg/querier/queryrange/query_range.go
index b85c9f800ba..59ee3dd158c 100644
--- a/pkg/querier/queryrange/query_range.go
+++ b/pkg/querier/queryrange/query_range.go
@@ -22,7 +22,8 @@ import (
 	"github.com/cortexproject/cortex/pkg/ingester/client"
 )
 
-const statusSuccess = "success"
+// StatusSuccess Prometheus success result.
+const StatusSuccess = "success"
 
 var (
 	matrix            = model.ValMatrix.String()
@@ -71,6 +72,8 @@ type Request interface {
 	GetQuery() string
 	// WithStartEnd clone the current request with different start and end timestamp.
 	WithStartEnd(int64, int64) Request
+	// WithQuery clone the current request with a different query.
+	WithQuery(string) Request
 	proto.Message
 }
 
@@ -100,6 +103,13 @@ func (q *PrometheusRequest) WithStartEnd(start int64, end int64) Request {
 	return &new
 }
 
+// WithQuery clones the current `PrometheusRequest` with a new query.
+func (q *PrometheusRequest) WithQuery(query string) Request {
+	new := *q
+	new.Query = query
+	return &new
+}
+
 type byFirstTime []*PrometheusResponse
 
 func (a byFirstTime) Len() int           { return len(a) }
@@ -127,12 +137,12 @@ func (prometheusCodec) MergeResponse(responses ...Response) (Response, error) {
 
 	if len(promResponses) == 0 {
 		return &PrometheusResponse{
-			Status: statusSuccess,
+			Status: StatusSuccess,
 		}, nil
 	}
 
 	return &PrometheusResponse{
-		Status: statusSuccess,
+		Status: StatusSuccess,
 		Data: PrometheusData{
 			ResultType: model.ValMatrix.String(),
 			Result:     matrixMerge(promResponses),
diff --git a/pkg/querier/queryrange/query_range_test.go b/pkg/querier/queryrange/query_range_test.go
index 7bfd0c1ed94..2e25bcdca30 100644
--- a/pkg/querier/queryrange/query_range_test.go
+++ b/pkg/querier/queryrange/query_range_test.go
@@ -117,7 +117,7 @@ func TestMergeAPIResponses(t *testing.T) {
 		{
 			input: []Response{},
 			expected: &PrometheusResponse{
-				Status: statusSuccess,
+				Status: StatusSuccess,
 			},
 		},
 
@@ -132,7 +132,7 @@ func TestMergeAPIResponses(t *testing.T) {
 				},
 			},
 			expected: &PrometheusResponse{
-				Status: statusSuccess,
+				Status: StatusSuccess,
 				Data: PrometheusData{
 					ResultType: matrix,
 					Result:     []SampleStream{},
@@ -157,7 +157,7 @@ func TestMergeAPIResponses(t *testing.T) {
 				},
 			},
 			expected: &PrometheusResponse{
-				Status: statusSuccess,
+				Status: StatusSuccess,
 				Data: PrometheusData{
 					ResultType: matrix,
 					Result:     []SampleStream{},
@@ -198,7 +198,7 @@ func TestMergeAPIResponses(t *testing.T) {
 				},
 			},
 			expected: &PrometheusResponse{
-				Status: statusSuccess,
+				Status: StatusSuccess,
 				Data: PrometheusData{
 					ResultType: matrix,
 					Result: []SampleStream{
@@ -223,7 +223,7 @@ func TestMergeAPIResponses(t *testing.T) {
 				mustParse(t, `{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"c":"d","a":"b"},"values":[[2,"2"],[3,"3"]]}]}}`),
 			},
 			expected: &PrometheusResponse{
-				Status: statusSuccess,
+				Status: StatusSuccess,
 				Data: PrometheusData{
 					ResultType: matrix,
 					Result: []SampleStream{
@@ -247,7 +247,7 @@ func TestMergeAPIResponses(t *testing.T) {
 				mustParse(t, `{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"c":"d","a":"b"},"values":[[2,"2"],[3,"3"]]}]}}`),
 			},
 			expected: &PrometheusResponse{
-				Status: statusSuccess,
+				Status: StatusSuccess,
 				Data: PrometheusData{
 					ResultType: matrix,
 					Result: []SampleStream{
diff --git a/pkg/querier/queryrange/queryable.go b/pkg/querier/queryrange/queryable.go
new file mode 100644
index 00000000000..01ebe6b6865
--- /dev/null
+++ b/pkg/querier/queryrange/queryable.go
@@ -0,0 +1,121 @@
+package queryrange
+
+import (
+	"context"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/storage"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+)
+
+const (
+	missingEmbeddedQueryMsg = "missing embedded query"
+	nonEmbeddedErrMsg       = "DownstreamQuerier cannot handle a non-embedded query"
+)
+
+// ShardedQueryable is an implementor of the Queryable interface.
+type ShardedQueryable struct {
+	Req     Request
+	Handler Handler
+}
+
+// Querier implements Queryable
+func (q *ShardedQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
+	return &ShardedQuerier{ctx, q.Req, q.Handler}, nil
+}
+
+// ShardedQuerier is a an implementor of the Querier interface.
+type ShardedQuerier struct {
+	Ctx     context.Context
+	Req     Request
+	Handler Handler
+}
+
+// Select returns a set of series that matches the given label matchers.
+func (q *ShardedQuerier) Select(
+	_ *storage.SelectParams,
+	matchers ...*labels.Matcher,
+) (storage.SeriesSet, storage.Warnings, error) {
+	var embeddedQuery string
+	var isEmbedded bool
+	for _, matcher := range matchers {
+		if matcher.Name == labels.MetricName && matcher.Value == astmapper.EmbeddedQueriesMetricName {
+			isEmbedded = true
+		}
+
+		if matcher.Name == astmapper.QueryLabel {
+			embeddedQuery = matcher.Value
+		}
+	}
+
+	if isEmbedded {
+		if embeddedQuery != "" {
+			return q.handleEmbeddedQuery(embeddedQuery)
+		}
+		return nil, nil, errors.Errorf(missingEmbeddedQueryMsg)
+
+	}
+
+	return nil, nil, errors.Errorf(nonEmbeddedErrMsg)
+}
+
+// handleEmbeddedQuery defers execution of an encoded query to a downstream Handler
+func (q *ShardedQuerier) handleEmbeddedQuery(encoded string) (storage.SeriesSet, storage.Warnings, error) {
+	queries, err := astmapper.JSONCodec.Decode(encoded)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	ctx, cancel := context.WithCancel(q.Ctx)
+	defer cancel()
+
+	// buffer channels to length of queries to prevent leaking memory due to sending to unbuffered channels after cancel/err
+	errCh := make(chan error, len(queries))
+	samplesCh := make(chan []SampleStream, len(queries))
+	// TODO(owen-d): impl unified concurrency controls, not per middleware
+	for _, query := range queries {
+		go func(query string) {
+			resp, err := q.Handler.Do(ctx, q.Req.WithQuery(query))
+			if err != nil {
+				errCh <- err
+				return
+			}
+			streams, err := ResponseToSamples(resp)
+			if err != nil {
+				errCh <- err
+				return
+			}
+			samplesCh <- streams
+		}(query)
+	}
+
+	var samples []SampleStream
+
+	for i := 0; i < len(queries); i++ {
+		select {
+		case err := <-errCh:
+			return nil, nil, err
+		case streams := <-samplesCh:
+			samples = append(samples, streams...)
+		}
+	}
+
+	return NewSeriesSet(samples), nil, err
+}
+
+// LabelValues returns all potential values for a label name.
+func (q *ShardedQuerier) LabelValues(name string) ([]string, storage.Warnings, error) {
+	return nil, nil, errors.Errorf("unimplemented")
+}
+
+// LabelNames returns all the unique label names present in the block in sorted order.
+func (q *ShardedQuerier) LabelNames() ([]string, storage.Warnings, error) {
+	return nil, nil, errors.Errorf("unimplemented")
+}
+
+// Close releases the resources of the Querier.
+func (q *ShardedQuerier) Close() error {
+	return nil
+}
diff --git a/pkg/querier/queryrange/queryable_test.go b/pkg/querier/queryrange/queryable_test.go
new file mode 100644
index 00000000000..36006c71195
--- /dev/null
+++ b/pkg/querier/queryrange/queryable_test.go
@@ -0,0 +1,269 @@
+package queryrange
+
+import (
+	"context"
+	"testing"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+)
+
+func TestSelect(t *testing.T) {
+	var testExpr = []struct {
+		name    string
+		querier *ShardedQuerier
+		fn      func(*testing.T, *ShardedQuerier)
+	}{
+		{
+			name: "errors non embedded query",
+			querier: mkQuerier(
+				nil,
+			),
+			fn: func(t *testing.T, q *ShardedQuerier) {
+				set, _, err := q.Select(nil)
+				require.Nil(t, set)
+				require.EqualError(t, err, nonEmbeddedErrMsg)
+			},
+		},
+		{
+			name: "replaces query",
+			querier: mkQuerier(mockHandler(
+				&PrometheusResponse{},
+				nil,
+			)),
+			fn: func(t *testing.T, q *ShardedQuerier) {
+
+				expected := &PrometheusResponse{
+					Status: "success",
+					Data: PrometheusData{
+						ResultType: promql.ValueTypeVector,
+					},
+				}
+
+				// override handler func to assert new query has been substituted
+				q.Handler = HandlerFunc(
+					func(ctx context.Context, req Request) (Response, error) {
+						require.Equal(t, `http_requests_total{cluster="prod"}`, req.GetQuery())
+						return expected, nil
+					},
+				)
+
+				encoded, err := astmapper.JSONCodec.Encode([]string{`http_requests_total{cluster="prod"}`})
+				require.Nil(t, err)
+				_, _, err = q.Select(
+					nil,
+					exactMatch("__name__", astmapper.EmbeddedQueriesMetricName),
+					exactMatch(astmapper.QueryLabel, encoded),
+				)
+				require.Nil(t, err)
+			},
+		},
+		{
+			name: "propagates response error",
+			querier: mkQuerier(mockHandler(
+				&PrometheusResponse{
+					Error: "SomeErr",
+				},
+				nil,
+			)),
+			fn: func(t *testing.T, q *ShardedQuerier) {
+				encoded, err := astmapper.JSONCodec.Encode([]string{`http_requests_total{cluster="prod"}`})
+				require.Nil(t, err)
+				set, _, err := q.Select(
+					nil,
+					exactMatch("__name__", astmapper.EmbeddedQueriesMetricName),
+					exactMatch(astmapper.QueryLabel, encoded),
+				)
+				require.Nil(t, set)
+				require.EqualError(t, err, "SomeErr")
+			},
+		},
+		{
+			name: "returns SeriesSet",
+			querier: mkQuerier(mockHandler(
+				&PrometheusResponse{
+					Data: PrometheusData{
+						ResultType: promql.ValueTypeVector,
+						Result: []SampleStream{
+							{
+								Labels: []client.LabelAdapter{
+									{Name: "a", Value: "a1"},
+									{Name: "b", Value: "b1"},
+								},
+								Samples: []client.Sample{
+									{
+										Value:       1,
+										TimestampMs: 1,
+									},
+									{
+										Value:       2,
+										TimestampMs: 2,
+									},
+								},
+							},
+							{
+								Labels: []client.LabelAdapter{
+									{Name: "a", Value: "a1"},
+									{Name: "b", Value: "b1"},
+								},
+								Samples: []client.Sample{
+									{
+										Value:       8,
+										TimestampMs: 1,
+									},
+									{
+										Value:       9,
+										TimestampMs: 2,
+									},
+								},
+							},
+						},
+					},
+				},
+				nil,
+			)),
+			fn: func(t *testing.T, q *ShardedQuerier) {
+				encoded, err := astmapper.JSONCodec.Encode([]string{`http_requests_total{cluster="prod"}`})
+				require.Nil(t, err)
+				set, _, err := q.Select(
+					nil,
+					exactMatch("__name__", astmapper.EmbeddedQueriesMetricName),
+					exactMatch(astmapper.QueryLabel, encoded),
+				)
+				require.Nil(t, err)
+				require.Equal(
+					t,
+					NewSeriesSet([]SampleStream{
+						{
+							Labels: []client.LabelAdapter{
+								{Name: "a", Value: "a1"},
+								{Name: "b", Value: "b1"},
+							},
+							Samples: []client.Sample{
+								{
+									Value:       1,
+									TimestampMs: 1,
+								},
+								{
+									Value:       2,
+									TimestampMs: 2,
+								},
+							},
+						},
+						{
+							Labels: []client.LabelAdapter{
+								{Name: "a", Value: "a1"},
+								{Name: "b", Value: "b1"},
+							},
+							Samples: []client.Sample{
+								{
+									Value:       8,
+									TimestampMs: 1,
+								},
+								{
+									Value:       9,
+									TimestampMs: 2,
+								},
+							},
+						},
+					}),
+					set,
+				)
+			},
+		},
+	}
+
+	for _, c := range testExpr {
+		t.Run(c.name, func(t *testing.T) {
+			c.fn(t, c.querier)
+		})
+	}
+}
+
+func TestSelectConcurrent(t *testing.T) {
+	for _, c := range []struct {
+		name    string
+		queries []string
+		err     error
+	}{
+		{
+			name: "concats queries",
+			queries: []string{
+				`sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="0_of_3",baz="blip"}[1m]))`,
+				`sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="1_of_3",baz="blip"}[1m]))`,
+				`sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="2_of_3",baz="blip"}[1m]))`,
+			},
+			err: nil,
+		},
+		{
+			name: "errors",
+			queries: []string{
+				`sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="0_of_3",baz="blip"}[1m]))`,
+				`sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="1_of_3",baz="blip"}[1m]))`,
+				`sum by(__cortex_shard__) (rate(bar1{__cortex_shard__="2_of_3",baz="blip"}[1m]))`,
+			},
+			err: errors.Errorf("some-err"),
+		},
+	} {
+
+		t.Run(c.name, func(t *testing.T) {
+			// each request will return a single samplestream
+			querier := mkQuerier(mockHandler(&PrometheusResponse{
+				Data: PrometheusData{
+					ResultType: promql.ValueTypeVector,
+					Result: []SampleStream{
+						{
+							Labels: []client.LabelAdapter{
+								{Name: "a", Value: "1"},
+							},
+							Samples: []client.Sample{
+								{
+									Value:       1,
+									TimestampMs: 1,
+								},
+							},
+						},
+					},
+				},
+			}, c.err))
+
+			encoded, err := astmapper.JSONCodec.Encode(c.queries)
+			require.Nil(t, err)
+			set, _, err := querier.Select(
+				nil,
+				exactMatch("__name__", astmapper.EmbeddedQueriesMetricName),
+				exactMatch(astmapper.QueryLabel, encoded),
+			)
+
+			if c.err != nil {
+				require.EqualError(t, err, c.err.Error())
+				return
+			}
+
+			var ct int
+			for set.Next() {
+				ct++
+			}
+			require.Equal(t, len(c.queries), ct)
+
+		})
+	}
+}
+
+func exactMatch(k, v string) *labels.Matcher {
+	m, err := labels.NewMatcher(labels.MatchEqual, k, v)
+	if err != nil {
+		panic(err)
+	}
+	return m
+
+}
+
+func mkQuerier(handler Handler) *ShardedQuerier {
+	return &ShardedQuerier{context.Background(), &PrometheusRequest{}, handler}
+}
diff --git a/pkg/querier/queryrange/querysharding.go b/pkg/querier/queryrange/querysharding.go
new file mode 100644
index 00000000000..297c2fe39d3
--- /dev/null
+++ b/pkg/querier/queryrange/querysharding.go
@@ -0,0 +1,306 @@
+package queryrange
+
+import (
+	"context"
+	fmt "fmt"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/promql"
+
+	"github.com/cortexproject/cortex/pkg/chunk"
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+	"github.com/cortexproject/cortex/pkg/querier/lazyquery"
+)
+
+var (
+	nanosecondsInMillisecond = int64(time.Millisecond / time.Nanosecond)
+
+	errInvalidShardingRange = errors.New("Query does not fit in a single sharding configuration")
+)
+
+// ShardingConfigs is a slice of chunk shard configs
+type ShardingConfigs []chunk.PeriodConfig
+
+// ValidRange extracts a non-overlapping sharding configuration from a list of configs and a time range.
+func (confs ShardingConfigs) ValidRange(start, end int64) (chunk.PeriodConfig, error) {
+	for i, conf := range confs {
+		if start < int64(conf.From.Time) {
+			// the query starts before this config's range
+			return chunk.PeriodConfig{}, errInvalidShardingRange
+		} else if i == len(confs)-1 {
+			// the last configuration has no upper bound
+			return conf, nil
+		} else if end < int64(confs[i+1].From.Time) {
+			// The request is entirely scoped into this shard config
+			return conf, nil
+		} else {
+			continue
+		}
+	}
+
+	return chunk.PeriodConfig{}, errInvalidShardingRange
+}
+
+// GetConf will extract a shardable config corresponding to a request and the shardingconfigs
+func (confs ShardingConfigs) GetConf(r Request) (chunk.PeriodConfig, error) {
+	conf, err := confs.ValidRange(r.GetStart(), r.GetEnd())
+
+	// query exists across multiple sharding configs
+	if err != nil {
+		return conf, err
+	}
+
+	// query doesn't have shard factor, so don't try to do AST mapping.
+	if conf.RowShards < 2 {
+		return conf, errors.Errorf("shard factor not high enough: [%d]", conf.RowShards)
+	}
+
+	return conf, nil
+}
+
+func (confs ShardingConfigs) hasShards() bool {
+	for _, conf := range confs {
+		if conf.RowShards > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+func mapQuery(mapper astmapper.ASTMapper, query string) (promql.Node, error) {
+	expr, err := promql.ParseExpr(query)
+	if err != nil {
+		return nil, err
+	}
+	return mapper.Map(expr)
+}
+
+// NewQueryShardMiddleware creates a middleware which downstreams queries after AST mapping and query encoding.
+func NewQueryShardMiddleware(
+	logger log.Logger,
+	engine *promql.Engine,
+	confs ShardingConfigs,
+	codec Codec,
+	minShardingLookback time.Duration,
+) Middleware {
+
+	noshards := !confs.hasShards()
+
+	if noshards {
+		level.Warn(logger).Log(
+			"middleware", "QueryShard",
+			"msg", "no configuration with shard found",
+			"confs", fmt.Sprintf("%+v", confs),
+		)
+		return PassthroughMiddleware
+	}
+
+	mapperware := MiddlewareFunc(func(next Handler) Handler {
+		return &astMapperware{
+			confs:  confs,
+			logger: log.With(logger, "middleware", "QueryShard.astMapperware"),
+			next:   next,
+		}
+	})
+
+	shardingware := MiddlewareFunc(func(next Handler) Handler {
+		return &queryShard{
+			confs:  confs,
+			next:   next,
+			engine: engine,
+		}
+	})
+
+	return MiddlewareFunc(func(next Handler) Handler {
+		return &shardSplitter{
+			codec:               codec,
+			MinShardingLookback: minShardingLookback,
+			shardingware: MergeMiddlewares(
+				InstrumentMiddleware("shardingware"),
+				mapperware,
+				shardingware,
+			).Wrap(next),
+			now:  time.Now,
+			next: InstrumentMiddleware("sharding-bypass").Wrap(next),
+		}
+	})
+
+}
+
+type astMapperware struct {
+	confs  ShardingConfigs
+	logger log.Logger
+	next   Handler
+}
+
+func (ast *astMapperware) Do(ctx context.Context, r Request) (Response, error) {
+	conf, err := ast.confs.GetConf(r)
+	// cannot shard with this timerange
+	if err != nil {
+		level.Warn(ast.logger).Log("err", err.Error(), "msg", "skipped AST mapper for request")
+		return ast.next.Do(ctx, r)
+	}
+
+	shardSummer, err := astmapper.NewShardSummer(int(conf.RowShards), astmapper.VectorSquasher)
+	if err != nil {
+		return nil, err
+	}
+
+	subtreeFolder := astmapper.NewSubtreeFolder()
+
+	strQuery := r.GetQuery()
+	mappedQuery, err := mapQuery(
+		astmapper.NewMultiMapper(
+			shardSummer,
+			subtreeFolder,
+		),
+		strQuery,
+	)
+
+	if err != nil {
+		return nil, err
+	}
+
+	strMappedQuery := mappedQuery.String()
+	level.Debug(ast.logger).Log("msg", "mapped query", "original", strQuery, "mapped", strMappedQuery)
+	mappedASTCounter.Inc()
+
+	return ast.next.Do(ctx, r.WithQuery(strMappedQuery))
+
+}
+
+type queryShard struct {
+	confs  ShardingConfigs
+	next   Handler
+	engine *promql.Engine
+}
+
+func (qs *queryShard) Do(ctx context.Context, r Request) (Response, error) {
+	// since there's no available sharding configuration for this time range,
+	// no astmapping has been performed, so skip this middleware.
+	if _, err := qs.confs.GetConf(r); err != nil {
+		return qs.next.Do(ctx, r)
+	}
+
+	queryable := lazyquery.NewLazyQueryable(&ShardedQueryable{r, qs.next})
+
+	qry, err := qs.engine.NewRangeQuery(
+		queryable,
+		r.GetQuery(),
+		TimeFromMillis(r.GetStart()),
+		TimeFromMillis(r.GetEnd()),
+		time.Duration(r.GetStep())*time.Millisecond,
+	)
+
+	if err != nil {
+		return nil, err
+	}
+	res := qry.Exec(ctx)
+	extracted, err := FromResult(res)
+	if err != nil {
+		return nil, err
+
+	}
+	return &PrometheusResponse{
+		Status: StatusSuccess,
+		Data: PrometheusData{
+			ResultType: string(res.Value.Type()),
+			Result:     extracted,
+		},
+	}, nil
+}
+
+// shardSplitter middleware will only shard appropriate requests that do not extend past the MinShardingLookback interval.
+// This is used to send nonsharded requests to the ingesters in order to not overload them.
+type shardSplitter struct {
+	codec               Codec
+	MinShardingLookback time.Duration    // delimiter for splitting sharded vs non-sharded queries
+	shardingware        Handler          // handler for sharded queries
+	next                Handler          // handler for non-sharded queries
+	now                 func() time.Time // injectable time.Now
+}
+
+func (splitter *shardSplitter) Do(ctx context.Context, r Request) (Response, error) {
+	cutoff := splitter.now().Add(-splitter.MinShardingLookback)
+	sharded, nonsharded := partitionRequest(r, cutoff)
+
+	return splitter.parallel(ctx, sharded, nonsharded)
+
+}
+
+func (splitter *shardSplitter) parallel(ctx context.Context, sharded, nonsharded Request) (Response, error) {
+	if sharded == nil {
+		return splitter.next.Do(ctx, nonsharded)
+	}
+
+	if nonsharded == nil {
+		return splitter.shardingware.Do(ctx, sharded)
+	}
+
+	nonshardCh := make(chan Response, 1)
+	shardCh := make(chan Response, 1)
+	errCh := make(chan error, 2)
+
+	go func() {
+		res, err := splitter.next.Do(ctx, nonsharded)
+		if err != nil {
+			errCh <- err
+			return
+		}
+		nonshardCh <- res
+
+	}()
+
+	go func() {
+		res, err := splitter.shardingware.Do(ctx, sharded)
+		if err != nil {
+			errCh <- err
+			return
+		}
+		shardCh <- res
+	}()
+
+	resps := make([]Response, 0, 2)
+	for i := 0; i < 2; i++ {
+		select {
+		case r := <-nonshardCh:
+			resps = append(resps, r)
+		case r := <-shardCh:
+			resps = append(resps, r)
+		case err := <-errCh:
+			return nil, err
+		case <-ctx.Done():
+			return nil, ctx.Err()
+		}
+
+	}
+
+	return splitter.codec.MergeResponse(resps...)
+}
+
+// partitionQuery splits a request into potentially multiple requests, one including the request's time range
+// [0,t). The other will include [t,inf)
+func partitionRequest(r Request, t time.Time) (before Request, after Request) {
+	boundary := TimeToMillis(t)
+	if r.GetStart() >= boundary {
+		return nil, r
+	}
+
+	if r.GetEnd() < boundary {
+		return r, nil
+	}
+
+	return r.WithStartEnd(r.GetStart(), boundary), r.WithStartEnd(boundary, r.GetEnd())
+}
+
+// TimeFromMillis is a helper to turn milliseconds -> time.Time
+func TimeFromMillis(ms int64) time.Time {
+	return time.Unix(0, ms*nanosecondsInMillisecond)
+}
+
+func TimeToMillis(t time.Time) int64 {
+	return t.UnixNano() / nanosecondsInMillisecond
+}
diff --git a/pkg/querier/queryrange/querysharding_test.go b/pkg/querier/queryrange/querysharding_test.go
new file mode 100644
index 00000000000..d57c7b0e969
--- /dev/null
+++ b/pkg/querier/queryrange/querysharding_test.go
@@ -0,0 +1,632 @@
+package queryrange
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	"github.com/pkg/errors"
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/chunk"
+	"github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/util"
+)
+
+func TestQueryshardingMiddleware(t *testing.T) {
+	var testExpr = []struct {
+		name     string
+		next     Handler
+		input    Request
+		ctx      context.Context
+		expected *PrometheusResponse
+		err      bool
+		override func(*testing.T, Handler)
+	}{
+		{
+			name: "invalid query error",
+			// if the query parses correctly force it to succeed
+			next: mockHandler(&PrometheusResponse{
+				Status: "",
+				Data: PrometheusData{
+					ResultType: promql.ValueTypeVector,
+					Result:     []SampleStream{},
+				},
+				ErrorType: "",
+				Error:     "",
+			}, nil),
+			input:    &PrometheusRequest{Query: "^GARBAGE"},
+			ctx:      context.Background(),
+			expected: nil,
+			err:      true,
+		},
+		{
+			name:     "downstream err",
+			next:     mockHandler(nil, errors.Errorf("some err")),
+			input:    defaultReq(),
+			ctx:      context.Background(),
+			expected: nil,
+			err:      true,
+		},
+		{
+			name: "successful trip",
+			next: mockHandler(sampleMatrixResponse(), nil),
+			override: func(t *testing.T, handler Handler) {
+
+				// pre-encode the query so it doesn't try to re-split. We're just testing if it passes through correctly
+				qry := defaultReq().WithQuery(
+					`__embedded_queries__{__cortex_queries__="{\"Concat\":[\"http_requests_total{cluster=\\\"prod\\\"}\"]}"}`,
+				)
+				out, err := handler.Do(context.Background(), qry)
+				require.Nil(t, err)
+				require.Equal(t, promql.ValueTypeMatrix, out.(*PrometheusResponse).Data.ResultType)
+				require.Equal(t, sampleMatrixResponse(), out)
+			},
+		},
+	}
+
+	for _, c := range testExpr {
+		t.Run(c.name, func(t *testing.T) {
+			engine := promql.NewEngine(promql.EngineOpts{
+				Logger:        util.Logger,
+				Reg:           nil,
+				MaxConcurrent: 10,
+				MaxSamples:    1000,
+				Timeout:       time.Minute,
+			})
+
+			handler := NewQueryShardMiddleware(
+				log.NewNopLogger(),
+				engine,
+				ShardingConfigs{
+					{
+						RowShards: 3,
+					},
+				},
+				PrometheusCodec,
+				0,
+			).Wrap(c.next)
+
+			// escape hatch for custom tests
+			if c.override != nil {
+				c.override(t, handler)
+				return
+			}
+
+			out, err := handler.Do(c.ctx, c.input)
+
+			if c.err {
+				require.NotNil(t, err)
+			} else {
+				require.Nil(t, err)
+				require.Equal(t, c.expected, out)
+			}
+
+		})
+	}
+}
+
+func sampleMatrixResponse() *PrometheusResponse {
+	return &PrometheusResponse{
+		Status: StatusSuccess,
+		Data: PrometheusData{
+			ResultType: promql.ValueTypeMatrix,
+			Result: []SampleStream{
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a1"},
+						{Name: "b", Value: "b1"},
+					},
+					Samples: []client.Sample{
+						{
+							TimestampMs: 5,
+							Value:       1,
+						},
+						{
+							TimestampMs: 10,
+							Value:       2,
+						},
+					},
+				},
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a1"},
+						{Name: "b", Value: "b1"},
+					},
+					Samples: []client.Sample{
+						{
+							TimestampMs: 5,
+							Value:       8,
+						},
+						{
+							TimestampMs: 10,
+							Value:       9,
+						},
+					},
+				},
+			},
+		},
+	}
+}
+
+func mockHandler(resp *PrometheusResponse, err error) Handler {
+	return HandlerFunc(func(ctx context.Context, req Request) (Response, error) {
+		if expired := ctx.Err(); expired != nil {
+			return nil, expired
+		}
+
+		return resp, err
+	})
+}
+
+func defaultReq() *PrometheusRequest {
+	return &PrometheusRequest{
+		Path:    "/query_range",
+		Start:   00,
+		End:     10,
+		Step:    5,
+		Timeout: time.Minute,
+		Query:   `sum(rate(http_requests_total{}[5m]))`,
+	}
+}
+
+func TestShardingConfigs_ValidRange(t *testing.T) {
+	reqWith := func(start, end string) *PrometheusRequest {
+		r := defaultReq()
+
+		if start != "" {
+			r.Start = int64(parseDate(start))
+		}
+
+		if end != "" {
+			r.End = int64(parseDate(end))
+		}
+
+		return r
+	}
+
+	var testExpr = []struct {
+		name     string
+		confs    ShardingConfigs
+		req      *PrometheusRequest
+		expected chunk.PeriodConfig
+		err      error
+	}{
+		{
+			name:  "0 ln configs fail",
+			confs: ShardingConfigs{},
+			req:   defaultReq(),
+			err:   errInvalidShardingRange,
+		},
+		{
+			name: "request starts before beginning config",
+			confs: ShardingConfigs{
+				{
+					From:      chunk.DayTime{Time: parseDate("2019-10-16")},
+					RowShards: 1,
+				},
+			},
+			req: reqWith("2019-10-15", ""),
+			err: errInvalidShardingRange,
+		},
+		{
+			name: "request spans multiple configs",
+			confs: ShardingConfigs{
+				{
+					From:      chunk.DayTime{Time: parseDate("2019-10-16")},
+					RowShards: 1,
+				},
+				{
+					From:      chunk.DayTime{Time: parseDate("2019-11-16")},
+					RowShards: 2,
+				},
+			},
+			req: reqWith("2019-10-15", "2019-11-17"),
+			err: errInvalidShardingRange,
+		},
+		{
+			name: "selects correct config ",
+			confs: ShardingConfigs{
+				{
+					From:      chunk.DayTime{Time: parseDate("2019-10-16")},
+					RowShards: 1,
+				},
+				{
+					From:      chunk.DayTime{Time: parseDate("2019-11-16")},
+					RowShards: 2,
+				},
+				{
+					From:      chunk.DayTime{Time: parseDate("2019-12-16")},
+					RowShards: 3,
+				},
+			},
+			req: reqWith("2019-11-20", "2019-11-25"),
+			expected: chunk.PeriodConfig{
+				From:      chunk.DayTime{Time: parseDate("2019-11-16")},
+				RowShards: 2,
+			},
+		},
+	}
+
+	for _, c := range testExpr {
+		t.Run(c.name, func(t *testing.T) {
+			out, err := c.confs.ValidRange(c.req.Start, c.req.End)
+
+			if c.err != nil {
+				require.EqualError(t, err, c.err.Error())
+			} else {
+				require.Nil(t, err)
+				require.Equal(t, c.expected, out)
+			}
+		})
+	}
+}
+
+func TestTimeFromMillis(t *testing.T) {
+	var testExpr = []struct {
+		input    int64
+		expected time.Time
+	}{
+		{input: 1000, expected: time.Unix(1, 0)},
+		{input: 1500, expected: time.Unix(1, 500*nanosecondsInMillisecond)},
+	}
+
+	for i, c := range testExpr {
+		t.Run(string(i), func(t *testing.T) {
+			res := TimeFromMillis(c.input)
+			require.Equal(t, c.expected, res)
+		})
+	}
+}
+
+func parseDate(in string) model.Time {
+	t, err := time.Parse("2006-01-02", in)
+	if err != nil {
+		panic(err)
+	}
+	return model.Time(t.UnixNano())
+}
+
+// mappingValidator can be injected into a middleware chain to assert that a query matches an expected query
+type mappingValidator struct {
+	expected string
+	next     Handler
+}
+
+func (v *mappingValidator) Do(ctx context.Context, req Request) (Response, error) {
+	expr, err := promql.ParseExpr(req.GetQuery())
+	if err != nil {
+		return nil, err
+	}
+
+	if v.expected != expr.String() {
+		return nil, errors.Errorf("bad query mapping: expected [%s], got [%s]", v.expected, expr.String())
+	}
+
+	return v.next.Do(ctx, req)
+}
+
+// approximatelyEquals ensures two responses are approximately equal, up to 6 decimals precision per sample
+func approximatelyEquals(t *testing.T, a, b *PrometheusResponse) {
+	require.Equal(t, a.Status, b.Status)
+	if a.Status != StatusSuccess {
+		return
+	}
+	as, err := ResponseToSamples(a)
+	require.Nil(t, err)
+	bs, err := ResponseToSamples(b)
+	require.Nil(t, err)
+
+	require.Equal(t, len(as), len(bs))
+
+	for i := 0; i < len(as); i++ {
+		a := as[i]
+		b := bs[i]
+		require.Equal(t, a.Labels, b.Labels)
+		require.Equal(t, len(a.Samples), len(b.Samples))
+
+		for j := 0; j < len(a.Samples); j++ {
+			aSample := &a.Samples[j]
+			aSample.Value = math.Round(aSample.Value*1e6) / 1e6
+			bSample := &b.Samples[j]
+			bSample.Value = math.Round(bSample.Value*1e6) / 1e6
+		}
+		require.Equal(t, a, b)
+	}
+}
+
+func TestQueryshardingCorrectness(t *testing.T) {
+	shardFactor := 2
+	req := &PrometheusRequest{
+		Path:  "/query_range",
+		Start: start.UnixNano() / nanosecondsInMillisecond,
+		End:   end.UnixNano() / nanosecondsInMillisecond,
+		Step:  int64(step) / int64(time.Second),
+	}
+	for _, tc := range []struct {
+		desc   string
+		query  string
+		mapped string
+	}{
+		{
+			desc:   "fully encoded histogram_quantile",
+			query:  `histogram_quantile(0.5, rate(bar1{baz="blip"}[30s]))`,
+			mapped: `__embedded_queries__{__cortex_queries__="{\"Concat\":[\"histogram_quantile(0.5, rate(bar1{baz=\\\"blip\\\"}[30s]))\"]}"}`,
+		},
+		{
+			desc:   "entire query with shard summer",
+			query:  `sum by (foo,bar) (min_over_time(bar1{baz="blip"}[1m]))`,
+			mapped: `sum by(foo, bar) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(foo, bar, __cortex_shard__) (min_over_time(bar1{__cortex_shard__=\\\"0_of_2\\\",baz=\\\"blip\\\"}[1m]))\",\"sum by(foo, bar, __cortex_shard__) (min_over_time(bar1{__cortex_shard__=\\\"1_of_2\\\",baz=\\\"blip\\\"}[1m]))\"]}"})`,
+		},
+		{
+			desc:   "shard one leg encode the other",
+			query:  "sum(rate(bar1[1m])) or rate(bar1[1m])",
+			mapped: `sum without(__cortex_shard__) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"0_of_2\\\"}[1m]))\",\"sum by(__cortex_shard__) (rate(bar1{__cortex_shard__=\\\"1_of_2\\\"}[1m]))\"]}"}) or __embedded_queries__{__cortex_queries__="{\"Concat\":[\"rate(bar1[1m])\"]}"}`,
+		},
+		{
+			desc:   "should skip encoding leaf scalar/strings",
+			query:  `histogram_quantile(0.5, sum(rate(cortex_cache_value_size_bytes_bucket[5m])) by (le))`,
+			mapped: `histogram_quantile(0.5, sum by(le) (__embedded_queries__{__cortex_queries__="{\"Concat\":[\"sum by(le, __cortex_shard__) (rate(cortex_cache_value_size_bytes_bucket{__cortex_shard__=\\\"0_of_2\\\"}[5m]))\",\"sum by(le, __cortex_shard__) (rate(cortex_cache_value_size_bytes_bucket{__cortex_shard__=\\\"1_of_2\\\"}[5m]))\"]}"}))`,
+		},
+	} {
+		t.Run(tc.desc, func(t *testing.T) {
+			shardingConf := ShardingConfigs{
+				chunk.PeriodConfig{
+					Schema:    "v10",
+					RowShards: uint32(shardFactor),
+				},
+			}
+			shardingware := NewQueryShardMiddleware(
+				log.NewNopLogger(),
+				engine,
+				// ensure that all requests are shard compatbile
+				shardingConf,
+				PrometheusCodec,
+				0,
+			)
+
+			downstream := &downstreamHandler{
+				engine:    engine,
+				queryable: shardAwareQueryable,
+			}
+
+			assertionMWare := MiddlewareFunc(func(next Handler) Handler {
+				return &mappingValidator{
+					expected: tc.mapped,
+					next:     next,
+				}
+			})
+
+			mapperware := MiddlewareFunc(func(next Handler) Handler {
+				return &astMapperware{
+					confs:  shardingConf,
+					logger: log.NewNopLogger(),
+					next:   next,
+				}
+			})
+
+			r := req.WithQuery(tc.query)
+
+			// ensure the expected ast mapping occurs
+			_, err := MergeMiddlewares(mapperware, assertionMWare).Wrap(downstream).Do(context.Background(), r)
+			require.Nil(t, err)
+
+			shardedRes, err := shardingware.Wrap(downstream).Do(context.Background(), r)
+			require.Nil(t, err)
+
+			res, err := downstream.Do(context.Background(), r)
+			require.Nil(t, err)
+
+			approximatelyEquals(t, res.(*PrometheusResponse), shardedRes.(*PrometheusResponse))
+		})
+	}
+}
+
+func TestShardSplitting(t *testing.T) {
+
+	req := &PrometheusRequest{
+		Path:  "/query_range",
+		Start: start.UnixNano() / nanosecondsInMillisecond,
+		End:   end.UnixNano() / nanosecondsInMillisecond,
+		Step:  int64(step) / int64(time.Second),
+		Query: "sum(rate(bar1[1m]))",
+	}
+
+	shardingware := NewQueryShardMiddleware(
+		log.NewNopLogger(),
+		engine,
+		// ensure that all requests are shard compatbile
+		ShardingConfigs{
+			chunk.PeriodConfig{
+				Schema:    "v10",
+				RowShards: uint32(2),
+			},
+		},
+		PrometheusCodec,
+		end.Sub(start)/2, // shard 1/2 of the req
+	)
+
+	downstream := &downstreamHandler{
+		engine:    engine,
+		queryable: shardAwareQueryable,
+	}
+
+	handler := shardingware.Wrap(downstream).(*shardSplitter)
+	handler.now = func() time.Time { return end } // make the split cut the request in half (don't use time.Now)
+
+	resp, err := handler.Do(context.Background(), req)
+	require.Nil(t, err)
+
+	unaltered, err := downstream.Do(context.Background(), req)
+	require.Nil(t, err)
+
+	approximatelyEquals(t, unaltered.(*PrometheusResponse), resp.(*PrometheusResponse))
+}
+
+func BenchmarkQuerySharding(b *testing.B) {
+
+	var shards []uint32
+
+	// max out at half available cpu cores in order to minimize noisy neighbor issues while benchmarking
+	for shard := 1; shard <= runtime.NumCPU()/2; shard = shard * 2 {
+		shards = append(shards, uint32(shard))
+	}
+
+	for _, tc := range []struct {
+		labelBuckets     int
+		labels           []string
+		samplesPerSeries int
+		query            string
+		desc             string
+	}{
+		// Ensure you have enough cores to run these tests without blocking.
+		// We want to simulate parallel computations and waiting in queue doesn't help
+
+		// no-group
+		{
+			labelBuckets:     16,
+			labels:           []string{"a", "b", "c"},
+			samplesPerSeries: 100,
+			query:            `sum(rate(http_requests_total[5m]))`,
+			desc:             "sum nogroup",
+		},
+		// sum by
+		{
+			labelBuckets:     16,
+			labels:           []string{"a", "b", "c"},
+			samplesPerSeries: 100,
+			query:            `sum by(a) (rate(http_requests_total[5m]))`,
+			desc:             "sum by",
+		},
+		// sum without
+		{
+			labelBuckets:     16,
+			labels:           []string{"a", "b", "c"},
+			samplesPerSeries: 100,
+			query:            `sum without (a) (rate(http_requests_total[5m]))`,
+			desc:             "sum without",
+		},
+	} {
+		for _, delayPerSeries := range []time.Duration{
+			0,
+			time.Millisecond / 10,
+		} {
+			engine := promql.NewEngine(promql.EngineOpts{
+				Logger: util.Logger,
+				Reg:    nil,
+				// set high concurrency so we're not bottlenecked here
+				MaxConcurrent: 100000,
+				MaxSamples:    100000000,
+				Timeout:       time.Minute,
+			})
+
+			queryable := NewMockShardedQueryable(
+				tc.samplesPerSeries,
+				tc.labels,
+				tc.labelBuckets,
+				delayPerSeries,
+			)
+			downstream := &downstreamHandler{
+				engine:    engine,
+				queryable: queryable,
+			}
+
+			var (
+				start int64 = 0
+				end         = int64(1000 * tc.samplesPerSeries)
+				step        = (end - start) / 1000
+			)
+
+			req := &PrometheusRequest{
+				Path:    "/query_range",
+				Start:   start,
+				End:     end,
+				Step:    step,
+				Timeout: time.Minute,
+				Query:   tc.query,
+			}
+
+			for _, shardFactor := range shards {
+				shardingware := NewQueryShardMiddleware(
+					log.NewNopLogger(),
+					engine,
+					// ensure that all requests are shard compatbile
+					ShardingConfigs{
+						chunk.PeriodConfig{
+							Schema:    "v10",
+							RowShards: shardFactor,
+						},
+					},
+					PrometheusCodec,
+					0,
+				).Wrap(downstream)
+
+				b.Run(
+					fmt.Sprintf(
+						"desc:[%s]---shards:[%d]---series:[%.0f]---delayPerSeries:[%s]---samplesPerSeries:[%d]",
+						tc.desc,
+						shardFactor,
+						math.Pow(float64(tc.labelBuckets), float64(len(tc.labels))),
+						delayPerSeries,
+						tc.samplesPerSeries,
+					),
+					func(b *testing.B) {
+						for n := 0; n < b.N; n++ {
+							_, err := shardingware.Do(
+								context.Background(),
+								req,
+							)
+							if err != nil {
+								b.Fatal(err.Error())
+							}
+						}
+					},
+				)
+			}
+			fmt.Println()
+		}
+
+		fmt.Print("--------------------------------\n\n")
+	}
+}
+
+type downstreamHandler struct {
+	engine    *promql.Engine
+	queryable storage.Queryable
+}
+
+func (h *downstreamHandler) Do(ctx context.Context, r Request) (Response, error) {
+	qry, err := h.engine.NewRangeQuery(
+		h.queryable,
+		r.GetQuery(),
+		TimeFromMillis(r.GetStart()),
+		TimeFromMillis(r.GetEnd()),
+		time.Duration(r.GetStep())*time.Millisecond,
+	)
+
+	if err != nil {
+		return nil, err
+	}
+
+	res := qry.Exec(ctx)
+	extracted, err := FromResult(res)
+	if err != nil {
+		return nil, err
+
+	}
+
+	return &PrometheusResponse{
+		Status: StatusSuccess,
+		Data: PrometheusData{
+			ResultType: string(res.Value.Type()),
+			Result:     extracted,
+		},
+	}, nil
+}
diff --git a/pkg/querier/queryrange/results_cache.go b/pkg/querier/queryrange/results_cache.go
index 3ffc07a5d02..72adb981802 100644
--- a/pkg/querier/queryrange/results_cache.go
+++ b/pkg/querier/queryrange/results_cache.go
@@ -78,7 +78,7 @@ func (t constSplitter) GenerateCacheKey(userID string, r Request) string {
 var PrometheusResponseExtractor = ExtractorFunc(func(start, end int64, from Response) Response {
 	promRes := from.(*PrometheusResponse)
 	return &PrometheusResponse{
-		Status: statusSuccess,
+		Status: StatusSuccess,
 		Data: PrometheusData{
 			ResultType: promRes.Data.ResultType,
 			Result:     extractMatrix(start, end, promRes.Data.Result),
diff --git a/pkg/querier/queryrange/results_cache_test.go b/pkg/querier/queryrange/results_cache_test.go
index 59020aa7c3f..d38bb84340d 100644
--- a/pkg/querier/queryrange/results_cache_test.go
+++ b/pkg/querier/queryrange/results_cache_test.go
@@ -58,7 +58,7 @@ var (
 )
 
 var dummyResponse = &PrometheusResponse{
-	Status: statusSuccess,
+	Status: StatusSuccess,
 	Data: PrometheusData{
 		ResultType: matrix,
 		Result: []SampleStream{
@@ -87,7 +87,7 @@ func mkAPIResponse(start, end, step int64) *PrometheusResponse {
 	}
 
 	return &PrometheusResponse{
-		Status: statusSuccess,
+		Status: StatusSuccess,
 		Data: PrometheusData{
 			ResultType: matrix,
 			Result: []SampleStream{
diff --git a/pkg/querier/queryrange/roundtrip.go b/pkg/querier/queryrange/roundtrip.go
index a080c817761..505e6554a79 100644
--- a/pkg/querier/queryrange/roundtrip.go
+++ b/pkg/querier/queryrange/roundtrip.go
@@ -17,7 +17,6 @@ package queryrange
 
 import (
 	"context"
-	"errors"
 	"flag"
 	"net/http"
 	"strings"
@@ -25,15 +24,27 @@ import (
 
 	"github.com/go-kit/kit/log"
 	"github.com/go-kit/kit/log/level"
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/promql"
 	"github.com/weaveworks/common/httpgrpc"
 	"github.com/weaveworks/common/user"
 
+	"github.com/cortexproject/cortex/pkg/chunk"
 	"github.com/cortexproject/cortex/pkg/chunk/cache"
 	"github.com/cortexproject/cortex/pkg/querier/frontend"
 )
 
 const day = 24 * time.Hour
 
+var (
+	// PassthroughMiddleware is a noop middleware
+	PassthroughMiddleware = MiddlewareFunc(func(next Handler) Handler {
+		return next
+	})
+
+	errInvalidMinShardingLookback = errors.New("a non-zero value is required for querier.query-ingesters-within when querier.sum-shards is enabled")
+)
+
 // Config for query_range middleware chain.
 type Config struct {
 	SplitQueriesByInterval time.Duration `yaml:"split_queries_by_interval"`
@@ -42,6 +53,7 @@ type Config struct {
 	ResultsCacheConfig     `yaml:"results_cache"`
 	CacheResults           bool `yaml:"cache_results"`
 	MaxRetries             int  `yaml:"max_retries"`
+	ShardedQueries         bool `yaml:"parallelise_shardable_queries"`
 }
 
 // RegisterFlags adds the flags required to config this to the given FlagSet.
@@ -51,6 +63,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.DurationVar(&cfg.SplitQueriesByInterval, "querier.split-queries-by-interval", 0, "Split queries by an interval and execute in parallel, 0 disables it. You should use an a multiple of 24 hours (same as the storage bucketing scheme), to avoid queriers downloading and processing the same chunks. This also determines how cache keys are chosen when result caching is enabled")
 	f.BoolVar(&cfg.AlignQueriesWithStep, "querier.align-querier-with-step", false, "Mutate incoming queries to align their start and end with their step.")
 	f.BoolVar(&cfg.CacheResults, "querier.cache-results", false, "Cache query results.")
+	f.BoolVar(&cfg.ShardedQueries, "querier.parallelise-shardable-queries", false, "Perform query parallelisations based on storage sharding configuration and query ASTs. This feature is supported only by the chunks storage engine.")
 	cfg.ResultsCacheConfig.RegisterFlags(f)
 }
 
@@ -105,7 +118,16 @@ func MergeMiddlewares(middleware ...Middleware) Middleware {
 }
 
 // NewTripperware returns a Tripperware configured with middlewares to limit, align, split, retry and cache requests.
-func NewTripperware(cfg Config, log log.Logger, limits Limits, codec Codec, cacheExtractor Extractor) (frontend.Tripperware, cache.Cache, error) {
+func NewTripperware(
+	cfg Config,
+	log log.Logger,
+	limits Limits,
+	codec Codec,
+	cacheExtractor Extractor,
+	schema chunk.SchemaConfig,
+	engineOpts promql.EngineOpts,
+	minShardingLookback time.Duration,
+) (frontend.Tripperware, cache.Cache, error) {
 	queryRangeMiddleware := []Middleware{LimitsMiddleware(limits)}
 	if cfg.AlignQueriesWithStep {
 		queryRangeMiddleware = append(queryRangeMiddleware, InstrumentMiddleware("step_align"), StepAlignMiddleware)
@@ -113,6 +135,7 @@ func NewTripperware(cfg Config, log log.Logger, limits Limits, codec Codec, cach
 	if cfg.SplitQueriesByInterval != 0 {
 		queryRangeMiddleware = append(queryRangeMiddleware, InstrumentMiddleware("split_by_interval"), SplitByIntervalMiddleware(cfg.SplitQueriesByInterval, limits, codec))
 	}
+
 	var c cache.Cache
 	if cfg.CacheResults {
 		queryCacheMiddleware, cache, err := NewResultsCacheMiddleware(log, cfg.ResultsCacheConfig, constSplitter(cfg.SplitQueriesByInterval), limits, codec, cacheExtractor)
@@ -122,6 +145,26 @@ func NewTripperware(cfg Config, log log.Logger, limits Limits, codec Codec, cach
 		c = cache
 		queryRangeMiddleware = append(queryRangeMiddleware, InstrumentMiddleware("results_cache"), queryCacheMiddleware)
 	}
+
+	if cfg.ShardedQueries {
+		if minShardingLookback == 0 {
+			return nil, nil, errInvalidMinShardingLookback
+		}
+
+		shardingware := NewQueryShardMiddleware(
+			log,
+			promql.NewEngine(engineOpts),
+			schema.Configs,
+			codec,
+			minShardingLookback,
+		)
+
+		queryRangeMiddleware = append(
+			queryRangeMiddleware,
+			shardingware, // instrumentation is included in the sharding middleware
+		)
+	}
+
 	if cfg.MaxRetries > 0 {
 		queryRangeMiddleware = append(queryRangeMiddleware, InstrumentMiddleware("retry"), NewRetryMiddleware(log, cfg.MaxRetries))
 	}
diff --git a/pkg/querier/queryrange/roundtrip_test.go b/pkg/querier/queryrange/roundtrip_test.go
index 1d0491c252c..e9423983c38 100644
--- a/pkg/querier/queryrange/roundtrip_test.go
+++ b/pkg/querier/queryrange/roundtrip_test.go
@@ -8,11 +8,15 @@ import (
 	"net/url"
 	"strconv"
 	"testing"
+	"time"
 
+	"github.com/go-kit/kit/log"
+	"github.com/prometheus/prometheus/promql"
 	"github.com/stretchr/testify/require"
 	"github.com/weaveworks/common/middleware"
 	"github.com/weaveworks/common/user"
 
+	"github.com/cortexproject/cortex/pkg/chunk"
 	"github.com/cortexproject/cortex/pkg/util"
 )
 
@@ -42,7 +46,22 @@ func TestRoundTrip(t *testing.T) {
 		next: http.DefaultTransport,
 	}
 
-	tw, _, err := NewTripperware(Config{}, util.Logger, fakeLimits{}, PrometheusCodec, nil)
+	tw, _, err := NewTripperware(Config{},
+		util.Logger,
+		fakeLimits{},
+		PrometheusCodec,
+		nil,
+		chunk.SchemaConfig{},
+		promql.EngineOpts{
+			Logger:        util.Logger,
+			Reg:           nil,
+			MaxConcurrent: 10,
+			MaxSamples:    1000,
+			Timeout:       time.Minute,
+		},
+		0,
+	)
+
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -86,3 +105,18 @@ func (s singleHostRoundTripper) RoundTrip(r *http.Request) (*http.Response, erro
 	r.URL.Host = s.host
 	return s.next.RoundTrip(r)
 }
+
+func Test_ShardingConfigError(t *testing.T) {
+	_, _, err := NewTripperware(
+		Config{ShardedQueries: true},
+		log.NewNopLogger(),
+		nil,
+		nil,
+		nil,
+		chunk.SchemaConfig{},
+		promql.EngineOpts{},
+		0,
+	)
+
+	require.EqualError(t, err, errInvalidMinShardingLookback.Error())
+}
diff --git a/pkg/querier/queryrange/series_test.go b/pkg/querier/queryrange/series_test.go
new file mode 100644
index 00000000000..1523db47883
--- /dev/null
+++ b/pkg/querier/queryrange/series_test.go
@@ -0,0 +1,76 @@
+package queryrange
+
+import (
+	"testing"
+
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/ingester/client"
+)
+
+func Test_ResponseToSamples(t *testing.T) {
+	input := &PrometheusResponse{
+		Data: PrometheusData{
+			ResultType: promql.ValueTypeMatrix,
+			Result: []SampleStream{
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a1"},
+						{Name: "b", Value: "b1"},
+					},
+					Samples: []client.Sample{
+						{
+							Value:       1,
+							TimestampMs: 1,
+						},
+						{
+							Value:       2,
+							TimestampMs: 2,
+						},
+					},
+				},
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a1"},
+						{Name: "b", Value: "b1"},
+					},
+					Samples: []client.Sample{
+						{
+							Value:       8,
+							TimestampMs: 1,
+						},
+						{
+							Value:       9,
+							TimestampMs: 2,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	streams, err := ResponseToSamples(input)
+	require.Nil(t, err)
+	set := NewSeriesSet(streams)
+
+	setCt := 0
+
+	for set.Next() {
+		iter := set.At().Iterator()
+		require.Nil(t, set.Err())
+
+		sampleCt := 0
+		for iter.Next() {
+			ts, v := iter.At()
+			require.Equal(t, input.Data.Result[setCt].Samples[sampleCt].TimestampMs, ts)
+			require.Equal(t, input.Data.Result[setCt].Samples[sampleCt].Value, v)
+			sampleCt++
+		}
+		require.Equal(t, len(input.Data.Result[setCt].Samples), sampleCt)
+		setCt++
+	}
+
+	require.Equal(t, len(input.Data.Result), setCt)
+
+}
diff --git a/pkg/querier/queryrange/split_by_interval.go b/pkg/querier/queryrange/split_by_interval.go
index ff5cd3af174..2dc55d428cb 100644
--- a/pkg/querier/queryrange/split_by_interval.go
+++ b/pkg/querier/queryrange/split_by_interval.go
@@ -28,6 +28,7 @@ func (s splitByInterval) Do(ctx context.Context, r Request) (Response, error) {
 	// First we're going to build new requests, one for each day, taking care
 	// to line up the boundaries with step.
 	reqs := splitQuery(r, s.interval)
+	splitByCounter.Add(float64(len(reqs)))
 
 	reqResps, err := DoRequests(ctx, s.next, reqs, s.limits)
 	if err != nil {
diff --git a/pkg/querier/queryrange/test_utils.go b/pkg/querier/queryrange/test_utils.go
new file mode 100644
index 00000000000..400b0eef71a
--- /dev/null
+++ b/pkg/querier/queryrange/test_utils.go
@@ -0,0 +1,187 @@
+package queryrange
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/storage"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+	"github.com/cortexproject/cortex/pkg/querier/series"
+)
+
+// genLabels will create a slice of labels where each label has an equal chance to occupy a value from [0,labelBuckets]. It returns a slice of length labelBuckets^len(labelSet)
+func genLabels(
+	labelSet []string,
+	labelBuckets int,
+) (result []labels.Labels) {
+	if len(labelSet) == 0 {
+		return result
+	}
+
+	l := labelSet[0]
+	rest := genLabels(labelSet[1:], labelBuckets)
+
+	for i := 0; i < labelBuckets; i++ {
+		x := labels.Label{
+			Name:  l,
+			Value: fmt.Sprintf("%d", i),
+		}
+		if len(rest) == 0 {
+			set := labels.Labels{x}
+			result = append(result, set)
+			continue
+		}
+		for _, others := range rest {
+			set := append(others, x)
+			result = append(result, set)
+		}
+	}
+	return result
+
+}
+
+// NewMockShardedQueryable creates a shard-aware in memory queryable.
+func NewMockShardedQueryable(
+	nSamples int,
+	labelSet []string,
+	labelBuckets int,
+	delayPerSeries time.Duration,
+) *MockShardedQueryable {
+	samples := make([]model.SamplePair, 0, nSamples)
+	for i := 0; i < nSamples; i++ {
+		samples = append(samples, model.SamplePair{
+			Timestamp: model.Time(i * 1000),
+			Value:     model.SampleValue(i),
+		})
+	}
+	sets := genLabels(labelSet, labelBuckets)
+	xs := make([]storage.Series, 0, len(sets))
+	for _, ls := range sets {
+		xs = append(xs, series.NewConcreteSeries(ls, samples))
+	}
+
+	return &MockShardedQueryable{
+		series:         xs,
+		delayPerSeries: delayPerSeries,
+	}
+}
+
+// MockShardedQueryable is exported to be reused in the querysharding benchmarking
+type MockShardedQueryable struct {
+	series         []storage.Series
+	delayPerSeries time.Duration
+}
+
+// Querier impls storage.Queryable
+func (q *MockShardedQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
+	return q, nil
+}
+
+// Select impls storage.Querier
+func (q *MockShardedQueryable) Select(
+	_ *storage.SelectParams,
+	matchers ...*labels.Matcher,
+) (storage.SeriesSet, storage.Warnings, error) {
+	tStart := time.Now()
+
+	shard, _, err := astmapper.ShardFromMatchers(matchers)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	var (
+		start int
+		end   int
+	)
+
+	if shard == nil {
+		start = 0
+		end = len(q.series)
+	} else {
+		// return the series range associated with this shard
+		seriesPerShard := len(q.series) / shard.Of
+		start = shard.Shard * seriesPerShard
+		end = start + seriesPerShard
+
+		// if we're clipping an odd # of series, add the final series to the last shard
+		if end == len(q.series)-1 && len(q.series)%2 == 1 {
+			end = len(q.series)
+		}
+	}
+
+	var name string
+	for _, m := range matchers {
+		if m.Type == labels.MatchEqual && m.Name == "__name__" {
+			name = m.Value
+		}
+	}
+
+	results := make([]storage.Series, 0, end-start)
+	for i := start; i < end; i++ {
+		results = append(results, &ShardLabelSeries{
+			shard:  shard,
+			name:   name,
+			Series: q.series[i],
+		})
+	}
+
+	// loosely enforce the assumption that an operation on 1/nth of the data
+	// takes 1/nth of the time.
+	duration := q.delayPerSeries * time.Duration(len(q.series))
+	if shard != nil {
+		duration = duration / time.Duration(shard.Of)
+	}
+
+	remaining := time.Until(tStart.Add(duration))
+	if remaining > 0 {
+		time.Sleep(remaining)
+	}
+
+	return series.NewConcreteSeriesSet(results), nil, nil
+
+}
+
+// ShardLabelSeries allows extending a Series with new labels. This is helpful for adding cortex shard labels
+type ShardLabelSeries struct {
+	shard *astmapper.ShardAnnotation
+	name  string
+	storage.Series
+}
+
+// Labels impls storage.Series
+func (s *ShardLabelSeries) Labels() labels.Labels {
+	ls := s.Series.Labels()
+
+	if s.name != "" {
+		ls = append(ls, labels.Label{
+			Name:  "__name__",
+			Value: s.name,
+		})
+	}
+
+	if s.shard != nil {
+		ls = append(ls, s.shard.Label())
+	}
+
+	return ls
+}
+
+// LabelValues impls storage.Querier
+func (q *MockShardedQueryable) LabelValues(name string) ([]string, storage.Warnings, error) {
+	return nil, nil, errors.Errorf("unimplemented")
+}
+
+// LabelNames returns all the unique label names present in the block in sorted order.
+func (q *MockShardedQueryable) LabelNames() ([]string, storage.Warnings, error) {
+	return nil, nil, errors.Errorf("unimplemented")
+}
+
+// Close releases the resources of the Querier.
+func (q *MockShardedQueryable) Close() error {
+	return nil
+}
diff --git a/pkg/querier/queryrange/test_utils_test.go b/pkg/querier/queryrange/test_utils_test.go
new file mode 100644
index 00000000000..3deda66e400
--- /dev/null
+++ b/pkg/querier/queryrange/test_utils_test.go
@@ -0,0 +1,135 @@
+package queryrange
+
+import (
+	"math"
+	"sort"
+	"testing"
+
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
+)
+
+func TestGenLabelsCorrectness(t *testing.T) {
+	ls := genLabels([]string{"a", "b"}, 2)
+	for _, set := range ls {
+		sort.Sort(set)
+	}
+	expected := []labels.Labels{
+		{
+			labels.Label{
+				Name:  "a",
+				Value: "0",
+			},
+			labels.Label{
+				Name:  "b",
+				Value: "0",
+			},
+		},
+		{
+			labels.Label{
+				Name:  "a",
+				Value: "0",
+			},
+			labels.Label{
+				Name:  "b",
+				Value: "1",
+			},
+		},
+		{
+			labels.Label{
+				Name:  "a",
+				Value: "1",
+			},
+			labels.Label{
+				Name:  "b",
+				Value: "0",
+			},
+		},
+		{
+			labels.Label{
+				Name:  "a",
+				Value: "1",
+			},
+			labels.Label{
+				Name:  "b",
+				Value: "1",
+			},
+		},
+	}
+	require.Equal(t, expected, ls)
+}
+
+func TestGenLabelsSize(t *testing.T) {
+	for _, tc := range []struct {
+		set     []string
+		buckets int
+	}{
+		{
+			set:     []string{"a", "b"},
+			buckets: 5,
+		},
+		{
+			set:     []string{"a", "b", "c"},
+			buckets: 10,
+		},
+	} {
+		sets := genLabels(tc.set, tc.buckets)
+		require.Equal(
+			t,
+			math.Pow(float64(tc.buckets), float64(len(tc.set))),
+			float64(len(sets)),
+		)
+	}
+}
+
+func TestNewMockShardedqueryable(t *testing.T) {
+	for _, tc := range []struct {
+		shards, nSamples, labelBuckets int
+		labelSet                       []string
+	}{
+		{
+			nSamples:     100,
+			shards:       1,
+			labelBuckets: 3,
+			labelSet:     []string{"a", "b", "c"},
+		},
+		{
+			nSamples:     0,
+			shards:       2,
+			labelBuckets: 3,
+			labelSet:     []string{"a", "b", "c"},
+		},
+	} {
+		q := NewMockShardedQueryable(tc.nSamples, tc.labelSet, tc.labelBuckets, 0)
+		expectedSeries := int(math.Pow(float64(tc.labelBuckets), float64(len(tc.labelSet))))
+
+		seriesCt := 0
+		for i := 0; i < tc.shards; i++ {
+
+			set, _, err := q.Select(nil, &labels.Matcher{
+				Type: labels.MatchEqual,
+				Name: astmapper.ShardLabel,
+				Value: astmapper.ShardAnnotation{
+					Shard: i,
+					Of:    tc.shards,
+				}.String(),
+			})
+
+			require.Nil(t, err)
+
+			for set.Next() {
+				seriesCt++
+				iter := set.At().Iterator()
+				samples := 0
+				for iter.Next() {
+					samples++
+				}
+				require.Equal(t, tc.nSamples, samples)
+			}
+
+		}
+		require.Equal(t, expectedSeries, seriesCt)
+	}
+}
diff --git a/pkg/querier/queryrange/value.go b/pkg/querier/queryrange/value.go
new file mode 100644
index 00000000000..e8b8095ee2a
--- /dev/null
+++ b/pkg/querier/queryrange/value.go
@@ -0,0 +1,121 @@
+package queryrange
+
+import (
+	"github.com/pkg/errors"
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/prometheus/prometheus/storage"
+
+	"github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/querier/series"
+)
+
+// FromResult transforms a promql query result into a samplestream
+func FromResult(res *promql.Result) ([]SampleStream, error) {
+	if res.Err != nil {
+		return nil, res.Err
+	}
+	switch v := res.Value.(type) {
+	case promql.Scalar:
+		return []SampleStream{
+			{
+				Samples: []client.Sample{
+					{
+						Value:       v.V,
+						TimestampMs: v.T,
+					},
+				},
+			},
+		}, nil
+
+	case promql.Vector:
+		res := make([]SampleStream, 0, len(v))
+		for _, sample := range v {
+			res = append(res, SampleStream{
+				Labels:  mapLabels(sample.Metric),
+				Samples: mapPoints(sample.Point),
+			})
+		}
+		return res, nil
+
+	case promql.Matrix:
+		res := make([]SampleStream, 0, len(v))
+		for _, series := range v {
+			res = append(res, SampleStream{
+				Labels:  mapLabels(series.Metric),
+				Samples: mapPoints(series.Points...),
+			})
+		}
+		return res, nil
+
+	}
+
+	return nil, errors.Errorf("Unexpected value type: [%s]", res.Value.Type())
+}
+
+func mapLabels(ls labels.Labels) []client.LabelAdapter {
+	result := make([]client.LabelAdapter, 0, len(ls))
+	for _, l := range ls {
+		result = append(result, client.LabelAdapter(l))
+	}
+
+	return result
+}
+
+func mapPoints(pts ...promql.Point) []client.Sample {
+	result := make([]client.Sample, 0, len(pts))
+
+	for _, pt := range pts {
+		result = append(result, client.Sample{
+			Value:       pt.V,
+			TimestampMs: pt.T,
+		})
+	}
+
+	return result
+}
+
+// ResponseToSamples is needed to map back from api response to the underlying series data
+func ResponseToSamples(resp Response) ([]SampleStream, error) {
+	promRes, ok := resp.(*PrometheusResponse)
+	if !ok {
+		return nil, errors.Errorf("error invalid response type: %T, expected: %T", resp, &PrometheusResponse{})
+	}
+	if promRes.Error != "" {
+		return nil, errors.New(promRes.Error)
+	}
+	switch promRes.Data.ResultType {
+	case promql.ValueTypeVector, promql.ValueTypeMatrix:
+		return promRes.Data.Result, nil
+	}
+
+	return nil, errors.Errorf(
+		"Invalid promql.Value type: [%s]. Only %s and %s supported",
+		promRes.Data.ResultType,
+		promql.ValueTypeVector,
+		promql.ValueTypeMatrix,
+	)
+}
+
+// NewSeriesSet returns an in memory storage.SeriesSet from a []SampleStream
+func NewSeriesSet(results []SampleStream) storage.SeriesSet {
+	set := make([]storage.Series, 0, len(results))
+
+	for _, stream := range results {
+		samples := make([]model.SamplePair, 0, len(stream.Samples))
+		for _, sample := range stream.Samples {
+			samples = append(samples, model.SamplePair{
+				Timestamp: model.Time(sample.TimestampMs),
+				Value:     model.SampleValue(sample.Value),
+			})
+		}
+
+		ls := make([]labels.Label, 0, len(stream.Labels))
+		for _, l := range stream.Labels {
+			ls = append(ls, labels.Label(l))
+		}
+		set = append(set, series.NewConcreteSeries(ls, samples))
+	}
+	return series.NewConcreteSeriesSet(set)
+}
diff --git a/pkg/querier/queryrange/value_test.go b/pkg/querier/queryrange/value_test.go
new file mode 100644
index 00000000000..00bfb7447c5
--- /dev/null
+++ b/pkg/querier/queryrange/value_test.go
@@ -0,0 +1,168 @@
+package queryrange
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/ingester/client"
+)
+
+func TestFromValue(t *testing.T) {
+	var testExpr = []struct {
+		input    *promql.Result
+		err      bool
+		expected []SampleStream
+	}{
+		// string (errors)
+		{
+			input: &promql.Result{Value: promql.String{T: 1, V: "hi"}},
+			err:   true,
+		},
+		{
+			input: &promql.Result{Err: errors.New("foo")},
+			err:   true,
+		},
+		// Scalar
+		{
+			input: &promql.Result{Value: promql.Scalar{T: 1, V: 1}},
+			err:   false,
+			expected: []SampleStream{
+				{
+					Samples: []client.Sample{
+						{
+							Value:       1,
+							TimestampMs: 1,
+						},
+					},
+				},
+			},
+		},
+		// Vector
+		{
+			input: &promql.Result{
+				Value: promql.Vector{
+					promql.Sample{
+						Point: promql.Point{T: 1, V: 1},
+						Metric: labels.Labels{
+							{Name: "a", Value: "a1"},
+							{Name: "b", Value: "b1"},
+						},
+					},
+					promql.Sample{
+						Point: promql.Point{T: 2, V: 2},
+						Metric: labels.Labels{
+							{Name: "a", Value: "a2"},
+							{Name: "b", Value: "b2"},
+						},
+					},
+				},
+			},
+			err: false,
+			expected: []SampleStream{
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a1"},
+						{Name: "b", Value: "b1"},
+					},
+					Samples: []client.Sample{
+						{
+							Value:       1,
+							TimestampMs: 1,
+						},
+					},
+				},
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a2"},
+						{Name: "b", Value: "b2"},
+					},
+					Samples: []client.Sample{
+						{
+							Value:       2,
+							TimestampMs: 2,
+						},
+					},
+				},
+			},
+		},
+		// Matrix
+		{
+			input: &promql.Result{
+				Value: promql.Matrix{
+					{
+						Metric: labels.Labels{
+							{Name: "a", Value: "a1"},
+							{Name: "b", Value: "b1"},
+						},
+						Points: []promql.Point{
+							{T: 1, V: 1},
+							{T: 2, V: 2},
+						},
+					},
+					{
+						Metric: labels.Labels{
+							{Name: "a", Value: "a2"},
+							{Name: "b", Value: "b2"},
+						},
+						Points: []promql.Point{
+							{T: 1, V: 8},
+							{T: 2, V: 9},
+						},
+					},
+				},
+			},
+			err: false,
+			expected: []SampleStream{
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a1"},
+						{Name: "b", Value: "b1"},
+					},
+					Samples: []client.Sample{
+						{
+							Value:       1,
+							TimestampMs: 1,
+						},
+						{
+							Value:       2,
+							TimestampMs: 2,
+						},
+					},
+				},
+				{
+					Labels: []client.LabelAdapter{
+						{Name: "a", Value: "a2"},
+						{Name: "b", Value: "b2"},
+					},
+					Samples: []client.Sample{
+						{
+							Value:       8,
+							TimestampMs: 1,
+						},
+						{
+							Value:       9,
+							TimestampMs: 2,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	for i, c := range testExpr {
+		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
+			result, err := FromResult(c.input)
+			if c.err {
+				require.NotNil(t, err)
+			} else {
+				require.Nil(t, err)
+				require.Equal(t, c.expected, result)
+			}
+		})
+	}
+}
diff --git a/pkg/querier/remote_read_test.go b/pkg/querier/remote_read_test.go
index 1b29f2bd592..f12c2460d89 100644
--- a/pkg/querier/remote_read_test.go
+++ b/pkg/querier/remote_read_test.go
@@ -17,6 +17,7 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/cortexproject/cortex/pkg/ingester/client"
+	"github.com/cortexproject/cortex/pkg/querier/series"
 )
 
 func TestRemoteReadHandler(t *testing.T) {
@@ -90,7 +91,7 @@ func (m mockQuerier) Select(sp *storage.SelectParams, matchers ...*labels.Matche
 	if sp == nil {
 		panic(fmt.Errorf("select params must be set"))
 	}
-	return matrixToSeriesSet(m.matrix), nil, nil
+	return series.MatrixToSeriesSet(m.matrix), nil, nil
 }
 
 func (m mockQuerier) LabelValues(name string) ([]string, storage.Warnings, error) {
diff --git a/pkg/querier/series_set.go b/pkg/querier/series/series_set.go
similarity index 64%
rename from pkg/querier/series_set.go
rename to pkg/querier/series/series_set.go
index 2b5c66d1908..4f11dde09ca 100644
--- a/pkg/querier/series_set.go
+++ b/pkg/querier/series/series_set.go
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package querier
+package series
 
 import (
 	"sort"
@@ -26,61 +26,69 @@ import (
 	"github.com/cortexproject/cortex/pkg/prom1/storage/metric"
 )
 
-// concreteSeriesSet implements storage.SeriesSet.
-type concreteSeriesSet struct {
+// ConcreteSeriesSet implements storage.SeriesSet.
+type ConcreteSeriesSet struct {
 	cur    int
 	series []storage.Series
 }
 
-func newConcreteSeriesSet(series []storage.Series) storage.SeriesSet {
+// NewConcreteSeriesSet instantiates an in-memory series set from a series
+func NewConcreteSeriesSet(series []storage.Series) storage.SeriesSet {
 	sort.Sort(byLabels(series))
-	return &concreteSeriesSet{
+	return &ConcreteSeriesSet{
 		cur:    -1,
 		series: series,
 	}
 }
 
-func (c *concreteSeriesSet) Next() bool {
+// Next iterates through a series set and impls storage.SeriesSet
+func (c *ConcreteSeriesSet) Next() bool {
 	c.cur++
 	return c.cur < len(c.series)
 }
 
-func (c *concreteSeriesSet) At() storage.Series {
+// At returns the current series and impls storage.SeriesSet
+func (c *ConcreteSeriesSet) At() storage.Series {
 	return c.series[c.cur]
 }
 
-func (c *concreteSeriesSet) Err() error {
+// Err impls storage.SeriesSet
+func (c *ConcreteSeriesSet) Err() error {
 	return nil
 }
 
-// concreteSeries implements storage.Series.
-type concreteSeries struct {
+// ConcreteSeries implements storage.Series.
+type ConcreteSeries struct {
 	labels  labels.Labels
 	samples []model.SamplePair
 }
 
-func newConcreteSeries(ls labels.Labels, samples []model.SamplePair) *concreteSeries {
-	return &concreteSeries{
+// NewConcreteSeries instantiates an in memory series from a list of samples & labels
+func NewConcreteSeries(ls labels.Labels, samples []model.SamplePair) *ConcreteSeries {
+	return &ConcreteSeries{
 		labels:  ls,
 		samples: samples,
 	}
 }
 
-func (c *concreteSeries) Labels() labels.Labels {
+// Labels impls storage.Series
+func (c *ConcreteSeries) Labels() labels.Labels {
 	return c.labels
 }
 
-func (c *concreteSeries) Iterator() storage.SeriesIterator {
-	return newConcreteSeriesIterator(c)
+// Iterator impls storage.Series
+func (c *ConcreteSeries) Iterator() storage.SeriesIterator {
+	return NewConcreteSeriesIterator(c)
 }
 
 // concreteSeriesIterator implements storage.SeriesIterator.
 type concreteSeriesIterator struct {
 	cur    int
-	series *concreteSeries
+	series *ConcreteSeries
 }
 
-func newConcreteSeriesIterator(series *concreteSeries) storage.SeriesIterator {
+// NewConcreteSeriesIterator instaniates an in memory storage.SeriesIterator
+func NewConcreteSeriesIterator(series *ConcreteSeries) storage.SeriesIterator {
 	return &concreteSeriesIterator{
 		cur:    -1,
 		series: series,
@@ -108,6 +116,11 @@ func (c *concreteSeriesIterator) Err() error {
 	return nil
 }
 
+// NewErrIterator instantiates an errIterator
+func NewErrIterator(err error) storage.SeriesIterator {
+	return errIterator{err}
+}
+
 // errIterator implements storage.SeriesIterator, just returning an error.
 type errIterator struct {
 	err error
@@ -129,26 +142,28 @@ func (e errIterator) Err() error {
 	return e.err
 }
 
-func matrixToSeriesSet(m model.Matrix) storage.SeriesSet {
+// MatrixToSeriesSet creates a storage.SeriesSet from a model.Matrix
+func MatrixToSeriesSet(m model.Matrix) storage.SeriesSet {
 	series := make([]storage.Series, 0, len(m))
 	for _, ss := range m {
-		series = append(series, &concreteSeries{
+		series = append(series, &ConcreteSeries{
 			labels:  metricToLabels(ss.Metric),
 			samples: ss.Values,
 		})
 	}
-	return newConcreteSeriesSet(series)
+	return NewConcreteSeriesSet(series)
 }
 
-func metricsToSeriesSet(ms []metric.Metric) storage.SeriesSet {
+// MetricsToSeriesSet creates a storage.SeriesSet from a []metric.Metric
+func MetricsToSeriesSet(ms []metric.Metric) storage.SeriesSet {
 	series := make([]storage.Series, 0, len(ms))
 	for _, m := range ms {
-		series = append(series, &concreteSeries{
+		series = append(series, &ConcreteSeries{
 			labels:  metricToLabels(m.Metric),
 			samples: nil,
 		})
 	}
-	return newConcreteSeriesSet(series)
+	return NewConcreteSeriesSet(series)
 }
 
 func metricToLabels(m model.Metric) labels.Labels {
diff --git a/pkg/querier/series_set_test.go b/pkg/querier/series/series_set_test.go
similarity index 88%
rename from pkg/querier/series_set_test.go
rename to pkg/querier/series/series_set_test.go
index a1244b57f4d..a8ca1ae4efc 100644
--- a/pkg/querier/series_set_test.go
+++ b/pkg/querier/series/series_set_test.go
@@ -1,4 +1,4 @@
-package querier
+package series
 
 import (
 	"testing"
@@ -10,15 +10,15 @@ import (
 )
 
 func TestConcreteSeriesSet(t *testing.T) {
-	series1 := &concreteSeries{
+	series1 := &ConcreteSeries{
 		labels:  labels.FromStrings("foo", "bar"),
 		samples: []model.SamplePair{{Value: 1, Timestamp: 2}},
 	}
-	series2 := &concreteSeries{
+	series2 := &ConcreteSeries{
 		labels:  labels.FromStrings("foo", "baz"),
 		samples: []model.SamplePair{{Value: 3, Timestamp: 4}},
 	}
-	c := newConcreteSeriesSet([]storage.Series{series2, series1})
+	c := NewConcreteSeriesSet([]storage.Series{series2, series1})
 	require.True(t, c.Next())
 	require.Equal(t, series1, c.At())
 	require.True(t, c.Next())
@@ -39,7 +39,7 @@ func TestMatrixToSeriesSetSortsMetricLabels(t *testing.T) {
 			Values: []model.SamplePair{{Timestamp: 0, Value: 0}},
 		},
 	}
-	ss := matrixToSeriesSet(matrix)
+	ss := MatrixToSeriesSet(matrix)
 	require.True(t, ss.Next())
 	require.NoError(t, ss.Err())
 
diff --git a/tools/query-audit/auditor.go b/tools/query-audit/auditor.go
new file mode 100644
index 00000000000..17ff61c3dbe
--- /dev/null
+++ b/tools/query-audit/auditor.go
@@ -0,0 +1,92 @@
+package main
+
+import (
+	"math"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/common/model"
+)
+
+// Auditor is a struct for auditing prometheus queries
+type Auditor struct{}
+
+// Diff stores a difference between two queries
+type Diff struct {
+	Series      int
+	Diff        float64   // avg proportional diff across all series & samples
+	sampleDiffs []float64 // proportional diffs as measured by x/control
+}
+
+// Audit audits two prometheus queries
+func (a *Auditor) Audit(control, x model.Value) (Diff, error) {
+	if x.Type() == model.ValMatrix && control.Type() == model.ValMatrix {
+		return a.auditMatrix(x.(model.Matrix), control.(model.Matrix))
+	}
+
+	if x.Type() == model.ValVector && control.Type() == model.ValVector {
+		return a.auditVector(x.(model.Vector), control.(model.Vector))
+	}
+
+	return Diff{}, errors.Errorf("unsupported types for equality: got %s & %s", control.Type().String(), x.Type().String())
+}
+
+func (a *Auditor) auditMatrix(x, y model.Matrix) (diff Diff, err error) {
+	// different # of returned series
+	if len(x) != len(y) {
+		return diff, errors.Errorf("different # of series: control=%d, other=%d", len(x), len(y))
+	}
+
+	for i := 0; i < len(x); i++ {
+		xSeries, ySeries := x[i], y[i]
+		if !xSeries.Metric.Equal(ySeries.Metric) {
+			return diff, errors.Errorf("mismatched metrics: %v vs %v", xSeries.Metric, ySeries.Metric)
+		}
+
+		xVals, yVals := xSeries.Values, ySeries.Values
+		if len(xVals) != len(yVals) {
+			return diff, errors.Errorf(
+				"mismatched number of samples for series %v. control=%d, other=%d",
+				xSeries.Metric,
+				len(xVals),
+				len(yVals),
+			)
+		}
+
+		for j := 0; j < len(xVals); j++ {
+			xSample, ySample := xVals[j], yVals[j]
+
+			if xSample.Timestamp != ySample.Timestamp {
+				return diff, errors.Errorf(
+					"mismatched timestamp for %d sample of series %v. control=%d, other=%d",
+					j,
+					xSeries.Metric,
+					xSample.Timestamp,
+					ySample.Timestamp,
+				)
+			}
+
+			absDiff := math.Abs(float64(ySample.Value-xSample.Value)) / math.Abs(float64(xSample.Value))
+
+			// 0/0 -> no diff
+			if math.IsNaN(absDiff) {
+				absDiff = 0
+			}
+
+			diff.sampleDiffs = append(diff.sampleDiffs, absDiff)
+
+		}
+	}
+
+	diff.Series = len(x)
+	var avgDiffProportion float64
+	for _, d := range diff.sampleDiffs {
+		avgDiffProportion += d
+	}
+	diff.Diff = avgDiffProportion / float64(len(diff.sampleDiffs))
+
+	return diff, nil
+}
+
+func (a *Auditor) auditVector(x, y model.Vector) (Diff, error) {
+	return Diff{}, errors.New("unimplemented")
+}
diff --git a/tools/query-audit/config.go b/tools/query-audit/config.go
new file mode 100644
index 00000000000..d1035fc1b49
--- /dev/null
+++ b/tools/query-audit/config.go
@@ -0,0 +1,67 @@
+package main
+
+import (
+	"io/ioutil"
+	"time"
+
+	"github.com/pkg/errors"
+
+	"sigs.k8s.io/yaml"
+)
+
+type Backend struct {
+	Host    string            `yaml:"host" json:"host"`
+	Headers map[string]string `yaml:"headers" json:"headers"`
+}
+
+type Query struct {
+	Query       string    `yaml:"query" json:"query"`
+	Start       time.Time `yaml:"start" json:"start"`
+	End         time.Time `yaml:"end" json:"end"`
+	StepSizeStr string    `yaml:"step_size" json:"step_size"`
+	StepSize    time.Duration
+}
+
+func (q *Query) Validate() error {
+	parsedDur, err := time.ParseDuration(q.StepSizeStr)
+	if err != nil {
+		return err
+	}
+
+	q.StepSize = parsedDur
+
+	if q.StepSize == time.Duration(0) {
+		q.StepSize = time.Minute
+	}
+	return nil
+}
+
+type Config struct {
+	Control Backend  `yaml:"control" json:"control"`
+	Test    Backend  `yaml:"test" json:"test"`
+	Queries []*Query `yaml:"queries" json:"queries"`
+}
+
+func (cfg *Config) Validate() error {
+	for _, q := range cfg.Queries {
+		if err := q.Validate(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// LoadConfig read YAML-formatted config from filename into cfg.
+func LoadConfig(filename string, cfg *Config) error {
+	buf, err := ioutil.ReadFile(filename)
+	if err != nil {
+		return errors.Wrap(err, "Error reading config file")
+	}
+
+	err = yaml.Unmarshal(buf, cfg)
+	if err != nil {
+		return errors.Wrap(err, "Error parsing config file")
+	}
+
+	return cfg.Validate()
+}
diff --git a/tools/query-audit/example-config.yaml b/tools/query-audit/example-config.yaml
new file mode 100644
index 00000000000..11ffeeb7a5a
--- /dev/null
+++ b/tools/query-audit/example-config.yaml
@@ -0,0 +1,37 @@
+control:
+  host: http://localhost:8080/api/prom
+  headers:
+    "X-Scope-OrgID": 1234
+
+test:
+  host: http://localhost:8081/api/prom
+  headers:
+    "X-Scope-OrgID": 1234
+
+queries:
+  - query: 'sum(rate(container_cpu_usage_seconds_total[5m]))'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-28T00:00:00Z
+    step_size: 15m
+  - query: 'sum(rate(container_cpu_usage_seconds_total[5m])) by (container_name)'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-28T00:00:00Z
+    step_size: 15m
+  - query: 'sum(rate(container_cpu_usage_seconds_total[5m])) without (container_name)'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-26T00:00:00Z
+    step_size: 15m
+  - query: 'histogram_quantile(0.9, sum(rate(cortex_cache_value_size_bytes_bucket[5m])) by (le, job))'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-25T06:00:00Z
+    step_size: 15m
+    # two shardable legs
+  - query: 'sum without (instance, job) (rate(cortex_query_frontend_queue_length[5m])) or sum by (job) (rate(cortex_query_frontend_queue_length[5m]))'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-25T06:00:00Z
+    step_size: 15m
+    # one shardable leg
+  - query: 'sum without (instance, job) (rate(cortex_cache_request_duration_seconds_count[5m])) or rate(cortex_cache_request_duration_seconds_count[5m])'
+    start: 2019-11-25T00:00:00Z
+    end: 2019-11-25T06:00:00Z
+    step_size: 15m
diff --git a/tools/query-audit/main.go b/tools/query-audit/main.go
new file mode 100644
index 00000000000..20b41380e09
--- /dev/null
+++ b/tools/query-audit/main.go
@@ -0,0 +1,87 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"log"
+
+	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
+)
+
+var (
+	configFile string
+)
+
+func init() {
+	flag.StringVar(&configFile, "f", "", "path to config file")
+	flag.Parse()
+	if configFile == "" {
+		log.Fatal(`unset configFile. try "-f <file>"`)
+	}
+}
+
+func main() {
+	var conf Config
+	if err := LoadConfig(configFile, &conf); err != nil {
+		log.Fatal(err)
+	}
+
+	err := Run(conf)
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+
+func Run(conf Config) error {
+	ctlAPI, err := NewAPI(conf.Control)
+	if err != nil {
+		return err
+	}
+
+	tstAPI, err := NewAPI(conf.Test)
+	if err != nil {
+		return err
+	}
+
+	for _, query := range conf.Queries {
+		ctlResp, _, err := ctlAPI.QueryRange(context.Background(), query.Query, v1.Range{
+			Start: query.Start,
+			End:   query.End,
+			Step:  query.StepSize,
+		})
+
+		if err != nil {
+			return err
+		}
+
+		tstResp, _, err := tstAPI.QueryRange(context.Background(), query.Query, v1.Range{
+			Start: query.Start,
+			End:   query.End,
+			Step:  query.StepSize,
+		})
+
+		if err != nil {
+			return err
+		}
+
+		auditor := &Auditor{}
+		diff, err := auditor.Audit(ctlResp, tstResp)
+		if err != nil {
+			return err
+		}
+
+		fmt.Println(fmt.Sprintf(
+			"\n%f%% avg diff for:\n\tquery: %s\n\tseries: %d\n\tsamples: %d\n\tstart: %v\n\tend: %v\n\tstep: %v",
+			diff.Diff*100,
+			query.Query,
+			diff.Series,
+			len(diff.sampleDiffs),
+			query.Start,
+			query.End,
+			query.StepSize,
+		))
+
+	}
+	return nil
+}
diff --git a/tools/query-audit/runner.go b/tools/query-audit/runner.go
new file mode 100644
index 00000000000..e8eadee852d
--- /dev/null
+++ b/tools/query-audit/runner.go
@@ -0,0 +1,32 @@
+package main
+
+import (
+	"net/http"
+
+	"github.com/prometheus/client_golang/api"
+	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+)
+
+// NewAPI instantiates a prometheus api
+func NewAPI(backend Backend) (v1.API, error) {
+	config := api.Config{
+		Address: backend.Host,
+	}
+
+	if len(backend.Headers) > 0 {
+		config.RoundTripper = promhttp.RoundTripperFunc(func(req *http.Request) (*http.Response, error) {
+			for key, value := range backend.Headers {
+				req.Header.Add(key, value)
+			}
+			return http.DefaultTransport.RoundTrip(req)
+		})
+	}
+
+	c, err := api.NewClient(config)
+	if err != nil {
+		return nil, err
+	}
+
+	return v1.NewAPI(c), nil
+}