elastic · csoulios · Apr 28, 2020 · May 12, 2020 · May 14, 2020 · May 14, 2020
diff --git a/docs/reference/mapping/fields.asciidoc b/docs/reference/mapping/fields.asciidoc
@@ -59,6 +59,11 @@ some of these metadata fields can be customized when a mapping type is created.
 
     Application specific metadata.
 
+=== Doc count metadata field
+
+<<mapping-doc-count-field,`_doc_count`>>::
+
+    A custom field used for storing doc counts when a document represents pre-aggregated data.
 
 include::fields/field-names-field.asciidoc[]
 
@@ -76,3 +81,4 @@ include::fields/source-field.asciidoc[]
 
 include::fields/type-field.asciidoc[]
 
+include::fields/doc-count-field.asciidoc[]
diff --git a/docs/reference/mapping/fields/doc-count-field.asciidoc b/docs/reference/mapping/fields/doc-count-field.asciidoc
@@ -0,0 +1,118 @@
+[[mapping-doc-count-field]]
+=== `_doc_count` data type
+++++
+<titleabbrev>_doc_count</titleabbrev>
+++++
+
+Bucket aggregations always return a field named `doc_count` showing the number of documents that were aggregated and partitioned
+in each bucket. Computation of the value of `doc_count` is very simple. `doc_count` is incremented by 1 for every document collected
+in each bucket.
+
+While this simple approach is effective when computing aggregations over individual documents, it fails to accurately represent
+documents that store pre-aggregated data (such as `histogram` or `aggregate_metric_double` fields), because one summary field may
+represent multiple documents.
+
+To allow for correct computation of the number of documents when working with pre-aggregated data, we have introduced a
+metadata field type named `_doc_count`. `_doc_count` must always be a positive integer representing the number of documents
+aggregated in a single summary field.
+
+When field `_doc_count` is added to a document, all bucket aggregations will respect its value and increment the bucket `doc_count`
+by the value of the field. If a document does not contain any `_doc_count` field, `_doc_count = 1` is implied by default.
+
+[IMPORTANT]
+========
+* A `_doc_count` field can only store a single positive integer per document. Nested arrays are not allowed.
 XContentParserUtils.ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, parser.currentToken(), parser); 
 XContentParserUtils.ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, parser.currentToken(), parser); 
+* If a document contains no `_doc_count` fields, aggregators will increment by 1, which is the default behavior.
+========
+
+[[mapping-doc-count-field-example]]
+==== Example
+
+The following <<indices-create-index, create index>> API request creates a new index with the following field mappings:
+
+* `my_histogram`, a `histogram` field used to store percentile data
+* `my_text`, a `keyword` field used to store a title for the histogram
+
+[source,console]
+--------------------------------------------------
+PUT my_index
+{
+  "mappings" : {
+    "properties" : {
+      "my_histogram" : {
+        "type" : "histogram"
+      },
+      "my_text" : {
+        "type" : "keyword"
+      }
+    }
+  }
+}
+--------------------------------------------------
+
+The following <<docs-index_,index>> API requests store pre-aggregated data for
+two histograms: `histogram_1` and `histogram_2`.
+
+[source,console]
+--------------------------------------------------
+PUT my_index/_doc/1
+{
+  "my_text" : "histogram_1",
+  "my_histogram" : {
+      "values" : [0.1, 0.2, 0.3, 0.4, 0.5],
+      "counts" : [3, 7, 23, 12, 6]
+   },
+  "_doc_count": 45 <1>
+}
+
+PUT my_index/_doc/2
+{
+  "my_text" : "histogram_2",
+  "my_histogram" : {
+      "values" : [0.1, 0.25, 0.35, 0.4, 0.45, 0.5],
+      "counts" : [8, 17, 8, 7, 6, 2]
+   },
+  "_doc_count_": 62 <1>
+}
+--------------------------------------------------
+<1> Field `_doc_count` must be a positive integer storing the number of documents aggregated to produce each histogram.
+
+If we run the following <<search-aggregations-bucket-terms-aggregation, terms aggregation>> on `my_index`:
+
+[source,console]
+--------------------------------------------------
+GET /_search
+{
+    "aggs" : {
+        "histogram_titles" : {
+            "terms" : { "field" : "my_text" }
+        }
+    }
+}
+--------------------------------------------------
+
+We will get the following response:
+
+[source,console-result]
+--------------------------------------------------
+{
+    ...
+    "aggregations" : {
+        "histogram_titles" : {
+            "doc_count_error_upper_bound": 0,
+            "sum_other_doc_count": 0,
+            "buckets" : [
+                {
+                    "key" : "histogram_2",
+                    "doc_count" : 62
+                },
+                {
+                    "key" : "histogram_1",
+                    "doc_count" : 45
+                }
+            ]
+        }
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[skip:test not setup]
diff --git a/...api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml b/...api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml
@@ -0,0 +1,95 @@
+setup:
+  - do:
+      indices.create:
+        index: test_1
+        body:
+          settings:
+            number_of_replicas: 0
+          mappings:
+            properties:
+              str:
+                type: keyword
+              number:
+                type: integer
+
+  - do:
+      bulk:
+        index: test_1
+        refresh: true
+        body:
+          - '{"index": {}}'
+          - '{"_doc_count": 10, "str": "abc", "number" : 500, "unmapped": "abc" }'
+          - '{"index": {}}'
+          - '{"_doc_count": 5, "str": "xyz", "number" : 100, "unmapped": "xyz" }'
+          - '{"index": {}}'
+          - '{"_doc_count": 7, "str": "foo", "number" : 100, "unmapped": "foo" }'
+          - '{"index": {}}'
+          - '{"_doc_count": 1, "str": "foo", "number" : 200, "unmapped": "foo" }'
+          - '{"index": {}}'
+          - '{"str": "abc", "number" : 500, "unmapped": "abc" }'
+
+---
+"Test numeric terms agg with doc_count":
+  - skip:
+      version: " - 7.99.99"
+      reason: "Doc count fields are only implemented in 8.0"
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body: { "size" : 0, "aggs" : { "num_terms" : { "terms" : { "field" : "number" } } } }
+
+  - match: { hits.total: 5 }
+  - length: { aggregations.num_terms.buckets: 3 }
+  - match: { aggregations.num_terms.buckets.0.key: 100 }
+  - match: { aggregations.num_terms.buckets.0.doc_count: 12 }
+  - match: { aggregations.num_terms.buckets.1.key: 500 }
+  - match: { aggregations.num_terms.buckets.1.doc_count: 11 }
+  - match: { aggregations.num_terms.buckets.2.key: 200 }
+  - match: { aggregations.num_terms.buckets.2.doc_count: 1 }
+
+
+---
+"Test keyword terms agg with doc_count":
+  - skip:
+      version: " - 7.99.99"
+      reason: "Doc count fields are only implemented in 8.0"
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str" } } } }
+
+  - match: { hits.total: 5 }
+  - length: { aggregations.str_terms.buckets: 3 }
+  - match: { aggregations.str_terms.buckets.0.key: "abc" }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 11 }
+  - match: { aggregations.str_terms.buckets.1.key: "foo" }
+  - match: { aggregations.str_terms.buckets.1.doc_count: 8 }
+  - match: { aggregations.str_terms.buckets.2.key: "xyz" }
+  - match: { aggregations.str_terms.buckets.2.doc_count: 5 }
+
+---
+
+"Test unmapped string terms agg with doc_count":
+  - skip:
+      version: " - 7.99.99"
+      reason: "Doc count fields are only implemented in 8.0"
+  - do:
+      bulk:
+        index: test_2
+        refresh: true
+        body:
+          - '{"index": {}}'
+          - '{"_doc_count": 10, "str": "abc" }'
+          - '{"index": {}}'
+          - '{"str": "abc" }'
+  - do:
+      search:
+        index: test_2
+        rest_total_hits_as_int: true
+        body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str.keyword" } } } }
+
+  - match: { hits.total: 2 }
+  - length: { aggregations.str_terms.buckets: 1 }
+  - match: { aggregations.str_terms.buckets.0.key: "abc" }
+  - match: { aggregations.str_terms.buckets.0.doc_count: 11 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.search.DocValuesFieldExistsQuery;
+import org.apache.lucene.search.Query;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.index.query.QueryShardException;
+import org.elasticsearch.search.lookup.SearchLookup;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+/** Mapper for the doc_count field. */
+public class DocCountFieldMapper extends MetadataFieldMapper {
+
+    public static final String NAME = "_doc_count";
+    public static final String CONTENT_TYPE = "_doc_count";
+
+    public static final TypeParser PARSER = new ConfigurableTypeParser(
+        c -> new DocCountFieldMapper(),
+        c -> new DocCountFieldMapper.Builder());
+
+    static class Builder extends MetadataFieldMapper.Builder {
+
+        Builder() {
+            super(NAME);
+        }
+
+        @Override
+        protected List<Parameter<?>> getParameters() {
+            return Collections.emptyList();
+        }
+
+        @Override
+        public DocCountFieldMapper build(BuilderContext context) {
+            return new DocCountFieldMapper();
+        }
+    }
+
+    public static final class DocCountFieldType extends MappedFieldType {
+
+        public static final DocCountFieldType INSTANCE = new DocCountFieldType();
+
+        public DocCountFieldType() {
+            super(NAME, false, false, true, TextSearchInfo.NONE,  Collections.emptyMap());
+        }
+
+        @Override
+        public String typeName() {
+            return CONTENT_TYPE;
+        }
+
+        @Override
+        public Query existsQuery(QueryShardContext context) {
+            return new DocValuesFieldExistsQuery(NAME);
+        }
+
+        @Override
+        public Query termQuery(Object value, QueryShardContext context) {
+            throw new QueryShardException(context, "Field [" + name() + " ]of type [" + CONTENT_TYPE + "] is not searchable");
+        }
+    }
+
+    private DocCountFieldMapper() {
+        super(DocCountFieldType.INSTANCE);
+    }
+
+    @Override
+    protected void parseCreateField(ParseContext context) throws IOException {
+        if (context.parser().currentToken() == XContentParser.Token.VALUE_NUMBER) {
+            Long value = context.parser().longValue(false);
+
+            if (value != null) {
+                if (value.longValue() <= 0) {
+                    throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer.");
+                }
+                final Field docCount = new NumericDocValuesField(NAME, value.longValue());
+                context.doc().add(docCount);
+            }
+        } else {
+            throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer.");
+        }
+    }
+
+    @Override
+    public void preParse(ParseContext context) { }
+
+    @Override
+    public ValueFetcher valueFetcher(MapperService mapperService, SearchLookup lookup, String format) {
+        if (format != null) {
+            throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
+        }
+        return new SourceValueFetcher(name(), mapperService, parsesArrayValue()) {
+            @Override
+            protected Object parseSourceValue(Object value) {
+                return value;
+            }
+        };
+    }
+
+    @Override
+    public DocCountFieldType fieldType() {
+        return (DocCountFieldType) super.fieldType();
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesModule.java b/server/src/main/java/org/elasticsearch/indices/IndicesModule.java
@@ -32,6 +32,7 @@
 import org.elasticsearch.index.mapper.BooleanFieldMapper;
 import org.elasticsearch.index.mapper.CompletionFieldMapper;
 import org.elasticsearch.index.mapper.DateFieldMapper;
+import org.elasticsearch.index.mapper.DocCountFieldMapper;
 import org.elasticsearch.index.mapper.FieldAliasMapper;
 import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
 import org.elasticsearch.index.mapper.GeoPointFieldMapper;
@@ -154,6 +155,7 @@ private static Map<String, MetadataFieldMapper.TypeParser> initBuiltInMetadataMa
         builtInMetadataMappers.put(NestedPathFieldMapper.NAME, NestedPathFieldMapper.PARSER);
         builtInMetadataMappers.put(VersionFieldMapper.NAME, VersionFieldMapper.PARSER);
         builtInMetadataMappers.put(SeqNoFieldMapper.NAME, SeqNoFieldMapper.PARSER);
+        builtInMetadataMappers.put(DocCountFieldMapper.NAME, DocCountFieldMapper.PARSER);
         //_field_names must be added last so that it has a chance to see all the other mappers
         builtInMetadataMappers.put(FieldNamesFieldMapper.NAME, FieldNamesFieldMapper.PARSER);
         return Collections.unmodifiableMap(builtInMetadataMappers);

diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java b/server/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java
@@ -180,7 +180,7 @@ public Map<String, Object> metadata() {
 
     @Override
     public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws IOException {
-        preGetSubLeafCollectors();
+        preGetSubLeafCollectors(ctx);
         final LeafBucketCollector sub = collectableSubAggregators.getLeafCollector(ctx);
         return getLeafCollector(ctx, sub);
     }
@@ -189,7 +189,7 @@ public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws
      * Can be overridden by aggregator implementations that like the perform an operation before the leaf collectors
      * of children aggregators are instantiated for the next segment.
      */
-    protected void preGetSubLeafCollectors() throws IOException {
+    protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException {
     }
 
     /**