|  | 
|  | 1 | +[[search-aggregations-bucket-rare-terms-aggregation]] | 
|  | 2 | +=== Rare Terms Aggregation | 
|  | 3 | + | 
|  | 4 | +A multi-bucket value source based aggregation which finds "rare" terms -- terms that are at the long-tail | 
|  | 5 | +of the distribution and are not frequent.  Conceptually, this is like a `terms` aggregation that is | 
|  | 6 | +sorted by `_count` ascending.  As noted in the <<search-aggregations-bucket-terms-aggregation-order,terms aggregation docs>>, | 
|  | 7 | +actually ordering a `terms` agg by count ascending has unbounded error.  Instead, you should use the `rare_terms` | 
|  | 8 | +aggregation | 
|  | 9 | + | 
|  | 10 | +////////////////////////// | 
|  | 11 | +
 | 
|  | 12 | +[source,js] | 
|  | 13 | +-------------------------------------------------- | 
|  | 14 | +PUT /products | 
|  | 15 | +{ | 
|  | 16 | +    "mappings": { | 
|  | 17 | +        "properties": { | 
|  | 18 | +            "genre": { | 
|  | 19 | +                "type": "keyword" | 
|  | 20 | +            }, | 
|  | 21 | +            "product": { | 
|  | 22 | +                "type": "keyword" | 
|  | 23 | +            } | 
|  | 24 | +        } | 
|  | 25 | +    } | 
|  | 26 | +} | 
|  | 27 | +
 | 
|  | 28 | +POST /products/_doc/_bulk?refresh | 
|  | 29 | +{"index":{"_id":0}} | 
|  | 30 | +{"genre": "rock", "product": "Product A"} | 
|  | 31 | +{"index":{"_id":1}} | 
|  | 32 | +{"genre": "rock"} | 
|  | 33 | +{"index":{"_id":2}} | 
|  | 34 | +{"genre": "rock"} | 
|  | 35 | +{"index":{"_id":3}} | 
|  | 36 | +{"genre": "jazz", "product": "Product Z"} | 
|  | 37 | +{"index":{"_id":4}} | 
|  | 38 | +{"genre": "jazz"} | 
|  | 39 | +{"index":{"_id":5}} | 
|  | 40 | +{"genre": "electronic"} | 
|  | 41 | +{"index":{"_id":6}} | 
|  | 42 | +{"genre": "electronic"} | 
|  | 43 | +{"index":{"_id":7}} | 
|  | 44 | +{"genre": "electronic"} | 
|  | 45 | +{"index":{"_id":8}} | 
|  | 46 | +{"genre": "electronic"} | 
|  | 47 | +{"index":{"_id":9}} | 
|  | 48 | +{"genre": "electronic"} | 
|  | 49 | +{"index":{"_id":10}} | 
|  | 50 | +{"genre": "swing"} | 
|  | 51 | +
 | 
|  | 52 | +------------------------------------------------- | 
|  | 53 | +// NOTCONSOLE | 
|  | 54 | +// TESTSETUP | 
|  | 55 | +
 | 
|  | 56 | +////////////////////////// | 
|  | 57 | + | 
|  | 58 | +==== Syntax | 
|  | 59 | + | 
|  | 60 | +A `rare_terms` aggregation looks like this in isolation: | 
|  | 61 | + | 
|  | 62 | +[source,js] | 
|  | 63 | +-------------------------------------------------- | 
|  | 64 | +{ | 
|  | 65 | +    "rare_terms": { | 
|  | 66 | +        "field": "the_field", | 
|  | 67 | +        "max_doc_count": 1 | 
|  | 68 | +    } | 
|  | 69 | +} | 
|  | 70 | +-------------------------------------------------- | 
|  | 71 | +// NOTCONSOLE | 
|  | 72 | + | 
|  | 73 | +.`rare_terms` Parameters | 
|  | 74 | +|=== | 
|  | 75 | +|Parameter Name |Description |Required |Default Value | 
|  | 76 | +|`field` |The field we wish to find rare terms in |Required | | 
|  | 77 | +|`max_doc_count` |The maximum number of documents a term should appear in. |Optional |`1` | 
|  | 78 | +|`precision` |The precision of the internal CuckooFilters.  Smaller precision leads to | 
|  | 79 | +better approximation, but higher memory usage. Cannot be smaller than `0.00001` |Optional |`0.01` | 
|  | 80 | +|`include` |Terms that should be included in the aggregation|Optional | | 
|  | 81 | +|`exclude` |Terms that should be excluded from the aggregation|Optional | | 
|  | 82 | +|`missing` |The value that should be used if a document does not have the field being aggregated|Optional | | 
|  | 83 | +|=== | 
|  | 84 | + | 
|  | 85 | + | 
|  | 86 | +Example: | 
|  | 87 | + | 
|  | 88 | +[source,js] | 
|  | 89 | +-------------------------------------------------- | 
|  | 90 | +GET /_search | 
|  | 91 | +{ | 
|  | 92 | +    "aggs" : { | 
|  | 93 | +        "genres" : { | 
|  | 94 | +            "rare_terms" : { | 
|  | 95 | +                "field" : "genre" | 
|  | 96 | +            } | 
|  | 97 | +        } | 
|  | 98 | +    } | 
|  | 99 | +} | 
|  | 100 | +-------------------------------------------------- | 
|  | 101 | +// CONSOLE | 
|  | 102 | +// TEST[s/_search/_search\?filter_path=aggregations/] | 
|  | 103 | + | 
|  | 104 | +Response: | 
|  | 105 | + | 
|  | 106 | +[source,js] | 
|  | 107 | +-------------------------------------------------- | 
|  | 108 | +{ | 
|  | 109 | +    ... | 
|  | 110 | +    "aggregations" : { | 
|  | 111 | +        "genres" : { | 
|  | 112 | +            "buckets" : [ | 
|  | 113 | +                { | 
|  | 114 | +                    "key" : "swing", | 
|  | 115 | +                    "doc_count" : 1 | 
|  | 116 | +                } | 
|  | 117 | +            ] | 
|  | 118 | +        } | 
|  | 119 | +    } | 
|  | 120 | +} | 
|  | 121 | +-------------------------------------------------- | 
|  | 122 | +// TESTRESPONSE[s/\.\.\.//] | 
|  | 123 | + | 
|  | 124 | +In this example, the only bucket that we see is the "swing" bucket, because it is the only term that appears in | 
|  | 125 | +one document.  If we increase the `max_doc_count` to `2`, we'll see some more buckets: | 
|  | 126 | + | 
|  | 127 | +[source,js] | 
|  | 128 | +-------------------------------------------------- | 
|  | 129 | +GET /_search | 
|  | 130 | +{ | 
|  | 131 | +    "aggs" : { | 
|  | 132 | +        "genres" : { | 
|  | 133 | +            "rare_terms" : { | 
|  | 134 | +                "field" : "genre", | 
|  | 135 | +                "max_doc_count": 2 | 
|  | 136 | +            } | 
|  | 137 | +        } | 
|  | 138 | +    } | 
|  | 139 | +} | 
|  | 140 | +-------------------------------------------------- | 
|  | 141 | +// CONSOLE | 
|  | 142 | +// TEST[s/_search/_search\?filter_path=aggregations/] | 
|  | 143 | + | 
|  | 144 | +This now shows the "jazz" term which has a `doc_count` of 2": | 
|  | 145 | + | 
|  | 146 | +[source,js] | 
|  | 147 | +-------------------------------------------------- | 
|  | 148 | +{ | 
|  | 149 | +    ... | 
|  | 150 | +    "aggregations" : { | 
|  | 151 | +        "genres" : { | 
|  | 152 | +            "buckets" : [ | 
|  | 153 | +                { | 
|  | 154 | +                    "key" : "swing", | 
|  | 155 | +                    "doc_count" : 1 | 
|  | 156 | +                }, | 
|  | 157 | +                { | 
|  | 158 | +                    "key" : "jazz", | 
|  | 159 | +                    "doc_count" : 2 | 
|  | 160 | +                } | 
|  | 161 | +            ] | 
|  | 162 | +        } | 
|  | 163 | +    } | 
|  | 164 | +} | 
|  | 165 | +-------------------------------------------------- | 
|  | 166 | +// TESTRESPONSE[s/\.\.\.//] | 
|  | 167 | + | 
|  | 168 | +[[search-aggregations-bucket-rare-terms-aggregation-max-doc-count]] | 
|  | 169 | +==== Maximum document count | 
|  | 170 | + | 
|  | 171 | +The `max_doc_count` parameter is used to control the upper bound of document counts that a term can have.  There | 
|  | 172 | +is not a size limitation on the `rare_terms` agg like `terms` agg has.  This means that terms | 
|  | 173 | +which match the `max_doc_count` criteria will be returned.  The aggregation functions in this manner to avoid | 
|  | 174 | +the order-by-ascending issues that afflict the `terms` aggregation. | 
|  | 175 | + | 
|  | 176 | +This does, however, mean that  a large number of results can be returned if chosen incorrectly. | 
|  | 177 | +To limit the danger of this setting, the maximum `max_doc_count` is 100. | 
|  | 178 | + | 
|  | 179 | +[[search-aggregations-bucket-rare-terms-aggregation-max-buckets]] | 
|  | 180 | +==== Max Bucket Limit | 
|  | 181 | + | 
|  | 182 | +The Rare Terms aggregation is more liable to trip the `search.max_buckets` soft limit than other aggregations due | 
|  | 183 | +to how it works.  The `max_bucket` soft-limit is evaluated on a per-shard basis while the aggregation is collecting | 
|  | 184 | +results.  It is possible for a term to be "rare" on a shard but become "not rare" once all the shard results are | 
|  | 185 | +merged together.  This means that individual shards tend to collect more buckets than are truly rare, because | 
|  | 186 | +they only have their own local view.  This list is ultimately pruned to the correct, smaller list of rare | 
|  | 187 | +terms on the coordinating node... but a shard may have already tripped the `max_buckets` soft limit and aborted | 
|  | 188 | +the request. | 
|  | 189 | + | 
|  | 190 | +When aggregating on fields that have potentially many "rare" terms, you may need to increase the `max_buckets` soft | 
|  | 191 | +limit.  Alternatively, you might need to find a way to filter the results to return fewer rare values (smaller time | 
|  | 192 | +span, filter by category, etc), or re-evaluate your definition of "rare" (e.g. if something | 
|  | 193 | +appears 100,000 times, is it truly "rare"?) | 
|  | 194 | + | 
|  | 195 | +[[search-aggregations-bucket-rare-terms-aggregation-approximate-counts]] | 
|  | 196 | +==== Document counts are approximate | 
|  | 197 | + | 
|  | 198 | +The naive way to determine the "rare" terms in a dataset is to place all the values in a map, incrementing counts | 
|  | 199 | +as each document is visited, then return the bottom `n` rows.  This does not scale beyond even modestly sized data | 
|  | 200 | +sets.  A sharded approach where only the "top n" values are retained from each shard (ala the `terms` aggregation) | 
|  | 201 | +fails because the long-tail nature of the problem means it is impossible to find the "top n" bottom values without | 
|  | 202 | +simply collecting all the values from all shards. | 
|  | 203 | + | 
|  | 204 | +Instead, the Rare Terms aggregation uses a different approximate algorithm: | 
|  | 205 | + | 
|  | 206 | +1. Values are placed in a map the first time they are seen. | 
|  | 207 | +2. Each addition occurrence of the term increments a counter in the map | 
|  | 208 | +3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a | 
|  | 209 | +https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf[CuckooFilter] | 
|  | 210 | +4. The CuckooFilter is consulted on each term.  If the value is inside the filter, it is known to be above the | 
|  | 211 | +threshold already and skipped. | 
|  | 212 | + | 
|  | 213 | +After execution, the map of values is the map of "rare" terms under the `max_doc_count` threshold.  This map and CuckooFilter | 
|  | 214 | +are then merged with all other shards.  If there are terms that are greater than the threshold (or appear in | 
|  | 215 | +a different shard's CuckooFilter) the term is removed from the merged list.  The final map of values is returned | 
|  | 216 | +to the user as the "rare" terms. | 
|  | 217 | + | 
|  | 218 | +CuckooFilters have the possibility of returning false positives (they can say a value exists in their collection when | 
|  | 219 | +it actually does not).  Since the CuckooFilter is being used to see if a term is over threshold, this means a false positive | 
|  | 220 | +from the CuckooFilter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). | 
|  | 221 | +Practically, this means the aggregations exhibits false-negative behavior since the filter is being used "in reverse" | 
|  | 222 | +of how people generally think of approximate set membership sketches. | 
|  | 223 | + | 
|  | 224 | +CuckooFilters are described in more detail in the paper: | 
|  | 225 | + | 
|  | 226 | +https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf[Fan, Bin, et al. "Cuckoo filter: Practically better than bloom."] | 
|  | 227 | +Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. ACM, 2014. | 
|  | 228 | + | 
|  | 229 | +==== Precision | 
|  | 230 | + | 
|  | 231 | +Although the internal CuckooFilter is approximate in nature, the false-negative rate can be controlled with a | 
|  | 232 | +`precision` parameter.  This allows the user to trade more runtime memory for more accurate results. | 
|  | 233 | + | 
|  | 234 | +The default precision is `0.001`, and the smallest (e.g. most accurate and largest memory overhead) is `0.00001`. | 
|  | 235 | +Below are some charts which demonstrate how the accuracy of the aggregation is affected by precision and number | 
|  | 236 | +of distinct terms. | 
|  | 237 | + | 
|  | 238 | +The X-axis shows the number of distinct values the aggregation has seen, and the Y-axis shows the percent error. | 
|  | 239 | +Each line series represents one "rarity" condition (ranging from one rare item to 100,000 rare items).  For example, | 
|  | 240 | +the orange "10" line means ten of the values were "rare" (`doc_count == 1`), out of 1-20m distinct values (where the | 
|  | 241 | +rest of the values had `doc_count > 1`) | 
|  | 242 | + | 
|  | 243 | +This first chart shows precision `0.01`: | 
|  | 244 | + | 
|  | 245 | +image:images/rare_terms/accuracy_01.png[] | 
|  | 246 | + | 
|  | 247 | +And precision `0.001` (the default): | 
|  | 248 | + | 
|  | 249 | +image:images/rare_terms/accuracy_001.png[] | 
|  | 250 | + | 
|  | 251 | +And finally `precision 0.0001`: | 
|  | 252 | + | 
|  | 253 | +image:images/rare_terms/accuracy_0001.png[] | 
|  | 254 | + | 
|  | 255 | +The default precision of `0.001` maintains an accuracy of < 2.5% for the tested conditions, and accuracy slowly | 
|  | 256 | +degrades in a controlled, linear fashion as the number of distinct values increases. | 
|  | 257 | + | 
|  | 258 | +The default precision of `0.001` has a memory profile of `1.748⁻⁶ * n` bytes, where `n` is the number | 
|  | 259 | +of distinct values the aggregation has seen (it can also be roughly eyeballed, e.g. 20 million unique values is about | 
|  | 260 | +30mb of memory).  The memory usage is linear to the number of distinct values regardless of which precision is chosen, | 
|  | 261 | +the precision only affects the slope of the memory profile as seen in this chart: | 
|  | 262 | + | 
|  | 263 | +image:images/rare_terms/memory.png[] | 
|  | 264 | + | 
|  | 265 | +For comparison, an equivalent terms aggregation at 20 million buckets would be roughly | 
|  | 266 | +`20m * 69b == ~1.38gb` (with 69 bytes being a very optimistic estimate of an empty bucket cost, far lower than what | 
|  | 267 | +the circuit breaker accounts for).  So although the `rare_terms` agg is relatively heavy, it is still orders of | 
|  | 268 | +magnitude smaller than the equivalent terms aggregation | 
|  | 269 | + | 
|  | 270 | +==== Filtering Values | 
|  | 271 | + | 
|  | 272 | +It is possible to filter the values for which buckets will be created. This can be done using the `include` and | 
|  | 273 | +`exclude` parameters which are based on regular expression strings or arrays of exact values. Additionally, | 
|  | 274 | +`include` clauses can filter using `partition` expressions. | 
|  | 275 | + | 
|  | 276 | +===== Filtering Values with regular expressions | 
|  | 277 | + | 
|  | 278 | +[source,js] | 
|  | 279 | +-------------------------------------------------- | 
|  | 280 | +GET /_search | 
|  | 281 | +{ | 
|  | 282 | +    "aggs" : { | 
|  | 283 | +        "genres" : { | 
|  | 284 | +            "rare_terms" : { | 
|  | 285 | +                "field" : "genre", | 
|  | 286 | +                "include" : "swi*", | 
|  | 287 | +                "exclude" : "electro*" | 
|  | 288 | +            } | 
|  | 289 | +        } | 
|  | 290 | +    } | 
|  | 291 | +} | 
|  | 292 | +-------------------------------------------------- | 
|  | 293 | +// CONSOLE | 
|  | 294 | + | 
|  | 295 | +In the above example, buckets will be created for all the tags that starts with `swi`, except those starting | 
|  | 296 | +with `electro` (so the tag `swing` will be aggregated but not `electro_swing`). The `include` regular expression will determine what | 
|  | 297 | +values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When | 
|  | 298 | +both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. | 
|  | 299 | + | 
|  | 300 | +The syntax is the same as <<regexp-syntax,regexp queries>>. | 
|  | 301 | + | 
|  | 302 | +===== Filtering Values with exact values | 
|  | 303 | + | 
|  | 304 | +For matching based on exact values the `include` and `exclude` parameters can simply take an array of | 
|  | 305 | +strings that represent the terms as they are found in the index: | 
|  | 306 | + | 
|  | 307 | +[source,js] | 
|  | 308 | +-------------------------------------------------- | 
|  | 309 | +GET /_search | 
|  | 310 | +{ | 
|  | 311 | +    "aggs" : { | 
|  | 312 | +        "genres" : { | 
|  | 313 | +             "rare_terms" : { | 
|  | 314 | +                 "field" : "genre", | 
|  | 315 | +                 "include" : ["swing", "rock"], | 
|  | 316 | +                 "exclude" : ["jazz"] | 
|  | 317 | +             } | 
|  | 318 | +         } | 
|  | 319 | +    } | 
|  | 320 | +} | 
|  | 321 | +-------------------------------------------------- | 
|  | 322 | +// CONSOLE | 
|  | 323 | + | 
|  | 324 | + | 
|  | 325 | +==== Missing value | 
|  | 326 | + | 
|  | 327 | +The `missing` parameter defines how documents that are missing a value should be treated. | 
|  | 328 | +By default they will be ignored but it is also possible to treat them as if they | 
|  | 329 | +had a value. | 
|  | 330 | + | 
|  | 331 | +[source,js] | 
|  | 332 | +-------------------------------------------------- | 
|  | 333 | +GET /_search | 
|  | 334 | +{ | 
|  | 335 | +    "aggs" : { | 
|  | 336 | +        "genres" : { | 
|  | 337 | +             "rare_terms" : { | 
|  | 338 | +                 "field" : "genre", | 
|  | 339 | +                 "missing": "N/A" <1> | 
|  | 340 | +             } | 
|  | 341 | +         } | 
|  | 342 | +    } | 
|  | 343 | +} | 
|  | 344 | +-------------------------------------------------- | 
|  | 345 | +// CONSOLE | 
|  | 346 | + | 
|  | 347 | +<1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. | 
|  | 348 | + | 
|  | 349 | +==== Nested, RareTerms, and scoring sub-aggregations | 
|  | 350 | + | 
|  | 351 | +The RareTerms aggregation has to operate in `breadth_first` mode, since it needs to prune terms as doc count thresholds | 
|  | 352 | +are breached.  This requirement means the RareTerms aggregation is incompatible with certain combinations of aggregations | 
|  | 353 | +that require `depth_first`. In particular, scoring sub-aggregations that are inside a `nested` force the entire aggregation tree to run | 
|  | 354 | +in `depth_first` mode.  This will throw an exception since RareTerms is unable to process `depth_first`. | 
|  | 355 | + | 
|  | 356 | +As a concrete example, if `rare_terms` aggregation is the child of a `nested` aggregation, and one of the child aggregations of `rare_terms` | 
|  | 357 | +needs document scores (like a `top_hits` aggregation), this will throw an exception. | 
0 commit comments