Merge pull request #200 from facebookresearch/datasets

[datasets] Add a Datasets class for managing datasets.
facebookresearch · Apr 26, 2021 · 145d450 · 145d450
2 parents cd31a34 + df5dcdb
commit 145d450
Show file tree

Hide file tree

Showing 10 changed files with 510 additions and 213 deletions.
diff --git a/compiler_gym/datasets/BUILD b/compiler_gym/datasets/BUILD
@@ -10,6 +10,7 @@ py_library(
         "__init__.py",
         "benchmark.py",
         "dataset.py",
+        "datasets.py",
         "files_dataset.py",
         "tar_dataset.py",
     ],

diff --git a/compiler_gym/datasets/__init__.py b/compiler_gym/datasets/__init__.py
@@ -17,6 +17,7 @@
     delete,
     require,
 )
+from compiler_gym.datasets.datasets import Datasets
 from compiler_gym.datasets.files_dataset import FilesDataset
 from compiler_gym.datasets.tar_dataset import TarDataset, TarDatasetWithManifest
 
@@ -27,6 +28,7 @@
     "BenchmarkSource",
     "Dataset",
     "DatasetInitError",
+    "Datasets",
     "deactivate",
     "delete",
     "FilesDataset",

diff --git a/compiler_gym/datasets/dataset.py b/compiler_gym/datasets/dataset.py
@@ -13,7 +13,6 @@
 from typing import Dict, Iterable, List, NamedTuple, Optional, Union
 
 import fasteners
-import numpy as np
 from deprecated.sphinx import deprecated
 
 from compiler_gym.datasets.benchmark import DATASET_NAME_RE, Benchmark
@@ -45,7 +44,6 @@ def __init__(
         site_data_base: Path,
         benchmark_class=Benchmark,
         references: Optional[Dict[str, str]] = None,
-        random: Optional[np.random.Generator] = None,
         hidden: bool = False,
         sort_order: int = 0,
         logger: Optional[logging.Logger] = None,
@@ -70,8 +68,6 @@ def __init__(
         :param references: A dictionary containing URLs for this dataset, keyed
             by their name. E.g. :code:`references["Paper"] = "https://..."`.
 
-        :param random: A source of randomness for selecting benchmarks.
-
         :param hidden: Whether the dataset should be excluded from the
             :meth:`datasets() <compiler_gym.datasets.Datasets.dataset>` iterator
             of any :class:`Datasets <compiler_gym.datasets.Datasets>` container.
@@ -103,7 +99,6 @@ def __init__(
         self._hidden = hidden
         self._validatable = validatable
 
-        self.random = random or np.random.default_rng()
         self._logger = logger
         self.sort_order = sort_order
         self.benchmark_class = benchmark_class
@@ -115,17 +110,6 @@ def __init__(
     def __repr__(self):
         return self.name
 
-    def seed(self, seed: int):
-        """Set the random state.
-
-        Setting a random state will fix the order that
-        :meth:`dataset.benchmark() <compiler_gym.datasets.Dataset.benchmark>`
-        returns benchmarks when called without arguments.
-
-        :param seed: A number.
-        """
-        self.random = np.random.default_rng(seed)
-
     @property
     def logger(self) -> logging.Logger:
         """The logger for this dataset.
@@ -337,23 +321,15 @@ def benchmark_uris(self) -> Iterable[str]:
         """
         raise NotImplementedError("abstract class")
 
-    def benchmark(self, uri: Optional[str] = None) -> Benchmark:
+    def benchmark(self, uri: str) -> Benchmark:
         """Select a benchmark.
 
-        If a URI is given, the corresponding :class:`Benchmark
-        <compiler_gym.datasets.Benchmark>` is returned. Otherwise, a benchmark
-        is selected uniformly randomly.
-
-        Use :meth:`seed() <compiler_gym.datasets.Dataset.seed>` to force a
-        reproducible order for randomly selected benchmarks.
-
-        :param uri: The URI of the benchmark to return. If :code:`None`, select
-            a benchmark randomly using :code:`self.random`.
+        :param uri: The URI of the benchmark to return.
 
         :return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
             instance.
 
-        :raise LookupError: If :code:`uri` is provided but does not exist.
+        :raise LookupError: If :code:`uri` is not found.
         """
         raise NotImplementedError("abstract class")
 

diff --git a/compiler_gym/datasets/datasets.py b/compiler_gym/datasets/datasets.py
@@ -0,0 +1,245 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import deque
+from typing import Dict, Iterable, Set, TypeVar
+
+from compiler_gym.datasets.benchmark import (
+    BENCHMARK_URI_RE,
+    Benchmark,
+    resolve_uri_protocol,
+)
+from compiler_gym.datasets.dataset import Dataset
+
+T = TypeVar("T")
+
+
+def round_robin_iterables(iters: Iterable[Iterable[T]]) -> Iterable[T]:
+    """Yield from the given iterators in round robin order."""
+    # Use a queue of iterators to iterate over. Repeatedly pop an iterator from
+    # the queue, yield the next value from it, then put it at the back of the
+    # queue. The iterator is discarded once exhausted.
+    iters = deque(iters)
+    while len(iters) > 1:
+        it = iters.popleft()
+        try:
+            yield next(it)
+            iters.append(it)
+        except StopIteration:
+            pass
+    # Once we have only a single iterator left, return it directly rather
+    # continuing with the round robin.
+    if len(iters) == 1:
+        yield from iters.popleft()
+
+
+class Datasets(object):
+    """A collection of datasets.
+
+    This class provides a dictionary-like interface for indexing and iterating
+    over multiple :class:`Dataset <compiler_gym.datasets.Dataset>` objects.
+    Select a dataset by URI using:
+
+        >>> env.datasets["benchmark://cbench-v1"]
+
+    Check whether a dataset exists using:
+
+        >>> "benchmark://cbench-v1" in env.datasets
+        True
+
+    Or iterate over the datasets using:
+
+        >>> for dataset in env.datasets:
+        ...     print(dataset.name)
+        benchmark://cbench-v1
+        benchmark://github-v0
+        benchmark://npb-v0
+
+    To select a benchmark from the datasets, use :meth:`benchmark()`:
+
+        >>> env.datasets.benchmark("benchmark://a-v0/a")
+
+    Use the :meth:`benchmarks()` method to iterate over every benchmark in the
+    datasets in a stable round robin order:
+
+        >>> for benchmark in env.datasets.benchmarks():
+        ...     print(benchmark)
+        benchmark://cbench-v1/1
+        benchmark://github-v0/1
+        benchmark://npb-v0/1
+        benchmark://cbench-v1/2
+        ...
+
+    If you want to exclude a dataset, delete it:
+
+        >>> del env.datasets["benchmark://b-v0"]
+    """
+
+    def __init__(
+        self,
+        datasets: Iterable[Dataset],
+    ):
+        self._datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
+        self._visible_datasets: Set[str] = set(
+            name for name, dataset in self._datasets.items() if not dataset.hidden
+        )
+
+    def datasets(self, with_deprecated: bool = False) -> Iterable[Dataset]:
+        """Enumerate the datasets.
+
+        Dataset order is consistent across runs.
+
+        :param with_deprecated: If :code:`True`, include datasets that have been
+            marked as deprecated.
+
+        :return: An iterable sequence of :meth:`Dataset
+            <compiler_gym.datasets.Dataset>` instances.
+        """
+        datasets = self._datasets.values()
+        if not with_deprecated:
+            datasets = (d for d in datasets if not d.hidden)
+        yield from sorted(datasets, key=lambda d: (d.sort_order, d.name))
+
+    def __iter__(self) -> Iterable[Dataset]:
+        """Iterate over the datasets.
+
+        Dataset order is consistent across runs.
+
+        Equivalent to :meth:`datasets.datasets()
+        <compiler_gym.datasets.Dataset.datasets>`, but without the ability to
+        iterate over the deprecated datasets.
+
+        :return: An iterable sequence of :meth:`Dataset
+            <compiler_gym.datasets.Dataset>` instances.
+        """
+        return self.datasets()
+
+    def dataset(self, dataset: str) -> Dataset:
+        """Get a dataset.
+
+        Return the corresponding :meth:`Dataset
+        <compiler_gym.datasets.Dataset>`. Name lookup will succeed whether or
+        not the dataset is deprecated.
+
+        :param dataset: A dataset name.
+
+        :return: A :meth:`Dataset <compiler_gym.datasets.Dataset>` instance.
+
+        :raises LookupError: If :code:`dataset` is not found.
+        """
+        dataset_name = resolve_uri_protocol(dataset)
+
+        if dataset_name not in self._datasets:
+            raise LookupError(f"Dataset not found: {dataset_name}")
+
+        return self._datasets[dataset_name]
+
+    def __getitem__(self, dataset: str) -> Dataset:
+        """Lookup a dataset.
+
+        :param dataset: A dataset name.
+
+        :return: A :meth:`Dataset <compiler_gym.datasets.Dataset>` instance.
+
+        :raises LookupError: If :code:`dataset` is not found.
+        """
+        return self.dataset(dataset)
+
+    def __setitem__(self, key: str, dataset: Dataset):
+        dataset_name = resolve_uri_protocol(key)
+
+        self._datasets[dataset_name] = dataset
+        if not dataset.hidden:
+            self._visible_datasets.add(dataset_name)
+
+    def __delitem__(self, dataset: str):
+        """Remove a dataset from the collection.
+
+        This does not affect any underlying storage used by dataset. See
+        :meth:`uninstall() <compiler_gym.datasets.Datasets.uninstall>` to clean
+        up.
+
+        :param dataset: The name of a dataset.
+
+        :return: :code:`True` if the dataset was removed, :code:`False` if it
+            was already removed.
+        """
+        dataset_name = resolve_uri_protocol(dataset)
+        if dataset_name in self._visible_datasets:
+            self._visible_datasets.remove(dataset_name)
+        del self._datasets[dataset_name]
+
+    def __contains__(self, dataset: str) -> bool:
+        """Returns whether the dataset is contained."""
+        try:
+            self.dataset(dataset)
+            return True
+        except LookupError:
+            return False
+
+    def benchmarks(self, with_deprecated: bool = False) -> Iterable[Benchmark]:
+        """Enumerate the (possibly infinite) benchmarks lazily.
+
+        Benchmarks order is consistent across runs. One benchmark from each
+        dataset is returned in round robin order until all datasets have been
+        fully enumerated. The order of :meth:`benchmarks()
+        <compiler_gym.datasets.Datasets.benchmarks>` and :meth:`benchmark_uris()
+        <compiler_gym.datasets.Datasets.benchmark_uris>` is the same.
+
+        :param with_deprecated: If :code:`True`, include benchmarks from
+            datasets that have been marked deprecated.
+
+        :return: An iterable sequence of :class:`Benchmark
+            <compiler_gym.datasets.Benchmark>` instances.
+        """
+        return round_robin_iterables(
+            (d.benchmarks() for d in self.datasets(with_deprecated=with_deprecated))
+        )
+
+    def benchmark_uris(self, with_deprecated: bool = False) -> Iterable[str]:
+        """Enumerate the (possibly infinite) benchmark URIs.
+
+        Benchmark URI order is consistent across runs. URIs from datasets are
+        returned in round robin order. The order of :meth:`benchmarks()
+        <compiler_gym.datasets.Datasets.benchmarks>` and :meth:`benchmark_uris()
+        <compiler_gym.datasets.Datasets.benchmark_uris>` is the same.
+
+        :param with_deprecated: If :code:`True`, include benchmarks from
+            datasets that have been marked deprecated.
+
+        :return: An iterable sequence of benchmark URI strings.
+        """
+        return round_robin_iterables(
+            (d.benchmark_uris() for d in self.datasets(with_deprecated=with_deprecated))
+        )
+
+    def benchmark(self, uri: str) -> Benchmark:
+        """Select a benchmark.
+
+        Returns the corresponding :class:`Benchmark
+        <compiler_gym.datasets.Benchmark>`, regardless of whether the containing
+        dataset is installed or deprecated.
+
+        :param uri: The URI of the benchmark to return.
+
+        :return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
+            instance.
+        """
+        uri = resolve_uri_protocol(uri)
+
+        match = BENCHMARK_URI_RE.match(uri)
+        if not match:
+            raise ValueError(f"Invalid benchmark URI: '{uri}'")
+
+        dataset_name = match.group("dataset")
+        dataset = self._datasets[dataset_name]
+
+        return dataset.benchmark(uri)
+
+    @property
+    def size(self) -> int:
+        return len(self._visible_datasets)
+
+    def __len__(self) -> int:
+        return self.size