Skip to content

Commit

Permalink
Merge pull request #200 from facebookresearch/datasets
Browse files Browse the repository at this point in the history
[datasets] Add a Datasets class for managing datasets.
  • Loading branch information
ChrisCummins authored Apr 26, 2021
2 parents cd31a34 + df5dcdb commit 145d450
Show file tree
Hide file tree
Showing 10 changed files with 510 additions and 213 deletions.
1 change: 1 addition & 0 deletions compiler_gym/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ py_library(
"__init__.py",
"benchmark.py",
"dataset.py",
"datasets.py",
"files_dataset.py",
"tar_dataset.py",
],
Expand Down
2 changes: 2 additions & 0 deletions compiler_gym/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
delete,
require,
)
from compiler_gym.datasets.datasets import Datasets
from compiler_gym.datasets.files_dataset import FilesDataset
from compiler_gym.datasets.tar_dataset import TarDataset, TarDatasetWithManifest

Expand All @@ -27,6 +28,7 @@
"BenchmarkSource",
"Dataset",
"DatasetInitError",
"Datasets",
"deactivate",
"delete",
"FilesDataset",
Expand Down
30 changes: 3 additions & 27 deletions compiler_gym/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from typing import Dict, Iterable, List, NamedTuple, Optional, Union

import fasteners
import numpy as np
from deprecated.sphinx import deprecated

from compiler_gym.datasets.benchmark import DATASET_NAME_RE, Benchmark
Expand Down Expand Up @@ -45,7 +44,6 @@ def __init__(
site_data_base: Path,
benchmark_class=Benchmark,
references: Optional[Dict[str, str]] = None,
random: Optional[np.random.Generator] = None,
hidden: bool = False,
sort_order: int = 0,
logger: Optional[logging.Logger] = None,
Expand All @@ -70,8 +68,6 @@ def __init__(
:param references: A dictionary containing URLs for this dataset, keyed
by their name. E.g. :code:`references["Paper"] = "https://..."`.
:param random: A source of randomness for selecting benchmarks.
:param hidden: Whether the dataset should be excluded from the
:meth:`datasets() <compiler_gym.datasets.Datasets.dataset>` iterator
of any :class:`Datasets <compiler_gym.datasets.Datasets>` container.
Expand Down Expand Up @@ -103,7 +99,6 @@ def __init__(
self._hidden = hidden
self._validatable = validatable

self.random = random or np.random.default_rng()
self._logger = logger
self.sort_order = sort_order
self.benchmark_class = benchmark_class
Expand All @@ -115,17 +110,6 @@ def __init__(
def __repr__(self):
return self.name

def seed(self, seed: int):
"""Set the random state.
Setting a random state will fix the order that
:meth:`dataset.benchmark() <compiler_gym.datasets.Dataset.benchmark>`
returns benchmarks when called without arguments.
:param seed: A number.
"""
self.random = np.random.default_rng(seed)

@property
def logger(self) -> logging.Logger:
"""The logger for this dataset.
Expand Down Expand Up @@ -337,23 +321,15 @@ def benchmark_uris(self) -> Iterable[str]:
"""
raise NotImplementedError("abstract class")

def benchmark(self, uri: Optional[str] = None) -> Benchmark:
def benchmark(self, uri: str) -> Benchmark:
"""Select a benchmark.
If a URI is given, the corresponding :class:`Benchmark
<compiler_gym.datasets.Benchmark>` is returned. Otherwise, a benchmark
is selected uniformly randomly.
Use :meth:`seed() <compiler_gym.datasets.Dataset.seed>` to force a
reproducible order for randomly selected benchmarks.
:param uri: The URI of the benchmark to return. If :code:`None`, select
a benchmark randomly using :code:`self.random`.
:param uri: The URI of the benchmark to return.
:return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
instance.
:raise LookupError: If :code:`uri` is provided but does not exist.
:raise LookupError: If :code:`uri` is not found.
"""
raise NotImplementedError("abstract class")

Expand Down
245 changes: 245 additions & 0 deletions compiler_gym/datasets/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from collections import deque
from typing import Dict, Iterable, Set, TypeVar

from compiler_gym.datasets.benchmark import (
BENCHMARK_URI_RE,
Benchmark,
resolve_uri_protocol,
)
from compiler_gym.datasets.dataset import Dataset

T = TypeVar("T")


def round_robin_iterables(iters: Iterable[Iterable[T]]) -> Iterable[T]:
"""Yield from the given iterators in round robin order."""
# Use a queue of iterators to iterate over. Repeatedly pop an iterator from
# the queue, yield the next value from it, then put it at the back of the
# queue. The iterator is discarded once exhausted.
iters = deque(iters)
while len(iters) > 1:
it = iters.popleft()
try:
yield next(it)
iters.append(it)
except StopIteration:
pass
# Once we have only a single iterator left, return it directly rather
# continuing with the round robin.
if len(iters) == 1:
yield from iters.popleft()


class Datasets(object):
"""A collection of datasets.
This class provides a dictionary-like interface for indexing and iterating
over multiple :class:`Dataset <compiler_gym.datasets.Dataset>` objects.
Select a dataset by URI using:
>>> env.datasets["benchmark://cbench-v1"]
Check whether a dataset exists using:
>>> "benchmark://cbench-v1" in env.datasets
True
Or iterate over the datasets using:
>>> for dataset in env.datasets:
... print(dataset.name)
benchmark://cbench-v1
benchmark://github-v0
benchmark://npb-v0
To select a benchmark from the datasets, use :meth:`benchmark()`:
>>> env.datasets.benchmark("benchmark://a-v0/a")
Use the :meth:`benchmarks()` method to iterate over every benchmark in the
datasets in a stable round robin order:
>>> for benchmark in env.datasets.benchmarks():
... print(benchmark)
benchmark://cbench-v1/1
benchmark://github-v0/1
benchmark://npb-v0/1
benchmark://cbench-v1/2
...
If you want to exclude a dataset, delete it:
>>> del env.datasets["benchmark://b-v0"]
"""

def __init__(
self,
datasets: Iterable[Dataset],
):
self._datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
self._visible_datasets: Set[str] = set(
name for name, dataset in self._datasets.items() if not dataset.hidden
)

def datasets(self, with_deprecated: bool = False) -> Iterable[Dataset]:
"""Enumerate the datasets.
Dataset order is consistent across runs.
:param with_deprecated: If :code:`True`, include datasets that have been
marked as deprecated.
:return: An iterable sequence of :meth:`Dataset
<compiler_gym.datasets.Dataset>` instances.
"""
datasets = self._datasets.values()
if not with_deprecated:
datasets = (d for d in datasets if not d.hidden)
yield from sorted(datasets, key=lambda d: (d.sort_order, d.name))

def __iter__(self) -> Iterable[Dataset]:
"""Iterate over the datasets.
Dataset order is consistent across runs.
Equivalent to :meth:`datasets.datasets()
<compiler_gym.datasets.Dataset.datasets>`, but without the ability to
iterate over the deprecated datasets.
:return: An iterable sequence of :meth:`Dataset
<compiler_gym.datasets.Dataset>` instances.
"""
return self.datasets()

def dataset(self, dataset: str) -> Dataset:
"""Get a dataset.
Return the corresponding :meth:`Dataset
<compiler_gym.datasets.Dataset>`. Name lookup will succeed whether or
not the dataset is deprecated.
:param dataset: A dataset name.
:return: A :meth:`Dataset <compiler_gym.datasets.Dataset>` instance.
:raises LookupError: If :code:`dataset` is not found.
"""
dataset_name = resolve_uri_protocol(dataset)

if dataset_name not in self._datasets:
raise LookupError(f"Dataset not found: {dataset_name}")

return self._datasets[dataset_name]

def __getitem__(self, dataset: str) -> Dataset:
"""Lookup a dataset.
:param dataset: A dataset name.
:return: A :meth:`Dataset <compiler_gym.datasets.Dataset>` instance.
:raises LookupError: If :code:`dataset` is not found.
"""
return self.dataset(dataset)

def __setitem__(self, key: str, dataset: Dataset):
dataset_name = resolve_uri_protocol(key)

self._datasets[dataset_name] = dataset
if not dataset.hidden:
self._visible_datasets.add(dataset_name)

def __delitem__(self, dataset: str):
"""Remove a dataset from the collection.
This does not affect any underlying storage used by dataset. See
:meth:`uninstall() <compiler_gym.datasets.Datasets.uninstall>` to clean
up.
:param dataset: The name of a dataset.
:return: :code:`True` if the dataset was removed, :code:`False` if it
was already removed.
"""
dataset_name = resolve_uri_protocol(dataset)
if dataset_name in self._visible_datasets:
self._visible_datasets.remove(dataset_name)
del self._datasets[dataset_name]

def __contains__(self, dataset: str) -> bool:
"""Returns whether the dataset is contained."""
try:
self.dataset(dataset)
return True
except LookupError:
return False

def benchmarks(self, with_deprecated: bool = False) -> Iterable[Benchmark]:
"""Enumerate the (possibly infinite) benchmarks lazily.
Benchmarks order is consistent across runs. One benchmark from each
dataset is returned in round robin order until all datasets have been
fully enumerated. The order of :meth:`benchmarks()
<compiler_gym.datasets.Datasets.benchmarks>` and :meth:`benchmark_uris()
<compiler_gym.datasets.Datasets.benchmark_uris>` is the same.
:param with_deprecated: If :code:`True`, include benchmarks from
datasets that have been marked deprecated.
:return: An iterable sequence of :class:`Benchmark
<compiler_gym.datasets.Benchmark>` instances.
"""
return round_robin_iterables(
(d.benchmarks() for d in self.datasets(with_deprecated=with_deprecated))
)

def benchmark_uris(self, with_deprecated: bool = False) -> Iterable[str]:
"""Enumerate the (possibly infinite) benchmark URIs.
Benchmark URI order is consistent across runs. URIs from datasets are
returned in round robin order. The order of :meth:`benchmarks()
<compiler_gym.datasets.Datasets.benchmarks>` and :meth:`benchmark_uris()
<compiler_gym.datasets.Datasets.benchmark_uris>` is the same.
:param with_deprecated: If :code:`True`, include benchmarks from
datasets that have been marked deprecated.
:return: An iterable sequence of benchmark URI strings.
"""
return round_robin_iterables(
(d.benchmark_uris() for d in self.datasets(with_deprecated=with_deprecated))
)

def benchmark(self, uri: str) -> Benchmark:
"""Select a benchmark.
Returns the corresponding :class:`Benchmark
<compiler_gym.datasets.Benchmark>`, regardless of whether the containing
dataset is installed or deprecated.
:param uri: The URI of the benchmark to return.
:return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
instance.
"""
uri = resolve_uri_protocol(uri)

match = BENCHMARK_URI_RE.match(uri)
if not match:
raise ValueError(f"Invalid benchmark URI: '{uri}'")

dataset_name = match.group("dataset")
dataset = self._datasets[dataset_name]

return dataset.benchmark(uri)

@property
def size(self) -> int:
return len(self._visible_datasets)

def __len__(self) -> int:
return self.size
Loading

0 comments on commit 145d450

Please sign in to comment.