Skip to content

Commit

Permalink
Optimize dataset first/head and last/tail (#5407)
Browse files Browse the repository at this point in the history
* dont iterate to get first and last

* update dataset and include head tail

* handle no results

* fix field spec

* updates

* must use existing implementation for views

---------

Co-authored-by: brimoor <brimoor@umich.edu>
kaixi-wang and brimoor authored Jan 21, 2025
1 parent d228b88 commit d306a36
Showing 5 changed files with 239 additions and 75 deletions.
2 changes: 1 addition & 1 deletion fiftyone/__public__.py
Original file line number Diff line number Diff line change
@@ -163,7 +163,7 @@
Run,
RunResults,
)
from .core.sample import Sample
from .core.sample import Sample, SampleView
from .core.threed import (
BoxGeometry,
CylinderGeometry,
68 changes: 11 additions & 57 deletions fiftyone/core/collections.py
Original file line number Diff line number Diff line change
@@ -923,10 +923,7 @@ def first(self):
a :class:`fiftyone.core.sample.Sample` or
:class:`fiftyone.core.sample.SampleView`
"""
try:
return next(iter(self))
except StopIteration:
raise ValueError("%s is empty" % self.__class__.__name__)
raise NotImplementedError("Subclass must implement first()")

def last(self):
"""Returns the last sample in the collection.
@@ -935,7 +932,7 @@ def last(self):
a :class:`fiftyone.core.sample.Sample` or
:class:`fiftyone.core.sample.SampleView`
"""
return self[-1:].first()
raise NotImplementedError("Subclass must implement last()")

def head(self, num_samples=3):
"""Returns a list of the first few samples in the collection.
@@ -947,9 +944,10 @@ def head(self, num_samples=3):
num_samples (3): the number of samples
Returns:
a list of :class:`fiftyone.core.sample.Sample` objects
a list of :class:`fiftyone.core.sample.Sample` or
:class:`fiftyone.core.sample.SampleView` objects
"""
return [s for s in self[:num_samples]]
raise NotImplementedError("Subclass must implement head()")

def tail(self, num_samples=3):
"""Returns a list of the last few samples in the collection.
@@ -961,41 +959,14 @@ def tail(self, num_samples=3):
num_samples (3): the number of samples
Returns:
a list of :class:`fiftyone.core.sample.Sample` objects
a list of :class:`fiftyone.core.sample.Sample` or
:class:`fiftyone.core.sample.SampleView` objects
"""
return [s for s in self[-num_samples:]]
raise NotImplementedError("Subclass must implement tail()")

def one(self, expr, exact=False):
"""Returns a single sample in this collection matching the expression.
Examples::
import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F
dataset = foz.load_zoo_dataset("quickstart")
#
# Get a sample by filepath
#
# A random filepath in the dataset
filepath = dataset.take(1).first().filepath
# Get sample by filepath
sample = dataset.one(F("filepath") == filepath)
#
# Dealing with multiple matches
#
# Get a sample whose image is JPEG
sample = dataset.one(F("filepath").ends_with(".jpg"))
# Raises an error since there are multiple JPEGs
dataset.one(F("filepath").ends_with(".jpg"), exact=True)
Args:
expr: a :class:`fiftyone.core.expressions.ViewExpression` or
`MongoDB expression <https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#aggregation-expressions>`_
@@ -1008,27 +979,10 @@ def one(self, expr, exact=False):
and multiple samples match the expression
Returns:
a :class:`fiftyone.core.sample.SampleView`
a :class:`fiftyone.core.sample.Sample` or
:class:`fiftyone.core.sample.SampleView`
"""
view = self.match(expr)
matches = iter(view)

try:
sample = next(matches)
except StopIteration:
raise ValueError("No samples match the given expression")

if exact:
try:
next(matches)
raise ValueError(
"Expected one matching sample, but found %d matches"
% len(view)
)
except StopIteration:
pass

return sample
raise NotImplementedError("Subclass must implement one()")

def view(self):
"""Returns a :class:`fiftyone.core.view.DatasetView` containing the
53 changes: 36 additions & 17 deletions fiftyone/core/dataset.py
Original file line number Diff line number Diff line change
@@ -1210,26 +1210,43 @@ def _frame_collstats(self):

return _get_collstats(self._frame_collection)

def _get_first(self, limit=1, reverse=False):
direction = -1 if reverse else 1
pipeline = [{"$sort": {"_id": direction}}, {"$limit": limit}]

return self._aggregate(
pipeline=pipeline, detach_frames=True, detach_groups=True
)

def first(self):
"""Returns the first sample in the dataset.
Returns:
a :class:`fiftyone.core.sample.Sample`
"""
return super().first()
cursor = self._get_first()

try:
d = next(cursor)
except StopIteration:
raise ValueError("%s is empty" % self.__class__.__name__)

return self._make_sample(d)

def last(self):
"""Returns the last sample in the dataset.
Returns:
a :class:`fiftyone.core.sample.Sample`
"""
cursor = self._get_first(reverse=True)

try:
sample_view = self[-1:].first()
except ValueError:
d = next(cursor)
except StopIteration:
raise ValueError("%s is empty" % self.__class__.__name__)

return fos.Sample.from_doc(sample_view._doc, dataset=self)
return self._make_sample(d)

def head(self, num_samples=3):
"""Returns a list of the first few samples in the dataset.
@@ -1243,10 +1260,8 @@ def head(self, num_samples=3):
Returns:
a list of :class:`fiftyone.core.sample.Sample` objects
"""
return [
fos.Sample.from_doc(sv._doc, dataset=self)
for sv in self[:num_samples]
]
cursor = self._get_first(limit=num_samples)
return [self._make_sample(d) for d in cursor]

def tail(self, num_samples=3):
"""Returns a list of the last few samples in the dataset.
@@ -1260,10 +1275,10 @@ def tail(self, num_samples=3):
Returns:
a list of :class:`fiftyone.core.sample.Sample` objects
"""
return [
fos.Sample.from_doc(sv._doc, dataset=self)
for sv in self[-num_samples:]
]
cursor = self._get_first(reverse=True, limit=num_samples)
samples = [self._make_sample(d) for d in cursor]
samples.reverse()
return samples

def one(self, expr, exact=False):
"""Returns a single sample in this dataset matching the expression.
@@ -1294,7 +1309,7 @@ def one(self, expr, exact=False):
sample = dataset.one(F("filepath").ends_with(".jpg"))
# Raises an error since there are multiple JPEGs
dataset.one(F("filepath").ends_with(".jpg"), exact=True)
_ = dataset.one(F("filepath").ends_with(".jpg"), exact=True)
Args:
expr: a :class:`fiftyone.core.expressions.ViewExpression` or
@@ -1303,11 +1318,16 @@ def one(self, expr, exact=False):
exact (False): whether to raise an error if multiple samples match
the expression
Raises:
ValueError: if no samples match the expression or if ``exact=True``
and multiple samples match the expression
Returns:
a :class:`fiftyone.core.sample.Sample`
"""
view = self.match(expr)
matches = iter(view._aggregate())
limit = 2 if exact else 1
view = self.match(expr).limit(limit)
matches = iter(view._aggregate(detach_frames=True, detach_groups=True))

try:
d = next(matches)
@@ -1318,8 +1338,7 @@ def one(self, expr, exact=False):
try:
next(matches)
raise ValueError(
"Expected one matching sample, but found %d matches"
% len(view)
"Expected one matching sample, but found multiple"
)
except StopIteration:
pass
113 changes: 113 additions & 0 deletions fiftyone/core/view.py
Original file line number Diff line number Diff line change
@@ -439,6 +439,119 @@ def _make_view_stages_str(self):
]
)

def first(self):
"""Returns the first sample in the view.
Returns:
a :class:`fiftyone.core.sample.SampleView`
"""
try:
return next(iter(self))
except StopIteration:
raise ValueError("%s is empty" % self.__class__.__name__)

def last(self):
"""Returns the last sample in the view.
Returns:
a :class:`fiftyone.core.sample.SampleView`
"""
return self[-1:].first()

def head(self, num_samples=3):
"""Returns a list of the first few samples in the view.
If fewer than ``num_samples`` samples are in the view, only
the available samples are returned.
Args:
num_samples (3): the number of samples
Returns:
a list of :class:`fiftyone.core.sample.SampleView` objects
"""
return [s for s in self[:num_samples]]

def tail(self, num_samples=3):
"""Returns a list of the last few samples in the view.
If fewer than ``num_samples`` samples are in the view, only
the available samples are returned.
Args:
num_samples (3): the number of samples
Returns:
a list of :class:`fiftyone.core.sample.SampleView` objects
"""
return [s for s in self[-num_samples:]]

def one(self, expr, exact=False):
"""Returns a single sample in this view matching the expression.
Examples::
import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F
dataset = foz.load_zoo_dataset("quickstart")
view = dataset.select_fields()
#
# Get a sample by filepath
#
# A random filepath in the view
filepath = view.take(1).first().filepath
# Get sample by filepath
sample = view.one(F("filepath") == filepath)
#
# Dealing with multiple matches
#
# Get a sample whose image is JPEG
sample = view.one(F("filepath").ends_with(".jpg"))
# Raises an error since there are multiple JPEGs
_ = view.one(F("filepath").ends_with(".jpg"), exact=True)
Args:
expr: a :class:`fiftyone.core.expressions.ViewExpression` or
`MongoDB expression <https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#aggregation-expressions>`_
that evaluates to ``True`` for the sample to match
exact (False): whether to raise an error if multiple samples match
the expression
Raises:
ValueError: if no samples match the expression or if ``exact=True``
and multiple samples match the expression
Returns:
a :class:`fiftyone.core.sample.SampleView`
"""
limit = 2 if exact else 1
view = self.match(expr).limit(limit)
matches = iter(view._aggregate(detach_frames=True, detach_groups=True))

try:
d = next(matches)
except StopIteration:
raise ValueError("No samples match the given expression")

if exact:
try:
next(matches)
raise ValueError(
"Expected one matching sample, but found multiple"
)
except StopIteration:
pass

return self._make_sample(d)

def view(self):
"""Returns a copy of this view.
78 changes: 78 additions & 0 deletions tests/unittests/dataset_tests.py
Original file line number Diff line number Diff line change
@@ -968,6 +968,84 @@ def test_summary_fields(self):
self.assertFalse("frames_gt_confidence_by_label.min" in db_indexes)
self.assertFalse("frames_gt_confidence_by_label.max" in db_indexes)

@drop_datasets
def test_first_last_head_tail(self):
dataset = fo.Dataset()
dataset.add_samples(
[
fo.Sample(filepath="image%d.jpg" % i, index=i)
for i in range(1, 52)
]
)

sample = dataset.first()
self.assertIsInstance(sample, fo.Sample)
self.assertEqual(sample.index, 1)

sample = dataset.last()
self.assertIsInstance(sample, fo.Sample)
self.assertEqual(sample.index, 51)

samples = dataset.head()
self.assertIsInstance(samples, list)
self.assertIsInstance(samples[0], fo.Sample)
self.assertListEqual([s.index for s in samples], [1, 2, 3])

samples = dataset.tail()
self.assertIsInstance(samples, list)
self.assertIsInstance(samples[0], fo.Sample)
self.assertListEqual([s.index for s in samples], [49, 50, 51])

view = dataset.select_fields("index")

sample = view.first()
self.assertIsInstance(sample, fo.SampleView)
self.assertEqual(sample.index, 1)

sample = view.last()
self.assertIsInstance(sample, fo.SampleView)
self.assertEqual(sample.index, 51)

samples = view.head()
self.assertIsInstance(samples, list)
self.assertIsInstance(samples[0], fo.SampleView)
self.assertListEqual([s.index for s in samples], [1, 2, 3])

samples = view.tail()
self.assertIsInstance(samples, list)
self.assertIsInstance(samples[0], fo.SampleView)
self.assertListEqual([s.index for s in samples], [49, 50, 51])

@drop_datasets
def test_first_last_head_tail_empty(self):
dataset = fo.Dataset()

with self.assertRaises(ValueError):
_ = dataset.first()

with self.assertRaises(ValueError):
_ = dataset.last()

samples = dataset.head()
self.assertListEqual(samples, [])

samples = dataset.tail()
self.assertListEqual(samples, [])

view = dataset.select_fields()

with self.assertRaises(ValueError):
_ = view.first()

with self.assertRaises(ValueError):
_ = view.last()

samples = view.head()
self.assertListEqual(samples, [])

samples = view.tail()
self.assertListEqual(samples, [])

@drop_datasets
def test_iter_samples(self):
dataset = fo.Dataset()

0 comments on commit d306a36

Please sign in to comment.