Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement/revamp cursor iteration #44

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
27 changes: 9 additions & 18 deletions appnexus/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def __init__(self, client, service_name, representation, **specs):
self.service_name = service_name
self.representation = representation
self.specs = specs
self.retrieved = 0
self._skip = 0
self._limit = float('inf')

Expand All @@ -39,21 +38,11 @@ def __getitem__(self, idx):

def __iter__(self):
"""Iterate over all AppNexus objects matching the specifications"""
retrieved = 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment from @rambobinator
The retrieved variable is not used anymore, remove it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable removed in last commits

for page in self.iter_pages():
data = self.extract_data(page)
if self._skip >= len(data):
self._skip -= len(data)
continue
elif self._skip:
self._skip = 0
data = data[self._skip:]
lasting = self._limit - self.retrieved
if not lasting:
break
elif lasting < len(data):
data = data[:lasting]
for entity in data:
self.retrieved += 1
retrieved += 1
yield entity

def extract_data(self, page):
Expand Down Expand Up @@ -86,15 +75,17 @@ def get_page(self, start_element=0, num_elements=None):
specs.update(start_element=start_element, num_elements=num_elements)
return self.client.get(self.service_name, **specs)

def iter_pages(self, skip_elements=0):
def iter_pages(self):
"""Iterate as much as needed to get all available pages"""
start_element = skip_elements
start_element = self._skip
num_elements = min(self._limit, self.batch_size)
count = -1
while start_element < count or count == -1:
page = self.get_page(start_element)
page = self.get_page(start_element, num_elements)
shnups marked this conversation as resolved.
Show resolved Hide resolved
yield page
start_element = page["start_element"] + page["num_elements"]
count = page["count"]
start_element = start_element + page["num_elements"]
num_elements = min(page["count"] - num_elements, self.batch_size)
count = min(page["count"], self._skip + self._limit)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we just keep num_elements, and entirely remove count from the method?

Copy link
Member Author

@shnups shnups Sep 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't address this comment.
I remember trying hard to handle all corners cases and also avoiding all these intermediate variables and always bumping into a problem or having a pretty unreadable code.
I think the code is both functional and maintanable as is so I would keep it that way.


def count(self):
"""Returns the number of elements matching the specifications"""
Expand Down
144 changes: 142 additions & 2 deletions tests/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from appnexus.client import AppNexusClient
from appnexus.cursor import Cursor

from .helpers import gen_random_collection
from .helpers import gen_ordered_collection, gen_random_collection

COLLECTION_SIZE = 324


@pytest.fixture
Expand Down Expand Up @@ -55,7 +57,12 @@ def response_dict2():

@pytest.fixture
def random_response_dict():
return gen_random_collection(count=324)
return gen_random_collection(count=COLLECTION_SIZE)


@pytest.fixture
def ordered_response_dict():
return gen_ordered_collection(start_element=0, count=COLLECTION_SIZE)


@pytest.fixture
Expand All @@ -74,6 +81,31 @@ def random_cursor(mocker, random_response_dict):
return Cursor(client, "campaign", representations.raw)


@pytest.fixture
def ordered_cursor(mocker, ordered_response_dict):
client = AppNexusClient("test", "test")
mocker.patch.object(client, "get")
client.get.side_effect = ordered_response_dict
return Cursor(client, "campaign", representations.raw)


def mock_ordered_cursor(mocker, start=0, count=COLLECTION_SIZE, factor=1):
client = AppNexusClient("test", "test")
mocker.patch.object(client, "get")
client.get.side_effect = gen_ordered_collection(start, count) * factor
cursor = Cursor(client, "campaign", representations.raw)
mocker.patch.object(cursor, "get_page", wraps=cursor.get_page)
return cursor


@pytest.fixture
def double_ordered_cursor(mocker, ordered_response_dict):
client = AppNexusClient("test", "test")
mocker.patch.object(client, "get")
client.get.side_effect = ordered_response_dict * 2
return Cursor(client, "campaign", representations.raw)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixture doesn't seem used anywhere.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed



def test_cursor_count(cursor, response_dict):
assert cursor.count() == response_dict["count"]

Expand Down Expand Up @@ -164,3 +196,111 @@ def test_uncallable_representation():
def test_requests_volume_on_iteration(cursor):
_ = [r for r in cursor]
assert cursor.client.get.call_count == 1


def test_skip_none(mocker):
cursor = mock_ordered_cursor(mocker, start=0, count=COLLECTION_SIZE)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you use this rather than your ordered_cursor fixture?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No good reason in this case, updated

results = [r for r in cursor]
assert len(results) == COLLECTION_SIZE
assert results[0]['id'] == 0
assert results[-1]['id'] == COLLECTION_SIZE - 1
assert cursor.get_page.call_count == 4


def test_skip_ten(mocker):
skip = 10
cursor = mock_ordered_cursor(mocker, start=skip, count=COLLECTION_SIZE)
cursor.skip(skip)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure to understand why you mock_ordered_cursor(..., start=skip, ...) and then cursor.skip(skip), can you explain?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goal of this test and the ones following is to validate that the cursor is iterating over pages properly taking skip and limit parmeters into account. These parameters are setting up what is going to be asked of the API, not doing some post-request manipulation of the results.
The start, count or factor arguments given to mock_ordered_cursor are used to simulate the expected response from the API. Therefore, we need to align what we want (ie. setting up the cursor) with what is expected to be returned by the client.get method that we are patching, feeding gen_collection results to it.

I think the tests are ok as is (used them to validate the algo changes) though I admit that they could be a bit better.
One potentially better approach would be to actually monkey patch request.get and similar to change the raw results based on the query params of the url requested. I felt this was a bit overkill for what I wanted to achieve here.

results = [r for r in cursor]
assert len(results) == COLLECTION_SIZE - skip
assert results[0]['id'] == skip
assert results[-1]['id'] == COLLECTION_SIZE - 1
assert cursor.get_page.call_count == 4


def test_skip_hundred_ten(mocker):
skip = 110
cursor = mock_ordered_cursor(mocker, start=skip, count=COLLECTION_SIZE)
cursor.skip(skip)
results = [r for r in cursor]
assert len(results) == COLLECTION_SIZE - skip
assert results[0]['id'] == skip
assert results[-1]['id'] == COLLECTION_SIZE - 1
assert cursor.get_page.call_count == 3


def test_skip_twice(mocker):
skip = 10
cursor = mock_ordered_cursor(mocker, start=skip, count=COLLECTION_SIZE,
factor=2)
cursor.skip(skip)
results = [r for r in cursor]
assert len(results) == COLLECTION_SIZE - skip
assert results[0]['id'] == skip
assert cursor.get_page.call_count == 4
results = [r for r in cursor]
assert len(results) == COLLECTION_SIZE - skip
assert results[0]['id'] == skip
assert cursor.get_page.call_count == 8


def test_limit_ten(mocker):
limit = 10
cursor = mock_ordered_cursor(mocker, start=0, count=limit)
cursor.limit(limit)
results = [r for r in cursor]
assert len(results) == limit
assert results[0]['id'] == 0
assert results[-1]['id'] == limit - 1
assert cursor.get_page.call_count == 1


def test_limit_hundred_ten(mocker):
limit = 110
cursor = mock_ordered_cursor(mocker, start=0, count=limit)
cursor.limit(limit)
results = [r for r in cursor]
assert len(results) == limit
assert results[0]['id'] == 0
assert results[-1]['id'] == limit - 1
assert cursor.get_page.call_count == 2


def test_limit_thousand(mocker):
limit = 1000
cursor = mock_ordered_cursor(mocker, start=0, count=COLLECTION_SIZE)
cursor.limit(limit)
results = [r for r in cursor]
assert len(results) == COLLECTION_SIZE
assert results[0]['id'] == 0
assert results[-1]['id'] == COLLECTION_SIZE - 1
assert cursor.get_page.call_count == 4


def test_limit_twice(mocker):
limit = 50
cursor = mock_ordered_cursor(mocker, start=0, count=limit, factor=2)
cursor.limit(limit)
results = [r for r in cursor]
assert len(results) == limit
assert results[0]['id'] == 0
assert results[-1]['id'] == limit - 1
assert cursor.get_page.call_count == 1
results = [r for r in cursor]
assert len(results) == limit
assert results[0]['id'] == 0
assert results[-1]['id'] == limit - 1
assert cursor.get_page.call_count == 2


def test_skip_and_limit(mocker):
skip = 10
limit = 150
cursor = mock_ordered_cursor(mocker, start=skip, count=skip + limit)
cursor.skip(skip)
cursor.limit(limit)
results = [r for r in cursor]
assert len(results) == limit
assert results[0]['id'] == skip
assert results[-1]['id'] == limit + skip - 1
assert cursor.get_page.call_count == 2
57 changes: 37 additions & 20 deletions tests/helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
import random


def gen_random_object():
return {"id": random.randrange(1000000)}
def gen_collection(object_generator_func, start_element=0, count=None,
object_type="campaigns"):
if count is None:
random.randrange(10000)
result = []
i = 0
volume = count - start_element
for i in range(volume // 100):
page = gen_page(object_generator_func, count=count,
object_type=object_type,
start_element=start_element + i * 100)
result.append(page)
if volume % 100 != 0:
i = i + 1 if len(result) else 0
page = gen_page(object_generator_func, count=count,
object_type=object_type,
start_element=start_element + i * 100,
num_elements=volume % 100)
result.append(page)
return result
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you keep gen_collection under gen_page so that we can see the actual diff? FYI, the functions are ordered that way because we usually order the functions "à la C", i.e. with functions used in other functions at the top, although this is not a strict convention nor something important.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done



def gen_random_page(num_elements=None, start_element=0, count=None,
object_type="campaigns"):
def gen_page(object_generator_func, start_element=0, num_elements=None,
count=None, object_type="campaigns"):
if not object_generator_func or not callable(object_generator_func):
raise ValueError("object_generator_func has to be set and callable")
if count is None:
count = random.randrange(10000)
if num_elements is None:
Expand All @@ -17,22 +37,19 @@ def gen_random_page(num_elements=None, start_element=0, count=None,
"start_element": start_element,
"num_elements": num_elements,
"count": count,
object_type: [gen_random_object() for _ in range(num_elements)]
object_type: [object_generator_func(start_element + index)
for index in range(num_elements)]
}


def gen_random_collection(count=None, object_type="campaigns"):
if count is None:
count = random.randrange(10000)
result = []
i = 0
for i in range(count // 100):
random_page = gen_random_page(count=count, object_type=object_type,
start_element=i * 100)
result.append(random_page)
if count % 100 != 0:
random_page = gen_random_page(count=count, object_type=object_type,
start_element=i * 100,
num_elements=count % 100)
result.append(random_page)
return result
def gen_random_collection(start_element=0, count=None,
object_type="campaigns"):
return gen_collection(
object_generator_func=lambda index: {"id": random.randrange(1000000)},
start_element=start_element, count=count, object_type=object_type)


def gen_ordered_collection(start_element, count, object_type="campaigns"):
return gen_collection(
object_generator_func=lambda index: {"id": index},
start_element=start_element, count=count, object_type=object_type)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it would be more readable/maintainable to add a random: bool parameter to gen_collection and gen_page than having these two very similar functions, what do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, changes made