Skip to content

Commit e3b51ee

Browse files
committed
feat: Use ScrapyProcessProtocol instead of Job (from #359)
1 parent 9451b71 commit e3b51ee

11 files changed

+74
-82
lines changed

docs/news.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ Web UI
4242
API
4343
^^^
4444

45-
- The ``Access-Control-Allow-Methods`` response header contains only the HTTP methods to which webservices respond.
4645
- Clarify error messages, for example:
4746

4847
- ``'project' parameter is required``, instead of ``'project'`` (KeyError)
@@ -79,6 +78,7 @@ Library
7978
- ``sorted_versions`` to ``scrapyd.eggstorage``
8079
- ``get_crawl_args`` to ``scrapyd.launcher``
8180

81+
- :ref:`jobstorage` uses the ``ScrapyProcessProtocol`` class, by default. If :ref:`jobstorage` is set to ``scrapyd.jobstorage.SqliteJobStorage``, Scrapyd 1.3.0 uses a ``Job`` class, instead.
8282
- Move the ``activate_egg`` function from the ``scrapyd.eggutils`` module to its caller, the ``scrapyd.runner`` module.
8383
- Move the ``job_items_url`` and ``job_log_url`` functions from the ``scrapyd.jobstorage`` module to the ``scrapyd.utils`` module. :ref:`jobstorage` is not responsible for URLs.
8484
- Change the ``get_crawl_args`` function to no longer convert ``bytes`` to ``str``, as already done by its caller.
@@ -100,7 +100,8 @@ Fixed
100100
API
101101
^^^
102102

103-
- The Content-Length header counts the number of bytes, instead of the number of characters.
103+
- The ``Content-Length`` header counts the number of bytes, instead of the number of characters.
104+
- The ``Access-Control-Allow-Methods`` response header contains only the HTTP methods to which webservices respond.
104105
- The :ref:`schedule.json` webservice sets the ``node_name`` field in error responses.
105106
- The next pending job for all but one project was unreported by the :ref:`daemonstatus.json` and :ref:`listjobs.json` webservices, and was not cancellable by the :ref:`cancel.json` webservice.
106107

scrapyd/jobstorage.py

+6-30
Original file line numberDiff line numberDiff line change
@@ -3,38 +3,11 @@
33
Job storage was previously in-memory only and managed by the launcher.
44
"""
55

6-
import datetime
7-
86
from zope.interface import implementer
97

108
from scrapyd import sqlite
119
from scrapyd.interfaces import IJobStorage
12-
13-
14-
class Job:
15-
def __init__(self, project, spider, job=None, start_time=None, end_time=None):
16-
self.project = project
17-
self.spider = spider
18-
self.job = job
19-
self.start_time = start_time if start_time else datetime.datetime.now()
20-
self.end_time = end_time if end_time else datetime.datetime.now()
21-
22-
# For equality assertions in tests.
23-
def __eq__(self, other):
24-
return (
25-
self.project == other.project
26-
and self.spider == other.spider
27-
and self.job == other.job
28-
and self.start_time == other.start_time
29-
and self.end_time == other.end_time
30-
)
31-
32-
# For error messsages in tests.
33-
def __repr__(self):
34-
return (
35-
f"Job(project={self.project}, spider={self.spider}, job={self.job}, "
36-
f"start_time={self.start_time}, end_time={self.end_time})"
37-
)
10+
from scrapyd.launcher import ScrapyProcessProtocol
3811

3912

4013
@implementer(IJobStorage)
@@ -74,5 +47,8 @@ def __len__(self):
7447
return len(self.jobs)
7548

7649
def __iter__(self):
77-
for project, spider, job, start_time, end_time in self.jobs:
78-
yield Job(project=project, spider=spider, job=job, start_time=start_time, end_time=end_time)
50+
for project, spider, jobid, start_time, end_time in self.jobs:
51+
job = ScrapyProcessProtocol(project, spider, jobid, env={}, args=[])
52+
job.start_time = start_time
53+
job.end_time = end_time
54+
yield job

scrapyd/launcher.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -88,21 +88,34 @@ def _get_max_proc(self, config):
8888
# https://docs.twisted.org/en/stable/api/twisted.internet.protocol.ProcessProtocol.html
8989
class ScrapyProcessProtocol(protocol.ProcessProtocol):
9090
def __init__(self, project, spider, job, env, args):
91-
self.pid = None
9291
self.project = project
9392
self.spider = spider
9493
self.job = job
94+
self.pid = None
9595
self.start_time = datetime.datetime.now()
9696
self.end_time = None
97-
self.env = env
9897
self.args = args
98+
self.env = env
9999
self.deferred = defer.Deferred()
100100

101+
# For equality assertions in tests.
102+
def __eq__(self, other):
103+
return (
104+
self.project == other.project
105+
and self.spider == other.spider
106+
and self.job == other.job
107+
and self.pid == other.pid
108+
and self.start_time == other.start_time
109+
and self.end_time == other.end_time
110+
and self.args == other.args
111+
and self.env == other.env
112+
)
113+
101114
# For error messsages in tests.
102115
def __repr__(self):
103116
return (
104-
f"ScrapyProcessProtocol(pid={self.pid} project={self.project} spider={self.spider} job={self.job} "
105-
f"start_time={self.start_time} end_time={self.end_time} env={self.env} args={self.args})"
117+
f"ScrapyProcessProtocol(project={self.project} spider={self.spider} job={self.job} pid={self.pid} "
118+
f"start_time={self.start_time} end_time={self.end_time} args={self.args} env={self.env})"
106119
)
107120

108121
def outReceived(self, data):

scrapyd/sqlite.py

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ def __init__(self, database, table):
2929
def __len__(self):
3030
return self.conn.execute(f"SELECT COUNT(*) FROM {self.table}").fetchone()[0]
3131

32+
# SQLite JSON is enabled by default since 3.38.0 (2022-02-22), and JSONB is available since 3.45.0 (2024-01-15).
33+
# https://sqlite.org/json1.html
3234
def encode(self, obj):
3335
return sqlite3.Binary(json.dumps(obj).encode("ascii"))
3436

tests/__init__.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import datetime
12
import io
23
import os.path
34
import pkgutil
45

56
from twisted.logger import eventAsText
67

8+
from scrapyd.launcher import ScrapyProcessProtocol
9+
710

811
def get_egg_data(basename):
912
return pkgutil.get_data("tests", f"fixtures/{basename}.egg")
@@ -19,3 +22,14 @@ def root_add_version(root, project, version, basename):
1922

2023
def get_message(captured):
2124
return eventAsText(captured[0]).split(" ", 1)[1]
25+
26+
27+
def get_finished_job(project="p1", spider="s1", job="j1", start_time=None, end_time=None):
28+
if start_time is None:
29+
start_time = datetime.datetime.now()
30+
if end_time is None:
31+
end_time = datetime.datetime.now()
32+
process = ScrapyProcessProtocol(project, spider, job, {}, [])
33+
process.start_time = start_time
34+
process.end_time = end_time
35+
return process

tests/test_job.py

-16
This file was deleted.

tests/test_jobstorage.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44

55
from scrapyd.config import Config
66
from scrapyd.interfaces import IJobStorage
7-
from scrapyd.jobstorage import Job, MemoryJobStorage, SqliteJobStorage
7+
from scrapyd.jobstorage import MemoryJobStorage, SqliteJobStorage
8+
from tests import get_finished_job
89

9-
job1 = Job("p1", "s1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7))
10-
job2 = Job("p2", "s2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8))
11-
job3 = Job("p3", "s3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9))
10+
job1 = get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7))
11+
job2 = get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8))
12+
job3 = get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9))
1213

1314

1415
def pytest_generate_tests(metafunc):

tests/test_launcher.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -183,4 +183,4 @@ def test_process_ended_terminated(environ, process):
183183

184184

185185
def test_repr(process):
186-
assert repr(process).startswith(f"ScrapyProcessProtocol(pid={process.pid} project=p1 spider=s1 job=j1 start_time=")
186+
assert repr(process).startswith(f"ScrapyProcessProtocol(project=p1 spider=s1 job=j1 pid={process.pid} start_time=")

tests/test_sqlite.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
import pytest
44

5-
from scrapyd.jobstorage import Job
65
from scrapyd.sqlite import JsonSqlitePriorityQueue, SqliteFinishedJobs
6+
from tests import get_finished_job
77

88

99
@pytest.fixture()
@@ -14,9 +14,9 @@ def jsonsqlitepriorityqueue():
1414
@pytest.fixture()
1515
def sqlitefinishedjobs():
1616
q = SqliteFinishedJobs(":memory:")
17-
q.add(Job("p1", "s1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)))
18-
q.add(Job("p2", "s2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8)))
19-
q.add(Job("p3", "s3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9)))
17+
q.add(get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)))
18+
q.add(get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8)))
19+
q.add(get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9)))
2020
return q
2121

2222

tests/test_webservice.py

+14-13
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,19 @@
44
import os
55
import re
66
import sys
7-
from unittest.mock import MagicMock, call
7+
from unittest.mock import MagicMock, PropertyMock, call
88

99
import pytest
1010
from twisted.logger import LogLevel, capturedLogs
1111
from twisted.web import error
1212

1313
from scrapyd.exceptions import DirectoryTraversalError, RunnerError
1414
from scrapyd.interfaces import IEggStorage
15-
from scrapyd.jobstorage import Job
1615
from scrapyd.launcher import ScrapyProcessProtocol
1716
from scrapyd.webservice import spider_list
18-
from tests import get_egg_data, get_message, has_settings, root_add_version
17+
from tests import get_egg_data, get_finished_job, get_message, has_settings, root_add_version
1918

20-
job1 = Job(
19+
job1 = get_finished_job(
2120
project="p1",
2221
spider="s1",
2322
job="j1",
@@ -30,7 +29,9 @@
3029
def scrapy_process():
3130
process = ScrapyProcessProtocol(project="p1", spider="s1", job="j1", env={}, args=[])
3231
process.start_time = datetime.datetime(2001, 2, 3, 4, 5, 6, 9)
32+
process.end_time = datetime.datetime(2001, 2, 3, 4, 5, 6, 10)
3333
process.transport = MagicMock()
34+
type(process.transport).pid = PropertyMock(return_value=12345)
3435
return process
3536

3637

@@ -290,8 +291,8 @@ def test_status(txrequest, root, scrapy_process, args):
290291
root.update_projects()
291292

292293
if args:
293-
root.launcher.finished.add(Job(project="p2", spider="s2", job="j1"))
294-
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j1", {}, [])
294+
root.launcher.finished.add(get_finished_job("p2", "s2", "j1"))
295+
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j1", env={}, args=[])
295296
root.poller.queues["p2"].add("s2", _job="j1")
296297

297298
expected = {"currstate": None}
@@ -325,8 +326,8 @@ def test_list_jobs(txrequest, root, scrapy_process, args):
325326
root.update_projects()
326327

327328
if args:
328-
root.launcher.finished.add(Job(project="p2", spider="s2", job="j2"))
329-
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j2", {}, [])
329+
root.launcher.finished.add(get_finished_job("p2", "s2", "j2"))
330+
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j2", env={}, args=[])
330331
root.poller.queues["p2"].add("s2", _job="j2")
331332

332333
expected = {"pending": [], "running": [], "finished": []}
@@ -336,9 +337,9 @@ def test_list_jobs(txrequest, root, scrapy_process, args):
336337

337338
expected["finished"].append(
338339
{
339-
"id": "j1",
340340
"project": "p1",
341341
"spider": "s1",
342+
"id": "j1",
342343
"start_time": "2001-02-03 04:05:06.000007",
343344
"end_time": "2001-02-03 04:05:06.000008",
344345
"items_url": "/items/p1/s1/j1.jl",
@@ -351,11 +352,11 @@ def test_list_jobs(txrequest, root, scrapy_process, args):
351352

352353
expected["running"].append(
353354
{
354-
"id": "j1",
355355
"project": "p1",
356356
"spider": "s1",
357-
"start_time": "2001-02-03 04:05:06.000009",
357+
"id": "j1",
358358
"pid": None,
359+
"start_time": "2001-02-03 04:05:06.000009",
359360
}
360361
)
361362
assert_content(txrequest, root, "GET", "listjobs", args, expected)
@@ -371,9 +372,9 @@ def test_list_jobs(txrequest, root, scrapy_process, args):
371372

372373
expected["pending"].append(
373374
{
374-
"id": "j1",
375375
"project": "p1",
376376
"spider": "s1",
377+
"id": "j1",
377378
"version": "0.1",
378379
"settings": {"DOWNLOAD_DELAY=2": "TRACK=Cause = Time"},
379380
"args": {"other": "one"},
@@ -645,7 +646,7 @@ def test_cancel(txrequest, root, scrapy_process, args):
645646

646647
root.launcher.processes[0] = scrapy_process
647648
root.launcher.processes[1] = scrapy_process
648-
root.launcher.processes[2] = ScrapyProcessProtocol("p2", "s2", "j2", {}, [])
649+
root.launcher.processes[2] = ScrapyProcessProtocol("p2", "s2", "j2", env={}, args=[])
649650

650651
expected["prevstate"] = "running"
651652
assert_content(txrequest, root, "POST", "cancel", args, expected)

tests/test_website.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@
77
from twisted.web.test.requesthelper import DummyRequest
88

99
from scrapyd.app import application
10-
from scrapyd.jobstorage import Job
1110
from scrapyd.launcher import ScrapyProcessProtocol
1211
from scrapyd.website import Root
13-
from tests import has_settings, root_add_version
12+
from tests import get_finished_job, has_settings, root_add_version
1413

1514

1615
def assert_headers(txrequest):
@@ -33,7 +32,7 @@ def assert_hrefs(urls, text, header):
3332

3433
# Derived from test_emptyChildUnicodeParent.
3534
# https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py
36-
def test_render_logs_dir(txrequest, root):
35+
def test_logs_dir(txrequest, root):
3736
os.makedirs(os.path.join("logs", "quotesbot"))
3837

3938
file = root.children[b"logs"]
@@ -49,7 +48,7 @@ def test_render_logs_dir(txrequest, root):
4948

5049
# Derived from test_indexNames.
5150
# https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py
52-
def test_render_logs_file(txrequest, root):
51+
def test_logs_file(txrequest, root):
5352
os.makedirs(os.path.join("logs", "quotesbot"))
5453
with open(os.path.join("logs", "foo.txt"), "wb") as f:
5554
f.write(b"baz")
@@ -74,16 +73,16 @@ def cbRendered(ignored):
7473

7574
@pytest.mark.parametrize("cancel", [True, False], ids=["cancel", "no_cancel"])
7675
@pytest.mark.parametrize("header", [True, False], ids=["header", "no_header"])
77-
def test_render_jobs(txrequest, config, cancel, header):
76+
def test_jobs(txrequest, config, cancel, header):
7877
if not cancel:
7978
config.cp.remove_option("services", "cancel.json")
8079

8180
root = Root(config, application(config))
8281
root_add_version(root, "quotesbot", "0.1", "quotesbot")
8382
root.update_projects()
8483

85-
root.launcher.finished.add(Job("p1", "s1", "j-finished"))
86-
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j-running", {}, [])
84+
root.launcher.finished.add(get_finished_job("p1", "s1", "j-finished"))
85+
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j-running", env={}, args=[])
8786
root.poller.queues["quotesbot"].add("quotesbot", _job="j-pending")
8887

8988
if header:
@@ -117,11 +116,12 @@ def test_render_jobs(txrequest, config, cancel, header):
117116
else:
118117
assert b"<th>Cancel</th>" not in content
119118
assert b'/cancel.json">' not in content
119+
assert b' value="j-finished">' not in content
120120

121121

122122
@pytest.mark.parametrize("with_egg", [True, False])
123123
@pytest.mark.parametrize("header", [True, False])
124-
def test_render_home(txrequest, root, with_egg, header):
124+
def test_home(txrequest, root, with_egg, header):
125125
if with_egg:
126126
root_add_version(root, "quotesbot", "0.1", "quotesbot")
127127
root.update_projects()

0 commit comments

Comments
 (0)