Skip to content

Commit 657dff6

Browse files
Merge pull request #169 from pricingassistant/0.9.x
0.9.x
2 parents 9851294 + a6d89c4 commit 657dff6

File tree

138 files changed

+4788
-2214
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+4788
-2214
lines changed

Diff for: .gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ develop-eggs
1717
lib
1818
lib64
1919
__pycache__
20+
.cache
2021

2122
# Installer logs
2223
pip-log.txt
@@ -44,3 +45,5 @@ mrq-config.py
4445
dump.rdb
4546
supervisord.pid
4647
memory_traces
48+
mrq/dashboard/static/node_modules/
49+
.vscode

Diff for: .travis.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ services:
66
env:
77
- PYTHON_BIN=python
88
- PYTHON_BIN=python3
9+
- PYTHON_BIN=/pypy/bin/pypy
910

1011
before_install:
1112
- docker ps
@@ -15,5 +16,6 @@ before_install:
1516

1617
# TODO: coveralls?
1718
script:
18-
- docker run -i -t -v `pwd`:/app:rw -w /app mrq_local $PYTHON_BIN -m pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
19-
- docker run -i -t -v `pwd`:/app:rw -w /app mrq_local $PYTHON_BIN -m pytest tests/ -v --junitxml=pytest-report.xml --cov mrq --cov-report term
19+
- docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq $PYTHON_BIN -m pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
20+
# - docker run -i -t -v `pwd`:/app:rw -w /app mrq_local $PYTHON_BIN -m pytest tests/ --collect-only
21+
- docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq $PYTHON_BIN -m pytest tests/ -v --junitxml=pytest-report.xml --cov mrq --cov-report term --timeout-method=thread --timeout=240

Diff for: Dockerfile

+32-9
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@ FROM debian:jessie
55
# https://github.com/docker-library/buildpack-deps/issues/40
66
#
77

8-
RUN echo \
9-
'deb ftp://ftp.us.debian.org/debian/ jessie main\n \
10-
deb ftp://ftp.us.debian.org/debian/ jessie-updates main\n \
11-
deb http://security.debian.org jessie/updates main\n' \
12-
> /etc/apt/sources.list
13-
14-
RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10
15-
RUN echo "deb http://repo.mongodb.org/apt/debian wheezy/mongodb-org/3.0 main" > /etc/apt/sources.list.d/mongodb-org-3.0.list
8+
# RUN echo \
9+
# 'deb ftp://ftp.us.debian.org/debian/ jessie main\n \
10+
# deb ftp://ftp.us.debian.org/debian/ jessie-updates main\n \
11+
# deb http://security.debian.org jessie/updates main\n' \
12+
# > /etc/apt/sources.list
13+
14+
RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0C49F3730359A14518585931BC711F9BA15703C6
15+
RUN echo "deb http://repo.mongodb.org/apt/debian jessie/mongodb-org/3.4 main" > /etc/apt/sources.list.d/mongodb-org-3.4.list
1616
RUN apt-get update && \
1717
apt-get install -y --no-install-recommends \
1818
curl \
@@ -21,17 +21,27 @@ RUN apt-get update && \
2121
python-pip \
2222
python3-pip \
2323
python3-dev \
24+
make \
2425
git \
2526
vim \
26-
mongodb-org-server \
27+
bzip2 \
28+
mongodb-org \
2729
nginx redis-server \
30+
g++ \
2831
&& \
2932
apt-get clean -y && \
3033
rm -rf /var/lib/apt/lists/*
3134

35+
RUN curl -sL https://deb.nodesource.com/setup_7.x | bash -
36+
RUN apt-get install -y --no-install-recommends nodejs
37+
38+
# Download pypy
39+
RUN curl -sL 'https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.8-1-linux_x86_64-portable.tar.bz2' > /pypy.tar.bz2 && tar jxvf /pypy.tar.bz2 && rm -rf /pypy.tar.bz2 && mv /pypy-* /pypy
40+
3241
# Upgrade pip
3342
RUN pip install --upgrade --ignore-installed pip
3443
RUN pip3 install --upgrade --ignore-installed pip
44+
RUN /pypy/bin/pypy -m ensurepip
3545

3646
ADD requirements-heroku.txt /app/requirements-heroku.txt
3747
ADD requirements-base.txt /app/requirements-base.txt
@@ -50,8 +60,21 @@ RUN pip install -r /app/requirements-heroku.txt && \
5060
pip install -r /app/requirements-dashboard.txt && \
5161
rm -rf ~/.cache
5262

63+
RUN /pypy/bin/pip install -r /app/requirements-heroku.txt && \
64+
/pypy/bin/pip install -r /app/requirements-base.txt && \
65+
/pypy/bin/pip install -r /app/requirements-dev.txt && \
66+
/pypy/bin/pip install -r /app/requirements-dashboard.txt && \
67+
rm -rf ~/.cache
68+
5369
RUN mkdir -p /data/db
5470

71+
RUN ln -s /app/mrq/bin/mrq_run.py /usr/bin/mrq-run
72+
RUN ln -s /app/mrq/bin/mrq_worker.py /usr/bin/mrq-worker
73+
RUN ln -s /app/mrq/bin/mrq_agent.py /usr/bin/mrq-agent
74+
RUN ln -s /app/mrq/dashboard/app.py /usr/bin/mrq-dashboard
75+
76+
ENV PYTHONPATH /app
77+
5578
VOLUME ["/data"]
5679
WORKDIR /app
5780

Diff for: Dockerfile-with-code

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
FROM pricingassistant/mrq-env:latest
2+
3+
ADD ./mrq /app/mrq

Diff for: MANIFEST.in

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
include *.md
2-
recursive-include mrq/supervisord_templates *
32
include requirements*
43
recursive-include mrq/dashboard/static *
54
recursive-include mrq/dashboard/templates *

Diff for: Makefile

+29-21
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,57 @@
11
docker:
2-
docker build -t mrq_local .
2+
docker build -t pricingassistant/mrq-env .
3+
docker build -t pricingassistant/mrq -f Dockerfile-with-code .
4+
5+
docker_push:
6+
docker push pricingassistant/mrq-env:latest
7+
docker push pricingassistant/mrq:latest
38

49
test: docker
5-
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app mrq_local python -m pytest tests/ -v --instafail"
10+
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env python -m pytest tests/ -v --instafail"
611

712
test3: docker
8-
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app mrq_local python3 -m pytest tests/ -v --instafail"
13+
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env python3 -m pytest tests/ -v --instafail"
14+
15+
testpypy: docker
16+
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env /pypy/bin/pypy -m pytest tests/ -v --instafail"
917

1018
shell:
11-
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -p 8000:8000 -v `pwd`:/app:rw -w /app mrq_local bash"
19+
sh -c "docker run --rm -i -t -p 27017:27017 -p 6379:6379 -p 5555:5555 -p 20020:20020 -p 8000:8000 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env bash"
20+
21+
reshell:
22+
# Reconnect in the current taskqueue container
23+
sh -c 'docker exec -t -i `docker ps | grep pricingassistant/mrq-env | cut -f 1 -d " "` bash'
1224

1325
shell_noport:
14-
sh -c "docker run --rm -i -t -v `pwd`:/app:rw -w /app mrq_local bash"
26+
sh -c "docker run --rm -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env bash"
1527

1628
docs_serve:
17-
sh -c "docker run --rm -i -t-p 8000:8000 -v `pwd`:/app:rw -w /app mrq_local mkdocs serve"
29+
sh -c "docker run --rm -i -t -p 8000:8000 -v `pwd`:/app:rw -w /app pricingassistant/mrq-env mkdocs serve"
1830

1931
lint: docker
20-
docker run -i -t -v `pwd`:/app:rw -w /app mrq_local pylint --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq
32+
docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env pylint -j 0 --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq
2133

2234
linterrors: docker
23-
docker run -i -t -v `pwd`:/app:rw -w /app mrq_local pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
35+
docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env pylint -j 0 --errors-only --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq
2436

2537
linterrors3: docker
26-
docker run -i -t -v `pwd`:/app:rw -w /app mrq_local python3 -m pylint --errors-only --init-hook="import sys; sys.path.append('.')" -d E1103 --rcfile .pylintrc mrq
38+
docker run -i -t -v `pwd`:/app:rw -w /app pricingassistant/mrq-env python3 -m pylint -j 0 --errors-only --init-hook="import sys; sys.path.append('.')" --rcfile .pylintrc mrq
2739

2840
virtualenv:
29-
virtualenv venv --distribute
30-
31-
virtualenv_pypy:
32-
virtualenv -p /usr/bin/pypy pypy --distribute
41+
virtualenv venv --distribute --python=python2.7
3342

3443
deps:
3544
pip install -r requirements.txt
3645
pip install -r requirements-dev.txt
3746
pip install -r requirements-dashboard.txt
3847

39-
deps_pypy:
40-
pip install git+git://github.com/schmir/gevent@pypy-hacks
41-
pip install cffi
42-
pip install git+git://github.com/gevent-on-pypy/pypycore
43-
export GEVENT_LOOP=pypycore.loop
44-
pip install -r requirements-pypy.txt
45-
4648
clean:
4749
find . -path ./venv -prune -o -name "*.pyc" -exec rm {} \;
4850
find . -name __pycache__ | xargs rm -r
4951

52+
build_dashboard:
53+
cd mrq/dashboard/static && npm install && mkdir -p bin && npm run build
54+
5055
dashboard:
5156
python mrq/dashboard/app.py
5257

@@ -62,8 +67,11 @@ pep8:
6267
autopep8:
6368
autopep8 --max-line-length 99 -aaaaaaaa --in-place --recursive mrq
6469

65-
pypi: linterrors
70+
pypi: linterrors linterrors3
6671
python setup.py sdist upload
6772

6873
build_docs:
6974
python scripts/propagate_docs.py
75+
76+
ensureindexes:
77+
mrq-run mrq.basetasks.indexes.EnsureIndexes

Diff for: docs/command-line.md

+3-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ The following general flags can be passed as command-line arguments to either **
1616
- `--mongodb_jobs, --mongodb`: MongoDB URI for the jobs, scheduled_jobs & workers database. Defaults to **mongodb://127.0.0.1:27017/mrq**.
1717
- `--mongodb_logs` :MongoDB URI for the logs database."0" will disable remote logs, "1" will use main MongoDB. Defaults to **1**
1818
- `--mongodb_logs_size`: If provided, sets the log collection to capped to that amount of bytes.
19-
- `--no_mongodb_ensure_indexes`: If provided, skip the creation of MongoDB indexes at worker startup.
2019
- `--redis`: Redis URI. Defaults to **redis://127.0.0.1:6379**.
2120
- `--redis_prefix`: Redis key prefix. Defaults to "mrq".
2221
- `--redis_max_connections`: Redis max connection pool size. Defaults to **1000**.
@@ -46,16 +45,14 @@ You can pass additional configuration flags:
4645

4746
- `--max_jobs`: Gevent:max number of jobs to do before quitting. Use as a workaround for memory leaks in your tasks. Defaults to **0**
4847
- `--max_memory`: Max memory (in Mb) after which the process will be shut down. Use with `--processes [1-N]`
49-
to have supervisord automatically respawn the worker when this happens. Defaults to **1**
48+
to have the worker automatically respawn when this happens. Defaults to **1**
5049
- `--grenlets, --gevent, --g`: Max number of greenlets to use. Defaults to **1**.
51-
- `--processes, --p`: Number of processes to launch with supervisord. Defaults to **0** (no supervisord).
52-
- `--supervisord_template`: Path of supervisord template to use. Defaults to **supervisord_templates/default.conf**.
50+
- `--processes, --p`: Number of processes to launch . Defaults to **0**.
5351
- `--scheduler`: Run the scheduler. Defaults to **false**.
5452
- `--scheduler_interval`: Seconds between scheduler checks. Defaults to **60** seconds, only ints are acceptable.
5553
- `--report_interval`: Seconds between worker reports to MongoDB. Defaults to **10** seconds, floats are acceptable too.
5654
- `--report_file`: Filepath of a json dump of the worker status. Disabled if none.
5755
- `--subqueues_refresh_interval`: Seconds between worker refreshes of the known subqueues.
58-
- `--subqueues_delimiter`: Delimiter between main queue and subqueue names.
5956
- `--paused_queues_refresh_interval`: Seconds between worker refreshes of the paused queues list.
6057
- `--admin_port`: Start an admin server on this port, if provided. Incompatible with --processes. Defaults to **0**
6158
- `--admin_ip`: IP for the admin server to listen on. Use "0.0.0.0" to allow access from outside. Defaults to **127.0.0.1**.
@@ -71,7 +68,7 @@ The default is to run tasks one at a time. You should obviously change this beha
7168

7269
This will start 30 greenlets over 3 UNIX processes. Each of them will run 10 jobs at the same time.
7370

74-
As soon as you use the `--processes` option (even with `--processes=1`) then supervisord will be used to control the processes. It is quite useful to manage long-running instances.
71+
The worker is autonomous to handle its processes. It is quite useful to manage long-running instances.
7572

7673
### Simulating network latency
7774

Diff for: docs/configuration.md

+2-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ Remember, put mrq-config.py in your workers directory.
2020
MONGODB_JOBS = "mongodb://127.0.0.1:27017/mrq" # MongoDB URI for the jobs, scheduled_jobs & workers database.Defaults to mongodb://127.0.0.1:27017/mrq
2121
MONGODB_LOGS = 1 #MongoDB URI for the logs database."0" will disable remote logs, "1" will use main MongoDB.Defaults to 1
2222
MONGODB_LOGS_SIZE = None #If provided, sets the log collection to capped to that amount of bytes.
23-
NO_MONGODB_ENSURE_INDEXES = None #If provided, skip the creation of MongoDB indexes at worker startup.
2423

2524
#Redis settings
2625
REDIS = "redis://127.0.0.1:6379" #Redis URI.Defaults to redis://127.0.0.1:6379
@@ -57,10 +56,9 @@ USE_LARGE_JOB_IDS = False #Do not use compacted job IDs in Redis. For compatibil
5756
QUEUES = ("default",) # The queues to listen on.Defaults to default , which will listen on all queues.
5857
MAX_JOBS = 0 #Gevent:max number of jobs to do before quitting. Workaround for memory leaks in your tasks. Defaults to 0
5958
MAX_TIME = 0 # max number of seconds a worker runs before quitting
60-
MAX_MEMORY = 1 #Max memory (in Mb) after which the process will be shut down. Use with PROCESS = [1-N] to have supervisord automatically respawn the worker when this happens.Defaults to 1
59+
MAX_MEMORY = 1 #Max memory (in Mb) after which the process will be shut down. Use with PROCESS = [1-N] to have the worker automatically respawned when this happens.Defaults to 1
6160
GRENLETS = 1 #Max number of greenlets to use.Defaults to 1.
62-
PROCESSES = 0 #Number of processes to launch with supervisord.Defaults to 0.
63-
SUPERVISORD_TEMPLATE = "supervisord_templates/default.conf" #Path of supervisord template to use. Defaults to supervisord_templates/default.conf.
61+
PROCESSES = 0 #Number of processes to launch.Defaults to 0.
6462
SCHEDULER = False #Run the scheduler.Defaults to False.
6563
SCHEDULER_INTERVAL = 60 #Seconds between scheduler checks.Defaults to 60 seconds, only ints are acceptable.
6664
REPORT_INTERVAL = 10.5 #Seconds between worker reports to MongoDB.Defaults to 10 seconds, floats are acceptable too.

Diff for: docs/design.md

-8
This file was deleted.

Diff for: docs/get-started.md

-2
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,3 @@ This was a preview on the very basic features of MRQ. What makes it actually use
108108
* You can run multiple workers in parallel. Each worker can also run multiple greenlets in parallel.
109109
* Workers can dequeue from multiple queues
110110
* You can queue jobs from your Python code to avoid using `mrq-run` from the command-line.
111-
112-
These features will be demonstrated in a future example of a simple web crawler.

Diff for: docs/index.md

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ MRQ was first developed at [Pricing Assistant](http://pricingassistant.com) and
1616
* **Great [dashboard](http://mrq.readthedocs.org/en/latest/dashboard/):** Have visibility and control on everything: queued jobs, current jobs, worker status, ...
1717
* **Per-job logs:** Get the log output of each task separately in the dashboard
1818
* **Gevent worker:** IO-bound tasks can be done in parallel in the same UNIX process for maximum throughput
19-
* **Supervisord integration:** CPU-bound tasks can be split across several UNIX processes with a single command-line flag
2019
* **Job management:** You can retry, requeue, cancel jobs from the code or the dashboard.
2120
* **Performance:** Bulk job queueing, easy job profiling
2221
* **Easy [configuration](http://mrq.readthedocs.org/en/latest/configuration):** Every aspect of MRQ is configurable through command-line flags or a configuration file

Diff for: docs/jobs-maintenance.md

+2-11
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,9 @@ SCHEDULER_TASKS = [
3030
"interval": 3600
3131
},
3232
33-
# This will requeue jobs 'lost' between redis.blpop() and mongo.update(status=started).
34-
# This can happen only when the worker is killed brutally in the middle of dequeue_jobs()
33+
# This will make sure MRQ's indexes are built
3534
{
36-
"path": "mrq.basetasks.cleaning.RequeueLostJobs",
37-
"params": {},
38-
"interval": 24 * 3600
39-
},
40-
41-
# This will clean the list of known queues in Redis. It will mostly remove empty queues
42-
# so that they are not displayed in the dashboard anymore.
43-
{
44-
"path": "mrq.basetasks.cleaning.CleanKnownQueues",
35+
"path": "mrq.basetasks.indexes.EnsureIndexes",
4536
"params": {},
4637
"interval": 24 * 3600
4738
}

Diff for: docs/jobs.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ However, to be reliable a task queue needs to prepare for everything that can go
2424
* ```retry```: The method `task.retry()` was called to interrupt the job but mark it for being retried later. This may be useful when calling unreliable 3rd-party services.
2525
* ```maxretries```: The task was retried too many times. Max retries default to 3 and can be configured globally or per task. At this point it should be up to you to cancel them or requeue them again.
2626

27-
Only jobs in statuses `success` and `cancel` will be cleaned from MongoDB after a delay of `result_ttl` seconds (see [Task configuration](configuration.md))
27+
Jobs in status `success` will be cleaned from MongoDB after a delay of `result_ttl` seconds (see [Task configuration](configuration.md))
2828

2929
## Task API
3030

Diff for: docs/performance.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Performance
1+
# Worker performance
22

33
Performance is an explicit goal of MRQ as it was first developed at [Pricing Assistant](http://www.pricingassistant.com/) for crawling billions of web pages.
44

@@ -8,6 +8,8 @@ On a regular Macbook Pro, we see 1300 jobs/second in a single worker process wit
88

99
However what we are really measuring there is MongoDB's write performance. An install of MRQ with properly scaled MongoDB and Redis instances is be capable of much more.
1010

11+
For more, see our tutorial on [Queue performance](queue-performance.md).
12+
1113
## PyPy support
1214

1315
Earlier in its development MRQ was tested successfully on PyPy but we are waiting for better PyPy+gevent support to continue working on it, as performance was worse than CPython.

0 commit comments

Comments
 (0)