Skip to content

Commit 339a904

Browse files
authored
Merge pull request #63 from ClericPy/dev
1.7.2
2 parents f909140 + 9d9c4e9 commit 339a904

12 files changed

+521
-435
lines changed

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ uniparser>=1.4.5
22
fastapi
33
uvicorn
44
databases
5-
torequests>=5.0.3
5+
torequests>=5.0.4
66
fire
77
jinja2
88
aiofiles

watchdogs/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
from .config import Config
44
from .main import init_app
55

6-
__version__ = '1.7.1'
6+
__version__ = '1.7.2'
77
__all__ = ['Config', 'init_app']
88
logging.getLogger('watchdogs').addHandler(logging.NullHandler())

watchdogs/app.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from . import __version__
2121
from .config import md5_checker
22-
from .crawler import crawl_once
22+
from .crawler import crawl_once, find_next_check_time
2323
from .models import Task, query_tasks, tasks
2424
from .settings import (Config, get_host_freq_list, refresh_token, release_app,
2525
set_host_freq, setup_app)
@@ -119,6 +119,7 @@ async def index(request: Request, tag: str = ''):
119119
'custom_links': Config.custom_links,
120120
'callback_workers': Config.callback_handler.workers,
121121
'custom_tabs': Config.custom_tabs,
122+
'work_hours_doc': find_next_check_time.__doc__,
122123
})
123124
init_vars_b64 = b64encode(init_vars_json.encode('u8')).decode('u8')
124125
kwargs['init_vars'] = init_vars_b64
@@ -204,7 +205,7 @@ async def force_crawl(task_name: str):
204205
async def load_tasks(
205206
task_name: Optional[str] = None,
206207
page: int = 1,
207-
page_size: int = 30,
208+
page_size: int = Config.default_page_size,
208209
order_by: str = 'last_change_time',
209210
sort: str = 'desc',
210211
tag: str = '',
@@ -469,8 +470,12 @@ async def post_lite(request: Request,
469470

470471

471472
@app.get("/lite")
472-
async def lite(request: Request, tag: str = '', sign: str = '', page: int = 1):
473-
tasks, has_more = await query_tasks(tag=tag, page=page)
473+
async def lite(request: Request,
474+
tag: str = '',
475+
sign: str = '',
476+
page: int = 1,
477+
page_size: int = Config.default_page_size):
478+
tasks, has_more = await query_tasks(tag=tag, page=page, page_size=page_size)
474479
now = datetime.now()
475480
for task in tasks:
476481
result = loads(task['latest_result'] or '{}')

watchdogs/background.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
async def crawl_chunks(crawl_once):
77
loop_num = 0
88
while not Config.is_shutdown:
9-
loop_num += 1
109
has_more = await crawl_once()
1110
if isinstance(has_more, Exception):
1211
Config.logger.error(f'crawl_once error, {has_more!r}')
@@ -15,6 +14,7 @@ async def crawl_chunks(crawl_once):
1514
f'crawl_once finished, has_more: {has_more}, loop: {loop_num}')
1615
if not has_more:
1716
break
17+
loop_num += 1
1818

1919

2020
async def background_loop(coro_funcs: list = None):

watchdogs/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ class Config:
181181
custom_tabs: List[Dict] = []
182182
COLLATION: str = None
183183
cookie_max_age = 86400 * 7
184+
default_page_size = 15
184185

185186
@classmethod
186187
def add_custom_tabs(cls, label, url, name=None, desc=None):

watchdogs/crawler.py

+50-26
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
13
from asyncio import ensure_future, wait
24
from datetime import datetime, timedelta
35
from json import JSONDecodeError, dumps, loads
46
from typing import Optional, Tuple
57

6-
from torequests.utils import ttime
8+
from torequests.utils import timeago, ttime
79
from uniparser import Crawler, RuleNotFoundError
810

911
from .config import Config
@@ -38,8 +40,7 @@ def kwargs(self):
3840

3941

4042
def find_next_check_time(
41-
work_hours: str,
42-
interval: int,
43+
task: Task,
4344
now: Optional[datetime] = None,
4445
) -> Tuple[bool, datetime]:
4546
'''
@@ -63,30 +64,50 @@ def find_next_check_time(
6364
%w==5|20, 24 means every Friday or everyday 20:00 ~ 23:59
6465
%w==5|%w==2 means every Friday or Tuesday
6566
%w!=6&%w!=0 means everyday except Saturday & Sunday.
67+
5. Set a ensure change interval
68+
> If work_hours string endswith `#` and `x` seconds, will check the next_change_time first.
69+
> In other words, I am very sure that the interval between two changes is more than `x` seconds
70+
> So the crawler of this task will not run until the time is `last_change_time + change_interval`
71+
%w==5#86400 means every Friday if it didn't change within 1 day
72+
0, 24#3600 means each hour if it didn't change within this hour. The task will only be crawled once if it has changed.
6673
'''
6774
# find the latest hour fit work_hours, if not exist, return next day 00:00
6875
now = now or datetime.now()
76+
work_hours = task.work_hours or '0, 24'
77+
if '#' in work_hours:
78+
# check if changed
79+
last_change_time = task.last_change_time or datetime.fromtimestamp(0)
80+
# split work_hours and change_interval
81+
work_hours, change_interval_str = work_hours.split('#')
82+
change_interval = int(change_interval_str)
83+
# not fit change interval, will wait for left seconds.
84+
next_change_time = last_change_time + timedelta(seconds=change_interval)
85+
if now < next_change_time:
86+
Config.logger.info(
87+
f'Task [{task.name}] has changed in {timeago(change_interval, accuracy=1, format=1, short_name=1)} ago.'
88+
)
89+
return False, next_change_time
6990

70-
ok = check_work_time(work_hours, now)
71-
if ok:
72-
# current time is ok, next_check_time is now+interval
73-
next_check_time = now + timedelta(seconds=interval)
74-
return ok, next_check_time
91+
need_crawl = check_work_time(work_hours, now)
92+
if need_crawl:
93+
# current time is need_crawl, next_check_time is now+interval
94+
next_check_time = now + timedelta(seconds=task.interval)
95+
return need_crawl, next_check_time
7596
else:
76-
# current time is not ok
97+
# current time is not need_crawl
7798
next_check_time = now
78-
# time machine to check time fast
99+
# time machine to update next_check_time fast
79100
for _ in range(60):
80-
# check next interval
81-
next_check_time = next_check_time + timedelta(seconds=interval)
82-
_ok = check_work_time(work_hours, next_check_time)
83-
if _ok:
84-
# current is still False, but next_check_time is True
101+
# next interval
102+
next_check_time = next_check_time + timedelta(seconds=task.interval)
103+
_need_crawl = check_work_time(work_hours, next_check_time)
104+
if _need_crawl:
105+
# current time is still False, but next_check_time is True
85106
break
86-
return ok, next_check_time
107+
return need_crawl, next_check_time
87108

88109

89-
async def crawl(task):
110+
async def crawl(task: Task):
90111
crawler: Crawler = Config.crawler
91112
logger = Config.logger
92113
logger.info(f'Start crawling: {task.name}')
@@ -103,13 +124,17 @@ async def crawl(task):
103124
else:
104125
if len(crawl_result) == 1:
105126
# chain result for __request__ which fetch a new request
106-
result_list = get_watchdog_result(item=crawl_result.popitem()[1])
107-
if result_list == {'text': 'text not found'}:
127+
formated_result = get_watchdog_result(
128+
item=crawl_result.popitem()[1])
129+
if formated_result == {'text': 'text not found'}:
108130
error = f'{task.name} text not found, crawl result given: {crawl_result}'
109131
logger.error(error)
132+
result_list = None
110133
else:
111-
if not isinstance(result_list, list):
112-
result_list = [result_list]
134+
if isinstance(formated_result, list):
135+
result_list = formated_result
136+
else:
137+
result_list = [formated_result]
113138
# use force crawl one web UI for more log
114139
logger.info(f'{task.name} Crawl success: {result_list}'[:150])
115140
else:
@@ -141,12 +166,11 @@ async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
141166
for _task in fetched_tasks:
142167
task = Task(**dict(_task))
143168
# check work hours
144-
ok, next_check_time = find_next_check_time(task.work_hours or '0, 24',
145-
task.interval, now)
169+
need_crawl, next_check_time = find_next_check_time(task, now)
146170
if task_name:
147171
# always crawl for given task_name
148-
ok = True
149-
if ok:
172+
need_crawl = True
173+
if need_crawl:
150174
t = ensure_future(crawl(task))
151175
# add task_name for logger
152176
setattr(t, 'task_name', task.name)
@@ -160,7 +184,7 @@ async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
160184
# update task variable for callback
161185
task.__dict__.update(values)
162186
update_values.append(values)
163-
if not ok:
187+
if not need_crawl:
164188
logger.info(
165189
f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}'
166190
)

watchdogs/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ async def query_tasks(
274274
task_name: Optional[str] = None,
275275
task_id: Optional[int] = None,
276276
page: int = 1,
277-
page_size: int = 30,
277+
page_size: int = Config.default_page_size,
278278
order_by: str = 'last_change_time',
279279
sort: str = 'desc',
280280
tag: str = '',

watchdogs/static/css/watchdogs.css

+3
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ p.custom_links {
7979
.el-popover {
8080
max-width: 50%;
8181
}
82+
.el-message-box.work_hours_doc{
83+
width: 40%;
84+
}
8285
pre {
8386
word-wrap: break-word;
8487
white-space: pre-wrap;

watchdogs/static/css/watchdogs.min.css

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)