From 231df6787be5aaaa24a57833a0763481e153aba1 Mon Sep 17 00:00:00 2001 From: Duncan Dewhurst Date: Wed, 20 Apr 2022 09:33:51 +1200 Subject: [PATCH] check_for_errors: check scrapy log file --- check_for_errors.ipynb | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/check_for_errors.ipynb b/check_for_errors.ipynb index 5a799ca..81a00af 100644 --- a/check_for_errors.ipynb +++ b/check_for_errors.ipynb @@ -4,8 +4,7 @@ "metadata": { "colab": { "name": "data_collection_and_processing_errors", - "provenance": [], - "authorship_tag": "ABX9TyPffcgelqfpE7r9y+mDKzan" + "provenance": [] }, "kernelspec": { "name": "python3", @@ -22,6 +21,43 @@ "## Check for data collection and processing errors" ] }, + { + "cell_type": "markdown", + "source": [ + "### Kingfisher Collect Log" + ], + "metadata": { + "id": "DWcRuKnZt--_" + } + }, + { + "cell_type": "markdown", + "source": [ + "Print the crawler statistics from the log file specified in the setup section. If `downloader/response_status_count/{code}` is non-zero and `{code}` is an HTTP error code (400-599), then the collection may be incomplete. Where possible, you should check the total number of releases and/or contracting processes against the front-end of the data source." + ], + "metadata": { + "id": "YoxNFk17uFZe" + } + }, + { + "cell_type": "code", + "source": [ + "if log_url != '':\n", + "\n", + " response = requests.get(log_url, auth=('scrape', scrapy_password))\n", + "\n", + " with open('log_file', 'wb') as f:\n", + " f.write(response.content)\n", + " \n", + " log = ScrapyLogFile('log_file').logparser\n", + " pprint(dict(log['crawler_stats']))" + ], + "metadata": { + "id": "kfzwh_ExuEVX" + }, + "execution_count": null, + "outputs": [] + }, { "cell_type": "markdown", "metadata": { @@ -409,4 +445,4 @@ "outputs": [] } ] -} +} \ No newline at end of file