From 231df6787be5aaaa24a57833a0763481e153aba1 Mon Sep 17 00:00:00 2001
From: Duncan Dewhurst <duncan.dewhurst@opendataservices.coop>
Date: Wed, 20 Apr 2022 09:33:51 +1200
Subject: [PATCH] check_for_errors: check scrapy log file

---
 check_for_errors.ipynb | 42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/check_for_errors.ipynb b/check_for_errors.ipynb
index 5a799ca..81a00af 100644
--- a/check_for_errors.ipynb
+++ b/check_for_errors.ipynb
@@ -4,8 +4,7 @@
   "metadata": {
     "colab": {
       "name": "data_collection_and_processing_errors",
-      "provenance": [],
-      "authorship_tag": "ABX9TyPffcgelqfpE7r9y+mDKzan"
+      "provenance": []
     },
     "kernelspec": {
       "name": "python3",
@@ -22,6 +21,43 @@
         "## Check for data collection and processing errors"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Kingfisher Collect Log"
+      ],
+      "metadata": {
+        "id": "DWcRuKnZt--_"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Print the crawler statistics from the log file specified in the setup section. If `downloader/response_status_count/{code}` is non-zero and `{code}` is an HTTP error code (400-599), then the collection may be incomplete. Where possible, you should check the total number of releases and/or contracting processes against the front-end of the data source."
+      ],
+      "metadata": {
+        "id": "YoxNFk17uFZe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "if log_url != '':\n",
+        "\n",
+        "  response = requests.get(log_url, auth=('scrape', scrapy_password))\n",
+        "\n",
+        "  with open('log_file', 'wb') as f:\n",
+        "    f.write(response.content)\n",
+        "  \n",
+        "  log = ScrapyLogFile('log_file').logparser\n",
+        "  pprint(dict(log['crawler_stats']))"
+      ],
+      "metadata": {
+        "id": "kfzwh_ExuEVX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -409,4 +445,4 @@
       "outputs": []
     }
   ]
-}
+}
\ No newline at end of file