diff --git a/ir-measures/Dockerfile b/ir-measures/Dockerfile
new file mode 100644
index 0000000..cf2899a
--- /dev/null
+++ b/ir-measures/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.10
+
+RUN pip3 install ir-measures approvaltests pytest
+
+COPY ir_measures_evaluator.py /
+
+COPY tests /tmp/tests/
+
+RUN cd /tmp \
+        && find -iname __pycache__ -exec rm -Rf {} \; || echo "" \
+        && PYTHONPATH='../:.' pytest \
+        && cd / \
+        && rm -Rf /tmp/tests
+
+ENTRYPOINT [ "/ir_measures_evaluator.py" ]
diff --git a/ir-measures/Makefile b/ir-measures/Makefile
new file mode 100644
index 0000000..17003e4
--- /dev/null
+++ b/ir-measures/Makefile
@@ -0,0 +1,15 @@
+IMAGE_VERSION=1.0.5
+
+build-docker-image:
+	docker build -t webis/ir_measures_evaluator:${IMAGE_VERSION} .
+
+.PHONY: tests
+tests:
+	pytest
+
+example-execution:
+	rm -Rf output
+	docker run --rm -it -v ${PWD}/input:/input -v ${PWD}/output:/output webis/ir_measures_evaluator:${IMAGE_VERSION} --run /input/run.txt --qrels /input/qrels.txt --measures "AP(rel=2)" "P(rel=2)@10" --output_path /output/eval.prototext
+
+publish-docker-image:
+	docker push webis/ir_measures_evaluator:${IMAGE_VERSION}
diff --git a/ir-measures/Pipfile b/ir-measures/Pipfile
new file mode 100644
index 0000000..89c2636
--- /dev/null
+++ b/ir-measures/Pipfile
@@ -0,0 +1,14 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+ir-measures = "*"
+pytest = "*"
+approvaltests = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
diff --git a/ir-measures/Pipfile.lock b/ir-measures/Pipfile.lock
new file mode 100644
index 0000000..6dfee08
--- /dev/null
+++ b/ir-measures/Pipfile.lock
@@ -0,0 +1,332 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "80d55cd819e6b1d0d55c59515e27befe560fb92c22513373ae54811e2bc012ae"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.9"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "allpairspy": {
+            "hashes": [
+                "sha256:66dbcb30c22a2c73e1cccae9ae7093f8e18bdf542e4ba2864ce822463735b5b4",
+                "sha256:9358484c91abe74ba18daf9d6d6904c5be7cc8818397d05248c9d336023c28b1"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==2.5.0"
+        },
+        "approval-utilities": {
+            "hashes": [
+                "sha256:8e0b146716db2f7d95d01726947438904760a6cb005aa391cee6f7a623d627ee",
+                "sha256:d11f545ad318f3520e997657aaeb6511521a4d0b647a0868037ca817af7c73b3"
+            ],
+            "markers": "python_full_version >= '3.6.1'",
+            "version": "==7.4.0"
+        },
+        "approvaltests": {
+            "hashes": [
+                "sha256:64e8798f843f44ffd047935f18e8ec850caf9d4f924b9b24456008102cbb6aca",
+                "sha256:e09b2f2af6cd049d71a3dcdc7b3bb4c2cd887cc5eb4b1294b557675360c5f90d"
+            ],
+            "index": "pypi",
+            "version": "==7.4.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
+                "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==22.1.0"
+        },
+        "beautifulsoup4": {
+            "hashes": [
+                "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
+                "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
+            ],
+            "markers": "python_full_version >= '3.6.0'",
+            "version": "==4.11.1"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
+                "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2022.12.7"
+        },
+        "charset-normalizer": {
+            "hashes": [
+                "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
+                "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
+            ],
+            "markers": "python_full_version >= '3.6.0'",
+            "version": "==2.1.1"
+        },
+        "cwl-eval": {
+            "hashes": [
+                "sha256:ff9e4a1241eed82067ebe7605e7cf44d923d7d9d764f222034fc8216d0ef327d"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==1.0.12"
+        },
+        "empty-files": {
+            "hashes": [
+                "sha256:87277db100a3bfdafc2ba18f6094cd37090e257058fb1c0b15873a89e1003149",
+                "sha256:ec464f7f88a028d4567b380d57983fc4ffb79147538626690cd94c33090cd216"
+            ],
+            "markers": "python_full_version >= '3.6.1'",
+            "version": "==0.0.3"
+        },
+        "exceptiongroup": {
+            "hashes": [
+                "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828",
+                "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"
+            ],
+            "markers": "python_version < '3.11'",
+            "version": "==1.0.4"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
+                "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==3.4"
+        },
+        "iniconfig": {
+            "hashes": [
+                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+            ],
+            "version": "==1.1.1"
+        },
+        "ir-measures": {
+            "hashes": [
+                "sha256:4e6ed5495e3655c8d44e492bb2890fe6a06604b8ab1c2cd3c8996c499b7b1cef"
+            ],
+            "index": "pypi",
+            "version": "==0.3.1"
+        },
+        "mrjob": {
+            "hashes": [
+                "sha256:2c8d8dc8aa4b354a97de18d0260f551f018693af74af104b3d41daf165eebdd4",
+                "sha256:d8fa1bafcada0ffe3e7166896a27e996815e2cb835088aec025e3dd12c7146ce"
+            ],
+            "version": "==0.7.4"
+        },
+        "numpy": {
+            "hashes": [
+                "sha256:01dd17cbb340bf0fc23981e52e1d18a9d4050792e8fb8363cecbf066a84b827d",
+                "sha256:06005a2ef6014e9956c09ba07654f9837d9e26696a0470e42beedadb78c11b07",
+                "sha256:09b7847f7e83ca37c6e627682f145856de331049013853f344f37b0c9690e3df",
+                "sha256:0aaee12d8883552fadfc41e96b4c82ee7d794949e2a7c3b3a7201e968c7ecab9",
+                "sha256:0cbe9848fad08baf71de1a39e12d1b6310f1d5b2d0ea4de051058e6e1076852d",
+                "sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a",
+                "sha256:33161613d2269025873025b33e879825ec7b1d831317e68f4f2f0f84ed14c719",
+                "sha256:5039f55555e1eab31124a5768898c9e22c25a65c1e0037f4d7c495a45778c9f2",
+                "sha256:522e26bbf6377e4d76403826ed689c295b0b238f46c28a7251ab94716da0b280",
+                "sha256:56e454c7833e94ec9769fa0f86e6ff8e42ee38ce0ce1fa4cbb747ea7e06d56aa",
+                "sha256:58f545efd1108e647604a1b5aa809591ccd2540f468a880bedb97247e72db387",
+                "sha256:5e05b1c973a9f858c74367553e236f287e749465f773328c8ef31abe18f691e1",
+                "sha256:7903ba8ab592b82014713c491f6c5d3a1cde5b4a3bf116404e08f5b52f6daf43",
+                "sha256:8969bfd28e85c81f3f94eb4a66bc2cf1dbdc5c18efc320af34bffc54d6b1e38f",
+                "sha256:92c8c1e89a1f5028a4c6d9e3ccbe311b6ba53694811269b992c0b224269e2398",
+                "sha256:9c88793f78fca17da0145455f0d7826bcb9f37da4764af27ac945488116efe63",
+                "sha256:a7ac231a08bb37f852849bbb387a20a57574a97cfc7b6cabb488a4fc8be176de",
+                "sha256:abdde9f795cf292fb9651ed48185503a2ff29be87770c3b8e2a14b0cd7aa16f8",
+                "sha256:af1da88f6bc3d2338ebbf0e22fe487821ea4d8e89053e25fa59d1d79786e7481",
+                "sha256:b2a9ab7c279c91974f756c84c365a669a887efa287365a8e2c418f8b3ba73fb0",
+                "sha256:bf837dc63ba5c06dc8797c398db1e223a466c7ece27a1f7b5232ba3466aafe3d",
+                "sha256:ca51fcfcc5f9354c45f400059e88bc09215fb71a48d3768fb80e357f3b457e1e",
+                "sha256:ce571367b6dfe60af04e04a1834ca2dc5f46004ac1cc756fb95319f64c095a96",
+                "sha256:d208a0f8729f3fb790ed18a003f3a57895b989b40ea4dce4717e9cf4af62c6bb",
+                "sha256:dbee87b469018961d1ad79b1a5d50c0ae850000b639bcb1b694e9981083243b6",
+                "sha256:e9f4c4e51567b616be64e05d517c79a8a22f3606499941d97bb76f2ca59f982d",
+                "sha256:f063b69b090c9d918f9df0a12116029e274daf0181df392839661c4c7ec9018a",
+                "sha256:f9a909a8bae284d46bbfdefbdd4a262ba19d3bc9921b1e76126b1d21c3c34135"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==1.23.5"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
+                "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==22.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.0.0"
+        },
+        "pyperclip": {
+            "hashes": [
+                "sha256:105254a8b04934f0bc84e9c24eb360a591aaf6535c9def5f29d92af107a9bf57"
+            ],
+            "version": "==1.8.2"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71",
+                "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"
+            ],
+            "index": "pypi",
+            "version": "==7.2.0"
+        },
+        "pytrec-eval-terrier": {
+            "hashes": [
+                "sha256:0de661e617fb8b31525ed73c0589b67d67f2b2d4d264a227dd1346b3ddb233ad",
+                "sha256:1558cad61264e25d54a8c76e1fc2f1939bc4a68e72beec6a9b2d752da06d8c7f",
+                "sha256:159611ab0de9614e5ce98c5f834abad2672f6037c343a48339c0119ed61ae6d5",
+                "sha256:1713e0584d169b7506017b6c8963ff019637474cda8a7968a457a2e70a140782",
+                "sha256:24c2b7ccfbbcb6c2b4f3af8ac56bd08ba26c25e212a6e7b0d5f7e3ae384ed046",
+                "sha256:27ff02633f5ed94f7795cf7bd02643076d4b6bb2d4f363a2926901ab2c6320d9",
+                "sha256:2fa570fea2b102bcc3f1210d175bc04636330992abd4131bd3ad7b7e4f5f23e5",
+                "sha256:2fd2e77f5d31b0bc3e5bdf2a086c68fad7938b9ee48ec910786e4cc81563a5d0",
+                "sha256:43f3796a126e6fb50558f9a0894593a35d0a2f29972ac57506af7a63ddaf94d5",
+                "sha256:4e0a77f8f4d8a7e20384ee0902184dd538487ac920987edc3122fb9adad423e7",
+                "sha256:4fc2b87a4c5d78a9e83b9891e9716aa2855f89cf91d7a22f59c5642665e21a54",
+                "sha256:505837e21642822bb8dc8fee0c122d5b7dcc3943a481355f29c8cb029618c489",
+                "sha256:5575efa84276aed810e4f938d058d0366b51f2e1f039877e5145c9b8f4abd568",
+                "sha256:5a459527c74fa6cd06886169130bc7f20422ade4b4f1fc14c9aab299eeeaf4f0",
+                "sha256:5e8140f0932c2ef498d215cb80106472259b42048bf16b2cbd5ab9ffd9e539a8",
+                "sha256:5ee5b665132151c07ae89f0da75a4e81bf65ade2fc46a4aaecc6957ea70690b0",
+                "sha256:60877dede6a431ec549827c1c106f5c00b556ed31e5f8384f9ac9cc64b622a5d",
+                "sha256:6690beec72108a8d79192b31ee2231bd28a643b31330699d22516db7d6112743",
+                "sha256:68c0f6f40809e3338525e960f8256ef0b926495c8f83c6a1d9cd5edf4e2c8487",
+                "sha256:6b06b7e5c68ba98ba420c584d0fafdd3e2581d066345cd1d0837cfc619db19c8",
+                "sha256:6c15382020a78807a5601398ec2a2dc1491af495980338313ed837d46809c94c",
+                "sha256:8afa03c774691cb8f679e21a0414f3f656ae2fa9edab135f0fd2edd349314a64",
+                "sha256:9c62442588b0e4d5e40d74da3ed9c1108b4a1a027ac442f1ac71e279fe7c5cc5",
+                "sha256:9ec39dd579f2893016aa5c4cd66a31c573b4057277979b4a650307e3e9115e19",
+                "sha256:9f27c8afd6ae40db7f7bc7290f8952222e4e6f2c52796fe49f92909b6611cae1",
+                "sha256:a988f7185e9b99f15e32ccab4f74265549fe12df21a949165dd41c76a38470b9",
+                "sha256:ae80c5002cd798d055e9a4600abb1a4a00ae1ddd818d02c174749f8240721535",
+                "sha256:b0744f7ea5e4e60c33ec449c936077c57a5d3d2363ddf37306f55c61ad1619fb",
+                "sha256:b9284b3a2b7467095384927a3a01bb8bde6c06ee07280194310baba9d088ad89",
+                "sha256:c693d17d7fc60246a6ea5548f58d81a1b5960b12fc691001649f596116ebad7e",
+                "sha256:c8dcd150fb8b6324f23425d9881187b03a725772c20618b2fbaec53380cbb211",
+                "sha256:d11a70a91907daab061d053df42bdd7d5219e2d641fadb3bc207cc88615ffec0",
+                "sha256:d132bf0b1b64f93ba32fcf57a62dce9f66276fa5d6975ac81e92a2fc1e113f8c",
+                "sha256:d33f69048de6d575282ce25691d1ac6561b5ca3ab7ce2e5e228c06d96144c9c4",
+                "sha256:d6b0cd049e95efd1bde42e1befcd843193bd5037c1e33a8155cf78b05ed86ab7",
+                "sha256:d892525c27bdfeafe7f3ba49bc3dd7a1ace689cea380a98e5ee995027552b67c",
+                "sha256:e5c7a11e98c6381afcac061ddc20a586b518f6481aeabe58b569143918419eea",
+                "sha256:e84efeeb7a82e9b2c4a09e0cdea80f69fca37d07c0a1b484d513d469a0f86643",
+                "sha256:fb7f2d36f68c441f66984221500b5c9a3e9c21c27c9789ccdab492305f4d7674"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==0.5.5"
+        },
+        "pyyaml": {
+            "hashes": [
+                "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf",
+                "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293",
+                "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b",
+                "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57",
+                "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b",
+                "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4",
+                "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07",
+                "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba",
+                "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9",
+                "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287",
+                "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513",
+                "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0",
+                "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782",
+                "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0",
+                "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92",
+                "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f",
+                "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2",
+                "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc",
+                "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1",
+                "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c",
+                "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86",
+                "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4",
+                "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c",
+                "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34",
+                "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b",
+                "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d",
+                "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c",
+                "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb",
+                "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7",
+                "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737",
+                "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3",
+                "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d",
+                "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358",
+                "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53",
+                "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78",
+                "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803",
+                "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a",
+                "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f",
+                "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174",
+                "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==6.0"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
+                "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
+            ],
+            "markers": "python_version >= '3.7' and python_version < '4'",
+            "version": "==2.28.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
+        },
+        "soupsieve": {
+            "hashes": [
+                "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
+                "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.3.2.post1"
+        },
+        "tomli": {
+            "hashes": [
+                "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
+                "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
+            ],
+            "markers": "python_version < '3.11'",
+            "version": "==2.0.1"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
+                "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
+            ],
+            "markers": "python_version >= '3.7'",
+            "version": "==4.4.0"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
+                "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==1.26.13"
+        }
+    },
+    "develop": {}
+}
diff --git a/ir-measures/README.md b/ir-measures/README.md
new file mode 100644
index 0000000..2543b0d
--- /dev/null
+++ b/ir-measures/README.md
@@ -0,0 +1,21 @@
+
+Add the evaluator to tira with:
+
+Image:
+```
+webis/ir_measures_evaluator:1.0
+```
+
+Command (if no qrels are available):
+
+```
+/ir_measures_evaluator.py --run ${inputRun}/run.txt --output_path ${outputDir}/evaluation.prototext
+```
+
+
+Command (if qrels are available):
+
+```
+/ir_measures_evaluator.py --run ${inputRun}/run.txt --topics ${inputDataset}/queries.jsonl --qrels ${inputDataset}/qrels.txt --output_path ${outputDir} --measures "P@10" "nDCG@10" "MRR"
+```
+
diff --git a/ir-measures/__init__.py b/ir-measures/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ir-measures/__pycache__/ir_measures_evaluator.cpython-310.pyc b/ir-measures/__pycache__/ir_measures_evaluator.cpython-310.pyc
new file mode 100644
index 0000000..3ddf696
Binary files /dev/null and b/ir-measures/__pycache__/ir_measures_evaluator.cpython-310.pyc differ
diff --git a/ir-measures/ir-measures.iml b/ir-measures/ir-measures.iml
new file mode 100644
index 0000000..37dbae8
--- /dev/null
+++ b/ir-measures/ir-measures.iml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Pipenv (ir-measures)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/ir-measures/ir_measures_evaluator.py b/ir-measures/ir_measures_evaluator.py
new file mode 100755
index 0000000..6861996
--- /dev/null
+++ b/ir-measures/ir_measures_evaluator.py
@@ -0,0 +1,808 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+from typing import Tuple, Dict, List, Optional
+
+import ir_measures
+from ir_measures import Qrel, ScoredDoc, Measure, Metric
+
+
+def add_error(
+        error_log: Dict[str, Dict[str, List[str]]],
+        key: str,
+        line: int
+) -> None:
+    if key not in error_log['errors']:
+        error_log['errors'][key] = []
+    error_log['errors'][key].append(str(line))
+
+
+def add_warning(
+        error_log: Dict[str, Dict[str, List[str]]],
+        key: str,
+        line: int
+) -> None:
+    if key not in error_log['warnings']:
+        error_log['warnings'][key] = []
+    error_log['warnings'][key].append(str(line))
+
+
+def print_error(message: str, indent: bool = True) -> None:
+    tab = '\t' if indent else ''
+    print(f'{tab}\N{Ballot X} {message}')
+
+
+def print_warning(message: str, indent: bool = True) -> None:
+    tab = '\t' if indent else ''
+    print(f'{tab}\N{Warning Sign} {message}')
+
+
+def print_success(message: str, indent: bool = True) -> None:
+    tab = '\t' if indent else ''
+    print(f'{tab}\N{Check Mark} {message}')
+
+
+def print_info(message: str, indent: bool = True) -> None:
+    tab = '\t' if indent else ''
+    print(f'{tab}\N{Information Source} {message}')
+
+
+def check_file_path(path: Path, target: str) -> None:
+    print_info(f'Check {target} path: {path}', indent=False)
+    target = target.capitalize()
+    if not path.exists():
+        print_error(f'{target} path does not exist.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    if not path.is_file():
+        print_error(f'{target} path is not a file.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    if not os.access(path, os.R_OK):
+        print_error(f'{target} file is not readable.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    if path.stat().st_size <= 0:
+        print_error(f'{target} file is empty.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    print_success(f'{target} path is valid.', indent=False)
+
+
+def check_output_path(path: Path, target: str) -> None:
+    print_info(f'Check {target} path: {path}', indent=False)
+    target = target.capitalize()
+    if not path.exists():
+        print_error(f'{target} path does not exist.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    if not path.is_dir():
+        print_error(f'{target} path is not a directory.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    if not os.access(path, os.W_OK):
+        print_error(f'{target} directory is not writable.')
+        print_error(f'{target} path is invalid: 1 error', indent=False)
+        exit(1)
+    if next(path.iterdir(), None) is not None:
+        print_warning(f'{target} directory is not empty.')
+        print_success(f'{target} path is valid: 1 warning', indent=False)
+    else:
+        print_success(f'{target} path is valid.', indent=False)
+
+
+def _is_number(string: str) -> bool:
+    try:
+        float(string)
+        return True
+    except ValueError:
+        return False
+
+
+def _is_integer(string: str) -> bool:
+    try:
+        int(string)
+        return True
+    except ValueError:
+        return False
+
+
+regexp_special_chars = re.compile(r'[^a-zA-Z0-9-_]+')
+
+
+def check_run_file_content(path: Path) -> Dict[str, Dict[str, List[str]]]:
+    error_log: Dict[str, Dict[str, List[str]]] = {
+        'errors': {},
+        'warnings': {},
+    }
+
+    with path.open('rt') as file:
+        line_count = 1
+        list_of_cols = []
+        ignore_previous_cols = True
+
+        lines = file.readlines()
+
+        for line in lines:
+            cols = line.rstrip().split()
+            previous_cols = list_of_cols[-1] if list_of_cols else cols
+
+            # check columns
+            if len(cols) > 6:
+                add_error(error_log, 'cols_more', line_count)
+                ignore_previous_cols = True
+            elif len(cols) < 6:
+                add_error(error_log, 'cols_less', line_count)
+                ignore_previous_cols = True
+            else:
+                # error_log everything column specific
+                # only when the column count is correct
+                # because otherwise the position of the data
+                # is potentially corrupted
+
+                # check tags
+                if not cols[5] == previous_cols[5]:
+                    add_error(error_log, 'tag_multi', line_count)
+                if re.search(regexp_special_chars, cols[5]):
+                    add_warning(error_log, 'tag_chars', line_count)
+
+                # check query ids
+                if re.search(regexp_special_chars, cols[0]):
+                    add_warning(error_log, 'qid_chars', line_count)
+                if cols[0] < previous_cols[0]:
+                    add_warning(error_log, 'qid_asc', line_count)
+
+                # check doc ids
+                if re.search(regexp_special_chars, cols[2]):
+                    add_warning(error_log, 'docid_chars', line_count)
+
+                # check ignored column
+                if not cols[1] == 'Q0':
+                    add_warning(error_log, 'ignored_col', line_count)
+
+                # check scores
+                try:
+                    float(cols[4])
+                    float(previous_cols[4])
+
+                    if 'e' in cols[4].lower():
+                        add_warning(error_log, 'score_science', line_count)
+                    if (
+                            float(cols[4]) == float(previous_cols[4])
+                            and not ignore_previous_cols
+                    ):
+                        add_warning(error_log, 'score_tied', line_count)
+                    if float(cols[4]) > float(previous_cols[4]):
+                        add_warning(error_log, 'score_desc', line_count)
+                except:
+                    if not _is_number(cols[4]):
+                        add_error(error_log, 'score_num', line_count)
+                    if (
+                            cols[4] == previous_cols[4]
+                            and not ignore_previous_cols
+                    ):
+                        add_warning(error_log, 'score_tied', line_count)
+                    if cols[4] > previous_cols[4]:
+                        add_warning(error_log, 'scor_desc', line_count)
+
+                # check ranks
+                if not _is_number(cols[3]):
+                    add_error(error_log, 'rank_num', line_count)
+                if not _is_integer(cols[3]):
+                    add_warning(error_log, 'rank_int', line_count)
+                else:
+                    if line_count == 1 and int(cols[3]) != 0:
+                        add_warning(error_log, 'rank_start', line_count)
+                if cols[3] == previous_cols[3] and not ignore_previous_cols:
+                    add_warning(error_log, 'rank_tied', line_count)
+                if _is_integer(cols[3]) and _is_integer(previous_cols[3]):
+                    if int(cols[3]) < int(previous_cols[3]):
+                        add_warning(error_log, 'rank_asc', line_count)
+                    if (
+                            int(cols[3]) != int(previous_cols[3]) + 1
+                            and line_count > 1 and not ignore_previous_cols
+                    ):
+                        add_warning(error_log, 'rank_consecutive', line_count)
+                else:
+                    if cols[3] < previous_cols[3]:
+                        add_warning(error_log, 'rank_asc', line_count)
+
+                # check consistency
+                if _is_number(cols[4]) and _is_number(previous_cols[4]):
+                    if _is_integer(cols[3]) and _is_integer(previous_cols[3]):
+                        if (
+                                (
+                                        int(cols[3]) < int(previous_cols[3])
+                                        and not float(cols[4]) > float(previous_cols[4]))
+                                or
+                                (
+                                        float(cols[4]) > float(previous_cols[4])
+                                        and not int(cols[3]) < int(previous_cols[3])
+                                )
+                        ):
+                            add_warning(error_log, 'consistency', line_count)
+                    else:
+                        if (
+                                (
+                                        cols[3] < previous_cols[3]
+                                        and not float(cols[4]) > float(previous_cols[4])
+                                )
+                                or
+                                (
+                                        float(cols[4]) > float(previous_cols[4])
+                                        and not cols[3] < previous_cols[3]
+                                )
+                        ):
+                            add_warning(error_log, 'consistency', line_count)
+                else:
+                    if _is_integer(cols[3]) and _is_integer(previous_cols[3]):
+                        if (
+                                (
+                                        int(cols[3]) < int(previous_cols[3])
+                                        and not cols[4] > previous_cols[4]
+                                )
+                                or
+                                (
+                                        cols[4] > previous_cols[4]
+                                        and not cols[3] < previous_cols[3]
+                                )
+                        ):
+                            add_warning(error_log, 'consistency', line_count)
+                    else:
+                        if (
+                                (
+                                        cols[3] < previous_cols[3]
+                                        and not cols[4] > previous_cols[4]
+                                )
+                                or
+                                (
+                                        cols[4] > previous_cols[4]
+                                        and not cols[3] < previous_cols[3]
+                                )
+                        ):
+                            add_warning(error_log, 'consistency', line_count)
+
+                            # at the end of iteration if columns are correct:
+                # save actual line as previous line for the next iteration
+                list_of_cols.append(cols)
+                ignore_previous_cols = False
+            # at the end of iteration: count line 
+            line_count += 1
+
+        return error_log
+
+
+def check_consistency(
+        run: List[ScoredDoc],
+        qrels: List[Qrel],
+        topics: List[dict],
+) -> Dict[str, Dict[str, List[str]]]:
+    run_queries = {scored_doc.query_id for scored_doc in run}
+    run_docs = {scored_doc.doc_id for scored_doc in run}
+    qrels_queries = {scored_doc.query_id for scored_doc in qrels}
+    qrels_docs = {scored_doc.doc_id for scored_doc in qrels}
+    topics_queries = {topic['qid'] for topic in topics}
+
+    error_log: Dict[str, Dict[str, List[str]]] = {
+        'errors': {},
+        'warnings': {},
+    }
+
+    for query in sorted(run_queries):
+        if query not in qrels_queries:
+            add_warning(error_log, 'run_qid_qrels', query)
+
+    for doc in sorted(run_docs):
+        if doc not in qrels_docs:
+            add_warning(error_log, 'run_docid_qrels', doc)
+
+    for query in sorted(topics_queries):
+        if query not in run_queries:
+            add_warning(error_log, 'topics_qid_run', query)
+        if query not in qrels_queries:
+            add_warning(error_log, 'topics_qid_qrels', query)
+
+    return error_log
+
+
+def load_run(path: Path) -> List[ScoredDoc]:
+    print_info('Load run with ir-measures.', indent=False)
+    run = list(ir_measures.read_trec_run(str(path)))
+    print_success(f'Run successfully loaded.', indent=False)
+    return run
+
+
+def load_qrels(path: Path) -> List[Qrel]:
+    print_info('Load qrels with ir-measures.', indent=False)
+    qrels = list(ir_measures.read_trec_qrels(str(path)))
+    print_success(f'Qrels successfully loaded.', indent=False)
+    return qrels
+
+
+def load_topics(path: Path) -> List[dict]:
+    print_info('Load topics.', indent=False)
+    with path.open('rt') as lines:
+        topics = [json.loads(line) for line in lines]
+    print_success(f'Topics successfully loaded.', indent=False)
+    return topics
+
+
+def parse_measure_args(measures_str: List[str]) -> Optional[List[Measure]]:
+    print_info(
+        'Parse measures: '
+        f'{", ".join(measures_str)}',
+        indent=False
+    )
+    measures = []
+    unknown = 0
+    invalid = 0
+    for measure_str in measures_str:
+        try:
+            measure = ir_measures.parse_measure(measure_str)
+            measures.append(measure)
+        except NameError:
+            print_error(f'Measure is unknown: {measure_str}')
+            unknown += 1
+        except ValueError:
+            print_error(f'Measure is invalid: {measure_str}')
+            invalid += 1
+    if invalid > 0 or unknown > 0:
+        reasons = []
+        if invalid > 0:
+            reasons.append(f'{invalid} invalid')
+        if unknown > 0:
+            reasons.append(f'{unknown} unknown')
+        print_error(f'Measures could not be parsed: {", ".join(reasons)}')
+        exit(1)
+    print_success(f'Measures successfully parsed.', indent=False)
+    return measures
+
+
+def evaluate(
+        measures: list,
+        qrels: List[Qrel],
+        run: List[ScoredDoc],
+) -> Tuple[Dict[Measure, float], List[Metric]]:
+    print_info(
+        f'Evaluate run with measures: '
+        f'{", ".join([str(m) for m in measures])}',
+        indent=False
+    )
+    aggregate_metrics = ir_measures.calc_aggregate(measures, qrels, run)
+    query_metrics = list(ir_measures.iter_calc(measures, qrels, run))
+    print_success(f'Run successfully evaluated.', indent=False)
+    return aggregate_metrics, query_metrics
+
+
+def write_aggregated_prototext(
+        measure_metrics: Dict[Measure, float],
+        path: Path,
+) -> None:
+    # Sort by measure name.
+    metrics = (
+        (str(measure), value)
+        for measure, value in measure_metrics.items()
+    )
+    metrics = sorted(metrics, key=lambda item: item[0])
+    with path.open('wt') as file:
+        for index, (measure, value) in enumerate(metrics):
+            file.write(
+                f'measure {{\n'
+                f'\tkey: "{measure}"\n'
+                f'\tvalue: "{value}"\n'
+                f'}}\n'
+            )
+
+
+def write_per_query_prototext(metrics: List[Metric], path: Path) -> None:
+    numeric = all(metric.query_id.isnumeric() for metric in metrics)
+    # Sort by measure name.
+    metrics = sorted(
+        metrics,
+        key=lambda metric: str(metric.measure)
+    )
+    # Sort by query id.
+    metrics = sorted(
+        list(metrics),
+        key=lambda metric: int(metric.query_id) if numeric else metric.query_id
+    )
+    with path.open('wt') as file:
+        for index, metric in enumerate(metrics):
+            file.write(
+                f'measure {{\n'
+                f'\tquery_id: "{metric.query_id}"\n'
+                f'\tmeasure: "{metric.measure}"\n'
+                f'\tvalue: "{metric.value}"\n'
+                f'}}\n'
+            )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Evaluate submissions with ir-measures.'
+    )
+    parser.add_argument(
+        '--run',
+        type=Path,
+        help='Run in TREC format.',
+        required=True,
+    )
+    parser.add_argument(
+        '--qrels',
+        type=Path,
+        help='Qrels in TREC format. '
+             'If no qrels are provided, only the run file is validated.',
+        required=False,
+    )
+    parser.add_argument(
+        '--topics',
+        type=Path,
+        help='Topics in JSON-Lines format. '
+             'If no topics are provided, only the run file is validated.',
+        required=False,
+    )
+    parser.add_argument(
+        '--measures',
+        type=str,
+        nargs='+',
+        help='Measure(s) to evaluate.',
+        required=False,
+    )
+    parser.add_argument(
+        '--output',
+        type=Path,
+        help='Output path for the prototext file with evaluation results.',
+        required=False,
+    )
+    return parser.parse_args()
+
+
+def _error_count(error_log: Dict[str, Dict[str, List[str]]]) -> int:
+    return sum(len(errors) for errors in error_log['errors'].values())
+
+
+def _warnings_count(error_log: Dict[str, Dict[str, List[str]]]) -> int:
+    return sum(len(warnings) for warnings in error_log['warnings'].values())
+
+
+def check_run_format(run_path: Path) -> None:
+    print_info(f'Check run file format.', indent=False)
+    run_error_log = check_run_file_content(run_path)
+
+    if _error_count(run_error_log):
+        for key, value in run_error_log['errors'].items():
+
+            more = f'(+{str(len(value) - 5)} more)' \
+                if (len(value) - 5) > 0 else ''
+
+            if key == 'cols_more':
+                print_error(
+                    f'More then 6 columns at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'cols_less':
+                print_error(
+                    f'Fewer then 6 columns at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'score_num':
+                print_error(
+                    f'Non-numeric scores at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'rank_num':
+                print_error(
+                    f'Non-numeric ranks at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'tag_multi':
+                lines = [
+                    f'{int(line) - 1}≠{line}'
+                    for line in value[:5]
+                ]
+                print_error(
+                    f'Conflicting run tags at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+
+    if _warnings_count(run_error_log):
+        for key, value in run_error_log['warnings'].items():
+
+            more = f'(+{str(len(value) - 5)} more)' \
+                if (len(value) - 5) > 0 else ''
+
+            if key == 'tag_chars':
+                print_warning(
+                    f'Run tags with special characters at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'qid_chars':
+                print_warning(
+                    f'Query IDs with special characters at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'qid_asc':
+                lines = [
+                    f'{int(line) - 1}>{line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Query IDs not in ascending order at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+            if key == 'docid_chars':
+                print_warning(
+                    f'Document IDs with special characters at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'ignored_col':
+                print_warning(
+                    f'Ignored column is not "Q0" at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'score_science':
+                print_warning(
+                    f'Score in scientific notation at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'score_tied':
+                lines = [
+                    f'{int(line) - 1}={line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Scores ties at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+            if key == 'score_desc':
+                lines = [
+                    f'{int(line) - 1}<{line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Scores not in descending order at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+            if key == 'rank_int':
+                print_warning(
+                    f'Non-integer ranks at lines: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'rank_start':
+                print_warning(f'Ranks do not start at 0.')
+            if key == 'rank_tied':
+                lines = [
+                    f'{int(line) - 1}={line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Rank ties at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+            if key == 'rank_asc':
+                lines = [
+                    f'{int(line) - 1}>{line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Ranks not in ascending order at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+            if key == 'rank_consecutive':
+                lines = [
+                    f'{int(line) - 1}↛{line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Ranks not consecutive at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+            if key == 'consistency':
+                lines = [
+                    f'{int(line) - 1}≷{line}'
+                    for line in value[:5]
+                ]
+                print_warning(
+                    f'Ranks and scores inconsistent at lines: '
+                    f'{", ".join(lines)} {more}'
+                )
+
+    if _error_count(run_error_log):
+        warn_str = ''
+        if _warnings_count(run_error_log):
+            warn_str = f', {_warnings_count(run_error_log)} warnings'
+        print_error(
+            f'Run file format is invalid: '
+            f'{_error_count(run_error_log)} errors{warn_str}',
+            indent=False
+        )
+        exit(1)
+    elif _warnings_count(run_error_log):
+        print_warning(
+            f'Run file format is valid: '
+            f'{_warnings_count(run_error_log)} warnings',
+            indent=False
+        )
+    else:
+        print_success(f'Run file format is valid.', indent=False)
+
+
+def check_run_consistency(
+        run: List[ScoredDoc],
+        qrels: List[Qrel],
+        topics: List[dict],
+) -> None:
+    print_info('Check run, qrels, and topics consistency.', indent=False)
+    consistency_error_log = check_consistency(run, qrels, topics)
+    if _warnings_count(consistency_error_log):
+        for key, value in consistency_error_log['warnings'].items():
+
+            more = f'(+{str(len(value) - 5)} more)' \
+                if (len(value) - 5) > 0 else ''
+
+            if key == 'run_qid_qrels':
+                print_warning(
+                    f'Query IDs of run file not found in qrels file: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'run_docid_qrels':
+                print_warning(
+                    f'Document IDs of run file not found in qrels file: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'topics_qid_run':
+                print_warning(
+                    f'Query IDs of topics file not found in run file: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+            if key == 'topics_qid_qrels':
+                print_warning(
+                    f'Query IDs of topics file not found in qrels file: '
+                    f'{", ".join(value[:5])} {more}'
+                )
+        print_warning(
+            f'Run, qrels, and topics are inconsistent: '
+            f'{_warnings_count(consistency_error_log)} warnings',
+            indent=False
+        )
+    else:
+        print_success(
+            f'Run, qrels, and topics are consistent.',
+            indent=False
+        )
+
+
+def write_prototext(
+        aggregated: Dict[Measure, float],
+        per_query: List[Metric],
+        output_path: Path,
+) -> None:
+    print_info('Export metrics.', indent=False)
+    write_aggregated_prototext(
+        aggregated,
+        output_path / "evaluation.prototext"
+    )
+    write_per_query_prototext(
+        per_query,
+        output_path / "evaluation-per-query.prototext"
+    )
+    print_success('Metrics successfully exported.', indent=False)
+
+
+def main():
+    # Parse command line arguments.
+    args = parse_args()
+
+    # Check run file.
+    run_path = args.run
+    if run_path is not None:
+        # Check and load run.
+        check_file_path(run_path, 'run')
+        check_run_format(run_path)
+        run = load_run(run_path)
+    else:
+        run = None
+
+    # Check qrels.
+    qrels_path = args.qrels
+    if qrels_path is not None:
+        # Check and load qrels.
+        check_file_path(qrels_path, 'qrels')
+        qrels = load_qrels(qrels_path)
+    else:
+        qrels = None
+
+    # Check topic.
+    topics_path = args.topics
+    if topics_path is not None:
+        # Check and load topics.
+        check_file_path(topics_path, 'topics')
+        topics = load_topics(topics_path)
+    else:
+        topics = None
+
+    # Check measures.
+    measures_str = args.measures
+    if measures_str is not None:
+        # Check and load measures.
+        measures = parse_measure_args(measures_str)
+    else:
+        measures = None
+
+    # Check output path.
+    output_path = args.output
+    if output_path is not None:
+        check_output_path(output_path, 'output')
+
+    # Shortcuts for early exit.
+    if run is None:
+        print_error('Unable to validate without run file.', indent=False)
+        exit(1)
+
+    if qrels is None or topics is None:
+        if qrels is not None:
+            # Must have qrels and topics or neither.
+            print_error(
+                'Consistency check without topics file is not allowed.',
+                indent=False
+            )
+            exit(1)
+        if topics is not None:
+            # Must have qrels and topics or neither.
+            print_error(
+                'Consistency check without qrels file is not allowed.',
+                indent=False
+            )
+            exit(1)
+        missing = []
+        if qrels is None:
+            missing.append('qrels')
+        if topics is None:
+            missing.append('topics')
+        if measures is not None:
+            print_error(
+                f'Measuring without {" and ".join(missing)} files '
+                f'is not allowed.',
+                indent=False
+            )
+            exit(1)
+        if output_path is not None:
+            print_error(
+                f'Exporting metrics without {" and ".join(missing)} files '
+                f'is not allowed.',
+                indent=False
+            )
+            exit(1)
+        # Only check run format, no consistency check.
+        return
+
+    # Check run, qrels, and topics consistency.
+    check_run_consistency(run, qrels, topics)
+
+    # Shortcuts for early exit.
+    if measures is None:
+        if output_path is not None:
+            print_error(
+                'Exporting metrics without measures is not allowed.',
+                indent=False
+            )
+            exit(1)
+        # Only consistency check, no measurement.
+        return
+
+    aggregated, per_query = evaluate(measures, qrels, run)
+
+    # Shortcuts for early exit.
+    if output_path is None:
+        # Only measure, no writing to output.
+        return
+
+    write_prototext(aggregated, per_query, output_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ir-measures/output/evaluation-per-query.prototext b/ir-measures/output/evaluation-per-query.prototext
new file mode 100644
index 0000000..b77d315
--- /dev/null
+++ b/ir-measures/output/evaluation-per-query.prototext
@@ -0,0 +1,960 @@
+measure {
+	query_id: "51"
+	measure: "AP"
+	value: "0.08431784264198276"
+}
+measure {
+	query_id: "51"
+	measure: "ERR@20"
+	value: "0.16736"
+}
+measure {
+	query_id: "51"
+	measure: "P@20"
+	value: "0.55"
+}
+measure {
+	query_id: "51"
+	measure: "nDCG@20"
+	value: "0.31366546266699225"
+}
+measure {
+	query_id: "52"
+	measure: "AP"
+	value: "0.31008322890753487"
+}
+measure {
+	query_id: "52"
+	measure: "ERR@20"
+	value: "0.64243"
+}
+measure {
+	query_id: "52"
+	measure: "P@20"
+	value: "0.55"
+}
+measure {
+	query_id: "52"
+	measure: "nDCG@20"
+	value: "0.6397676289506087"
+}
+measure {
+	query_id: "53"
+	measure: "AP"
+	value: "0.006766846052119314"
+}
+measure {
+	query_id: "53"
+	measure: "ERR@20"
+	value: "0.01042"
+}
+measure {
+	query_id: "53"
+	measure: "P@20"
+	value: "0.05"
+}
+measure {
+	query_id: "53"
+	measure: "nDCG@20"
+	value: "0.025474423234207066"
+}
+measure {
+	query_id: "54"
+	measure: "AP"
+	value: "0.08239186557890366"
+}
+measure {
+	query_id: "54"
+	measure: "ERR@20"
+	value: "0.04318"
+}
+measure {
+	query_id: "54"
+	measure: "P@20"
+	value: "0.15"
+}
+measure {
+	query_id: "54"
+	measure: "nDCG@20"
+	value: "0.09439235986160549"
+}
+measure {
+	query_id: "55"
+	measure: "AP"
+	value: "0.025442795139649605"
+}
+measure {
+	query_id: "55"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "55"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "55"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "56"
+	measure: "AP"
+	value: "0.014797358670446365"
+}
+measure {
+	query_id: "56"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "56"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "56"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "57"
+	measure: "AP"
+	value: "0.08271814470215763"
+}
+measure {
+	query_id: "57"
+	measure: "ERR@20"
+	value: "0.18104"
+}
+measure {
+	query_id: "57"
+	measure: "P@20"
+	value: "1.0"
+}
+measure {
+	query_id: "57"
+	measure: "nDCG@20"
+	value: "0.37814263992105696"
+}
+measure {
+	query_id: "58"
+	measure: "AP"
+	value: "0.047981620592769274"
+}
+measure {
+	query_id: "58"
+	measure: "ERR@20"
+	value: "0.2002"
+}
+measure {
+	query_id: "58"
+	measure: "P@20"
+	value: "0.1"
+}
+measure {
+	query_id: "58"
+	measure: "nDCG@20"
+	value: "0.15470717233038872"
+}
+measure {
+	query_id: "59"
+	measure: "AP"
+	value: "0.000998727734206654"
+}
+measure {
+	query_id: "59"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "59"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "59"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "60"
+	measure: "AP"
+	value: "0.12562945326365674"
+}
+measure {
+	query_id: "60"
+	measure: "ERR@20"
+	value: "0.07527"
+}
+measure {
+	query_id: "60"
+	measure: "P@20"
+	value: "0.6"
+}
+measure {
+	query_id: "60"
+	measure: "nDCG@20"
+	value: "0.2996408129012732"
+}
+measure {
+	query_id: "61"
+	measure: "AP"
+	value: "0.04222096080018801"
+}
+measure {
+	query_id: "61"
+	measure: "ERR@20"
+	value: "0.0227"
+}
+measure {
+	query_id: "61"
+	measure: "P@20"
+	value: "0.25"
+}
+measure {
+	query_id: "61"
+	measure: "nDCG@20"
+	value: "0.11833938829847222"
+}
+measure {
+	query_id: "62"
+	measure: "AP"
+	value: "0.33499884410457975"
+}
+measure {
+	query_id: "62"
+	measure: "ERR@20"
+	value: "0.11955"
+}
+measure {
+	query_id: "62"
+	measure: "P@20"
+	value: "0.7"
+}
+measure {
+	query_id: "62"
+	measure: "nDCG@20"
+	value: "0.46481395345418863"
+}
+measure {
+	query_id: "63"
+	measure: "AP"
+	value: "0.09440786331475708"
+}
+measure {
+	query_id: "63"
+	measure: "ERR@20"
+	value: "0.07744"
+}
+measure {
+	query_id: "63"
+	measure: "P@20"
+	value: "0.55"
+}
+measure {
+	query_id: "63"
+	measure: "nDCG@20"
+	value: "0.32402552769835197"
+}
+measure {
+	query_id: "64"
+	measure: "AP"
+	value: "0.08685313627059385"
+}
+measure {
+	query_id: "64"
+	measure: "ERR@20"
+	value: "0.21479"
+}
+measure {
+	query_id: "64"
+	measure: "P@20"
+	value: "0.45"
+}
+measure {
+	query_id: "64"
+	measure: "nDCG@20"
+	value: "0.35944389019892176"
+}
+measure {
+	query_id: "65"
+	measure: "AP"
+	value: "0.20467862651522445"
+}
+measure {
+	query_id: "65"
+	measure: "ERR@20"
+	value: "0.12497"
+}
+measure {
+	query_id: "65"
+	measure: "P@20"
+	value: "0.95"
+}
+measure {
+	query_id: "65"
+	measure: "nDCG@20"
+	value: "0.41366710335886037"
+}
+measure {
+	query_id: "66"
+	measure: "AP"
+	value: "0.014383592976393763"
+}
+measure {
+	query_id: "66"
+	measure: "ERR@20"
+	value: "0.01158"
+}
+measure {
+	query_id: "66"
+	measure: "P@20"
+	value: "0.1"
+}
+measure {
+	query_id: "66"
+	measure: "nDCG@20"
+	value: "0.04742937553647769"
+}
+measure {
+	query_id: "67"
+	measure: "AP"
+	value: "0.1523541663660107"
+}
+measure {
+	query_id: "67"
+	measure: "ERR@20"
+	value: "0.25877"
+}
+measure {
+	query_id: "67"
+	measure: "P@20"
+	value: "0.6"
+}
+measure {
+	query_id: "67"
+	measure: "nDCG@20"
+	value: "0.5324232213504778"
+}
+measure {
+	query_id: "68"
+	measure: "AP"
+	value: "0.0539546705439429"
+}
+measure {
+	query_id: "68"
+	measure: "ERR@20"
+	value: "0.07295"
+}
+measure {
+	query_id: "68"
+	measure: "P@20"
+	value: "0.2"
+}
+measure {
+	query_id: "68"
+	measure: "nDCG@20"
+	value: "0.18905274852308146"
+}
+measure {
+	query_id: "69"
+	measure: "AP"
+	value: "0.04092753746619256"
+}
+measure {
+	query_id: "69"
+	measure: "ERR@20"
+	value: "0.11001"
+}
+measure {
+	query_id: "69"
+	measure: "P@20"
+	value: "0.2"
+}
+measure {
+	query_id: "69"
+	measure: "nDCG@20"
+	value: "0.22456039597469996"
+}
+measure {
+	query_id: "70"
+	measure: "AP"
+	value: "0.0"
+}
+measure {
+	query_id: "70"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "70"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "70"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "71"
+	measure: "AP"
+	value: "0.009362496897326355"
+}
+measure {
+	query_id: "71"
+	measure: "ERR@20"
+	value: "0.01232"
+}
+measure {
+	query_id: "71"
+	measure: "P@20"
+	value: "0.1"
+}
+measure {
+	query_id: "71"
+	measure: "nDCG@20"
+	value: "0.037256819695418365"
+}
+measure {
+	query_id: "72"
+	measure: "AP"
+	value: "0.001789027566111398"
+}
+measure {
+	query_id: "72"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "72"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "72"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "73"
+	measure: "AP"
+	value: "0.09857345907915036"
+}
+measure {
+	query_id: "73"
+	measure: "ERR@20"
+	value: "0.16964"
+}
+measure {
+	query_id: "73"
+	measure: "P@20"
+	value: "0.75"
+}
+measure {
+	query_id: "73"
+	measure: "nDCG@20"
+	value: "0.42288861454588955"
+}
+measure {
+	query_id: "74"
+	measure: "AP"
+	value: "0.0031613061126710886"
+}
+measure {
+	query_id: "74"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "74"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "74"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "75"
+	measure: "AP"
+	value: "0.1853848101935964"
+}
+measure {
+	query_id: "75"
+	measure: "ERR@20"
+	value: "0.08819"
+}
+measure {
+	query_id: "75"
+	measure: "P@20"
+	value: "0.7"
+}
+measure {
+	query_id: "75"
+	measure: "nDCG@20"
+	value: "0.28892416616087263"
+}
+measure {
+	query_id: "76"
+	measure: "AP"
+	value: "0.1596744031456364"
+}
+measure {
+	query_id: "76"
+	measure: "ERR@20"
+	value: "0.18655"
+}
+measure {
+	query_id: "76"
+	measure: "P@20"
+	value: "0.75"
+}
+measure {
+	query_id: "76"
+	measure: "nDCG@20"
+	value: "0.39505803833045194"
+}
+measure {
+	query_id: "77"
+	measure: "AP"
+	value: "0.05916086544499962"
+}
+measure {
+	query_id: "77"
+	measure: "ERR@20"
+	value: "0.17976"
+}
+measure {
+	query_id: "77"
+	measure: "P@20"
+	value: "0.2"
+}
+measure {
+	query_id: "77"
+	measure: "nDCG@20"
+	value: "0.1823684709075654"
+}
+measure {
+	query_id: "78"
+	measure: "AP"
+	value: "0.07309254260205446"
+}
+measure {
+	query_id: "78"
+	measure: "ERR@20"
+	value: "0.12988"
+}
+measure {
+	query_id: "78"
+	measure: "P@20"
+	value: "0.25"
+}
+measure {
+	query_id: "78"
+	measure: "nDCG@20"
+	value: "0.27305922100120666"
+}
+measure {
+	query_id: "79"
+	measure: "AP"
+	value: "0.1660727051407492"
+}
+measure {
+	query_id: "79"
+	measure: "ERR@20"
+	value: "0.15387"
+}
+measure {
+	query_id: "79"
+	measure: "P@20"
+	value: "0.6"
+}
+measure {
+	query_id: "79"
+	measure: "nDCG@20"
+	value: "0.44242395871513723"
+}
+measure {
+	query_id: "80"
+	measure: "AP"
+	value: "0.10493115399768116"
+}
+measure {
+	query_id: "80"
+	measure: "ERR@20"
+	value: "0.1751"
+}
+measure {
+	query_id: "80"
+	measure: "P@20"
+	value: "0.95"
+}
+measure {
+	query_id: "80"
+	measure: "nDCG@20"
+	value: "0.9665625086152866"
+}
+measure {
+	query_id: "81"
+	measure: "AP"
+	value: "0.01715890156076979"
+}
+measure {
+	query_id: "81"
+	measure: "ERR@20"
+	value: "0.03841"
+}
+measure {
+	query_id: "81"
+	measure: "P@20"
+	value: "0.1"
+}
+measure {
+	query_id: "81"
+	measure: "nDCG@20"
+	value: "0.08885921609572776"
+}
+measure {
+	query_id: "82"
+	measure: "AP"
+	value: "0.14060018745496566"
+}
+measure {
+	query_id: "82"
+	measure: "ERR@20"
+	value: "0.0158"
+}
+measure {
+	query_id: "82"
+	measure: "P@20"
+	value: "0.15"
+}
+measure {
+	query_id: "82"
+	measure: "nDCG@20"
+	value: "0.05286949541504003"
+}
+measure {
+	query_id: "83"
+	measure: "AP"
+	value: "0.0032003192840618978"
+}
+measure {
+	query_id: "83"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "83"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "83"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "84"
+	measure: "AP"
+	value: "0.26403019394946936"
+}
+measure {
+	query_id: "84"
+	measure: "ERR@20"
+	value: "0.37726"
+}
+measure {
+	query_id: "84"
+	measure: "P@20"
+	value: "1.0"
+}
+measure {
+	query_id: "84"
+	measure: "nDCG@20"
+	value: "0.8234510385252299"
+}
+measure {
+	query_id: "85"
+	measure: "AP"
+	value: "0.04984746146436076"
+}
+measure {
+	query_id: "85"
+	measure: "ERR@20"
+	value: "0.13017"
+}
+measure {
+	query_id: "85"
+	measure: "P@20"
+	value: "0.35"
+}
+measure {
+	query_id: "85"
+	measure: "nDCG@20"
+	value: "0.2072412325901197"
+}
+measure {
+	query_id: "86"
+	measure: "AP"
+	value: "0.38665923711198213"
+}
+measure {
+	query_id: "86"
+	measure: "ERR@20"
+	value: "0.57431"
+}
+measure {
+	query_id: "86"
+	measure: "P@20"
+	value: "0.9"
+}
+measure {
+	query_id: "86"
+	measure: "nDCG@20"
+	value: "0.9357498985251277"
+}
+measure {
+	query_id: "87"
+	measure: "AP"
+	value: "0.013672391980620615"
+}
+measure {
+	query_id: "87"
+	measure: "ERR@20"
+	value: "0.0125"
+}
+measure {
+	query_id: "87"
+	measure: "P@20"
+	value: "0.05"
+}
+measure {
+	query_id: "87"
+	measure: "nDCG@20"
+	value: "0.04218127244872386"
+}
+measure {
+	query_id: "88"
+	measure: "AP"
+	value: "0.07687099066731841"
+}
+measure {
+	query_id: "88"
+	measure: "ERR@20"
+	value: "0.07895"
+}
+measure {
+	query_id: "88"
+	measure: "P@20"
+	value: "0.6"
+}
+measure {
+	query_id: "88"
+	measure: "nDCG@20"
+	value: "0.29983243845001567"
+}
+measure {
+	query_id: "89"
+	measure: "AP"
+	value: "0.20466545770487468"
+}
+measure {
+	query_id: "89"
+	measure: "ERR@20"
+	value: "0.11247"
+}
+measure {
+	query_id: "89"
+	measure: "P@20"
+	value: "0.75"
+}
+measure {
+	query_id: "89"
+	measure: "nDCG@20"
+	value: "0.2343846571103149"
+}
+measure {
+	query_id: "90"
+	measure: "AP"
+	value: "0.1296490231661245"
+}
+measure {
+	query_id: "90"
+	measure: "ERR@20"
+	value: "0.07679"
+}
+measure {
+	query_id: "90"
+	measure: "P@20"
+	value: "0.55"
+}
+measure {
+	query_id: "90"
+	measure: "nDCG@20"
+	value: "0.3285795110302251"
+}
+measure {
+	query_id: "91"
+	measure: "AP"
+	value: "0.0902393945671462"
+}
+measure {
+	query_id: "91"
+	measure: "ERR@20"
+	value: "0.09979"
+}
+measure {
+	query_id: "91"
+	measure: "P@20"
+	value: "0.45"
+}
+measure {
+	query_id: "91"
+	measure: "nDCG@20"
+	value: "0.25477116590093857"
+}
+measure {
+	query_id: "92"
+	measure: "AP"
+	value: "0.00030078355911961535"
+}
+measure {
+	query_id: "92"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "92"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "92"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "93"
+	measure: "AP"
+	value: "0.2061604493260041"
+}
+measure {
+	query_id: "93"
+	measure: "ERR@20"
+	value: "0.16062"
+}
+measure {
+	query_id: "93"
+	measure: "P@20"
+	value: "0.4"
+}
+measure {
+	query_id: "93"
+	measure: "nDCG@20"
+	value: "0.3265855299470895"
+}
+measure {
+	query_id: "94"
+	measure: "AP"
+	value: "5.2742616033755275e-05"
+}
+measure {
+	query_id: "94"
+	measure: "ERR@20"
+	value: "0.0"
+}
+measure {
+	query_id: "94"
+	measure: "P@20"
+	value: "0.0"
+}
+measure {
+	query_id: "94"
+	measure: "nDCG@20"
+	value: "0.0"
+}
+measure {
+	query_id: "96"
+	measure: "AP"
+	value: "0.15053669942589204"
+}
+measure {
+	query_id: "96"
+	measure: "ERR@20"
+	value: "0.16757"
+}
+measure {
+	query_id: "96"
+	measure: "P@20"
+	value: "0.6"
+}
+measure {
+	query_id: "96"
+	measure: "nDCG@20"
+	value: "0.4710935901006018"
+}
+measure {
+	query_id: "97"
+	measure: "AP"
+	value: "0.033941105876571405"
+}
+measure {
+	query_id: "97"
+	measure: "ERR@20"
+	value: "0.05199"
+}
+measure {
+	query_id: "97"
+	measure: "P@20"
+	value: "0.2"
+}
+measure {
+	query_id: "97"
+	measure: "nDCG@20"
+	value: "0.11506784902368167"
+}
+measure {
+	query_id: "98"
+	measure: "AP"
+	value: "0.11126690348306295"
+}
+measure {
+	query_id: "98"
+	measure: "ERR@20"
+	value: "0.1414"
+}
+measure {
+	query_id: "98"
+	measure: "P@20"
+	value: "0.65"
+}
+measure {
+	query_id: "98"
+	measure: "nDCG@20"
+	value: "0.34040441310101244"
+}
+measure {
+	query_id: "99"
+	measure: "AP"
+	value: "0.17954069638973388"
+}
+measure {
+	query_id: "99"
+	measure: "ERR@20"
+	value: "0.07698"
+}
+measure {
+	query_id: "99"
+	measure: "P@20"
+	value: "0.65"
+}
+measure {
+	query_id: "99"
+	measure: "nDCG@20"
+	value: "0.350016617656165"
+}
diff --git a/ir-measures/output/evaluation.prototext b/ir-measures/output/evaluation.prototext
new file mode 100644
index 0000000..837b9fb
--- /dev/null
+++ b/ir-measures/output/evaluation.prototext
@@ -0,0 +1,16 @@
+measure {
+	key: "AP"
+	value: "0.0984479789876958"
+}
+measure {
+	key: "ERR@20"
+	value: "0.11547874999999996"
+}
+measure {
+	key: "P@20"
+	value: "0.3895833333333332"
+}
+measure {
+	key: "nDCG@20"
+	value: "0.2595661630864897"
+}
diff --git a/ir-measures/tests/__init__.py b/ir-measures/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ir-measures/tests/__pycache__/__init__.cpython-310.pyc b/ir-measures/tests/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..1957981
Binary files /dev/null and b/ir-measures/tests/__pycache__/__init__.cpython-310.pyc differ
diff --git a/ir-measures/tests/__pycache__/test_with_approvals.cpython-310-pytest-7.2.0.pyc b/ir-measures/tests/__pycache__/test_with_approvals.cpython-310-pytest-7.2.0.pyc
new file mode 100644
index 0000000..a03eae5
Binary files /dev/null and b/ir-measures/tests/__pycache__/test_with_approvals.cpython-310-pytest-7.2.0.pyc differ
diff --git a/ir-measures/tests/approvaltests_config.json b/ir-measures/tests/approvaltests_config.json
new file mode 100644
index 0000000..e8126f5
--- /dev/null
+++ b/ir-measures/tests/approvaltests_config.json
@@ -0,0 +1,3 @@
+{
+    "subdirectory": "approved_files"
+}
\ No newline at end of file
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_all_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_all_valid.approved.txt
new file mode 100644
index 0000000..27821a3
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_all_valid.approved.txt
@@ -0,0 +1,52 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check output path: test-output
+✓ Output path is valid.
+ℹ Check run, qrels, and topics consistency.
+✓ Run, qrels, and topics are consistent.
+ℹ Evaluate run with measures: P@2, nDCG@2
+✓ Run successfully evaluated.
+ℹ Export metrics.
+✓ Metrics successfully exported.
+
+
+####
+files: ['test-output/evaluation-per-query.prototext', 'test-output/evaluation.prototext']
+
+
+####test-output/evaluation-per-query.prototext
+measure {
+	query_id: "1"
+	measure: "P@2"
+	value: "1.0"
+}
+measure {
+	query_id: "1"
+	measure: "nDCG@2"
+	value: "0.6666666666666667"
+}
+
+
+####test-output/evaluation.prototext
+measure {
+	key: "P@2"
+	value: "1.0"
+}
+measure {
+	key: "nDCG@2"
+	value: "0.6666666666666667"
+}
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_document_ids_inconsistent_run_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_document_ids_inconsistent_run_qrels.approved.txt
new file mode 100644
index 0000000..68e1764
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_document_ids_inconsistent_run_qrels.approved.txt
@@ -0,0 +1,21 @@
+ℹ Check run path: test-input/run_sample_warning_docid_not_in_qrels.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Check run, qrels, and topics consistency.
+	⚠ Document IDs of run file not found in qrels file: 9 
+⚠ Run, qrels, and topics are inconsistent: 1 warnings
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_invalid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_invalid.approved.txt
new file mode 100644
index 0000000..b936341
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_invalid.approved.txt
@@ -0,0 +1,21 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@X
+	✗ Measure is invalid: P@X
+	✗ Measures could not be parsed: 1 invalid
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_unknown.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_unknown.approved.txt
new file mode 100644
index 0000000..a9d1a00
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_unknown.approved.txt
@@ -0,0 +1,21 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: FOOBAR
+	✗ Measure is unknown: FOOBAR
+	✗ Measures could not be parsed: 1 unknown
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid.approved.txt
new file mode 100644
index 0000000..c1cabd6
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid.approved.txt
@@ -0,0 +1,24 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2
+✓ Measures successfully parsed.
+ℹ Check run, qrels, and topics consistency.
+✓ Run, qrels, and topics are consistent.
+ℹ Evaluate run with measures: P@2
+✓ Run successfully evaluated.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_qrels.approved.txt
new file mode 100644
index 0000000..ccf205c
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_qrels.approved.txt
@@ -0,0 +1,17 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2
+✓ Measures successfully parsed.
+✗ Consistency check without qrels file is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_topics.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_topics.approved.txt
new file mode 100644
index 0000000..e46e28f
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_topics.approved.txt
@@ -0,0 +1,17 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Parse measures: P@2
+✓ Measures successfully parsed.
+✗ Consistency check without topics file is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measures_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measures_valid.approved.txt
new file mode 100644
index 0000000..2444a03
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_measures_valid.approved.txt
@@ -0,0 +1,24 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check run, qrels, and topics consistency.
+✓ Run, qrels, and topics are consistent.
+ℹ Evaluate run with measures: P@2, nDCG@2
+✓ Run successfully evaluated.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_dir_not_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_dir_not_empty.approved.txt
new file mode 100644
index 0000000..0fc60c3
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_dir_not_empty.approved.txt
@@ -0,0 +1,56 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check output path: test-output-not-empty
+	⚠ Output directory is not empty.
+✓ Output path is valid: 1 warning
+ℹ Check run, qrels, and topics consistency.
+✓ Run, qrels, and topics are consistent.
+ℹ Evaluate run with measures: P@2, nDCG@2
+✓ Run successfully evaluated.
+ℹ Export metrics.
+✓ Metrics successfully exported.
+
+
+####
+files: ['test-output-not-empty/evaluation-per-query.prototext', 'test-output-not-empty/evaluation.prototext', 'test-output-not-empty/file.txt']
+
+
+####test-output-not-empty/evaluation-per-query.prototext
+measure {
+	query_id: "1"
+	measure: "P@2"
+	value: "1.0"
+}
+measure {
+	query_id: "1"
+	measure: "nDCG@2"
+	value: "0.6666666666666667"
+}
+
+
+####test-output-not-empty/evaluation.prototext
+measure {
+	key: "P@2"
+	value: "1.0"
+}
+measure {
+	key: "nDCG@2"
+	value: "0.6666666666666667"
+}
+
+
+####test-output-not-empty/file.txt
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_path_is_file.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_is_file.approved.txt
new file mode 100644
index 0000000..1a428f5
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_is_file.approved.txt
@@ -0,0 +1,23 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check output path: test-output-not-empty/file.txt
+	✗ Output path is not a directory.
+✗ Output path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_not_found.approved.txt
new file mode 100644
index 0000000..2f1d220
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_not_found.approved.txt
@@ -0,0 +1,23 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check output path: 42
+	✗ Output path does not exist.
+✗ Output path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_measures.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_measures.approved.txt
new file mode 100644
index 0000000..c5b07e2
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_measures.approved.txt
@@ -0,0 +1,23 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Check output path: test-output
+✓ Output path is valid.
+ℹ Check run, qrels, and topics consistency.
+✓ Run, qrels, and topics are consistent.
+✗ Exporting metrics without measures is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_qrels.approved.txt
new file mode 100644
index 0000000..31b1004
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_qrels.approved.txt
@@ -0,0 +1,19 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check output path: test-output
+✓ Output path is valid.
+✗ Consistency check without qrels file is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_topics.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_topics.approved.txt
new file mode 100644
index 0000000..383199c
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_topics.approved.txt
@@ -0,0 +1,19 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Parse measures: P@2, nDCG@2
+✓ Measures successfully parsed.
+ℹ Check output path: test-output
+✓ Output path is valid.
+✗ Consistency check without topics file is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_file_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_file_empty.approved.txt
new file mode 100644
index 0000000..11a310c
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_file_empty.approved.txt
@@ -0,0 +1,13 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/empty_file.txt
+	✗ Qrels file is empty.
+✗ Qrels path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_is_dir.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_is_dir.approved.txt
new file mode 100644
index 0000000..9de4b7b
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_is_dir.approved.txt
@@ -0,0 +1,13 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input
+	✗ Qrels path is not a file.
+✗ Qrels path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_not_found.approved.txt
new file mode 100644
index 0000000..fb91d52
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_not_found.approved.txt
@@ -0,0 +1,13 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: bar
+	✗ Qrels path does not exist.
+✗ Qrels path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_topics_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_topics_valid.approved.txt
new file mode 100644
index 0000000..c67c1df
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_topics_valid.approved.txt
@@ -0,0 +1,20 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Check run, qrels, and topics consistency.
+✓ Run, qrels, and topics are consistent.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_valid_no_topics.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_valid_no_topics.approved.txt
new file mode 100644
index 0000000..ec59681
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_valid_no_topics.approved.txt
@@ -0,0 +1,15 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+✗ Consistency check without topics file is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_run_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_run_qrels.approved.txt
new file mode 100644
index 0000000..41bbe1e
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_run_qrels.approved.txt
@@ -0,0 +1,21 @@
+ℹ Check run path: test-input/run_sample_warning_qid_not_in_qrels.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Check run, qrels, and topics consistency.
+	⚠ Query IDs of run file not found in qrels file: 2 
+⚠ Run, qrels, and topics are inconsistent: 1 warnings
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_qrels.approved.txt
new file mode 100644
index 0000000..04ec584
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_qrels.approved.txt
@@ -0,0 +1,22 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_warning_qid_not_in_qrels.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Check run, qrels, and topics consistency.
+	⚠ Query IDs of topics file not found in run file: 2 
+	⚠ Query IDs of topics file not found in qrels file: 2 
+⚠ Run, qrels, and topics are inconsistent: 2 warnings
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_run.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_run.approved.txt
new file mode 100644
index 0000000..297637b
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_run.approved.txt
@@ -0,0 +1,22 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/topics_sample_warning_qid_not_in_run.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+ℹ Check run, qrels, and topics consistency.
+	⚠ Query IDs of topics file not found in run file: 2 
+	⚠ Query IDs of topics file not found in qrels file: 2 
+⚠ Run, qrels, and topics are inconsistent: 2 warnings
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_document_id_special_chars.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_document_id_special_chars.approved.txt
new file mode 100644
index 0000000..b18b4f9
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_document_id_special_chars.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_docid_special_chars.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Document IDs with special characters at lines: 2 
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_fewer_columns.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_fewer_columns.approved.txt
new file mode 100644
index 0000000..a9ba29c
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_fewer_columns.approved.txt
@@ -0,0 +1,9 @@
+ℹ Check run path: test-input/run_sample_invalid_less_columns.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	✗ Fewer then 6 columns at lines: 2 
+✗ Run file format is invalid: 1 errors
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_file_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_file_empty.approved.txt
new file mode 100644
index 0000000..1a3ce50
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_file_empty.approved.txt
@@ -0,0 +1,7 @@
+ℹ Check run path: test-input/empty_file.txt
+	✗ Run file is empty.
+✗ Run path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_first_rank_not_zero.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_first_rank_not_zero.approved.txt
new file mode 100644
index 0000000..2fc565e
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_first_rank_not_zero.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_rank_not_start_at_0.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Ranks do not start at 0.
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_ignored_column_not_default.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_ignored_column_not_default.approved.txt
new file mode 100644
index 0000000..c3adb40
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_ignored_column_not_default.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_ignored_column_wrong.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Ignored column is not "Q0" at lines: 2 
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_more_columns.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_more_columns.approved.txt
new file mode 100644
index 0000000..2017b7b
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_more_columns.approved.txt
@@ -0,0 +1,9 @@
+ℹ Check run path: test-input/run_sample_invalid_more_columns.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	✗ More then 6 columns at lines: 2 
+✗ Run file format is invalid: 1 errors
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_multiple_tags.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_multiple_tags.approved.txt
new file mode 100644
index 0000000..a179738
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_multiple_tags.approved.txt
@@ -0,0 +1,9 @@
+ℹ Check run path: test-input/run_sample_invalid_multiple_tags.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	✗ Conflicting run tags at lines: 1≠2, 2≠3 
+✗ Run file format is invalid: 2 errors
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_path_is_dir.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_is_dir.approved.txt
new file mode 100644
index 0000000..2939b7b
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_is_dir.approved.txt
@@ -0,0 +1,7 @@
+ℹ Check run path: test-input
+	✗ Run path is not a file.
+✗ Run path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_not_found.approved.txt
new file mode 100644
index 0000000..a2d9cce
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_not_found.approved.txt
@@ -0,0 +1,7 @@
+ℹ Check run path: foo
+	✗ Run path does not exist.
+✗ Run path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_not_ascending.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_not_ascending.approved.txt
new file mode 100644
index 0000000..a6ab3ed
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_not_ascending.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_qid_not_asc.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Query IDs not in ascending order at lines: 2>3 
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_special_chars.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_special_chars.approved.txt
new file mode 100644
index 0000000..d209f69
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_special_chars.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_qid_special_chars.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Query IDs with special characters at lines: 1, 2, 3, 4, 5 
+⚠ Run file format is valid: 5 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_ascending.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_ascending.approved.txt
new file mode 100644
index 0000000..bfe0591
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_ascending.approved.txt
@@ -0,0 +1,13 @@
+ℹ Check run path: test-input/run_sample_warning_rank_not_asc.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Ranks not consecutive at lines: 3↛4, 4↛5 
+	⚠ Ranks not in ascending order at lines: 4>5 
+	⚠ Ranks and scores inconsistent at lines: 4≷5 
+⚠ Run file format is valid: 4 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_consecutive.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_consecutive.approved.txt
new file mode 100644
index 0000000..ebd0404
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_consecutive.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_rank_not_consecutive.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Ranks not consecutive at lines: 4↛5 
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_integer.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_integer.approved.txt
new file mode 100644
index 0000000..a6e761f
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_integer.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_rank_not_int.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Non-integer ranks at lines: 1, 2, 3, 4, 5 
+⚠ Run file format is valid: 5 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_numeric.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_numeric.approved.txt
new file mode 100644
index 0000000..8949a41
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_numeric.approved.txt
@@ -0,0 +1,10 @@
+ℹ Check run path: test-input/run_sample_warning_rank_not_num.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	✗ Non-numeric ranks at lines: 1, 2, 3, 4, 5 
+	⚠ Non-integer ranks at lines: 1, 2, 3, 4, 5 
+✗ Run file format is invalid: 5 errors, 5 warnings
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_ties.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_ties.approved.txt
new file mode 100644
index 0000000..5bb7dba
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_ties.approved.txt
@@ -0,0 +1,12 @@
+ℹ Check run path: test-input/run_sample_warning_rank_ties.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Rank ties at lines: 4=5 
+	⚠ Ranks not consecutive at lines: 4↛5 
+⚠ Run file format is valid: 2 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_descending.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_descending.approved.txt
new file mode 100644
index 0000000..2286079
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_descending.approved.txt
@@ -0,0 +1,12 @@
+ℹ Check run path: test-input/run_sample_warning_score_not_desc.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Scores not in descending order at lines: 2<3 
+	⚠ Ranks and scores inconsistent at lines: 2≷3 
+⚠ Run file format is valid: 2 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_numeric.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_numeric.approved.txt
new file mode 100644
index 0000000..1233400
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_numeric.approved.txt
@@ -0,0 +1,9 @@
+ℹ Check run path: test-input/run_sample_warning_score_not_num.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	✗ Non-numeric scores at lines: 5 
+✗ Run file format is invalid: 1 errors
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_rank_inconsistent.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_rank_inconsistent.approved.txt
new file mode 100644
index 0000000..18a35f1
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_rank_inconsistent.approved.txt
@@ -0,0 +1,14 @@
+ℹ Check run path: test-input/run_sample_warning_consistency.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Ranks not consecutive at lines: 1↛2, 2↛3, 3↛4 
+	⚠ Ranks not in ascending order at lines: 2>3 
+	⚠ Ranks and scores inconsistent at lines: 2≷3, 4≷5 
+	⚠ Scores not in descending order at lines: 4<5 
+⚠ Run file format is valid: 7 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_scientific_notation.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_scientific_notation.approved.txt
new file mode 100644
index 0000000..475cf76
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_scientific_notation.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_score_scientific.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Score in scientific notation at lines: 1 
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_ties.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_ties.approved.txt
new file mode 100644
index 0000000..a317cef
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_ties.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_score_ties.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Scores ties at lines: 2=3 
+⚠ Run file format is valid: 1 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_tag_special_chars.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_tag_special_chars.approved.txt
new file mode 100644
index 0000000..40ef954
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_tag_special_chars.approved.txt
@@ -0,0 +1,11 @@
+ℹ Check run path: test-input/run_sample_warning_tag_special_chars.txt
+✓ Run path is valid.
+ℹ Check run file format.
+	⚠ Run tags with special characters at lines: 1, 2, 3, 4, 5 
+⚠ Run file format is valid: 5 warnings
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_valid.approved.txt
new file mode 100644
index 0000000..07cd91f
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_valid.approved.txt
@@ -0,0 +1,10 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_file_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_file_empty.approved.txt
new file mode 100644
index 0000000..81dffb3
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_file_empty.approved.txt
@@ -0,0 +1,17 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input/empty_file.jsonl
+	✗ Topics path does not exist.
+✗ Topics path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_is_dir.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_is_dir.approved.txt
new file mode 100644
index 0000000..bdefadf
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_is_dir.approved.txt
@@ -0,0 +1,17 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: test-input
+	✗ Topics path is not a file.
+✗ Topics path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_not_found.approved.txt
new file mode 100644
index 0000000..136aafc
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_not_found.approved.txt
@@ -0,0 +1,17 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check qrels path: test-input/qrels_sample_valid.txt
+✓ Qrels path is valid.
+ℹ Load qrels with ir-measures.
+✓ Qrels successfully loaded.
+ℹ Check topics path: baz
+	✗ Topics path does not exist.
+✗ Topics path is invalid: 1 error
+
+
+####
+files: []
diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_valid_no_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_valid_no_qrels.approved.txt
new file mode 100644
index 0000000..24ce5d1
--- /dev/null
+++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_valid_no_qrels.approved.txt
@@ -0,0 +1,15 @@
+ℹ Check run path: test-input/run_sample_valid.txt
+✓ Run path is valid.
+ℹ Check run file format.
+✓ Run file format is valid.
+ℹ Load run with ir-measures.
+✓ Run successfully loaded.
+ℹ Check topics path: test-input/topics_sample_valid.jsonl
+✓ Topics path is valid.
+ℹ Load topics.
+✓ Topics successfully loaded.
+✗ Consistency check without qrels file is not allowed.
+
+
+####
+files: []
diff --git a/ir-measures/tests/test-io/test-input/empty_file.txt b/ir-measures/tests/test-io/test-input/empty_file.txt
new file mode 100644
index 0000000..e69de29
diff --git a/ir-measures/tests/test-io/test-input/qrels_sample_valid.txt b/ir-measures/tests/test-io/test-input/qrels_sample_valid.txt
new file mode 100644
index 0000000..3bc9d94
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/qrels_sample_valid.txt
@@ -0,0 +1,5 @@
+1 0 1 2
+1 0 2 2
+1 0 3 2
+1 0 4 3
+1 0 5 3
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_invalid_less_columns.txt b/ir-measures/tests/test-io/test-input/run_sample_invalid_less_columns.txt
new file mode 100644
index 0000000..794165c
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_invalid_less_columns.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_invalid_more_columns.txt b/ir-measures/tests/test-io/test-input/run_sample_invalid_more_columns.txt
new file mode 100644
index 0000000..a49d72b
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_invalid_more_columns.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier 4815162342
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_invalid_multiple_tags.txt b/ir-measures/tests/test-io/test-input/run_sample_invalid_multiple_tags.txt
new file mode 100644
index 0000000..ef4b8fc
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_invalid_multiple_tags.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pydolphin
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_valid.txt b/ir-measures/tests/test-io/test-input/run_sample_valid.txt
new file mode 100644
index 0000000..8597673
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_valid.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_consistency.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_consistency.txt
new file mode 100644
index 0000000..2c98be3
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_consistency.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 2 3.352227701368739 pyterrier
+1 Q0 3 1 3.292554298236954 pyterrier
+1 Q0 4 3 3.251238969336898 pyterrier
+1 Q0 5 4 3.260319364736074 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_docid_not_in_qrels.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_not_in_qrels.txt
new file mode 100644
index 0000000..1c5c3d3
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_not_in_qrels.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 9 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_docid_special_chars.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_special_chars.txt
new file mode 100644
index 0000000..e4d95bc
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_special_chars.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2% 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_ignored_column_wrong.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_ignored_column_wrong.txt
new file mode 100644
index 0000000..7633888
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_ignored_column_wrong.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q1 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_asc.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_asc.txt
new file mode 100644
index 0000000..ecb0625
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_asc.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+2 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 5 3 3.260319364736074 pyterrier
+1 Q0 4 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_in_qrels.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_in_qrels.txt
new file mode 100644
index 0000000..7b90606
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_in_qrels.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+2 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_qid_special_chars.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_special_chars.txt
new file mode 100644
index 0000000..b4d8a18
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_special_chars.txt
@@ -0,0 +1,5 @@
+1$ Q0 1 0 3.446771712469712 pyterrier
+1$ Q0 2 1 3.352227701368739 pyterrier
+1$ Q0 3 2 3.292554298236954 pyterrier
+1$ Q0 4 3 3.260319364736074 pyterrier
+1$ Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_asc.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_asc.txt
new file mode 100644
index 0000000..e40a7f6
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_asc.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 4 3.260319364736074 pyterrier
+1 Q0 5 3 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_consecutive.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_consecutive.txt
new file mode 100644
index 0000000..c496737
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_consecutive.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 5 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_int.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_int.txt
new file mode 100644
index 0000000..ac109b1
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_int.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0.2 3.446771712469712 pyterrier
+1 Q0 2 1.2 3.352227701368739 pyterrier
+1 Q0 3 2.3 3.292554298236954 pyterrier
+1 Q0 4 3.4 3.260319364736074 pyterrier
+1 Q0 5 4.5 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_num.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_num.txt
new file mode 100644
index 0000000..e2468dd
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_num.txt
@@ -0,0 +1,5 @@
+1 Q0 1 A 3.446771712469712 pyterrier
+1 Q0 2 B 3.352227701368739 pyterrier
+1 Q0 3 C 3.292554298236954 pyterrier
+1 Q0 4 D 3.260319364736074 pyterrier
+1 Q0 5 E 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_start_at_0.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_start_at_0.txt
new file mode 100644
index 0000000..fd75cf4
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_start_at_0.txt
@@ -0,0 +1,5 @@
+1 Q0 1 1 3.446771712469712 pyterrier
+1 Q0 2 2 3.352227701368739 pyterrier
+1 Q0 3 3 3.292554298236954 pyterrier
+1 Q0 4 4 3.260319364736074 pyterrier
+1 Q0 5 5 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_ties.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_ties.txt
new file mode 100644
index 0000000..d9009ac
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_ties.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 3 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_desc.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_desc.txt
new file mode 100644
index 0000000..b4507b7
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_desc.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.292554298236954 pyterrier
+1 Q0 3 2 3.352227701368739 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_num.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_num.txt
new file mode 100644
index 0000000..e412c49
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_num.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898a pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_scientific.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_scientific.txt
new file mode 100644
index 0000000..31330d3
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_scientific.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712e10 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.292554298236954 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_ties.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_ties.txt
new file mode 100644
index 0000000..e06e6f1
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_ties.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 pyterrier
+1 Q0 2 1 3.352227701368739 pyterrier
+1 Q0 3 2 3.352227701368739 pyterrier
+1 Q0 4 3 3.260319364736074 pyterrier
+1 Q0 5 4 3.251238969336898 pyterrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_tag_special_chars.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_tag_special_chars.txt
new file mode 100644
index 0000000..52ee5fd
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/run_sample_warning_tag_special_chars.txt
@@ -0,0 +1,5 @@
+1 Q0 1 0 3.446771712469712 py+terrier
+1 Q0 2 1 3.352227701368739 py+terrier
+1 Q0 3 2 3.292554298236954 py+terrier
+1 Q0 4 3 3.260319364736074 py+terrier
+1 Q0 5 4 3.251238969336898 py+terrier
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/topics_sample_valid.jsonl b/ir-measures/tests/test-io/test-input/topics_sample_valid.jsonl
new file mode 100644
index 0000000..c4fc25a
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/topics_sample_valid.jsonl
@@ -0,0 +1,5 @@
+{"qid": "1", "query": "what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft ."}
+{"qid": "1", "query": "what are the structural and aeroelastic problems associated with flight\nof high speed aircraft ."}
+{"qid": "1", "query": "what problems of heat conduction in composite slabs have been solved so\nfar ."}
+{"qid": "1", "query": "can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium ."}
+{"qid": "1", "query": "what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems ."}
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_qrels.jsonl b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_qrels.jsonl
new file mode 100644
index 0000000..d3ff1fd
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_qrels.jsonl
@@ -0,0 +1,5 @@
+{"qid": "1", "query": "what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft ."}
+{"qid": "1", "query": "what are the structural and aeroelastic problems associated with flight\nof high speed aircraft ."}
+{"qid": "1", "query": "what problems of heat conduction in composite slabs have been solved so\nfar ."}
+{"qid": "1", "query": "can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium ."}
+{"qid": "2", "query": "what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems ."}
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_run.jsonl b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_run.jsonl
new file mode 100644
index 0000000..d3ff1fd
--- /dev/null
+++ b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_run.jsonl
@@ -0,0 +1,5 @@
+{"qid": "1", "query": "what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft ."}
+{"qid": "1", "query": "what are the structural and aeroelastic problems associated with flight\nof high speed aircraft ."}
+{"qid": "1", "query": "what problems of heat conduction in composite slabs have been solved so\nfar ."}
+{"qid": "1", "query": "can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium ."}
+{"qid": "2", "query": "what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems ."}
\ No newline at end of file
diff --git a/ir-measures/tests/test-io/test-output-not-empty/file.txt b/ir-measures/tests/test-io/test-output-not-empty/file.txt
new file mode 100644
index 0000000..e69de29
diff --git a/ir-measures/tests/test_with_approvals.py b/ir-measures/tests/test_with_approvals.py
new file mode 100644
index 0000000..d534805
--- /dev/null
+++ b/ir-measures/tests/test_with_approvals.py
@@ -0,0 +1,478 @@
+import io
+import os
+import sys
+from contextlib import redirect_stdout
+from pathlib import Path
+from shutil import copytree
+from tempfile import TemporaryDirectory
+from typing import List
+
+from approvaltests import set_default_reporter, DiffReporter
+from approvaltests.approvals import verify
+from pytest import raises
+
+from ir_measures_evaluator import main
+
+_TEST_IO_DIR = Path(__file__).parent / 'test-io'
+
+
+def setup_module():
+    set_default_reporter(DiffReporter())
+
+
+def run_capture_stdout_files(
+        argv: List[str],
+        exit_normal: bool,
+        output_dir: str = 'test-output',
+):
+    buffer = io.StringIO()
+    captured_files = ''
+    with TemporaryDirectory() as temp_dir:
+        tmp_path = Path(temp_dir) / 'test-io'
+        # Copy test_io to temp_dir
+        copytree(_TEST_IO_DIR, tmp_path)
+        # Change to temp_dir
+        os.chdir(tmp_path)
+        # Override sys.argv
+        sys.argv = ['', *argv]
+        with redirect_stdout(buffer):
+            if exit_normal:
+                main()
+            else:
+                with raises(SystemExit):
+                    main()
+        # List files in temp output dir
+        tmp_out_path = tmp_path / output_dir
+        files = sorted(tmp_out_path.glob('**/*'))
+        filenames = [
+            str(file.relative_to(tmp_path)) for file in files
+        ]
+        captured_files += f'\n\n####\nfiles: {filenames}\n'
+        for file in files:
+            if not file.is_file():
+                continue
+            captured_files += f'\n\n####{file.relative_to(tmp_path)}\n' + \
+                              open(file).read()
+    return buffer.getvalue() + captured_files
+
+
+def _run_capture_stdout_files_fail(
+        argv: List[str],
+        output_dir: str = 'test-output',
+):
+    return run_capture_stdout_files(argv, False, output_dir)
+
+
+def _run_capture_stdout_files_pass(
+        argv: List[str],
+        output_dir: str = 'test-output',
+):
+    return run_capture_stdout_files(argv, True, output_dir)
+
+
+def test_run_path_not_found():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'foo',
+    ])
+    verify(actual)
+
+
+def test_run_path_is_dir():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input',
+    ])
+    verify(actual)
+
+
+def test_run_file_empty():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/empty_file.txt',
+    ])
+    verify(actual)
+
+
+def test_run_fewer_columns():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_invalid_less_columns.txt',
+    ])
+    verify(actual)
+
+
+def test_run_more_columns():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_invalid_more_columns.txt',
+    ])
+    verify(actual)
+
+
+def test_run_multiple_tags():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_invalid_multiple_tags.txt',
+    ])
+    verify(actual)
+
+
+def test_run_tag_special_chars():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_tag_special_chars.txt',
+    ])
+    verify(actual)
+
+
+def test_run_query_id_special_chars():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_qid_special_chars.txt',
+    ])
+    verify(actual)
+
+
+def test_run_query_id_not_ascending():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_qid_not_asc.txt',
+    ])
+    verify(actual)
+
+
+def test_run_document_id_special_chars():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_docid_special_chars.txt',
+    ])
+    verify(actual)
+
+
+def test_run_ignored_column_not_default():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_ignored_column_wrong.txt',
+    ])
+    verify(actual)
+
+
+def test_run_score_not_numeric():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_warning_score_not_num.txt',
+    ])
+    verify(actual)
+
+
+def test_run_score_scientific_notation():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_score_scientific.txt',
+    ])
+    verify(actual)
+
+
+def test_run_score_ties():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_score_ties.txt',
+    ])
+    verify(actual)
+
+
+def test_run_score_not_descending():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_score_not_desc.txt',
+    ])
+    verify(actual)
+
+
+def test_run_rank_not_numeric():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_warning_rank_not_num.txt',
+    ])
+    verify(actual)
+
+
+def test_run_rank_not_integer():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_rank_not_int.txt',
+    ])
+    verify(actual)
+
+
+def test_run_first_rank_not_zero():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_rank_not_start_at_0.txt',
+    ])
+    verify(actual)
+
+
+def test_run_rank_ties():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_rank_ties.txt',
+    ])
+    verify(actual)
+
+
+def test_run_rank_not_ascending():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_rank_not_asc.txt',
+    ])
+    verify(actual)
+
+
+def test_run_rank_not_consecutive():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_rank_not_consecutive.txt',
+    ])
+    verify(actual)
+
+
+def test_run_score_rank_inconsistent():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_consistency.txt',
+    ])
+    verify(actual)
+
+
+def test_run_valid():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+    ])
+    verify(actual)
+
+
+def test_qrels_path_not_found():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'bar',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_qrels_path_is_dir():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_qrels_file_empty():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/empty_file.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_qrels_valid_no_topics():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+    ])
+    verify(actual)
+
+
+def test_topics_path_not_found():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'baz',
+    ])
+    verify(actual)
+
+
+def test_topics_path_is_dir():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input',
+    ])
+    verify(actual)
+
+
+def test_topics_file_empty():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/empty_file.jsonl',
+    ])
+    verify(actual)
+
+
+def test_topics_valid_no_qrels():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_qrels_topics_valid():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_query_ids_inconsistent_run_qrels():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_qid_not_in_qrels.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_document_ids_inconsistent_run_qrels():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_warning_docid_not_in_qrels.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+    ])
+    verify(actual)
+
+
+def test_query_ids_inconsistent_topics_run():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_warning_qid_not_in_run.jsonl',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+    ])
+    verify(actual)
+
+
+def test_query_ids_inconsistent_topics_qrels():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_warning_qid_not_in_qrels.jsonl',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+    ])
+    verify(actual)
+
+
+def test_measure_unknown():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'FOOBAR',
+    ])
+    verify(actual)
+
+
+def test_measure_invalid():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@X',
+    ])
+    verify(actual)
+
+
+def test_measure_valid_no_qrels():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2',
+    ])
+    verify(actual)
+
+
+def test_measure_valid_no_topics():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--measures', 'P@2',
+    ])
+    verify(actual)
+
+
+def test_measure_valid():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2',
+    ])
+    verify(actual)
+
+
+def test_measures_valid():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2', 'nDCG@2',
+    ])
+    verify(actual)
+
+
+def test_output_path_not_found():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2', 'nDCG@2',
+        '--output', '42',
+    ])
+    verify(actual)
+
+
+def test_output_path_is_file():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2', 'nDCG@2',
+        '--output', 'test-output-not-empty/file.txt',
+    ])
+    verify(actual)
+
+
+def test_output_dir_not_empty():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2', 'nDCG@2',
+        '--output', 'test-output-not-empty',
+    ], output_dir='test-output-not-empty')
+    verify(actual)
+
+
+def test_output_valid_no_qrels():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2', 'nDCG@2',
+        '--output', 'test-output',
+    ])
+    verify(actual)
+
+
+def test_output_valid_no_topics():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--measures', 'P@2', 'nDCG@2',
+        '--output', 'test-output',
+    ])
+    verify(actual)
+
+
+def test_output_valid_no_measures():
+    actual = _run_capture_stdout_files_fail([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--output', 'test-output',
+    ])
+    verify(actual)
+
+
+def test_all_valid():
+    actual = _run_capture_stdout_files_pass([
+        '--run', 'test-input/run_sample_valid.txt',
+        '--qrels', 'test-input/qrels_sample_valid.txt',
+        '--topics', 'test-input/topics_sample_valid.jsonl',
+        '--measures', 'P@2', 'nDCG@2',
+        '--output', 'test-output',
+    ])
+    verify(actual)
diff --git a/reproducibility-experiments/README.md b/reproducibility-experiments/README.md
new file mode 100644
index 0000000..03b94ce
--- /dev/null
+++ b/reproducibility-experiments/README.md
@@ -0,0 +1,4 @@
+# Examples of Reproducibility Experiments
+
+We will add the Jupyter Notebooks for the reproducibility Experiments in the next two days, we are at the moment cleaning and documenting the code.
+