diff --git a/ir-measures/Dockerfile b/ir-measures/Dockerfile new file mode 100644 index 0000000..cf2899a --- /dev/null +++ b/ir-measures/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.10 + +RUN pip3 install ir-measures approvaltests pytest + +COPY ir_measures_evaluator.py / + +COPY tests /tmp/tests/ + +RUN cd /tmp \ + && find -iname __pycache__ -exec rm -Rf {} \; || echo "" \ + && PYTHONPATH='../:.' pytest \ + && cd / \ + && rm -Rf /tmp/tests + +ENTRYPOINT [ "/ir_measures_evaluator.py" ] diff --git a/ir-measures/Makefile b/ir-measures/Makefile new file mode 100644 index 0000000..17003e4 --- /dev/null +++ b/ir-measures/Makefile @@ -0,0 +1,15 @@ +IMAGE_VERSION=1.0.5 + +build-docker-image: + docker build -t webis/ir_measures_evaluator:${IMAGE_VERSION} . + +.PHONY: tests +tests: + pytest + +example-execution: + rm -Rf output + docker run --rm -it -v ${PWD}/input:/input -v ${PWD}/output:/output webis/ir_measures_evaluator:${IMAGE_VERSION} --run /input/run.txt --qrels /input/qrels.txt --measures "AP(rel=2)" "P(rel=2)@10" --output_path /output/eval.prototext + +publish-docker-image: + docker push webis/ir_measures_evaluator:${IMAGE_VERSION} diff --git a/ir-measures/Pipfile b/ir-measures/Pipfile new file mode 100644 index 0000000..89c2636 --- /dev/null +++ b/ir-measures/Pipfile @@ -0,0 +1,14 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +ir-measures = "*" +pytest = "*" +approvaltests = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/ir-measures/Pipfile.lock b/ir-measures/Pipfile.lock new file mode 100644 index 0000000..6dfee08 --- /dev/null +++ b/ir-measures/Pipfile.lock @@ -0,0 +1,332 @@ +{ + "_meta": { + "hash": { + "sha256": "80d55cd819e6b1d0d55c59515e27befe560fb92c22513373ae54811e2bc012ae" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "allpairspy": { + "hashes": [ + "sha256:66dbcb30c22a2c73e1cccae9ae7093f8e18bdf542e4ba2864ce822463735b5b4", + "sha256:9358484c91abe74ba18daf9d6d6904c5be7cc8818397d05248c9d336023c28b1" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==2.5.0" + }, + "approval-utilities": { + "hashes": [ + "sha256:8e0b146716db2f7d95d01726947438904760a6cb005aa391cee6f7a623d627ee", + "sha256:d11f545ad318f3520e997657aaeb6511521a4d0b647a0868037ca817af7c73b3" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==7.4.0" + }, + "approvaltests": { + "hashes": [ + "sha256:64e8798f843f44ffd047935f18e8ec850caf9d4f924b9b24456008102cbb6aca", + "sha256:e09b2f2af6cd049d71a3dcdc7b3bb4c2cd887cc5eb4b1294b557675360c5f90d" + ], + "index": "pypi", + "version": "==7.4.0" + }, + "attrs": { + "hashes": [ + "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", + "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" + ], + "markers": "python_version >= '3.5'", + "version": "==22.1.0" + }, + "beautifulsoup4": { + "hashes": [ + "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", + "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.11.1" + }, + "certifi": { + "hashes": [ + "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3", + "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18" + ], + "markers": "python_version >= '3.6'", + "version": "==2022.12.7" + }, + "charset-normalizer": { + "hashes": [ + "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", + "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2.1.1" + }, + "cwl-eval": { + "hashes": [ + "sha256:ff9e4a1241eed82067ebe7605e7cf44d923d7d9d764f222034fc8216d0ef327d" + ], + "markers": "python_version >= '3'", + "version": "==1.0.12" + }, + "empty-files": { + "hashes": [ + "sha256:87277db100a3bfdafc2ba18f6094cd37090e257058fb1c0b15873a89e1003149", + "sha256:ec464f7f88a028d4567b380d57983fc4ffb79147538626690cd94c33090cd216" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==0.0.3" + }, + "exceptiongroup": { + "hashes": [ + "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828", + "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec" + ], + "markers": "python_version < '3.11'", + "version": "==1.0.4" + }, + "idna": { + "hashes": [ + "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", + "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + ], + "markers": "python_version >= '3.5'", + "version": "==3.4" + }, + "iniconfig": { + "hashes": [ + "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", + "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" + ], + "version": "==1.1.1" + }, + "ir-measures": { + "hashes": [ + "sha256:4e6ed5495e3655c8d44e492bb2890fe6a06604b8ab1c2cd3c8996c499b7b1cef" + ], + "index": "pypi", + "version": "==0.3.1" + }, + "mrjob": { + "hashes": [ + "sha256:2c8d8dc8aa4b354a97de18d0260f551f018693af74af104b3d41daf165eebdd4", + "sha256:d8fa1bafcada0ffe3e7166896a27e996815e2cb835088aec025e3dd12c7146ce" + ], + "version": "==0.7.4" + }, + "numpy": { + "hashes": [ + "sha256:01dd17cbb340bf0fc23981e52e1d18a9d4050792e8fb8363cecbf066a84b827d", + "sha256:06005a2ef6014e9956c09ba07654f9837d9e26696a0470e42beedadb78c11b07", + "sha256:09b7847f7e83ca37c6e627682f145856de331049013853f344f37b0c9690e3df", + "sha256:0aaee12d8883552fadfc41e96b4c82ee7d794949e2a7c3b3a7201e968c7ecab9", + "sha256:0cbe9848fad08baf71de1a39e12d1b6310f1d5b2d0ea4de051058e6e1076852d", + "sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a", + "sha256:33161613d2269025873025b33e879825ec7b1d831317e68f4f2f0f84ed14c719", + "sha256:5039f55555e1eab31124a5768898c9e22c25a65c1e0037f4d7c495a45778c9f2", + "sha256:522e26bbf6377e4d76403826ed689c295b0b238f46c28a7251ab94716da0b280", + "sha256:56e454c7833e94ec9769fa0f86e6ff8e42ee38ce0ce1fa4cbb747ea7e06d56aa", + "sha256:58f545efd1108e647604a1b5aa809591ccd2540f468a880bedb97247e72db387", + "sha256:5e05b1c973a9f858c74367553e236f287e749465f773328c8ef31abe18f691e1", + "sha256:7903ba8ab592b82014713c491f6c5d3a1cde5b4a3bf116404e08f5b52f6daf43", + "sha256:8969bfd28e85c81f3f94eb4a66bc2cf1dbdc5c18efc320af34bffc54d6b1e38f", + "sha256:92c8c1e89a1f5028a4c6d9e3ccbe311b6ba53694811269b992c0b224269e2398", + "sha256:9c88793f78fca17da0145455f0d7826bcb9f37da4764af27ac945488116efe63", + "sha256:a7ac231a08bb37f852849bbb387a20a57574a97cfc7b6cabb488a4fc8be176de", + "sha256:abdde9f795cf292fb9651ed48185503a2ff29be87770c3b8e2a14b0cd7aa16f8", + "sha256:af1da88f6bc3d2338ebbf0e22fe487821ea4d8e89053e25fa59d1d79786e7481", + "sha256:b2a9ab7c279c91974f756c84c365a669a887efa287365a8e2c418f8b3ba73fb0", + "sha256:bf837dc63ba5c06dc8797c398db1e223a466c7ece27a1f7b5232ba3466aafe3d", + "sha256:ca51fcfcc5f9354c45f400059e88bc09215fb71a48d3768fb80e357f3b457e1e", + "sha256:ce571367b6dfe60af04e04a1834ca2dc5f46004ac1cc756fb95319f64c095a96", + "sha256:d208a0f8729f3fb790ed18a003f3a57895b989b40ea4dce4717e9cf4af62c6bb", + "sha256:dbee87b469018961d1ad79b1a5d50c0ae850000b639bcb1b694e9981083243b6", + "sha256:e9f4c4e51567b616be64e05d517c79a8a22f3606499941d97bb76f2ca59f982d", + "sha256:f063b69b090c9d918f9df0a12116029e274daf0181df392839661c4c7ec9018a", + "sha256:f9a909a8bae284d46bbfdefbdd4a262ba19d3bc9921b1e76126b1d21c3c34135" + ], + "markers": "python_version >= '3.8'", + "version": "==1.23.5" + }, + "packaging": { + "hashes": [ + "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3", + "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3" + ], + "markers": "python_version >= '3.7'", + "version": "==22.0" + }, + "pluggy": { + "hashes": [ + "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", + "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" + ], + "markers": "python_version >= '3.6'", + "version": "==1.0.0" + }, + "pyperclip": { + "hashes": [ + "sha256:105254a8b04934f0bc84e9c24eb360a591aaf6535c9def5f29d92af107a9bf57" + ], + "version": "==1.8.2" + }, + "pytest": { + "hashes": [ + "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71", + "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59" + ], + "index": "pypi", + "version": "==7.2.0" + }, + "pytrec-eval-terrier": { + "hashes": [ + "sha256:0de661e617fb8b31525ed73c0589b67d67f2b2d4d264a227dd1346b3ddb233ad", + "sha256:1558cad61264e25d54a8c76e1fc2f1939bc4a68e72beec6a9b2d752da06d8c7f", + "sha256:159611ab0de9614e5ce98c5f834abad2672f6037c343a48339c0119ed61ae6d5", + "sha256:1713e0584d169b7506017b6c8963ff019637474cda8a7968a457a2e70a140782", + "sha256:24c2b7ccfbbcb6c2b4f3af8ac56bd08ba26c25e212a6e7b0d5f7e3ae384ed046", + "sha256:27ff02633f5ed94f7795cf7bd02643076d4b6bb2d4f363a2926901ab2c6320d9", + "sha256:2fa570fea2b102bcc3f1210d175bc04636330992abd4131bd3ad7b7e4f5f23e5", + "sha256:2fd2e77f5d31b0bc3e5bdf2a086c68fad7938b9ee48ec910786e4cc81563a5d0", + "sha256:43f3796a126e6fb50558f9a0894593a35d0a2f29972ac57506af7a63ddaf94d5", + "sha256:4e0a77f8f4d8a7e20384ee0902184dd538487ac920987edc3122fb9adad423e7", + "sha256:4fc2b87a4c5d78a9e83b9891e9716aa2855f89cf91d7a22f59c5642665e21a54", + "sha256:505837e21642822bb8dc8fee0c122d5b7dcc3943a481355f29c8cb029618c489", + "sha256:5575efa84276aed810e4f938d058d0366b51f2e1f039877e5145c9b8f4abd568", + "sha256:5a459527c74fa6cd06886169130bc7f20422ade4b4f1fc14c9aab299eeeaf4f0", + "sha256:5e8140f0932c2ef498d215cb80106472259b42048bf16b2cbd5ab9ffd9e539a8", + "sha256:5ee5b665132151c07ae89f0da75a4e81bf65ade2fc46a4aaecc6957ea70690b0", + "sha256:60877dede6a431ec549827c1c106f5c00b556ed31e5f8384f9ac9cc64b622a5d", + "sha256:6690beec72108a8d79192b31ee2231bd28a643b31330699d22516db7d6112743", + "sha256:68c0f6f40809e3338525e960f8256ef0b926495c8f83c6a1d9cd5edf4e2c8487", + "sha256:6b06b7e5c68ba98ba420c584d0fafdd3e2581d066345cd1d0837cfc619db19c8", + "sha256:6c15382020a78807a5601398ec2a2dc1491af495980338313ed837d46809c94c", + "sha256:8afa03c774691cb8f679e21a0414f3f656ae2fa9edab135f0fd2edd349314a64", + "sha256:9c62442588b0e4d5e40d74da3ed9c1108b4a1a027ac442f1ac71e279fe7c5cc5", + "sha256:9ec39dd579f2893016aa5c4cd66a31c573b4057277979b4a650307e3e9115e19", + "sha256:9f27c8afd6ae40db7f7bc7290f8952222e4e6f2c52796fe49f92909b6611cae1", + "sha256:a988f7185e9b99f15e32ccab4f74265549fe12df21a949165dd41c76a38470b9", + "sha256:ae80c5002cd798d055e9a4600abb1a4a00ae1ddd818d02c174749f8240721535", + "sha256:b0744f7ea5e4e60c33ec449c936077c57a5d3d2363ddf37306f55c61ad1619fb", + "sha256:b9284b3a2b7467095384927a3a01bb8bde6c06ee07280194310baba9d088ad89", + "sha256:c693d17d7fc60246a6ea5548f58d81a1b5960b12fc691001649f596116ebad7e", + "sha256:c8dcd150fb8b6324f23425d9881187b03a725772c20618b2fbaec53380cbb211", + "sha256:d11a70a91907daab061d053df42bdd7d5219e2d641fadb3bc207cc88615ffec0", + "sha256:d132bf0b1b64f93ba32fcf57a62dce9f66276fa5d6975ac81e92a2fc1e113f8c", + "sha256:d33f69048de6d575282ce25691d1ac6561b5ca3ab7ce2e5e228c06d96144c9c4", + "sha256:d6b0cd049e95efd1bde42e1befcd843193bd5037c1e33a8155cf78b05ed86ab7", + "sha256:d892525c27bdfeafe7f3ba49bc3dd7a1ace689cea380a98e5ee995027552b67c", + "sha256:e5c7a11e98c6381afcac061ddc20a586b518f6481aeabe58b569143918419eea", + "sha256:e84efeeb7a82e9b2c4a09e0cdea80f69fca37d07c0a1b484d513d469a0f86643", + "sha256:fb7f2d36f68c441f66984221500b5c9a3e9c21c27c9789ccdab492305f4d7674" + ], + "markers": "python_version >= '3'", + "version": "==0.5.5" + }, + "pyyaml": { + "hashes": [ + "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf", + "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", + "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b", + "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57", + "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b", + "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4", + "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07", + "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba", + "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9", + "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287", + "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513", + "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0", + "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782", + "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0", + "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92", + "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f", + "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2", + "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc", + "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1", + "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c", + "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86", + "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4", + "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c", + "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34", + "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b", + "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d", + "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c", + "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb", + "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7", + "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737", + "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3", + "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d", + "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358", + "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53", + "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78", + "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803", + "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a", + "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f", + "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174", + "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5" + ], + "markers": "python_version >= '3.6'", + "version": "==6.0" + }, + "requests": { + "hashes": [ + "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", + "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" + ], + "markers": "python_version >= '3.7' and python_version < '4'", + "version": "==2.28.1" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, + "soupsieve": { + "hashes": [ + "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", + "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" + ], + "markers": "python_version >= '3.6'", + "version": "==2.3.2.post1" + }, + "tomli": { + "hashes": [ + "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", + "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + ], + "markers": "python_version < '3.11'", + "version": "==2.0.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", + "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" + ], + "markers": "python_version >= '3.7'", + "version": "==4.4.0" + }, + "urllib3": { + "hashes": [ + "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc", + "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==1.26.13" + } + }, + "develop": {} +} diff --git a/ir-measures/README.md b/ir-measures/README.md new file mode 100644 index 0000000..2543b0d --- /dev/null +++ b/ir-measures/README.md @@ -0,0 +1,21 @@ + +Add the evaluator to tira with: + +Image: +``` +webis/ir_measures_evaluator:1.0 +``` + +Command (if no qrels are available): + +``` +/ir_measures_evaluator.py --run ${inputRun}/run.txt --output_path ${outputDir}/evaluation.prototext +``` + + +Command (if qrels are available): + +``` +/ir_measures_evaluator.py --run ${inputRun}/run.txt --topics ${inputDataset}/queries.jsonl --qrels ${inputDataset}/qrels.txt --output_path ${outputDir} --measures "P@10" "nDCG@10" "MRR" +``` + diff --git a/ir-measures/__init__.py b/ir-measures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ir-measures/__pycache__/ir_measures_evaluator.cpython-310.pyc b/ir-measures/__pycache__/ir_measures_evaluator.cpython-310.pyc new file mode 100644 index 0000000..3ddf696 Binary files /dev/null and b/ir-measures/__pycache__/ir_measures_evaluator.cpython-310.pyc differ diff --git a/ir-measures/ir-measures.iml b/ir-measures/ir-measures.iml new file mode 100644 index 0000000..37dbae8 --- /dev/null +++ b/ir-measures/ir-measures.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/ir-measures/ir_measures_evaluator.py b/ir-measures/ir_measures_evaluator.py new file mode 100755 index 0000000..6861996 --- /dev/null +++ b/ir-measures/ir_measures_evaluator.py @@ -0,0 +1,808 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import re +from pathlib import Path +from typing import Tuple, Dict, List, Optional + +import ir_measures +from ir_measures import Qrel, ScoredDoc, Measure, Metric + + +def add_error( + error_log: Dict[str, Dict[str, List[str]]], + key: str, + line: int +) -> None: + if key not in error_log['errors']: + error_log['errors'][key] = [] + error_log['errors'][key].append(str(line)) + + +def add_warning( + error_log: Dict[str, Dict[str, List[str]]], + key: str, + line: int +) -> None: + if key not in error_log['warnings']: + error_log['warnings'][key] = [] + error_log['warnings'][key].append(str(line)) + + +def print_error(message: str, indent: bool = True) -> None: + tab = '\t' if indent else '' + print(f'{tab}\N{Ballot X} {message}') + + +def print_warning(message: str, indent: bool = True) -> None: + tab = '\t' if indent else '' + print(f'{tab}\N{Warning Sign} {message}') + + +def print_success(message: str, indent: bool = True) -> None: + tab = '\t' if indent else '' + print(f'{tab}\N{Check Mark} {message}') + + +def print_info(message: str, indent: bool = True) -> None: + tab = '\t' if indent else '' + print(f'{tab}\N{Information Source} {message}') + + +def check_file_path(path: Path, target: str) -> None: + print_info(f'Check {target} path: {path}', indent=False) + target = target.capitalize() + if not path.exists(): + print_error(f'{target} path does not exist.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + if not path.is_file(): + print_error(f'{target} path is not a file.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + if not os.access(path, os.R_OK): + print_error(f'{target} file is not readable.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + if path.stat().st_size <= 0: + print_error(f'{target} file is empty.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + print_success(f'{target} path is valid.', indent=False) + + +def check_output_path(path: Path, target: str) -> None: + print_info(f'Check {target} path: {path}', indent=False) + target = target.capitalize() + if not path.exists(): + print_error(f'{target} path does not exist.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + if not path.is_dir(): + print_error(f'{target} path is not a directory.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + if not os.access(path, os.W_OK): + print_error(f'{target} directory is not writable.') + print_error(f'{target} path is invalid: 1 error', indent=False) + exit(1) + if next(path.iterdir(), None) is not None: + print_warning(f'{target} directory is not empty.') + print_success(f'{target} path is valid: 1 warning', indent=False) + else: + print_success(f'{target} path is valid.', indent=False) + + +def _is_number(string: str) -> bool: + try: + float(string) + return True + except ValueError: + return False + + +def _is_integer(string: str) -> bool: + try: + int(string) + return True + except ValueError: + return False + + +regexp_special_chars = re.compile(r'[^a-zA-Z0-9-_]+') + + +def check_run_file_content(path: Path) -> Dict[str, Dict[str, List[str]]]: + error_log: Dict[str, Dict[str, List[str]]] = { + 'errors': {}, + 'warnings': {}, + } + + with path.open('rt') as file: + line_count = 1 + list_of_cols = [] + ignore_previous_cols = True + + lines = file.readlines() + + for line in lines: + cols = line.rstrip().split() + previous_cols = list_of_cols[-1] if list_of_cols else cols + + # check columns + if len(cols) > 6: + add_error(error_log, 'cols_more', line_count) + ignore_previous_cols = True + elif len(cols) < 6: + add_error(error_log, 'cols_less', line_count) + ignore_previous_cols = True + else: + # error_log everything column specific + # only when the column count is correct + # because otherwise the position of the data + # is potentially corrupted + + # check tags + if not cols[5] == previous_cols[5]: + add_error(error_log, 'tag_multi', line_count) + if re.search(regexp_special_chars, cols[5]): + add_warning(error_log, 'tag_chars', line_count) + + # check query ids + if re.search(regexp_special_chars, cols[0]): + add_warning(error_log, 'qid_chars', line_count) + if cols[0] < previous_cols[0]: + add_warning(error_log, 'qid_asc', line_count) + + # check doc ids + if re.search(regexp_special_chars, cols[2]): + add_warning(error_log, 'docid_chars', line_count) + + # check ignored column + if not cols[1] == 'Q0': + add_warning(error_log, 'ignored_col', line_count) + + # check scores + try: + float(cols[4]) + float(previous_cols[4]) + + if 'e' in cols[4].lower(): + add_warning(error_log, 'score_science', line_count) + if ( + float(cols[4]) == float(previous_cols[4]) + and not ignore_previous_cols + ): + add_warning(error_log, 'score_tied', line_count) + if float(cols[4]) > float(previous_cols[4]): + add_warning(error_log, 'score_desc', line_count) + except: + if not _is_number(cols[4]): + add_error(error_log, 'score_num', line_count) + if ( + cols[4] == previous_cols[4] + and not ignore_previous_cols + ): + add_warning(error_log, 'score_tied', line_count) + if cols[4] > previous_cols[4]: + add_warning(error_log, 'scor_desc', line_count) + + # check ranks + if not _is_number(cols[3]): + add_error(error_log, 'rank_num', line_count) + if not _is_integer(cols[3]): + add_warning(error_log, 'rank_int', line_count) + else: + if line_count == 1 and int(cols[3]) != 0: + add_warning(error_log, 'rank_start', line_count) + if cols[3] == previous_cols[3] and not ignore_previous_cols: + add_warning(error_log, 'rank_tied', line_count) + if _is_integer(cols[3]) and _is_integer(previous_cols[3]): + if int(cols[3]) < int(previous_cols[3]): + add_warning(error_log, 'rank_asc', line_count) + if ( + int(cols[3]) != int(previous_cols[3]) + 1 + and line_count > 1 and not ignore_previous_cols + ): + add_warning(error_log, 'rank_consecutive', line_count) + else: + if cols[3] < previous_cols[3]: + add_warning(error_log, 'rank_asc', line_count) + + # check consistency + if _is_number(cols[4]) and _is_number(previous_cols[4]): + if _is_integer(cols[3]) and _is_integer(previous_cols[3]): + if ( + ( + int(cols[3]) < int(previous_cols[3]) + and not float(cols[4]) > float(previous_cols[4])) + or + ( + float(cols[4]) > float(previous_cols[4]) + and not int(cols[3]) < int(previous_cols[3]) + ) + ): + add_warning(error_log, 'consistency', line_count) + else: + if ( + ( + cols[3] < previous_cols[3] + and not float(cols[4]) > float(previous_cols[4]) + ) + or + ( + float(cols[4]) > float(previous_cols[4]) + and not cols[3] < previous_cols[3] + ) + ): + add_warning(error_log, 'consistency', line_count) + else: + if _is_integer(cols[3]) and _is_integer(previous_cols[3]): + if ( + ( + int(cols[3]) < int(previous_cols[3]) + and not cols[4] > previous_cols[4] + ) + or + ( + cols[4] > previous_cols[4] + and not cols[3] < previous_cols[3] + ) + ): + add_warning(error_log, 'consistency', line_count) + else: + if ( + ( + cols[3] < previous_cols[3] + and not cols[4] > previous_cols[4] + ) + or + ( + cols[4] > previous_cols[4] + and not cols[3] < previous_cols[3] + ) + ): + add_warning(error_log, 'consistency', line_count) + + # at the end of iteration if columns are correct: + # save actual line as previous line for the next iteration + list_of_cols.append(cols) + ignore_previous_cols = False + # at the end of iteration: count line + line_count += 1 + + return error_log + + +def check_consistency( + run: List[ScoredDoc], + qrels: List[Qrel], + topics: List[dict], +) -> Dict[str, Dict[str, List[str]]]: + run_queries = {scored_doc.query_id for scored_doc in run} + run_docs = {scored_doc.doc_id for scored_doc in run} + qrels_queries = {scored_doc.query_id for scored_doc in qrels} + qrels_docs = {scored_doc.doc_id for scored_doc in qrels} + topics_queries = {topic['qid'] for topic in topics} + + error_log: Dict[str, Dict[str, List[str]]] = { + 'errors': {}, + 'warnings': {}, + } + + for query in sorted(run_queries): + if query not in qrels_queries: + add_warning(error_log, 'run_qid_qrels', query) + + for doc in sorted(run_docs): + if doc not in qrels_docs: + add_warning(error_log, 'run_docid_qrels', doc) + + for query in sorted(topics_queries): + if query not in run_queries: + add_warning(error_log, 'topics_qid_run', query) + if query not in qrels_queries: + add_warning(error_log, 'topics_qid_qrels', query) + + return error_log + + +def load_run(path: Path) -> List[ScoredDoc]: + print_info('Load run with ir-measures.', indent=False) + run = list(ir_measures.read_trec_run(str(path))) + print_success(f'Run successfully loaded.', indent=False) + return run + + +def load_qrels(path: Path) -> List[Qrel]: + print_info('Load qrels with ir-measures.', indent=False) + qrels = list(ir_measures.read_trec_qrels(str(path))) + print_success(f'Qrels successfully loaded.', indent=False) + return qrels + + +def load_topics(path: Path) -> List[dict]: + print_info('Load topics.', indent=False) + with path.open('rt') as lines: + topics = [json.loads(line) for line in lines] + print_success(f'Topics successfully loaded.', indent=False) + return topics + + +def parse_measure_args(measures_str: List[str]) -> Optional[List[Measure]]: + print_info( + 'Parse measures: ' + f'{", ".join(measures_str)}', + indent=False + ) + measures = [] + unknown = 0 + invalid = 0 + for measure_str in measures_str: + try: + measure = ir_measures.parse_measure(measure_str) + measures.append(measure) + except NameError: + print_error(f'Measure is unknown: {measure_str}') + unknown += 1 + except ValueError: + print_error(f'Measure is invalid: {measure_str}') + invalid += 1 + if invalid > 0 or unknown > 0: + reasons = [] + if invalid > 0: + reasons.append(f'{invalid} invalid') + if unknown > 0: + reasons.append(f'{unknown} unknown') + print_error(f'Measures could not be parsed: {", ".join(reasons)}') + exit(1) + print_success(f'Measures successfully parsed.', indent=False) + return measures + + +def evaluate( + measures: list, + qrels: List[Qrel], + run: List[ScoredDoc], +) -> Tuple[Dict[Measure, float], List[Metric]]: + print_info( + f'Evaluate run with measures: ' + f'{", ".join([str(m) for m in measures])}', + indent=False + ) + aggregate_metrics = ir_measures.calc_aggregate(measures, qrels, run) + query_metrics = list(ir_measures.iter_calc(measures, qrels, run)) + print_success(f'Run successfully evaluated.', indent=False) + return aggregate_metrics, query_metrics + + +def write_aggregated_prototext( + measure_metrics: Dict[Measure, float], + path: Path, +) -> None: + # Sort by measure name. + metrics = ( + (str(measure), value) + for measure, value in measure_metrics.items() + ) + metrics = sorted(metrics, key=lambda item: item[0]) + with path.open('wt') as file: + for index, (measure, value) in enumerate(metrics): + file.write( + f'measure {{\n' + f'\tkey: "{measure}"\n' + f'\tvalue: "{value}"\n' + f'}}\n' + ) + + +def write_per_query_prototext(metrics: List[Metric], path: Path) -> None: + numeric = all(metric.query_id.isnumeric() for metric in metrics) + # Sort by measure name. + metrics = sorted( + metrics, + key=lambda metric: str(metric.measure) + ) + # Sort by query id. + metrics = sorted( + list(metrics), + key=lambda metric: int(metric.query_id) if numeric else metric.query_id + ) + with path.open('wt') as file: + for index, metric in enumerate(metrics): + file.write( + f'measure {{\n' + f'\tquery_id: "{metric.query_id}"\n' + f'\tmeasure: "{metric.measure}"\n' + f'\tvalue: "{metric.value}"\n' + f'}}\n' + ) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Evaluate submissions with ir-measures.' + ) + parser.add_argument( + '--run', + type=Path, + help='Run in TREC format.', + required=True, + ) + parser.add_argument( + '--qrels', + type=Path, + help='Qrels in TREC format. ' + 'If no qrels are provided, only the run file is validated.', + required=False, + ) + parser.add_argument( + '--topics', + type=Path, + help='Topics in JSON-Lines format. ' + 'If no topics are provided, only the run file is validated.', + required=False, + ) + parser.add_argument( + '--measures', + type=str, + nargs='+', + help='Measure(s) to evaluate.', + required=False, + ) + parser.add_argument( + '--output', + type=Path, + help='Output path for the prototext file with evaluation results.', + required=False, + ) + return parser.parse_args() + + +def _error_count(error_log: Dict[str, Dict[str, List[str]]]) -> int: + return sum(len(errors) for errors in error_log['errors'].values()) + + +def _warnings_count(error_log: Dict[str, Dict[str, List[str]]]) -> int: + return sum(len(warnings) for warnings in error_log['warnings'].values()) + + +def check_run_format(run_path: Path) -> None: + print_info(f'Check run file format.', indent=False) + run_error_log = check_run_file_content(run_path) + + if _error_count(run_error_log): + for key, value in run_error_log['errors'].items(): + + more = f'(+{str(len(value) - 5)} more)' \ + if (len(value) - 5) > 0 else '' + + if key == 'cols_more': + print_error( + f'More then 6 columns at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'cols_less': + print_error( + f'Fewer then 6 columns at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'score_num': + print_error( + f'Non-numeric scores at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'rank_num': + print_error( + f'Non-numeric ranks at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'tag_multi': + lines = [ + f'{int(line) - 1}≠{line}' + for line in value[:5] + ] + print_error( + f'Conflicting run tags at lines: ' + f'{", ".join(lines)} {more}' + ) + + if _warnings_count(run_error_log): + for key, value in run_error_log['warnings'].items(): + + more = f'(+{str(len(value) - 5)} more)' \ + if (len(value) - 5) > 0 else '' + + if key == 'tag_chars': + print_warning( + f'Run tags with special characters at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'qid_chars': + print_warning( + f'Query IDs with special characters at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'qid_asc': + lines = [ + f'{int(line) - 1}>{line}' + for line in value[:5] + ] + print_warning( + f'Query IDs not in ascending order at lines: ' + f'{", ".join(lines)} {more}' + ) + if key == 'docid_chars': + print_warning( + f'Document IDs with special characters at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'ignored_col': + print_warning( + f'Ignored column is not "Q0" at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'score_science': + print_warning( + f'Score in scientific notation at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'score_tied': + lines = [ + f'{int(line) - 1}={line}' + for line in value[:5] + ] + print_warning( + f'Scores ties at lines: ' + f'{", ".join(lines)} {more}' + ) + if key == 'score_desc': + lines = [ + f'{int(line) - 1}<{line}' + for line in value[:5] + ] + print_warning( + f'Scores not in descending order at lines: ' + f'{", ".join(lines)} {more}' + ) + if key == 'rank_int': + print_warning( + f'Non-integer ranks at lines: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'rank_start': + print_warning(f'Ranks do not start at 0.') + if key == 'rank_tied': + lines = [ + f'{int(line) - 1}={line}' + for line in value[:5] + ] + print_warning( + f'Rank ties at lines: ' + f'{", ".join(lines)} {more}' + ) + if key == 'rank_asc': + lines = [ + f'{int(line) - 1}>{line}' + for line in value[:5] + ] + print_warning( + f'Ranks not in ascending order at lines: ' + f'{", ".join(lines)} {more}' + ) + if key == 'rank_consecutive': + lines = [ + f'{int(line) - 1}↛{line}' + for line in value[:5] + ] + print_warning( + f'Ranks not consecutive at lines: ' + f'{", ".join(lines)} {more}' + ) + if key == 'consistency': + lines = [ + f'{int(line) - 1}≷{line}' + for line in value[:5] + ] + print_warning( + f'Ranks and scores inconsistent at lines: ' + f'{", ".join(lines)} {more}' + ) + + if _error_count(run_error_log): + warn_str = '' + if _warnings_count(run_error_log): + warn_str = f', {_warnings_count(run_error_log)} warnings' + print_error( + f'Run file format is invalid: ' + f'{_error_count(run_error_log)} errors{warn_str}', + indent=False + ) + exit(1) + elif _warnings_count(run_error_log): + print_warning( + f'Run file format is valid: ' + f'{_warnings_count(run_error_log)} warnings', + indent=False + ) + else: + print_success(f'Run file format is valid.', indent=False) + + +def check_run_consistency( + run: List[ScoredDoc], + qrels: List[Qrel], + topics: List[dict], +) -> None: + print_info('Check run, qrels, and topics consistency.', indent=False) + consistency_error_log = check_consistency(run, qrels, topics) + if _warnings_count(consistency_error_log): + for key, value in consistency_error_log['warnings'].items(): + + more = f'(+{str(len(value) - 5)} more)' \ + if (len(value) - 5) > 0 else '' + + if key == 'run_qid_qrels': + print_warning( + f'Query IDs of run file not found in qrels file: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'run_docid_qrels': + print_warning( + f'Document IDs of run file not found in qrels file: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'topics_qid_run': + print_warning( + f'Query IDs of topics file not found in run file: ' + f'{", ".join(value[:5])} {more}' + ) + if key == 'topics_qid_qrels': + print_warning( + f'Query IDs of topics file not found in qrels file: ' + f'{", ".join(value[:5])} {more}' + ) + print_warning( + f'Run, qrels, and topics are inconsistent: ' + f'{_warnings_count(consistency_error_log)} warnings', + indent=False + ) + else: + print_success( + f'Run, qrels, and topics are consistent.', + indent=False + ) + + +def write_prototext( + aggregated: Dict[Measure, float], + per_query: List[Metric], + output_path: Path, +) -> None: + print_info('Export metrics.', indent=False) + write_aggregated_prototext( + aggregated, + output_path / "evaluation.prototext" + ) + write_per_query_prototext( + per_query, + output_path / "evaluation-per-query.prototext" + ) + print_success('Metrics successfully exported.', indent=False) + + +def main(): + # Parse command line arguments. + args = parse_args() + + # Check run file. + run_path = args.run + if run_path is not None: + # Check and load run. + check_file_path(run_path, 'run') + check_run_format(run_path) + run = load_run(run_path) + else: + run = None + + # Check qrels. + qrels_path = args.qrels + if qrels_path is not None: + # Check and load qrels. + check_file_path(qrels_path, 'qrels') + qrels = load_qrels(qrels_path) + else: + qrels = None + + # Check topic. + topics_path = args.topics + if topics_path is not None: + # Check and load topics. + check_file_path(topics_path, 'topics') + topics = load_topics(topics_path) + else: + topics = None + + # Check measures. + measures_str = args.measures + if measures_str is not None: + # Check and load measures. + measures = parse_measure_args(measures_str) + else: + measures = None + + # Check output path. + output_path = args.output + if output_path is not None: + check_output_path(output_path, 'output') + + # Shortcuts for early exit. + if run is None: + print_error('Unable to validate without run file.', indent=False) + exit(1) + + if qrels is None or topics is None: + if qrels is not None: + # Must have qrels and topics or neither. + print_error( + 'Consistency check without topics file is not allowed.', + indent=False + ) + exit(1) + if topics is not None: + # Must have qrels and topics or neither. + print_error( + 'Consistency check without qrels file is not allowed.', + indent=False + ) + exit(1) + missing = [] + if qrels is None: + missing.append('qrels') + if topics is None: + missing.append('topics') + if measures is not None: + print_error( + f'Measuring without {" and ".join(missing)} files ' + f'is not allowed.', + indent=False + ) + exit(1) + if output_path is not None: + print_error( + f'Exporting metrics without {" and ".join(missing)} files ' + f'is not allowed.', + indent=False + ) + exit(1) + # Only check run format, no consistency check. + return + + # Check run, qrels, and topics consistency. + check_run_consistency(run, qrels, topics) + + # Shortcuts for early exit. + if measures is None: + if output_path is not None: + print_error( + 'Exporting metrics without measures is not allowed.', + indent=False + ) + exit(1) + # Only consistency check, no measurement. + return + + aggregated, per_query = evaluate(measures, qrels, run) + + # Shortcuts for early exit. + if output_path is None: + # Only measure, no writing to output. + return + + write_prototext(aggregated, per_query, output_path) + + +if __name__ == '__main__': + main() diff --git a/ir-measures/output/evaluation-per-query.prototext b/ir-measures/output/evaluation-per-query.prototext new file mode 100644 index 0000000..b77d315 --- /dev/null +++ b/ir-measures/output/evaluation-per-query.prototext @@ -0,0 +1,960 @@ +measure { + query_id: "51" + measure: "AP" + value: "0.08431784264198276" +} +measure { + query_id: "51" + measure: "ERR@20" + value: "0.16736" +} +measure { + query_id: "51" + measure: "P@20" + value: "0.55" +} +measure { + query_id: "51" + measure: "nDCG@20" + value: "0.31366546266699225" +} +measure { + query_id: "52" + measure: "AP" + value: "0.31008322890753487" +} +measure { + query_id: "52" + measure: "ERR@20" + value: "0.64243" +} +measure { + query_id: "52" + measure: "P@20" + value: "0.55" +} +measure { + query_id: "52" + measure: "nDCG@20" + value: "0.6397676289506087" +} +measure { + query_id: "53" + measure: "AP" + value: "0.006766846052119314" +} +measure { + query_id: "53" + measure: "ERR@20" + value: "0.01042" +} +measure { + query_id: "53" + measure: "P@20" + value: "0.05" +} +measure { + query_id: "53" + measure: "nDCG@20" + value: "0.025474423234207066" +} +measure { + query_id: "54" + measure: "AP" + value: "0.08239186557890366" +} +measure { + query_id: "54" + measure: "ERR@20" + value: "0.04318" +} +measure { + query_id: "54" + measure: "P@20" + value: "0.15" +} +measure { + query_id: "54" + measure: "nDCG@20" + value: "0.09439235986160549" +} +measure { + query_id: "55" + measure: "AP" + value: "0.025442795139649605" +} +measure { + query_id: "55" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "55" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "55" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "56" + measure: "AP" + value: "0.014797358670446365" +} +measure { + query_id: "56" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "56" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "56" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "57" + measure: "AP" + value: "0.08271814470215763" +} +measure { + query_id: "57" + measure: "ERR@20" + value: "0.18104" +} +measure { + query_id: "57" + measure: "P@20" + value: "1.0" +} +measure { + query_id: "57" + measure: "nDCG@20" + value: "0.37814263992105696" +} +measure { + query_id: "58" + measure: "AP" + value: "0.047981620592769274" +} +measure { + query_id: "58" + measure: "ERR@20" + value: "0.2002" +} +measure { + query_id: "58" + measure: "P@20" + value: "0.1" +} +measure { + query_id: "58" + measure: "nDCG@20" + value: "0.15470717233038872" +} +measure { + query_id: "59" + measure: "AP" + value: "0.000998727734206654" +} +measure { + query_id: "59" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "59" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "59" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "60" + measure: "AP" + value: "0.12562945326365674" +} +measure { + query_id: "60" + measure: "ERR@20" + value: "0.07527" +} +measure { + query_id: "60" + measure: "P@20" + value: "0.6" +} +measure { + query_id: "60" + measure: "nDCG@20" + value: "0.2996408129012732" +} +measure { + query_id: "61" + measure: "AP" + value: "0.04222096080018801" +} +measure { + query_id: "61" + measure: "ERR@20" + value: "0.0227" +} +measure { + query_id: "61" + measure: "P@20" + value: "0.25" +} +measure { + query_id: "61" + measure: "nDCG@20" + value: "0.11833938829847222" +} +measure { + query_id: "62" + measure: "AP" + value: "0.33499884410457975" +} +measure { + query_id: "62" + measure: "ERR@20" + value: "0.11955" +} +measure { + query_id: "62" + measure: "P@20" + value: "0.7" +} +measure { + query_id: "62" + measure: "nDCG@20" + value: "0.46481395345418863" +} +measure { + query_id: "63" + measure: "AP" + value: "0.09440786331475708" +} +measure { + query_id: "63" + measure: "ERR@20" + value: "0.07744" +} +measure { + query_id: "63" + measure: "P@20" + value: "0.55" +} +measure { + query_id: "63" + measure: "nDCG@20" + value: "0.32402552769835197" +} +measure { + query_id: "64" + measure: "AP" + value: "0.08685313627059385" +} +measure { + query_id: "64" + measure: "ERR@20" + value: "0.21479" +} +measure { + query_id: "64" + measure: "P@20" + value: "0.45" +} +measure { + query_id: "64" + measure: "nDCG@20" + value: "0.35944389019892176" +} +measure { + query_id: "65" + measure: "AP" + value: "0.20467862651522445" +} +measure { + query_id: "65" + measure: "ERR@20" + value: "0.12497" +} +measure { + query_id: "65" + measure: "P@20" + value: "0.95" +} +measure { + query_id: "65" + measure: "nDCG@20" + value: "0.41366710335886037" +} +measure { + query_id: "66" + measure: "AP" + value: "0.014383592976393763" +} +measure { + query_id: "66" + measure: "ERR@20" + value: "0.01158" +} +measure { + query_id: "66" + measure: "P@20" + value: "0.1" +} +measure { + query_id: "66" + measure: "nDCG@20" + value: "0.04742937553647769" +} +measure { + query_id: "67" + measure: "AP" + value: "0.1523541663660107" +} +measure { + query_id: "67" + measure: "ERR@20" + value: "0.25877" +} +measure { + query_id: "67" + measure: "P@20" + value: "0.6" +} +measure { + query_id: "67" + measure: "nDCG@20" + value: "0.5324232213504778" +} +measure { + query_id: "68" + measure: "AP" + value: "0.0539546705439429" +} +measure { + query_id: "68" + measure: "ERR@20" + value: "0.07295" +} +measure { + query_id: "68" + measure: "P@20" + value: "0.2" +} +measure { + query_id: "68" + measure: "nDCG@20" + value: "0.18905274852308146" +} +measure { + query_id: "69" + measure: "AP" + value: "0.04092753746619256" +} +measure { + query_id: "69" + measure: "ERR@20" + value: "0.11001" +} +measure { + query_id: "69" + measure: "P@20" + value: "0.2" +} +measure { + query_id: "69" + measure: "nDCG@20" + value: "0.22456039597469996" +} +measure { + query_id: "70" + measure: "AP" + value: "0.0" +} +measure { + query_id: "70" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "70" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "70" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "71" + measure: "AP" + value: "0.009362496897326355" +} +measure { + query_id: "71" + measure: "ERR@20" + value: "0.01232" +} +measure { + query_id: "71" + measure: "P@20" + value: "0.1" +} +measure { + query_id: "71" + measure: "nDCG@20" + value: "0.037256819695418365" +} +measure { + query_id: "72" + measure: "AP" + value: "0.001789027566111398" +} +measure { + query_id: "72" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "72" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "72" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "73" + measure: "AP" + value: "0.09857345907915036" +} +measure { + query_id: "73" + measure: "ERR@20" + value: "0.16964" +} +measure { + query_id: "73" + measure: "P@20" + value: "0.75" +} +measure { + query_id: "73" + measure: "nDCG@20" + value: "0.42288861454588955" +} +measure { + query_id: "74" + measure: "AP" + value: "0.0031613061126710886" +} +measure { + query_id: "74" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "74" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "74" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "75" + measure: "AP" + value: "0.1853848101935964" +} +measure { + query_id: "75" + measure: "ERR@20" + value: "0.08819" +} +measure { + query_id: "75" + measure: "P@20" + value: "0.7" +} +measure { + query_id: "75" + measure: "nDCG@20" + value: "0.28892416616087263" +} +measure { + query_id: "76" + measure: "AP" + value: "0.1596744031456364" +} +measure { + query_id: "76" + measure: "ERR@20" + value: "0.18655" +} +measure { + query_id: "76" + measure: "P@20" + value: "0.75" +} +measure { + query_id: "76" + measure: "nDCG@20" + value: "0.39505803833045194" +} +measure { + query_id: "77" + measure: "AP" + value: "0.05916086544499962" +} +measure { + query_id: "77" + measure: "ERR@20" + value: "0.17976" +} +measure { + query_id: "77" + measure: "P@20" + value: "0.2" +} +measure { + query_id: "77" + measure: "nDCG@20" + value: "0.1823684709075654" +} +measure { + query_id: "78" + measure: "AP" + value: "0.07309254260205446" +} +measure { + query_id: "78" + measure: "ERR@20" + value: "0.12988" +} +measure { + query_id: "78" + measure: "P@20" + value: "0.25" +} +measure { + query_id: "78" + measure: "nDCG@20" + value: "0.27305922100120666" +} +measure { + query_id: "79" + measure: "AP" + value: "0.1660727051407492" +} +measure { + query_id: "79" + measure: "ERR@20" + value: "0.15387" +} +measure { + query_id: "79" + measure: "P@20" + value: "0.6" +} +measure { + query_id: "79" + measure: "nDCG@20" + value: "0.44242395871513723" +} +measure { + query_id: "80" + measure: "AP" + value: "0.10493115399768116" +} +measure { + query_id: "80" + measure: "ERR@20" + value: "0.1751" +} +measure { + query_id: "80" + measure: "P@20" + value: "0.95" +} +measure { + query_id: "80" + measure: "nDCG@20" + value: "0.9665625086152866" +} +measure { + query_id: "81" + measure: "AP" + value: "0.01715890156076979" +} +measure { + query_id: "81" + measure: "ERR@20" + value: "0.03841" +} +measure { + query_id: "81" + measure: "P@20" + value: "0.1" +} +measure { + query_id: "81" + measure: "nDCG@20" + value: "0.08885921609572776" +} +measure { + query_id: "82" + measure: "AP" + value: "0.14060018745496566" +} +measure { + query_id: "82" + measure: "ERR@20" + value: "0.0158" +} +measure { + query_id: "82" + measure: "P@20" + value: "0.15" +} +measure { + query_id: "82" + measure: "nDCG@20" + value: "0.05286949541504003" +} +measure { + query_id: "83" + measure: "AP" + value: "0.0032003192840618978" +} +measure { + query_id: "83" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "83" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "83" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "84" + measure: "AP" + value: "0.26403019394946936" +} +measure { + query_id: "84" + measure: "ERR@20" + value: "0.37726" +} +measure { + query_id: "84" + measure: "P@20" + value: "1.0" +} +measure { + query_id: "84" + measure: "nDCG@20" + value: "0.8234510385252299" +} +measure { + query_id: "85" + measure: "AP" + value: "0.04984746146436076" +} +measure { + query_id: "85" + measure: "ERR@20" + value: "0.13017" +} +measure { + query_id: "85" + measure: "P@20" + value: "0.35" +} +measure { + query_id: "85" + measure: "nDCG@20" + value: "0.2072412325901197" +} +measure { + query_id: "86" + measure: "AP" + value: "0.38665923711198213" +} +measure { + query_id: "86" + measure: "ERR@20" + value: "0.57431" +} +measure { + query_id: "86" + measure: "P@20" + value: "0.9" +} +measure { + query_id: "86" + measure: "nDCG@20" + value: "0.9357498985251277" +} +measure { + query_id: "87" + measure: "AP" + value: "0.013672391980620615" +} +measure { + query_id: "87" + measure: "ERR@20" + value: "0.0125" +} +measure { + query_id: "87" + measure: "P@20" + value: "0.05" +} +measure { + query_id: "87" + measure: "nDCG@20" + value: "0.04218127244872386" +} +measure { + query_id: "88" + measure: "AP" + value: "0.07687099066731841" +} +measure { + query_id: "88" + measure: "ERR@20" + value: "0.07895" +} +measure { + query_id: "88" + measure: "P@20" + value: "0.6" +} +measure { + query_id: "88" + measure: "nDCG@20" + value: "0.29983243845001567" +} +measure { + query_id: "89" + measure: "AP" + value: "0.20466545770487468" +} +measure { + query_id: "89" + measure: "ERR@20" + value: "0.11247" +} +measure { + query_id: "89" + measure: "P@20" + value: "0.75" +} +measure { + query_id: "89" + measure: "nDCG@20" + value: "0.2343846571103149" +} +measure { + query_id: "90" + measure: "AP" + value: "0.1296490231661245" +} +measure { + query_id: "90" + measure: "ERR@20" + value: "0.07679" +} +measure { + query_id: "90" + measure: "P@20" + value: "0.55" +} +measure { + query_id: "90" + measure: "nDCG@20" + value: "0.3285795110302251" +} +measure { + query_id: "91" + measure: "AP" + value: "0.0902393945671462" +} +measure { + query_id: "91" + measure: "ERR@20" + value: "0.09979" +} +measure { + query_id: "91" + measure: "P@20" + value: "0.45" +} +measure { + query_id: "91" + measure: "nDCG@20" + value: "0.25477116590093857" +} +measure { + query_id: "92" + measure: "AP" + value: "0.00030078355911961535" +} +measure { + query_id: "92" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "92" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "92" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "93" + measure: "AP" + value: "0.2061604493260041" +} +measure { + query_id: "93" + measure: "ERR@20" + value: "0.16062" +} +measure { + query_id: "93" + measure: "P@20" + value: "0.4" +} +measure { + query_id: "93" + measure: "nDCG@20" + value: "0.3265855299470895" +} +measure { + query_id: "94" + measure: "AP" + value: "5.2742616033755275e-05" +} +measure { + query_id: "94" + measure: "ERR@20" + value: "0.0" +} +measure { + query_id: "94" + measure: "P@20" + value: "0.0" +} +measure { + query_id: "94" + measure: "nDCG@20" + value: "0.0" +} +measure { + query_id: "96" + measure: "AP" + value: "0.15053669942589204" +} +measure { + query_id: "96" + measure: "ERR@20" + value: "0.16757" +} +measure { + query_id: "96" + measure: "P@20" + value: "0.6" +} +measure { + query_id: "96" + measure: "nDCG@20" + value: "0.4710935901006018" +} +measure { + query_id: "97" + measure: "AP" + value: "0.033941105876571405" +} +measure { + query_id: "97" + measure: "ERR@20" + value: "0.05199" +} +measure { + query_id: "97" + measure: "P@20" + value: "0.2" +} +measure { + query_id: "97" + measure: "nDCG@20" + value: "0.11506784902368167" +} +measure { + query_id: "98" + measure: "AP" + value: "0.11126690348306295" +} +measure { + query_id: "98" + measure: "ERR@20" + value: "0.1414" +} +measure { + query_id: "98" + measure: "P@20" + value: "0.65" +} +measure { + query_id: "98" + measure: "nDCG@20" + value: "0.34040441310101244" +} +measure { + query_id: "99" + measure: "AP" + value: "0.17954069638973388" +} +measure { + query_id: "99" + measure: "ERR@20" + value: "0.07698" +} +measure { + query_id: "99" + measure: "P@20" + value: "0.65" +} +measure { + query_id: "99" + measure: "nDCG@20" + value: "0.350016617656165" +} diff --git a/ir-measures/output/evaluation.prototext b/ir-measures/output/evaluation.prototext new file mode 100644 index 0000000..837b9fb --- /dev/null +++ b/ir-measures/output/evaluation.prototext @@ -0,0 +1,16 @@ +measure { + key: "AP" + value: "0.0984479789876958" +} +measure { + key: "ERR@20" + value: "0.11547874999999996" +} +measure { + key: "P@20" + value: "0.3895833333333332" +} +measure { + key: "nDCG@20" + value: "0.2595661630864897" +} diff --git a/ir-measures/tests/__init__.py b/ir-measures/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ir-measures/tests/__pycache__/__init__.cpython-310.pyc b/ir-measures/tests/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..1957981 Binary files /dev/null and b/ir-measures/tests/__pycache__/__init__.cpython-310.pyc differ diff --git a/ir-measures/tests/__pycache__/test_with_approvals.cpython-310-pytest-7.2.0.pyc b/ir-measures/tests/__pycache__/test_with_approvals.cpython-310-pytest-7.2.0.pyc new file mode 100644 index 0000000..a03eae5 Binary files /dev/null and b/ir-measures/tests/__pycache__/test_with_approvals.cpython-310-pytest-7.2.0.pyc differ diff --git a/ir-measures/tests/approvaltests_config.json b/ir-measures/tests/approvaltests_config.json new file mode 100644 index 0000000..e8126f5 --- /dev/null +++ b/ir-measures/tests/approvaltests_config.json @@ -0,0 +1,3 @@ +{ + "subdirectory": "approved_files" +} \ No newline at end of file diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_all_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_all_valid.approved.txt new file mode 100644 index 0000000..27821a3 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_all_valid.approved.txt @@ -0,0 +1,52 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check output path: test-output +✓ Output path is valid. +ℹ Check run, qrels, and topics consistency. +✓ Run, qrels, and topics are consistent. +ℹ Evaluate run with measures: P@2, nDCG@2 +✓ Run successfully evaluated. +ℹ Export metrics. +✓ Metrics successfully exported. + + +#### +files: ['test-output/evaluation-per-query.prototext', 'test-output/evaluation.prototext'] + + +####test-output/evaluation-per-query.prototext +measure { + query_id: "1" + measure: "P@2" + value: "1.0" +} +measure { + query_id: "1" + measure: "nDCG@2" + value: "0.6666666666666667" +} + + +####test-output/evaluation.prototext +measure { + key: "P@2" + value: "1.0" +} +measure { + key: "nDCG@2" + value: "0.6666666666666667" +} diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_document_ids_inconsistent_run_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_document_ids_inconsistent_run_qrels.approved.txt new file mode 100644 index 0000000..68e1764 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_document_ids_inconsistent_run_qrels.approved.txt @@ -0,0 +1,21 @@ +ℹ Check run path: test-input/run_sample_warning_docid_not_in_qrels.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Check run, qrels, and topics consistency. + ⚠ Document IDs of run file not found in qrels file: 9 +⚠ Run, qrels, and topics are inconsistent: 1 warnings + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_invalid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_invalid.approved.txt new file mode 100644 index 0000000..b936341 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_invalid.approved.txt @@ -0,0 +1,21 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@X + ✗ Measure is invalid: P@X + ✗ Measures could not be parsed: 1 invalid + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_unknown.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_unknown.approved.txt new file mode 100644 index 0000000..a9d1a00 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_unknown.approved.txt @@ -0,0 +1,21 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: FOOBAR + ✗ Measure is unknown: FOOBAR + ✗ Measures could not be parsed: 1 unknown + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid.approved.txt new file mode 100644 index 0000000..c1cabd6 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid.approved.txt @@ -0,0 +1,24 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2 +✓ Measures successfully parsed. +ℹ Check run, qrels, and topics consistency. +✓ Run, qrels, and topics are consistent. +ℹ Evaluate run with measures: P@2 +✓ Run successfully evaluated. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_qrels.approved.txt new file mode 100644 index 0000000..ccf205c --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_qrels.approved.txt @@ -0,0 +1,17 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2 +✓ Measures successfully parsed. +✗ Consistency check without qrels file is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_topics.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_topics.approved.txt new file mode 100644 index 0000000..e46e28f --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_measure_valid_no_topics.approved.txt @@ -0,0 +1,17 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Parse measures: P@2 +✓ Measures successfully parsed. +✗ Consistency check without topics file is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_measures_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_measures_valid.approved.txt new file mode 100644 index 0000000..2444a03 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_measures_valid.approved.txt @@ -0,0 +1,24 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check run, qrels, and topics consistency. +✓ Run, qrels, and topics are consistent. +ℹ Evaluate run with measures: P@2, nDCG@2 +✓ Run successfully evaluated. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_dir_not_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_dir_not_empty.approved.txt new file mode 100644 index 0000000..0fc60c3 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_dir_not_empty.approved.txt @@ -0,0 +1,56 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check output path: test-output-not-empty + ⚠ Output directory is not empty. +✓ Output path is valid: 1 warning +ℹ Check run, qrels, and topics consistency. +✓ Run, qrels, and topics are consistent. +ℹ Evaluate run with measures: P@2, nDCG@2 +✓ Run successfully evaluated. +ℹ Export metrics. +✓ Metrics successfully exported. + + +#### +files: ['test-output-not-empty/evaluation-per-query.prototext', 'test-output-not-empty/evaluation.prototext', 'test-output-not-empty/file.txt'] + + +####test-output-not-empty/evaluation-per-query.prototext +measure { + query_id: "1" + measure: "P@2" + value: "1.0" +} +measure { + query_id: "1" + measure: "nDCG@2" + value: "0.6666666666666667" +} + + +####test-output-not-empty/evaluation.prototext +measure { + key: "P@2" + value: "1.0" +} +measure { + key: "nDCG@2" + value: "0.6666666666666667" +} + + +####test-output-not-empty/file.txt diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_path_is_file.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_is_file.approved.txt new file mode 100644 index 0000000..1a428f5 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_is_file.approved.txt @@ -0,0 +1,23 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check output path: test-output-not-empty/file.txt + ✗ Output path is not a directory. +✗ Output path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_not_found.approved.txt new file mode 100644 index 0000000..2f1d220 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_path_not_found.approved.txt @@ -0,0 +1,23 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check output path: 42 + ✗ Output path does not exist. +✗ Output path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_measures.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_measures.approved.txt new file mode 100644 index 0000000..c5b07e2 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_measures.approved.txt @@ -0,0 +1,23 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Check output path: test-output +✓ Output path is valid. +ℹ Check run, qrels, and topics consistency. +✓ Run, qrels, and topics are consistent. +✗ Exporting metrics without measures is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_qrels.approved.txt new file mode 100644 index 0000000..31b1004 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_qrels.approved.txt @@ -0,0 +1,19 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check output path: test-output +✓ Output path is valid. +✗ Consistency check without qrels file is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_topics.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_topics.approved.txt new file mode 100644 index 0000000..383199c --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_output_valid_no_topics.approved.txt @@ -0,0 +1,19 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Parse measures: P@2, nDCG@2 +✓ Measures successfully parsed. +ℹ Check output path: test-output +✓ Output path is valid. +✗ Consistency check without topics file is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_file_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_file_empty.approved.txt new file mode 100644 index 0000000..11a310c --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_file_empty.approved.txt @@ -0,0 +1,13 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/empty_file.txt + ✗ Qrels file is empty. +✗ Qrels path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_is_dir.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_is_dir.approved.txt new file mode 100644 index 0000000..9de4b7b --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_is_dir.approved.txt @@ -0,0 +1,13 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input + ✗ Qrels path is not a file. +✗ Qrels path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_not_found.approved.txt new file mode 100644 index 0000000..fb91d52 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_path_not_found.approved.txt @@ -0,0 +1,13 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: bar + ✗ Qrels path does not exist. +✗ Qrels path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_topics_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_topics_valid.approved.txt new file mode 100644 index 0000000..c67c1df --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_topics_valid.approved.txt @@ -0,0 +1,20 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Check run, qrels, and topics consistency. +✓ Run, qrels, and topics are consistent. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_qrels_valid_no_topics.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_valid_no_topics.approved.txt new file mode 100644 index 0000000..ec59681 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_qrels_valid_no_topics.approved.txt @@ -0,0 +1,15 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +✗ Consistency check without topics file is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_run_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_run_qrels.approved.txt new file mode 100644 index 0000000..41bbe1e --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_run_qrels.approved.txt @@ -0,0 +1,21 @@ +ℹ Check run path: test-input/run_sample_warning_qid_not_in_qrels.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Check run, qrels, and topics consistency. + ⚠ Query IDs of run file not found in qrels file: 2 +⚠ Run, qrels, and topics are inconsistent: 1 warnings + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_qrels.approved.txt new file mode 100644 index 0000000..04ec584 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_qrels.approved.txt @@ -0,0 +1,22 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_warning_qid_not_in_qrels.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Check run, qrels, and topics consistency. + ⚠ Query IDs of topics file not found in run file: 2 + ⚠ Query IDs of topics file not found in qrels file: 2 +⚠ Run, qrels, and topics are inconsistent: 2 warnings + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_run.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_run.approved.txt new file mode 100644 index 0000000..297637b --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_query_ids_inconsistent_topics_run.approved.txt @@ -0,0 +1,22 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/topics_sample_warning_qid_not_in_run.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +ℹ Check run, qrels, and topics consistency. + ⚠ Query IDs of topics file not found in run file: 2 + ⚠ Query IDs of topics file not found in qrels file: 2 +⚠ Run, qrels, and topics are inconsistent: 2 warnings + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_document_id_special_chars.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_document_id_special_chars.approved.txt new file mode 100644 index 0000000..b18b4f9 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_document_id_special_chars.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_docid_special_chars.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Document IDs with special characters at lines: 2 +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_fewer_columns.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_fewer_columns.approved.txt new file mode 100644 index 0000000..a9ba29c --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_fewer_columns.approved.txt @@ -0,0 +1,9 @@ +ℹ Check run path: test-input/run_sample_invalid_less_columns.txt +✓ Run path is valid. +ℹ Check run file format. + ✗ Fewer then 6 columns at lines: 2 +✗ Run file format is invalid: 1 errors + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_file_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_file_empty.approved.txt new file mode 100644 index 0000000..1a3ce50 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_file_empty.approved.txt @@ -0,0 +1,7 @@ +ℹ Check run path: test-input/empty_file.txt + ✗ Run file is empty. +✗ Run path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_first_rank_not_zero.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_first_rank_not_zero.approved.txt new file mode 100644 index 0000000..2fc565e --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_first_rank_not_zero.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_rank_not_start_at_0.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Ranks do not start at 0. +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_ignored_column_not_default.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_ignored_column_not_default.approved.txt new file mode 100644 index 0000000..c3adb40 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_ignored_column_not_default.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_ignored_column_wrong.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Ignored column is not "Q0" at lines: 2 +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_more_columns.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_more_columns.approved.txt new file mode 100644 index 0000000..2017b7b --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_more_columns.approved.txt @@ -0,0 +1,9 @@ +ℹ Check run path: test-input/run_sample_invalid_more_columns.txt +✓ Run path is valid. +ℹ Check run file format. + ✗ More then 6 columns at lines: 2 +✗ Run file format is invalid: 1 errors + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_multiple_tags.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_multiple_tags.approved.txt new file mode 100644 index 0000000..a179738 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_multiple_tags.approved.txt @@ -0,0 +1,9 @@ +ℹ Check run path: test-input/run_sample_invalid_multiple_tags.txt +✓ Run path is valid. +ℹ Check run file format. + ✗ Conflicting run tags at lines: 1≠2, 2≠3 +✗ Run file format is invalid: 2 errors + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_path_is_dir.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_is_dir.approved.txt new file mode 100644 index 0000000..2939b7b --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_is_dir.approved.txt @@ -0,0 +1,7 @@ +ℹ Check run path: test-input + ✗ Run path is not a file. +✗ Run path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_not_found.approved.txt new file mode 100644 index 0000000..a2d9cce --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_path_not_found.approved.txt @@ -0,0 +1,7 @@ +ℹ Check run path: foo + ✗ Run path does not exist. +✗ Run path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_not_ascending.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_not_ascending.approved.txt new file mode 100644 index 0000000..a6ab3ed --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_not_ascending.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_qid_not_asc.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Query IDs not in ascending order at lines: 2>3 +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_special_chars.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_special_chars.approved.txt new file mode 100644 index 0000000..d209f69 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_query_id_special_chars.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_qid_special_chars.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Query IDs with special characters at lines: 1, 2, 3, 4, 5 +⚠ Run file format is valid: 5 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_ascending.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_ascending.approved.txt new file mode 100644 index 0000000..bfe0591 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_ascending.approved.txt @@ -0,0 +1,13 @@ +ℹ Check run path: test-input/run_sample_warning_rank_not_asc.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Ranks not consecutive at lines: 3↛4, 4↛5 + ⚠ Ranks not in ascending order at lines: 4>5 + ⚠ Ranks and scores inconsistent at lines: 4≷5 +⚠ Run file format is valid: 4 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_consecutive.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_consecutive.approved.txt new file mode 100644 index 0000000..ebd0404 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_consecutive.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_rank_not_consecutive.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Ranks not consecutive at lines: 4↛5 +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_integer.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_integer.approved.txt new file mode 100644 index 0000000..a6e761f --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_integer.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_rank_not_int.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Non-integer ranks at lines: 1, 2, 3, 4, 5 +⚠ Run file format is valid: 5 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_numeric.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_numeric.approved.txt new file mode 100644 index 0000000..8949a41 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_not_numeric.approved.txt @@ -0,0 +1,10 @@ +ℹ Check run path: test-input/run_sample_warning_rank_not_num.txt +✓ Run path is valid. +ℹ Check run file format. + ✗ Non-numeric ranks at lines: 1, 2, 3, 4, 5 + ⚠ Non-integer ranks at lines: 1, 2, 3, 4, 5 +✗ Run file format is invalid: 5 errors, 5 warnings + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_ties.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_ties.approved.txt new file mode 100644 index 0000000..5bb7dba --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_rank_ties.approved.txt @@ -0,0 +1,12 @@ +ℹ Check run path: test-input/run_sample_warning_rank_ties.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Rank ties at lines: 4=5 + ⚠ Ranks not consecutive at lines: 4↛5 +⚠ Run file format is valid: 2 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_descending.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_descending.approved.txt new file mode 100644 index 0000000..2286079 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_descending.approved.txt @@ -0,0 +1,12 @@ +ℹ Check run path: test-input/run_sample_warning_score_not_desc.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Scores not in descending order at lines: 2<3 + ⚠ Ranks and scores inconsistent at lines: 2≷3 +⚠ Run file format is valid: 2 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_numeric.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_numeric.approved.txt new file mode 100644 index 0000000..1233400 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_not_numeric.approved.txt @@ -0,0 +1,9 @@ +ℹ Check run path: test-input/run_sample_warning_score_not_num.txt +✓ Run path is valid. +ℹ Check run file format. + ✗ Non-numeric scores at lines: 5 +✗ Run file format is invalid: 1 errors + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_rank_inconsistent.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_rank_inconsistent.approved.txt new file mode 100644 index 0000000..18a35f1 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_rank_inconsistent.approved.txt @@ -0,0 +1,14 @@ +ℹ Check run path: test-input/run_sample_warning_consistency.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Ranks not consecutive at lines: 1↛2, 2↛3, 3↛4 + ⚠ Ranks not in ascending order at lines: 2>3 + ⚠ Ranks and scores inconsistent at lines: 2≷3, 4≷5 + ⚠ Scores not in descending order at lines: 4<5 +⚠ Run file format is valid: 7 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_scientific_notation.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_scientific_notation.approved.txt new file mode 100644 index 0000000..475cf76 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_scientific_notation.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_score_scientific.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Score in scientific notation at lines: 1 +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_score_ties.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_ties.approved.txt new file mode 100644 index 0000000..a317cef --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_score_ties.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_score_ties.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Scores ties at lines: 2=3 +⚠ Run file format is valid: 1 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_tag_special_chars.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_tag_special_chars.approved.txt new file mode 100644 index 0000000..40ef954 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_tag_special_chars.approved.txt @@ -0,0 +1,11 @@ +ℹ Check run path: test-input/run_sample_warning_tag_special_chars.txt +✓ Run path is valid. +ℹ Check run file format. + ⚠ Run tags with special characters at lines: 1, 2, 3, 4, 5 +⚠ Run file format is valid: 5 warnings +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_run_valid.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_run_valid.approved.txt new file mode 100644 index 0000000..07cd91f --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_run_valid.approved.txt @@ -0,0 +1,10 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_file_empty.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_file_empty.approved.txt new file mode 100644 index 0000000..81dffb3 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_file_empty.approved.txt @@ -0,0 +1,17 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input/empty_file.jsonl + ✗ Topics path does not exist. +✗ Topics path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_is_dir.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_is_dir.approved.txt new file mode 100644 index 0000000..bdefadf --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_is_dir.approved.txt @@ -0,0 +1,17 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: test-input + ✗ Topics path is not a file. +✗ Topics path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_not_found.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_not_found.approved.txt new file mode 100644 index 0000000..136aafc --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_path_not_found.approved.txt @@ -0,0 +1,17 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check qrels path: test-input/qrels_sample_valid.txt +✓ Qrels path is valid. +ℹ Load qrels with ir-measures. +✓ Qrels successfully loaded. +ℹ Check topics path: baz + ✗ Topics path does not exist. +✗ Topics path is invalid: 1 error + + +#### +files: [] diff --git a/ir-measures/tests/approved_files/test_with_approvals.test_topics_valid_no_qrels.approved.txt b/ir-measures/tests/approved_files/test_with_approvals.test_topics_valid_no_qrels.approved.txt new file mode 100644 index 0000000..24ce5d1 --- /dev/null +++ b/ir-measures/tests/approved_files/test_with_approvals.test_topics_valid_no_qrels.approved.txt @@ -0,0 +1,15 @@ +ℹ Check run path: test-input/run_sample_valid.txt +✓ Run path is valid. +ℹ Check run file format. +✓ Run file format is valid. +ℹ Load run with ir-measures. +✓ Run successfully loaded. +ℹ Check topics path: test-input/topics_sample_valid.jsonl +✓ Topics path is valid. +ℹ Load topics. +✓ Topics successfully loaded. +✗ Consistency check without qrels file is not allowed. + + +#### +files: [] diff --git a/ir-measures/tests/test-io/test-input/empty_file.txt b/ir-measures/tests/test-io/test-input/empty_file.txt new file mode 100644 index 0000000..e69de29 diff --git a/ir-measures/tests/test-io/test-input/qrels_sample_valid.txt b/ir-measures/tests/test-io/test-input/qrels_sample_valid.txt new file mode 100644 index 0000000..3bc9d94 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/qrels_sample_valid.txt @@ -0,0 +1,5 @@ +1 0 1 2 +1 0 2 2 +1 0 3 2 +1 0 4 3 +1 0 5 3 \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_invalid_less_columns.txt b/ir-measures/tests/test-io/test-input/run_sample_invalid_less_columns.txt new file mode 100644 index 0000000..794165c --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_invalid_less_columns.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_invalid_more_columns.txt b/ir-measures/tests/test-io/test-input/run_sample_invalid_more_columns.txt new file mode 100644 index 0000000..a49d72b --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_invalid_more_columns.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier 4815162342 +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_invalid_multiple_tags.txt b/ir-measures/tests/test-io/test-input/run_sample_invalid_multiple_tags.txt new file mode 100644 index 0000000..ef4b8fc --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_invalid_multiple_tags.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pydolphin +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_valid.txt b/ir-measures/tests/test-io/test-input/run_sample_valid.txt new file mode 100644 index 0000000..8597673 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_valid.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_consistency.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_consistency.txt new file mode 100644 index 0000000..2c98be3 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_consistency.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 2 3.352227701368739 pyterrier +1 Q0 3 1 3.292554298236954 pyterrier +1 Q0 4 3 3.251238969336898 pyterrier +1 Q0 5 4 3.260319364736074 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_docid_not_in_qrels.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_not_in_qrels.txt new file mode 100644 index 0000000..1c5c3d3 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_not_in_qrels.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 9 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_docid_special_chars.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_special_chars.txt new file mode 100644 index 0000000..e4d95bc --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_docid_special_chars.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2% 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_ignored_column_wrong.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_ignored_column_wrong.txt new file mode 100644 index 0000000..7633888 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_ignored_column_wrong.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q1 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_asc.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_asc.txt new file mode 100644 index 0000000..ecb0625 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_asc.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +2 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 5 3 3.260319364736074 pyterrier +1 Q0 4 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_in_qrels.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_in_qrels.txt new file mode 100644 index 0000000..7b90606 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_not_in_qrels.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +2 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_qid_special_chars.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_special_chars.txt new file mode 100644 index 0000000..b4d8a18 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_qid_special_chars.txt @@ -0,0 +1,5 @@ +1$ Q0 1 0 3.446771712469712 pyterrier +1$ Q0 2 1 3.352227701368739 pyterrier +1$ Q0 3 2 3.292554298236954 pyterrier +1$ Q0 4 3 3.260319364736074 pyterrier +1$ Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_asc.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_asc.txt new file mode 100644 index 0000000..e40a7f6 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_asc.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 4 3.260319364736074 pyterrier +1 Q0 5 3 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_consecutive.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_consecutive.txt new file mode 100644 index 0000000..c496737 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_consecutive.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 5 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_int.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_int.txt new file mode 100644 index 0000000..ac109b1 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_int.txt @@ -0,0 +1,5 @@ +1 Q0 1 0.2 3.446771712469712 pyterrier +1 Q0 2 1.2 3.352227701368739 pyterrier +1 Q0 3 2.3 3.292554298236954 pyterrier +1 Q0 4 3.4 3.260319364736074 pyterrier +1 Q0 5 4.5 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_num.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_num.txt new file mode 100644 index 0000000..e2468dd --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_num.txt @@ -0,0 +1,5 @@ +1 Q0 1 A 3.446771712469712 pyterrier +1 Q0 2 B 3.352227701368739 pyterrier +1 Q0 3 C 3.292554298236954 pyterrier +1 Q0 4 D 3.260319364736074 pyterrier +1 Q0 5 E 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_start_at_0.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_start_at_0.txt new file mode 100644 index 0000000..fd75cf4 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_not_start_at_0.txt @@ -0,0 +1,5 @@ +1 Q0 1 1 3.446771712469712 pyterrier +1 Q0 2 2 3.352227701368739 pyterrier +1 Q0 3 3 3.292554298236954 pyterrier +1 Q0 4 4 3.260319364736074 pyterrier +1 Q0 5 5 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_rank_ties.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_ties.txt new file mode 100644 index 0000000..d9009ac --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_rank_ties.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 3 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_desc.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_desc.txt new file mode 100644 index 0000000..b4507b7 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_desc.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.292554298236954 pyterrier +1 Q0 3 2 3.352227701368739 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_num.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_num.txt new file mode 100644 index 0000000..e412c49 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_not_num.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898a pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_scientific.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_scientific.txt new file mode 100644 index 0000000..31330d3 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_scientific.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712e10 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.292554298236954 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_score_ties.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_score_ties.txt new file mode 100644 index 0000000..e06e6f1 --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_score_ties.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 pyterrier +1 Q0 2 1 3.352227701368739 pyterrier +1 Q0 3 2 3.352227701368739 pyterrier +1 Q0 4 3 3.260319364736074 pyterrier +1 Q0 5 4 3.251238969336898 pyterrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/run_sample_warning_tag_special_chars.txt b/ir-measures/tests/test-io/test-input/run_sample_warning_tag_special_chars.txt new file mode 100644 index 0000000..52ee5fd --- /dev/null +++ b/ir-measures/tests/test-io/test-input/run_sample_warning_tag_special_chars.txt @@ -0,0 +1,5 @@ +1 Q0 1 0 3.446771712469712 py+terrier +1 Q0 2 1 3.352227701368739 py+terrier +1 Q0 3 2 3.292554298236954 py+terrier +1 Q0 4 3 3.260319364736074 py+terrier +1 Q0 5 4 3.251238969336898 py+terrier \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/topics_sample_valid.jsonl b/ir-measures/tests/test-io/test-input/topics_sample_valid.jsonl new file mode 100644 index 0000000..c4fc25a --- /dev/null +++ b/ir-measures/tests/test-io/test-input/topics_sample_valid.jsonl @@ -0,0 +1,5 @@ +{"qid": "1", "query": "what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft ."} +{"qid": "1", "query": "what are the structural and aeroelastic problems associated with flight\nof high speed aircraft ."} +{"qid": "1", "query": "what problems of heat conduction in composite slabs have been solved so\nfar ."} +{"qid": "1", "query": "can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium ."} +{"qid": "1", "query": "what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems ."} \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_qrels.jsonl b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_qrels.jsonl new file mode 100644 index 0000000..d3ff1fd --- /dev/null +++ b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_qrels.jsonl @@ -0,0 +1,5 @@ +{"qid": "1", "query": "what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft ."} +{"qid": "1", "query": "what are the structural and aeroelastic problems associated with flight\nof high speed aircraft ."} +{"qid": "1", "query": "what problems of heat conduction in composite slabs have been solved so\nfar ."} +{"qid": "1", "query": "can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium ."} +{"qid": "2", "query": "what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems ."} \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_run.jsonl b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_run.jsonl new file mode 100644 index 0000000..d3ff1fd --- /dev/null +++ b/ir-measures/tests/test-io/test-input/topics_sample_warning_qid_not_in_run.jsonl @@ -0,0 +1,5 @@ +{"qid": "1", "query": "what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft ."} +{"qid": "1", "query": "what are the structural and aeroelastic problems associated with flight\nof high speed aircraft ."} +{"qid": "1", "query": "what problems of heat conduction in composite slabs have been solved so\nfar ."} +{"qid": "1", "query": "can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium ."} +{"qid": "2", "query": "what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems ."} \ No newline at end of file diff --git a/ir-measures/tests/test-io/test-output-not-empty/file.txt b/ir-measures/tests/test-io/test-output-not-empty/file.txt new file mode 100644 index 0000000..e69de29 diff --git a/ir-measures/tests/test_with_approvals.py b/ir-measures/tests/test_with_approvals.py new file mode 100644 index 0000000..d534805 --- /dev/null +++ b/ir-measures/tests/test_with_approvals.py @@ -0,0 +1,478 @@ +import io +import os +import sys +from contextlib import redirect_stdout +from pathlib import Path +from shutil import copytree +from tempfile import TemporaryDirectory +from typing import List + +from approvaltests import set_default_reporter, DiffReporter +from approvaltests.approvals import verify +from pytest import raises + +from ir_measures_evaluator import main + +_TEST_IO_DIR = Path(__file__).parent / 'test-io' + + +def setup_module(): + set_default_reporter(DiffReporter()) + + +def run_capture_stdout_files( + argv: List[str], + exit_normal: bool, + output_dir: str = 'test-output', +): + buffer = io.StringIO() + captured_files = '' + with TemporaryDirectory() as temp_dir: + tmp_path = Path(temp_dir) / 'test-io' + # Copy test_io to temp_dir + copytree(_TEST_IO_DIR, tmp_path) + # Change to temp_dir + os.chdir(tmp_path) + # Override sys.argv + sys.argv = ['', *argv] + with redirect_stdout(buffer): + if exit_normal: + main() + else: + with raises(SystemExit): + main() + # List files in temp output dir + tmp_out_path = tmp_path / output_dir + files = sorted(tmp_out_path.glob('**/*')) + filenames = [ + str(file.relative_to(tmp_path)) for file in files + ] + captured_files += f'\n\n####\nfiles: {filenames}\n' + for file in files: + if not file.is_file(): + continue + captured_files += f'\n\n####{file.relative_to(tmp_path)}\n' + \ + open(file).read() + return buffer.getvalue() + captured_files + + +def _run_capture_stdout_files_fail( + argv: List[str], + output_dir: str = 'test-output', +): + return run_capture_stdout_files(argv, False, output_dir) + + +def _run_capture_stdout_files_pass( + argv: List[str], + output_dir: str = 'test-output', +): + return run_capture_stdout_files(argv, True, output_dir) + + +def test_run_path_not_found(): + actual = _run_capture_stdout_files_fail([ + '--run', 'foo', + ]) + verify(actual) + + +def test_run_path_is_dir(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input', + ]) + verify(actual) + + +def test_run_file_empty(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/empty_file.txt', + ]) + verify(actual) + + +def test_run_fewer_columns(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_invalid_less_columns.txt', + ]) + verify(actual) + + +def test_run_more_columns(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_invalid_more_columns.txt', + ]) + verify(actual) + + +def test_run_multiple_tags(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_invalid_multiple_tags.txt', + ]) + verify(actual) + + +def test_run_tag_special_chars(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_tag_special_chars.txt', + ]) + verify(actual) + + +def test_run_query_id_special_chars(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_qid_special_chars.txt', + ]) + verify(actual) + + +def test_run_query_id_not_ascending(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_qid_not_asc.txt', + ]) + verify(actual) + + +def test_run_document_id_special_chars(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_docid_special_chars.txt', + ]) + verify(actual) + + +def test_run_ignored_column_not_default(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_ignored_column_wrong.txt', + ]) + verify(actual) + + +def test_run_score_not_numeric(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_warning_score_not_num.txt', + ]) + verify(actual) + + +def test_run_score_scientific_notation(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_score_scientific.txt', + ]) + verify(actual) + + +def test_run_score_ties(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_score_ties.txt', + ]) + verify(actual) + + +def test_run_score_not_descending(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_score_not_desc.txt', + ]) + verify(actual) + + +def test_run_rank_not_numeric(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_warning_rank_not_num.txt', + ]) + verify(actual) + + +def test_run_rank_not_integer(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_rank_not_int.txt', + ]) + verify(actual) + + +def test_run_first_rank_not_zero(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_rank_not_start_at_0.txt', + ]) + verify(actual) + + +def test_run_rank_ties(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_rank_ties.txt', + ]) + verify(actual) + + +def test_run_rank_not_ascending(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_rank_not_asc.txt', + ]) + verify(actual) + + +def test_run_rank_not_consecutive(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_rank_not_consecutive.txt', + ]) + verify(actual) + + +def test_run_score_rank_inconsistent(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_consistency.txt', + ]) + verify(actual) + + +def test_run_valid(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + ]) + verify(actual) + + +def test_qrels_path_not_found(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'bar', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_qrels_path_is_dir(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_qrels_file_empty(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/empty_file.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_qrels_valid_no_topics(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + ]) + verify(actual) + + +def test_topics_path_not_found(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'baz', + ]) + verify(actual) + + +def test_topics_path_is_dir(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input', + ]) + verify(actual) + + +def test_topics_file_empty(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/empty_file.jsonl', + ]) + verify(actual) + + +def test_topics_valid_no_qrels(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_qrels_topics_valid(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_query_ids_inconsistent_run_qrels(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_qid_not_in_qrels.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_document_ids_inconsistent_run_qrels(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_warning_docid_not_in_qrels.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + ]) + verify(actual) + + +def test_query_ids_inconsistent_topics_run(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--topics', 'test-input/topics_sample_warning_qid_not_in_run.jsonl', + '--qrels', 'test-input/qrels_sample_valid.txt', + ]) + verify(actual) + + +def test_query_ids_inconsistent_topics_qrels(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--topics', 'test-input/topics_sample_warning_qid_not_in_qrels.jsonl', + '--qrels', 'test-input/qrels_sample_valid.txt', + ]) + verify(actual) + + +def test_measure_unknown(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'FOOBAR', + ]) + verify(actual) + + +def test_measure_invalid(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@X', + ]) + verify(actual) + + +def test_measure_valid_no_qrels(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', + ]) + verify(actual) + + +def test_measure_valid_no_topics(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--measures', 'P@2', + ]) + verify(actual) + + +def test_measure_valid(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', + ]) + verify(actual) + + +def test_measures_valid(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', 'nDCG@2', + ]) + verify(actual) + + +def test_output_path_not_found(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', 'nDCG@2', + '--output', '42', + ]) + verify(actual) + + +def test_output_path_is_file(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', 'nDCG@2', + '--output', 'test-output-not-empty/file.txt', + ]) + verify(actual) + + +def test_output_dir_not_empty(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', 'nDCG@2', + '--output', 'test-output-not-empty', + ], output_dir='test-output-not-empty') + verify(actual) + + +def test_output_valid_no_qrels(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', 'nDCG@2', + '--output', 'test-output', + ]) + verify(actual) + + +def test_output_valid_no_topics(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--measures', 'P@2', 'nDCG@2', + '--output', 'test-output', + ]) + verify(actual) + + +def test_output_valid_no_measures(): + actual = _run_capture_stdout_files_fail([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--output', 'test-output', + ]) + verify(actual) + + +def test_all_valid(): + actual = _run_capture_stdout_files_pass([ + '--run', 'test-input/run_sample_valid.txt', + '--qrels', 'test-input/qrels_sample_valid.txt', + '--topics', 'test-input/topics_sample_valid.jsonl', + '--measures', 'P@2', 'nDCG@2', + '--output', 'test-output', + ]) + verify(actual) diff --git a/reproducibility-experiments/README.md b/reproducibility-experiments/README.md new file mode 100644 index 0000000..03b94ce --- /dev/null +++ b/reproducibility-experiments/README.md @@ -0,0 +1,4 @@ +# Examples of Reproducibility Experiments + +We will add the Jupyter Notebooks for the reproducibility Experiments in the next two days, we are at the moment cleaning and documenting the code. +