diff --git a/DECISIONS.md b/DECISIONS.md index 4e619c9c..ebb1045e 100644 --- a/DECISIONS.md +++ b/DECISIONS.md @@ -1,5 +1,7 @@ # Decision log +## Initial design decisions - v0.1-0.4 + 1. To get started quickly, we will start with an AWS Elasticsearch managed cluster running in the cloud. We may wish to run our own cluster in the future. @@ -32,3 +34,12 @@ results, and we are only seeking feature-parity with the classic system. When we address hit highlighting, we can show matching author names deep in author list to provide visual feedback to the user. + +## Subsequent decisions + +- 2018-12-18. Removing cross-list functionality in v0.1 was a regression. Users + expect to be able to search by cross-list category just like primary + category. We decided to include cross-list/secondary category in the + all-fields search, add a cross-list field to the advanced search interface, + and include cross-list classification in shortcut routes and the advanced + interface's classification filter (with option to exclude). diff --git a/Dockerfile b/Dockerfile index 9ddc3a46..ab3adbec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,12 +3,14 @@ # Defines the runtime for the arXiv search service, which provides the main # UIs (and, eventually, APIs) for search. -FROM arxiv/base:0.6.1 +FROM arxiv/base:0.12.1 WORKDIR /opt/arxiv +# Install MySQL. +RUN yum install -y which mysql mysql-devel + # Add Python application and configuration. -ADD requirements/prod.txt /opt/arxiv/requirements.txt ADD app.py /opt/arxiv/ ADD Pipfile /opt/arxiv/ ADD Pipfile.lock /opt/arxiv/ diff --git a/Dockerfile-agent b/Dockerfile-agent index 12e8d369..ebea1c2b 100644 --- a/Dockerfile-agent +++ b/Dockerfile-agent @@ -4,7 +4,7 @@ # article metadata becomes available. Subscribes to a Kinesis stream for # notifications about new metadata. -FROM arxiv/search:0.4 +FROM arxiv/search:0.5.1 WORKDIR /opt/arxiv diff --git a/Dockerfile-api b/Dockerfile-api new file mode 100644 index 00000000..edd4c7ea --- /dev/null +++ b/Dockerfile-api @@ -0,0 +1,59 @@ +# arxiv/search-api +# +# Defines the runtime for the arXiv search API, which provides a metadata +# query API backed by Elasticsearch. + +FROM arxiv/base:0.12.1 + +WORKDIR /opt/arxiv + +# Install MySQL. +RUN yum install -y which mysql mysql-devel + +# Add Python application and configuration. +ADD app.py /opt/arxiv/ +ADD Pipfile /opt/arxiv/ +ADD Pipfile.lock /opt/arxiv/ +RUN pip install -U pip pipenv +RUN pipenv install + +ENV PATH "/opt/arxiv:${PATH}" + +ADD schema /opt/arxiv/schema +ADD mappings /opt/arxiv/mappings +ADD search /opt/arxiv/search +ADD wsgi-api.py /opt/arxiv/wsgi.py +RUN pip install uwsgi + +ADD bin/start_search.sh /opt/arxiv/ +RUN chmod +x /opt/arxiv/start_search.sh + +ENV LC_ALL en_US.utf8 +ENV LANG en_US.utf8 +ENV LOGLEVEL 40 +ENV FLASK_DEBUG 1 +ENV FLASK_APP /opt/arxiv/app.py + +ENV ELASTICSEARCH_SERVICE_HOST 127.0.0.1 +ENV ELASTICSEARCH_SERVICE_PORT 9200 +ENV ELASTICSEARCH_PORT_9200_PROTO http +ENV ELASTICSEARCH_INDEX arxiv +ENV ELASTICSEARCH_USER elastic +ENV ELASTICSEARCH_PASSWORD changeme +ENV METADATA_ENDPOINT https://arxiv.org/docmeta_bulk/ + +EXPOSE 8000 + +#CMD /bin/bash +ENTRYPOINT ["/opt/arxiv/start_search.sh"] +CMD ["--http-socket", ":8000", \ + "-M", \ + "-t 3000", \ + "--manage-script-name", \ + "--processes", "8", \ + "--threads", "1", \ + "--async", "100", \ + "--ugreen", \ + "--buffer-size", "65535", \ + "--mount", "/metadata=wsgi.py", \ + "--logformat", "%(addr) %(addr) - %(user_id)|%(session_id) [%(rtime)] [%(uagent)] \"%(method) %(uri) %(proto)\" %(status) %(size) %(micros) %(ttfb)"] diff --git a/Dockerfile-index b/Dockerfile-index index 834dd631..41e9d4b8 100644 --- a/Dockerfile-index +++ b/Dockerfile-index @@ -16,20 +16,9 @@ # # See also ELASTICSEARCH_* and METADATA_ENDPOINT parameters, below. -FROM arxiv/base - -# Add Python consumer and configuration. -ADD requirements/prod.txt /opt/arxiv/requirements.txt -ADD app.py /opt/arxiv/ -RUN pip install -U pip -RUN pip install -r /opt/arxiv/requirements.txt +FROM arxiv/search:0.5.1 ENV PATH "/opt/arxiv:${PATH}" - -ADD schema /opt/arxiv/schema -ADD mappings /opt/arxiv/mappings -ADD search /opt/arxiv/search -ADD tests /opt/arxiv/tests ADD bulk_index.py /opt/arxiv/ WORKDIR /opt/arxiv/ diff --git a/Pipfile b/Pipfile index 0b4c7b5f..4fb12cf6 100644 --- a/Pipfile +++ b/Pipfile @@ -1,13 +1,11 @@ [[source]] - url = "https://pypi.python.org/simple" verify_ssl = true name = "pypi" - [packages] - -arxiv-base = "==0.6.1" +arxiv-auth = "==0.2.3" +arxiv-base = "==0.12.1" boto = "==2.48.0" "boto3" = "==1.6.6" botocore = "==1.9.6" @@ -18,8 +16,8 @@ coverage = "==4.4.2" dataclasses = "==0.4" docutils = "==0.14" elasticsearch = "==6.2.0" -elasticsearch-dsl = "==6.1.0" -flask = "==0.12.2" +elasticsearch-dsl = "==6.3.1" +flask = "==1.0.2" "flask-s3" = "==0.3.3" idna = "==2.6" ipaddress = "==1.0.19" @@ -40,18 +38,20 @@ pyflakes = "==1.6.0" pylama = "==7.4.3" python-dateutil = "==2.6.1" pytz = "==2017.3" -requests = "==2.18.4" +requests = "==2.20.0" "s3transfer" = "==0.1.13" snowballstemmer = "==1.2.1" thrift = "==0.11.0" thrift-connector = "==0.23" typed-ast = "==1.1.0" "urllib3" = "==1.22" -werkzeug = "==0.13" +werkzeug = "==0.14.1" wtforms = "==2.1" bleach = "*" - +lxml = "*" [dev-packages] - coveralls = "*" +sphinx = "*" +sphinxcontrib-websupport = "*" +sphinx-autodoc-typehints = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 6b6a0b31..d1fe2f73 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "26b9ee6a01276378844c890e95157932326d43062f134a35f345e19371d9f678" + "sha256": "2821b0070a7746b316adfe7d71044ce7f3d081c12b4370eeb84e9e34a88391e4" }, "pipfile-spec": 6, "requires": {}, @@ -14,20 +14,27 @@ ] }, "default": { + "arxiv-auth": { + "hashes": [ + "sha256:6d4a5b3e27bf718a95c79e3fef92e7a148b5aca5a0719dca7bdb300c52ce7abb" + ], + "index": "pypi", + "version": "==0.2.3" + }, "arxiv-base": { "hashes": [ - "sha256:54836ab321c9c10c2dbf1d0478120a4c4961c2eb982716d81596579277bafe5f" + "sha256:f8fa599e50550e0c6ee9de53030c22c0b8921fca7ac1268753a6b2b0fef177e9" ], "index": "pypi", - "version": "==0.6.1" + "version": "==0.12.1" }, "bleach": { "hashes": [ - "sha256:b8fa79e91f96c2c2cd9fd1f9eda906efb1b88b483048978ba62fef680e962b34", - "sha256:eb7386f632349d10d9ce9d4a838b134d4731571851149f9cc2c05a9a837a9a44" + "sha256:48d39675b80a75f6d1c3bdbffec791cf0bbbab665cf01e20da701c77de278718", + "sha256:73d26f018af5d5adcdabf5c1c974add4361a9c76af215fe32fdec8a6fc5fb9b9" ], "index": "pypi", - "version": "==2.1.3" + "version": "==3.0.2" }, "boto": { "hashes": [ @@ -150,19 +157,19 @@ }, "elasticsearch-dsl": { "hashes": [ - "sha256:5114a38a88e93a4663782eae07a1e8084ba333c49887335c83de8b8043bc72b2", - "sha256:d6d974cd2289543a3350690494a43fe9996485b8dc6f1d8758cb56bee01244bd" + "sha256:5f43196a3fd91b2eac90f7345e99f92c66004d85a1fd803cdecf756430827231", + "sha256:5f80b3b4a6e61db5d273bc57c32a80b2ddbc555afcc122c62c20440c355008be" ], "index": "pypi", - "version": "==6.1.0" + "version": "==6.3.1" }, "flask": { "hashes": [ - "sha256:0749df235e3ff61ac108f69ac178c9770caeaccad2509cb762ce1f65570a8856", - "sha256:49f44461237b69ecd901cc7ce66feea0319b9158743dd27a2899962ab214dac1" + "sha256:2271c0070dbcb5275fad4a82e29f23ab92682dc45f9dfbc22c02ba9b9322ce48", + "sha256:a080b744b7e345ccfcbc77954861cb05b3c63786e93f2b3875e0913d44b43f05" ], "index": "pypi", - "version": "==0.12.2" + "version": "==1.0.2" }, "flask-s3": { "hashes": [ @@ -173,13 +180,6 @@ "index": "pypi", "version": "==0.3.3" }, - "html5lib": { - "hashes": [ - "sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3", - "sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736" - ], - "version": "==1.0.1" - }, "idna": { "hashes": [ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f", @@ -226,6 +226,42 @@ "index": "pypi", "version": "==2.6.0" }, + "lxml": { + "hashes": [ + "sha256:02bc220d61f46e9b9d5a53c361ef95e9f5e1d27171cd461dddb17677ae2289a5", + "sha256:22f253b542a342755f6cfc047fe4d3a296515cf9b542bc6e261af45a80b8caf6", + "sha256:2f31145c7ff665b330919bfa44aacd3a0211a76ca7e7b441039d2a0b0451e415", + "sha256:36720698c29e7a9626a0dc802ef8885f8f0239bfd1689628ecd459a061f2807f", + "sha256:438a1b0203545521f6616132bfe0f4bca86f8a401364008b30e2b26ec408ce85", + "sha256:4815892904c336bbaf73dafd54f45f69f4021c22b5bad7332176bbf4fb830568", + "sha256:5be031b0f15ad63910d8e5038b489d95a79929513b3634ad4babf77100602588", + "sha256:5c93ae37c3c588e829b037fdfbd64a6e40c901d3f93f7beed6d724c44829a3ad", + "sha256:60842230678674cdac4a1cf0f707ef12d75b9a4fc4a565add4f710b5fcf185d5", + "sha256:62939a8bb6758d1bf923aa1c13f0bcfa9bf5b2fc0f5fa917a6e25db5fe0cfa4e", + "sha256:75830c06a62fe7b8fe3bbb5f269f0b308f19f3949ac81cfd40062f47c1455faf", + "sha256:81992565b74332c7c1aff6a913a3e906771aa81c9d0c68c68113cffcae45bc53", + "sha256:8c892fb0ee52c594d9a7751c7d7356056a9682674b92cc1c4dc968ff0f30c52f", + "sha256:9d862e3cf4fc1f2837dedce9c42269c8c76d027e49820a548ac89fdcee1e361f", + "sha256:a623965c086a6e91bb703d4da62dabe59fe88888e82c4117d544e11fd74835d6", + "sha256:a7783ab7f6a508b0510490cef9f857b763d796ba7476d9703f89722928d1e113", + "sha256:aab09fbe8abfa3b9ce62aaf45aca2d28726b1b9ee44871dbe644050a2fff4940", + "sha256:abf181934ac3ef193832fb973fd7f6149b5c531903c2ec0f1220941d73eee601", + "sha256:ae07fa0c115733fce1e9da96a3ac3fa24801742ca17e917e0c79d63a01eeb843", + "sha256:b9c78242219f674ab645ec571c9a95d70f381319a23911941cd2358a8e0521cf", + "sha256:bccb267678b870d9782c3b44d0cefe3ba0e329f9af8c946d32bf3778e7a4f271", + "sha256:c4df4d27f4c93b2cef74579f00b1d3a31a929c7d8023f870c4b476f03a274db4", + "sha256:caf0e50b546bb60dfa99bb18dfa6748458a83131ecdceaf5c071d74907e7e78a", + "sha256:d3266bd3ac59ac4edcd5fa75165dee80b94a3e5c91049df5f7c057ccf097551c", + "sha256:db0d213987bcd4e6d41710fb4532b22315b0d8fb439ff901782234456556aed1", + "sha256:dbbd5cf7690a40a9f0a9325ab480d0fccf46d16b378eefc08e195d84299bfae1", + "sha256:e16e07a0ec3a75b5ee61f2b1003c35696738f937dc8148fbda9fe2147ccb6e61", + "sha256:e175a006725c7faadbe69e791877d09936c0ef2cf49d01b60a6c1efcb0e8be6f", + "sha256:edd9c13a97f6550f9da2236126bb51c092b3b1ce6187f2bd966533ad794bbb5e", + "sha256:fa39ea60d527fbdd94215b5e5552f1c6a912624521093f1384a491a8ad89ad8b" + ], + "index": "pypi", + "version": "==4.2.5" + }, "markupsafe": { "hashes": [ "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665" @@ -257,6 +293,14 @@ "index": "pypi", "version": "==0.560" }, + "mysqlclient": { + "hashes": [ + "sha256:062d78953acb23066c0387a8f3bd0ecf946626f599145bb7fd201460e8f773e1", + "sha256:3981ae9ce545901a36a8b7aed76ed02960a429f75dc53b7ad77fb2f9ab7cd56b", + "sha256:b3591a00c0366de71d65108627899710d9cfb00e575c4d211aa8de59b1f130c9" + ], + "version": "==1.3.14" + }, "nose2": { "hashes": [ "sha256:5c79a2e46ad76999ca1ec7a83080424ed134eafaaf90b6e7554ebdf85aea6f23" @@ -289,14 +333,19 @@ }, "pycodestyle": { "hashes": [ - "sha256:1ec08a51c901dfe44921576ed6e4c1f5b7ecbad403f871397feedb5eb8e4fa14", - "sha256:5ff2fbcbab997895ba9ead77e1b38b3ebc2e5c3b8a6194ef918666e4c790a00e", "sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766", "sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9" ], "index": "pypi", "version": "==2.3.1" }, + "pycountry": { + "hashes": [ + "sha256:104a8ca94c700898c42a0172da2eab5a5675c49637b729a11db9e1dac2d983cd", + "sha256:8ec4020b2b15cd410893d573820d42ee12fe50365332e58c0975c953b60a16de" + ], + "version": "==18.12.8" + }, "pydocstyle": { "hashes": [ "sha256:08a870edc94508264ed90510db466c6357c7192e0e866561d740624a8fc7d90c", @@ -314,6 +363,13 @@ "index": "pypi", "version": "==1.6.0" }, + "pyjwt": { + "hashes": [ + "sha256:5c6eca3c2940464d106b99ba83b00c6add741c9becaec087fb7ccdefea71350e", + "sha256:8d59a976fb773f3e6a39c85636357c4f0e242707394cadadd9814f5cbaa20e96" + ], + "version": "==1.7.1" + }, "pylama": { "hashes": [ "sha256:390c1dab1daebdf3d6acc923e551b035c3faa77d8b96b98530c230493f9ec712", @@ -345,13 +401,26 @@ "index": "pypi", "version": "==2017.3" }, + "redis": { + "hashes": [ + "sha256:8a1900a9f2a0a44ecf6e8b5eb3e967a9909dfed219ad66df094f27f7d6f330fb", + "sha256:a22ca993cea2962dbb588f9f30d0015ac4afcc45bee27d3978c0dbe9e97c6c0f" + ], + "version": "==2.10.6" + }, + "redis-py-cluster": { + "hashes": [ + "sha256:7db54b1de60bd34da3806676b112f07fc9afae556d8260ac02c3335d574ee42c" + ], + "version": "==1.3.6" + }, "requests": { "hashes": [ - "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b", - "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e" + "sha256:99dcfdaaeb17caf6e526f32b6a7b780461512ab3f1d992187801694cba42770c", + "sha256:a84b8c9ab6239b578f22d1c21d51b696dcfe004032bb80ea832398d6909d7279" ], "index": "pypi", - "version": "==2.18.4" + "version": "==2.20.0" }, "s3transfer": { "hashes": [ @@ -363,10 +432,10 @@ }, "six": { "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" ], - "version": "==1.11.0" + "version": "==1.12.0" }, "snowballstemmer": { "hashes": [ @@ -376,6 +445,12 @@ "index": "pypi", "version": "==1.2.1" }, + "sqlalchemy": { + "hashes": [ + "sha256:809547455d012734b4252081db1e6b4fc731de2299f3755708c39863625e1c77" + ], + "version": "==1.2.15" + }, "thrift": { "hashes": [ "sha256:7d59ac4fdcb2c58037ebd4a9da5f9a49e3e034bf75b3f26d9fe48ba3d8806e6b" @@ -394,6 +469,8 @@ "typed-ast": { "hashes": [ "sha256:0948004fa228ae071054f5208840a1e88747a357ec1101c17217bfe99b299d58", + "sha256:10703d3cec8dcd9eef5a630a04056bbc898abc19bac5691612acba7d1325b66d", + "sha256:1f6c4bd0bdc0f14246fd41262df7dfc018d65bb05f6e16390b7ea26ca454a291", "sha256:25d8feefe27eb0303b73545416b13d108c6067b846b543738a25ff304824ed9a", "sha256:29464a177d56e4e055b5f7b629935af7f49c196be47528cc94e0a7bf83fbc2b9", "sha256:2e214b72168ea0275efd6c884b114ab42e316de3ffa125b267e732ed2abda892", @@ -405,6 +482,9 @@ "sha256:6de012d2b166fe7a4cdf505eee3aaa12192f7ba365beeefaca4ec10e31241a85", "sha256:79b91ebe5a28d349b6d0d323023350133e927b4de5b651a8aa2db69c761420c6", "sha256:8550177fa5d4c1f09b5e5f524411c44633c80ec69b24e0e98906dd761941ca46", + "sha256:898f818399cafcdb93cbbe15fc83a33d05f18e29fb498ddc09b0214cdfc7cd51", + "sha256:94b091dc0f19291adcb279a108f5d38de2430411068b219f41b343c03b28fb1f", + "sha256:a26863198902cda15ab4503991e8cf1ca874219e0118cbf07c126bce7c4db129", "sha256:a8034021801bc0440f2e027c354b4eafd95891b573e12ff0418dec385c76785c", "sha256:bc978ac17468fe868ee589c795d06777f75496b1ed576d308002c8a5756fb9ea", "sha256:c05b41bc1deade9f90ddc5d988fe506208019ebba9f2578c622516fd201f5863", @@ -423,6 +503,12 @@ "index": "pypi", "version": "==1.22" }, + "uwsgi": { + "hashes": [ + "sha256:d2318235c74665a60021a4fc7770e9c2756f9fc07de7b8c22805efe85b5ab277" + ], + "version": "==2.0.17.1" + }, "webencodings": { "hashes": [ "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", @@ -432,11 +518,11 @@ }, "werkzeug": { "hashes": [ - "sha256:6246e5fc98a505824113fb6aca993d45ea284a2bcffdc2c65d0c538e53e4abd3", - "sha256:f3000aa146ce8a9da8ca3e978e0e931c2a58eb56c323a5efb6b4307f7832b549" + "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", + "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" ], "index": "pypi", - "version": "==0.13" + "version": "==0.14.1" }, "wtforms": { "hashes": [ @@ -447,6 +533,20 @@ } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, + "babel": { + "hashes": [ + "sha256:6778d85147d5d85345c14a26aada5e478ab04e39b078b0745ee6870c2b5cf669", + "sha256:8cba50f48c529ca3fa18cf81fa9403be176d374ac4d60738b839122dfaaa3d23" + ], + "version": "==2.6.0" + }, "certifi": { "hashes": [ "sha256:40523d2efb60523e113b44602298f0960e900388cf3bb6043f645cf57ea9e3f5", @@ -511,11 +611,11 @@ }, "coveralls": { "hashes": [ - "sha256:32569a43c9dbc13fa8199247580a4ab182ef439f51f65bb7f8316d377a1340e8", - "sha256:664794748d2e5673e347ec476159a9d87f43e0d2d44950e98ed0e27b98da8346" + "sha256:ab638e88d38916a6cedbf80a9cd8992d5fa55c77ab755e262e00b36792b7cd6d", + "sha256:b2388747e2529fa4c669fb1e3e2756e4e07b6ee56c7d9fce05f35ccccc913aa0" ], "index": "pypi", - "version": "==1.3.0" + "version": "==1.5.1" }, "docopt": { "hashes": [ @@ -523,6 +623,15 @@ ], "version": "==0.6.2" }, + "docutils": { + "hashes": [ + "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", + "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", + "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" + ], + "index": "pypi", + "version": "==0.14" + }, "idna": { "hashes": [ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f", @@ -531,13 +640,110 @@ "index": "pypi", "version": "==2.6" }, + "imagesize": { + "hashes": [ + "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", + "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" + ], + "version": "==1.1.0" + }, + "jinja2": { + "hashes": [ + "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", + "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4" + ], + "index": "pypi", + "version": "==2.10" + }, + "markupsafe": { + "hashes": [ + "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665" + ], + "index": "pypi", + "version": "==1.0" + }, + "packaging": { + "hashes": [ + "sha256:0886227f54515e592aaa2e5a553332c73962917f2831f1b0f9b9f4380a4b9807", + "sha256:f95a1e147590f204328170981833854229bb2912ac3d5f89e2a8ccd2834800c9" + ], + "version": "==18.0" + }, + "pygments": { + "hashes": [ + "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", + "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" + ], + "version": "==2.3.1" + }, + "pyparsing": { + "hashes": [ + "sha256:40856e74d4987de5d01761a22d1621ae1c7f8774585acae358aa5c5936c6c90b", + "sha256:f353aab21fd474459d97b709e527b5571314ee5f067441dc9f88e33eecd96592" + ], + "version": "==2.3.0" + }, + "pytz": { + "hashes": [ + "sha256:59707844a9825589878236ff2f4e0dc9958511b7ffaae94dc615da07d4a68d33", + "sha256:699d18a2a56f19ee5698ab1123bbcc1d269d061996aeb1eda6d89248d3542b82", + "sha256:80af0f3008046b9975242012a985f04c5df1f01eed4ec1633d56cc47a75a6a48", + "sha256:8cc90340159b5d7ced6f2ba77694d946fc975b09f1a51d93f3ce3bb399396f94", + "sha256:c41c62827ce9cafacd6f2f7018e4f83a6f1986e87bfd000b8cfbd4ab5da95f1a", + "sha256:d0ef5ef55ed3d37854320d4926b04a4cb42a2e88f71da9ddfdacfde8e364f027", + "sha256:dd2e4ca6ce3785c8dd342d1853dd9052b19290d5bf66060846e5dc6b8d6667f7", + "sha256:fae4cffc040921b8a2d60c6cf0b5d662c1190fe54d718271db4eb17d44a185b7", + "sha256:feb2365914948b8620347784b6b6da356f31c9d03560259070b2f30cff3d469d" + ], + "index": "pypi", + "version": "==2017.3" + }, "requests": { "hashes": [ - "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b", - "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e" + "sha256:99dcfdaaeb17caf6e526f32b6a7b780461512ab3f1d992187801694cba42770c", + "sha256:a84b8c9ab6239b578f22d1c21d51b696dcfe004032bb80ea832398d6909d7279" + ], + "index": "pypi", + "version": "==2.20.0" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "snowballstemmer": { + "hashes": [ + "sha256:919f26a68b2c17a7634da993d91339e288964f93c274f1343e3bbbe2096e1128", + "sha256:9f3bcd3c401c3e862ec0ebe6d2c069ebc012ce142cce209c098ccb5b09136e89" + ], + "index": "pypi", + "version": "==1.2.1" + }, + "sphinx": { + "hashes": [ + "sha256:120732cbddb1b2364471c3d9f8bfd4b0c5b550862f99a65736c77f970b142aea", + "sha256:b348790776490894e0424101af9c8413f2a86831524bd55c5f379d3e3e12ca64" + ], + "index": "pypi", + "version": "==1.8.2" + }, + "sphinx-autodoc-typehints": { + "hashes": [ + "sha256:9f546fa18ee6bfb17f5cf937805d3c8afea48b976050db0bd14bb463eee97888", + "sha256:adfa4712e77c795522574ce644e084cded2f6e796a59d37e7f1cb98287687100" ], "index": "pypi", - "version": "==2.18.4" + "version": "==1.5.2" + }, + "sphinxcontrib-websupport": { + "hashes": [ + "sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd", + "sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9" + ], + "index": "pypi", + "version": "==1.1.0" }, "urllib3": { "hashes": [ diff --git a/README.md b/README.md index 1a31e886..d297f528 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,14 @@ manually restart to see those changes take effect. If all goes well... http://127.0.0.1:5000/ should render the basic search page. +You can run the API in dev mode by changing `FLASK_APP` to point to ``api.py``, +i.e.: + +```bash +FLASK_APP=api.py FLASK_DEBUG=1 ELASTICSEARCH_HOST=127.0.0.1 pipenv run flask run +``` + + ## Running the indexing agent. diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md new file mode 100644 index 00000000..d86050c9 --- /dev/null +++ b/RELEASE_NOTES.md @@ -0,0 +1,18 @@ +# Release notes + +## Version 0.5 - The cross-list release. + +### Bug +- [ARXIVNG-1349] - Layout of category tags and DOI is really wonky in search results + + +### New Feature +- [ARXIVNG-1277] - Search result entries should display secondary categories if present +- [ARXIVNG-1278] - "All fields" queries should search secondary categories +- [ARXIVNG-1347] - Search by secondary/cross-list +- [ARXIVNG-1357] - Update document mapping for secondaries to be consistent with mapping for primary category +- [ARXIVNG-1362] - Search indexing agent should identify itself using a custom User-Agent + +### Task +- [ARXIVNG-1048] - Update accessibility notice in footer +- [ARXIVNG-1348] - Update requests dependency to 2.20.0 or greater diff --git a/api.py b/api.py new file mode 100644 index 00000000..c51ea39e --- /dev/null +++ b/api.py @@ -0,0 +1,5 @@ +"""Provides application for development purposes.""" + +from search.factory import create_api_web_app + +app = create_api_web_app() diff --git a/api/search.yaml b/api/search.yaml deleted file mode 100644 index b2e75ade..00000000 --- a/api/search.yaml +++ /dev/null @@ -1,82 +0,0 @@ -openapi: "3.0.0" -info: - version: "0.1" - title: "arXiv Search API" - description: "A RESTful API for arXiv documents." - termsOfService: "https://arxiv.org/help/general", - contact: - name: "arXiv API Team" - email: nextgen@arxiv.org - license: - name: MIT -servers: - - url: https://arxiv.org/api -paths: - /papers: - get: - operationId: queryPapers - description: | - Returns all published arXiv papers that respond to the specified - query parameters. By default, returns most recent papers first. - parameters: - - name: primary_category - in: query - description: | - Slug for the primary category or categories to which results - should be limited. - required: false - style: form - schema: - type: array - items: - type: string - responses: - '200': - description: All arXiv papers that respond to specified query. - content: - application/json: - schema: - $ref: '../schema/DocumentSet.json#DocumentSet' - default: - description: unexpected error - content: - application/json: - schema: - $ref: '#/components/schemas/Error' - /papers/{id}: - get: - description: Return metadata about an arXiv paper by arXiv ID. - operationId: getPaperByID - parameters: - - name: id - in: path - description: arXiv ID of paper to retrieve. - required: true - schema: - type: string - response: - '200': - description: Metadata about the requested arXiv paper. - content: - application/json: - schema: - $ref: '../schema/Document.json#Document' - default: - description: unexpected error - content: - application/json: - schema: - $ref: '#/components/schemas/Error' - -components: - schemas: - Error: - required: - - code - - message - properties: - code: - type: integer - format: int32 - message: - type: string diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 63a31812..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx-autodoc-typehints==1.2.5 -sphinxcontrib-websupport==1.0.1 -Sphinx==1.7.0 diff --git a/docs/source/api/search.agent.consumer.rst b/docs/source/api/search.agent.consumer.rst new file mode 100644 index 00000000..e5003bcb --- /dev/null +++ b/docs/source/api/search.agent.consumer.rst @@ -0,0 +1,7 @@ +search.agent.consumer module +============================ + +.. automodule:: search.agent.consumer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.agent.rst b/docs/source/api/search.agent.rst index e586b80b..5f56ca71 100644 --- a/docs/source/api/search.agent.rst +++ b/docs/source/api/search.agent.rst @@ -6,23 +6,17 @@ search.agent package :undoc-members: :show-inheritance: -Submodules ----------- +Subpackages +----------- -search.agent.base module ------------------------- +.. toctree:: -.. automodule:: search.agent.base - :members: - :undoc-members: - :show-inheritance: + search.agent.tests -search.agent.consumer module ----------------------------- +Submodules +---------- -.. automodule:: search.agent.consumer - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.agent.consumer diff --git a/docs/source/api/search.agent.tests.rst b/docs/source/api/search.agent.tests.rst new file mode 100644 index 00000000..1434db36 --- /dev/null +++ b/docs/source/api/search.agent.tests.rst @@ -0,0 +1,16 @@ +search.agent.tests package +========================== + +.. automodule:: search.agent.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + search.agent.tests.test_integration + search.agent.tests.test_record_processor + diff --git a/docs/source/api/search.agent.tests.test_integration.rst b/docs/source/api/search.agent.tests.test_integration.rst new file mode 100644 index 00000000..a48b0658 --- /dev/null +++ b/docs/source/api/search.agent.tests.test_integration.rst @@ -0,0 +1,7 @@ +search.agent.tests.test\_integration module +=========================================== + +.. automodule:: search.agent.tests.test_integration + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.agent.tests.test_record_processor.rst b/docs/source/api/search.agent.tests.test_record_processor.rst new file mode 100644 index 00000000..fecb043f --- /dev/null +++ b/docs/source/api/search.agent.tests.test_record_processor.rst @@ -0,0 +1,7 @@ +search.agent.tests.test\_record\_processor module +================================================= + +.. automodule:: search.agent.tests.test_record_processor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.config.rst b/docs/source/api/search.config.rst new file mode 100644 index 00000000..d73af70a --- /dev/null +++ b/docs/source/api/search.config.rst @@ -0,0 +1,7 @@ +search.config module +==================== + +.. automodule:: search.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.context.rst b/docs/source/api/search.context.rst new file mode 100644 index 00000000..8491c8cc --- /dev/null +++ b/docs/source/api/search.context.rst @@ -0,0 +1,7 @@ +search.context module +===================== + +.. automodule:: search.context + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.controllers.advanced.forms.rst b/docs/source/api/search.controllers.advanced.forms.rst new file mode 100644 index 00000000..d60013e3 --- /dev/null +++ b/docs/source/api/search.controllers.advanced.forms.rst @@ -0,0 +1,7 @@ +search.controllers.advanced.forms module +======================================== + +.. automodule:: search.controllers.advanced.forms + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.controllers.advanced.rst b/docs/source/api/search.controllers.advanced.rst index 8e0b6696..6d3f66f2 100644 --- a/docs/source/api/search.controllers.advanced.rst +++ b/docs/source/api/search.controllers.advanced.rst @@ -9,12 +9,8 @@ search.controllers.advanced package Submodules ---------- -search.controllers.advanced.forms module ----------------------------------------- - -.. automodule:: search.controllers.advanced.forms - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.controllers.advanced.forms + search.controllers.advanced.tests diff --git a/docs/source/api/search.controllers.advanced.tests.rst b/docs/source/api/search.controllers.advanced.tests.rst new file mode 100644 index 00000000..f563dfd1 --- /dev/null +++ b/docs/source/api/search.controllers.advanced.tests.rst @@ -0,0 +1,7 @@ +search.controllers.advanced.tests module +======================================== + +.. automodule:: search.controllers.advanced.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.controllers.api.rst b/docs/source/api/search.controllers.api.rst new file mode 100644 index 00000000..81ce16b5 --- /dev/null +++ b/docs/source/api/search.controllers.api.rst @@ -0,0 +1,8 @@ +search.controllers.api package +============================== + +.. automodule:: search.controllers.api + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/api/search.controllers.authors.rst b/docs/source/api/search.controllers.authors.rst deleted file mode 100644 index 5015a130..00000000 --- a/docs/source/api/search.controllers.authors.rst +++ /dev/null @@ -1,20 +0,0 @@ -search.controllers.authors package -================================== - -.. automodule:: search.controllers.authors - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -search.controllers.authors.forms module ---------------------------------------- - -.. automodule:: search.controllers.authors.forms - :members: - :undoc-members: - :show-inheritance: - - diff --git a/docs/source/api/search.controllers.rst b/docs/source/api/search.controllers.rst index 21ac25be..90eb38db 100644 --- a/docs/source/api/search.controllers.rst +++ b/docs/source/api/search.controllers.rst @@ -12,18 +12,14 @@ Subpackages .. toctree:: search.controllers.advanced - search.controllers.authors + search.controllers.api search.controllers.simple Submodules ---------- -search.controllers.util module ------------------------------- - -.. automodule:: search.controllers.util - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.controllers.tests + search.controllers.util diff --git a/docs/source/api/search.controllers.simple.forms.rst b/docs/source/api/search.controllers.simple.forms.rst new file mode 100644 index 00000000..676a5e0a --- /dev/null +++ b/docs/source/api/search.controllers.simple.forms.rst @@ -0,0 +1,7 @@ +search.controllers.simple.forms module +====================================== + +.. automodule:: search.controllers.simple.forms + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.controllers.simple.rst b/docs/source/api/search.controllers.simple.rst index 4d85e87d..8a5c6bc5 100644 --- a/docs/source/api/search.controllers.simple.rst +++ b/docs/source/api/search.controllers.simple.rst @@ -9,12 +9,8 @@ search.controllers.simple package Submodules ---------- -search.controllers.simple.forms module --------------------------------------- - -.. automodule:: search.controllers.simple.forms - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.controllers.simple.forms + search.controllers.simple.tests diff --git a/docs/source/api/search.controllers.simple.tests.rst b/docs/source/api/search.controllers.simple.tests.rst new file mode 100644 index 00000000..6956a652 --- /dev/null +++ b/docs/source/api/search.controllers.simple.tests.rst @@ -0,0 +1,7 @@ +search.controllers.simple.tests module +====================================== + +.. automodule:: search.controllers.simple.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.controllers.tests.rst b/docs/source/api/search.controllers.tests.rst new file mode 100644 index 00000000..04407d04 --- /dev/null +++ b/docs/source/api/search.controllers.tests.rst @@ -0,0 +1,7 @@ +search.controllers.tests module +=============================== + +.. automodule:: search.controllers.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.controllers.util.rst b/docs/source/api/search.controllers.util.rst new file mode 100644 index 00000000..32286608 --- /dev/null +++ b/docs/source/api/search.controllers.util.rst @@ -0,0 +1,7 @@ +search.controllers.util module +============================== + +.. automodule:: search.controllers.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.converters.rst b/docs/source/api/search.converters.rst new file mode 100644 index 00000000..8ee1e63d --- /dev/null +++ b/docs/source/api/search.converters.rst @@ -0,0 +1,7 @@ +search.converters module +======================== + +.. automodule:: search.converters + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.domain.advanced.rst b/docs/source/api/search.domain.advanced.rst new file mode 100644 index 00000000..f2fbd960 --- /dev/null +++ b/docs/source/api/search.domain.advanced.rst @@ -0,0 +1,7 @@ +search.domain.advanced module +============================= + +.. automodule:: search.domain.advanced + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.domain.api.rst b/docs/source/api/search.domain.api.rst new file mode 100644 index 00000000..3390a2ba --- /dev/null +++ b/docs/source/api/search.domain.api.rst @@ -0,0 +1,7 @@ +search.domain.api module +======================== + +.. automodule:: search.domain.api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.domain.base.rst b/docs/source/api/search.domain.base.rst new file mode 100644 index 00000000..3623beee --- /dev/null +++ b/docs/source/api/search.domain.base.rst @@ -0,0 +1,7 @@ +search.domain.base module +========================= + +.. automodule:: search.domain.base + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.domain.rst b/docs/source/api/search.domain.rst index 165458be..113ec00a 100644 --- a/docs/source/api/search.domain.rst +++ b/docs/source/api/search.domain.rst @@ -9,28 +9,9 @@ search.domain package Submodules ---------- -search.domain.advanced module ------------------------------ - -.. automodule:: search.domain.advanced - :members: - :undoc-members: - :show-inheritance: - -search.domain.author module ---------------------------- - -.. automodule:: search.domain.author - :members: - :undoc-members: - :show-inheritance: - -search.domain.base module -------------------------- - -.. automodule:: search.domain.base - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.domain.advanced + search.domain.api + search.domain.base diff --git a/docs/source/api/search.encode.rst b/docs/source/api/search.encode.rst new file mode 100644 index 00000000..4fedc406 --- /dev/null +++ b/docs/source/api/search.encode.rst @@ -0,0 +1,7 @@ +search.encode module +==================== + +.. automodule:: search.encode + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.factory.rst b/docs/source/api/search.factory.rst new file mode 100644 index 00000000..80814f0d --- /dev/null +++ b/docs/source/api/search.factory.rst @@ -0,0 +1,7 @@ +search.factory module +===================== + +.. automodule:: search.factory + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.process.rst b/docs/source/api/search.process.rst index df27115a..a8df3ddc 100644 --- a/docs/source/api/search.process.rst +++ b/docs/source/api/search.process.rst @@ -9,12 +9,8 @@ search.process package Submodules ---------- -search.process.transform module -------------------------------- - -.. automodule:: search.process.transform - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.process.tests + search.process.transform diff --git a/docs/source/api/search.process.tests.rst b/docs/source/api/search.process.tests.rst new file mode 100644 index 00000000..956540ca --- /dev/null +++ b/docs/source/api/search.process.tests.rst @@ -0,0 +1,7 @@ +search.process.tests module +=========================== + +.. automodule:: search.process.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.process.transform.rst b/docs/source/api/search.process.transform.rst new file mode 100644 index 00000000..e4858d04 --- /dev/null +++ b/docs/source/api/search.process.transform.rst @@ -0,0 +1,7 @@ +search.process.transform module +=============================== + +.. automodule:: search.process.transform + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.routes.api.exceptions.rst b/docs/source/api/search.routes.api.exceptions.rst new file mode 100644 index 00000000..107ac73d --- /dev/null +++ b/docs/source/api/search.routes.api.exceptions.rst @@ -0,0 +1,7 @@ +search.routes.api.exceptions module +=================================== + +.. automodule:: search.routes.api.exceptions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.routes.api.rst b/docs/source/api/search.routes.api.rst new file mode 100644 index 00000000..df729ea9 --- /dev/null +++ b/docs/source/api/search.routes.api.rst @@ -0,0 +1,23 @@ +search.routes.api package +========================= + +.. automodule:: search.routes.api + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + search.routes.api.tests + +Submodules +---------- + +.. toctree:: + + search.routes.api.exceptions + search.routes.api.serialize + diff --git a/docs/source/api/search.routes.api.serialize.rst b/docs/source/api/search.routes.api.serialize.rst new file mode 100644 index 00000000..bc2ea25b --- /dev/null +++ b/docs/source/api/search.routes.api.serialize.rst @@ -0,0 +1,7 @@ +search.routes.api.serialize module +================================== + +.. automodule:: search.routes.api.serialize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.routes.api.tests.rst b/docs/source/api/search.routes.api.tests.rst new file mode 100644 index 00000000..2fdc0aba --- /dev/null +++ b/docs/source/api/search.routes.api.tests.rst @@ -0,0 +1,16 @@ +search.routes.api.tests package +=============================== + +.. automodule:: search.routes.api.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + search.routes.api.tests.test_api + search.routes.api.tests.test_serialize + diff --git a/docs/source/api/search.routes.api.tests.test_api.rst b/docs/source/api/search.routes.api.tests.test_api.rst new file mode 100644 index 00000000..aa9840dc --- /dev/null +++ b/docs/source/api/search.routes.api.tests.test_api.rst @@ -0,0 +1,7 @@ +search.routes.api.tests.test\_api module +======================================== + +.. automodule:: search.routes.api.tests.test_api + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.routes.api.tests.test_serialize.rst b/docs/source/api/search.routes.api.tests.test_serialize.rst new file mode 100644 index 00000000..15883f1a --- /dev/null +++ b/docs/source/api/search.routes.api.tests.test_serialize.rst @@ -0,0 +1,7 @@ +search.routes.api.tests.test\_serialize module +============================================== + +.. automodule:: search.routes.api.tests.test_serialize + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.routes.rst b/docs/source/api/search.routes.rst index 5332161d..de6b7302 100644 --- a/docs/source/api/search.routes.rst +++ b/docs/source/api/search.routes.rst @@ -6,15 +6,17 @@ search.routes package :undoc-members: :show-inheritance: +Subpackages +----------- + +.. toctree:: + + search.routes.api + Submodules ---------- -search.routes.ui module ------------------------ - -.. automodule:: search.routes.ui - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.routes.ui diff --git a/docs/source/api/search.routes.ui.rst b/docs/source/api/search.routes.ui.rst new file mode 100644 index 00000000..d69da82b --- /dev/null +++ b/docs/source/api/search.routes.ui.rst @@ -0,0 +1,7 @@ +search.routes.ui module +======================= + +.. automodule:: search.routes.ui + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.rst b/docs/source/api/search.rst index db5b77c6..da532f7f 100644 --- a/docs/source/api/search.rst +++ b/docs/source/api/search.rst @@ -17,56 +17,16 @@ Subpackages search.process search.routes search.services + search.tests Submodules ---------- -search.config module --------------------- - -.. automodule:: search.config - :members: - :undoc-members: - :show-inheritance: - -search.context module ---------------------- - -.. automodule:: search.context - :members: - :undoc-members: - :show-inheritance: - -search.converter module ------------------------ - -.. automodule:: search.converter - :members: - :undoc-members: - :show-inheritance: - -search.factory module ---------------------- - -.. automodule:: search.factory - :members: - :undoc-members: - :show-inheritance: - -search.logging module ---------------------- - -.. automodule:: search.logging - :members: - :undoc-members: - :show-inheritance: - -search.util module ------------------- - -.. automodule:: search.util - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.config + search.context + search.converters + search.encode + search.factory diff --git a/docs/source/api/search.services.fulltext.rst b/docs/source/api/search.services.fulltext.rst new file mode 100644 index 00000000..5a74cdfe --- /dev/null +++ b/docs/source/api/search.services.fulltext.rst @@ -0,0 +1,7 @@ +search.services.fulltext module +=============================== + +.. automodule:: search.services.fulltext + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.advanced.rst b/docs/source/api/search.services.index.advanced.rst new file mode 100644 index 00000000..7bad6d7d --- /dev/null +++ b/docs/source/api/search.services.index.advanced.rst @@ -0,0 +1,7 @@ +search.services.index.advanced module +===================================== + +.. automodule:: search.services.index.advanced + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.authors.rst b/docs/source/api/search.services.index.authors.rst new file mode 100644 index 00000000..73bc1385 --- /dev/null +++ b/docs/source/api/search.services.index.authors.rst @@ -0,0 +1,7 @@ +search.services.index.authors module +==================================== + +.. automodule:: search.services.index.authors + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.exceptions.rst b/docs/source/api/search.services.index.exceptions.rst new file mode 100644 index 00000000..c90e0a77 --- /dev/null +++ b/docs/source/api/search.services.index.exceptions.rst @@ -0,0 +1,7 @@ +search.services.index.exceptions module +======================================= + +.. automodule:: search.services.index.exceptions + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.highlighting.rst b/docs/source/api/search.services.index.highlighting.rst new file mode 100644 index 00000000..fcac46d3 --- /dev/null +++ b/docs/source/api/search.services.index.highlighting.rst @@ -0,0 +1,7 @@ +search.services.index.highlighting module +========================================= + +.. automodule:: search.services.index.highlighting + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.prepare.rst b/docs/source/api/search.services.index.prepare.rst new file mode 100644 index 00000000..4509ae16 --- /dev/null +++ b/docs/source/api/search.services.index.prepare.rst @@ -0,0 +1,7 @@ +search.services.index.prepare module +==================================== + +.. automodule:: search.services.index.prepare + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.results.rst b/docs/source/api/search.services.index.results.rst new file mode 100644 index 00000000..95eb785a --- /dev/null +++ b/docs/source/api/search.services.index.results.rst @@ -0,0 +1,7 @@ +search.services.index.results module +==================================== + +.. automodule:: search.services.index.results + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.rst b/docs/source/api/search.services.index.rst new file mode 100644 index 00000000..979c0cbc --- /dev/null +++ b/docs/source/api/search.services.index.rst @@ -0,0 +1,29 @@ +search.services.index package +============================= + +.. automodule:: search.services.index + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + search.services.index.tests + +Submodules +---------- + +.. toctree:: + + search.services.index.advanced + search.services.index.authors + search.services.index.exceptions + search.services.index.highlighting + search.services.index.prepare + search.services.index.results + search.services.index.simple + search.services.index.util + diff --git a/docs/source/api/search.services.index.simple.rst b/docs/source/api/search.services.index.simple.rst new file mode 100644 index 00000000..b59f8929 --- /dev/null +++ b/docs/source/api/search.services.index.simple.rst @@ -0,0 +1,7 @@ +search.services.index.simple module +=================================== + +.. automodule:: search.services.index.simple + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.tests.rst b/docs/source/api/search.services.index.tests.rst new file mode 100644 index 00000000..9479c75f --- /dev/null +++ b/docs/source/api/search.services.index.tests.rst @@ -0,0 +1,18 @@ +search.services.index.tests package +=================================== + +.. automodule:: search.services.index.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + search.services.index.tests.test_reindex + search.services.index.tests.test_results + search.services.index.tests.test_util + search.services.index.tests.tests + diff --git a/docs/source/api/search.services.index.tests.test_reindex.rst b/docs/source/api/search.services.index.tests.test_reindex.rst new file mode 100644 index 00000000..d89bc7b8 --- /dev/null +++ b/docs/source/api/search.services.index.tests.test_reindex.rst @@ -0,0 +1,7 @@ +search.services.index.tests.test\_reindex module +================================================ + +.. automodule:: search.services.index.tests.test_reindex + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.tests.test_results.rst b/docs/source/api/search.services.index.tests.test_results.rst new file mode 100644 index 00000000..7bb1ba2b --- /dev/null +++ b/docs/source/api/search.services.index.tests.test_results.rst @@ -0,0 +1,7 @@ +search.services.index.tests.test\_results module +================================================ + +.. automodule:: search.services.index.tests.test_results + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.tests.test_util.rst b/docs/source/api/search.services.index.tests.test_util.rst new file mode 100644 index 00000000..cc535414 --- /dev/null +++ b/docs/source/api/search.services.index.tests.test_util.rst @@ -0,0 +1,7 @@ +search.services.index.tests.test\_util module +============================================= + +.. automodule:: search.services.index.tests.test_util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.tests.tests.rst b/docs/source/api/search.services.index.tests.tests.rst new file mode 100644 index 00000000..b8721d9d --- /dev/null +++ b/docs/source/api/search.services.index.tests.tests.rst @@ -0,0 +1,7 @@ +search.services.index.tests.tests module +======================================== + +.. automodule:: search.services.index.tests.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.index.util.rst b/docs/source/api/search.services.index.util.rst new file mode 100644 index 00000000..4ba90c85 --- /dev/null +++ b/docs/source/api/search.services.index.util.rst @@ -0,0 +1,7 @@ +search.services.index.util module +================================= + +.. automodule:: search.services.index.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.metadata.rst b/docs/source/api/search.services.metadata.rst new file mode 100644 index 00000000..93014a82 --- /dev/null +++ b/docs/source/api/search.services.metadata.rst @@ -0,0 +1,7 @@ +search.services.metadata module +=============================== + +.. automodule:: search.services.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.rst b/docs/source/api/search.services.rst index c8661d72..b353333b 100644 --- a/docs/source/api/search.services.rst +++ b/docs/source/api/search.services.rst @@ -6,31 +6,19 @@ search.services package :undoc-members: :show-inheritance: -Submodules ----------- +Subpackages +----------- -search.services.fulltext module -------------------------------- - -.. automodule:: search.services.fulltext - :members: - :undoc-members: - :show-inheritance: +.. toctree:: -search.services.index module ----------------------------- + search.services.index + search.services.tests -.. automodule:: search.services.index - :members: - :undoc-members: - :show-inheritance: - -search.services.metadata module -------------------------------- +Submodules +---------- -.. automodule:: search.services.metadata - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + search.services.fulltext + search.services.metadata diff --git a/docs/source/api/search.services.tests.rst b/docs/source/api/search.services.tests.rst new file mode 100644 index 00000000..8a059df1 --- /dev/null +++ b/docs/source/api/search.services.tests.rst @@ -0,0 +1,16 @@ +search.services.tests package +============================= + +.. automodule:: search.services.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + search.services.tests.test_fulltext + search.services.tests.test_metadata + diff --git a/docs/source/api/search.services.tests.test_fulltext.rst b/docs/source/api/search.services.tests.test_fulltext.rst new file mode 100644 index 00000000..c36d8420 --- /dev/null +++ b/docs/source/api/search.services.tests.test_fulltext.rst @@ -0,0 +1,7 @@ +search.services.tests.test\_fulltext module +=========================================== + +.. automodule:: search.services.tests.test_fulltext + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.services.tests.test_metadata.rst b/docs/source/api/search.services.tests.test_metadata.rst new file mode 100644 index 00000000..4b654897 --- /dev/null +++ b/docs/source/api/search.services.tests.test_metadata.rst @@ -0,0 +1,7 @@ +search.services.tests.test\_metadata module +=========================================== + +.. automodule:: search.services.tests.test_metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.tests.rst b/docs/source/api/search.tests.rst new file mode 100644 index 00000000..d2022adc --- /dev/null +++ b/docs/source/api/search.tests.rst @@ -0,0 +1,16 @@ +search.tests package +==================== + +.. automodule:: search.tests + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + search.tests.test_advanced_search + search.tests.test_param_persistence + diff --git a/docs/source/api/search.tests.test_advanced_search.rst b/docs/source/api/search.tests.test_advanced_search.rst new file mode 100644 index 00000000..5d4b705e --- /dev/null +++ b/docs/source/api/search.tests.test_advanced_search.rst @@ -0,0 +1,7 @@ +search.tests.test\_advanced\_search module +========================================== + +.. automodule:: search.tests.test_advanced_search + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/search.tests.test_param_persistence.rst b/docs/source/api/search.tests.test_param_persistence.rst new file mode 100644 index 00000000..a29664ed --- /dev/null +++ b/docs/source/api/search.tests.test_param_persistence.rst @@ -0,0 +1,7 @@ +search.tests.test\_param\_persistence module +============================================ + +.. automodule:: search.tests.test_param_persistence + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/architecture.rst b/docs/source/architecture.rst index 068da8ef..4f3f8da5 100644 --- a/docs/source/architecture.rst +++ b/docs/source/architecture.rst @@ -1,73 +1,14 @@ -arXiv Search -************ - -The current version of the arXiv search application is designed to meet the -goals outlined in arXiv-NG milestone H1: Replace Legacy Search. - -- H1.1. Replace the current advanced search interface, search results, and - search by author name. -- H1.2. The search result view should support pagination, and ordering by - publication date or relevance. -- H1.3. An indexing agent updates the search index at publication time in - response to a Kinesis notification, using metadata from the docmeta endpoint - in the classic system. - -Key Requirements -================ - -- Simple search: - - - Users should be able to search for arXiv papers by title, author, and - abstract. - - Searches can originate from any part of the arXiv.org site, via the - search bar in the site header. - -- Advanced search: - - - Users can search for papers using boolean combinations of search terms on - title, author names, and/or abstract. - - Users can filter results by primary classification, and submission date. - - Submission date supports prior year, specific year, and date range. - -- Author name search: - - - Users should be able to search for papers by author name. - - This should support queries originating on the abs page, and in search - results. - -- UI: The overall flavor of the search views should be substantially - similar to the classic views, but with styling improvements that improve - readability, usability, and accessibility. - -Quality Goals -============= -- Code quality: - - - 90% test coverage on Python components that we develop/control. - - Linting: ``pylint`` passes with >= 9/10. - - Documentation: ``pydocstyle`` passes. - - Static checking: ``mypy`` passes. - -- Performance & reliability: - - - Response time: 99% of requests have a latency of 1 second or less. - - Error rate: parity with classic search. - - Request rate: support request volume of existing search * safety factor 3. - -- Accessibility: meet or exceed WCAG 2.0 level A for accessibility. - -Constraints -=========== -- Must be implemented in Python/Flask, and be deployable behind Apache as a - Python/WSGI application. -- The search application itself must be stateless. It must be able to connect - to an arbitrary ElasticSearch cluster, which can be specified via - configuration. -- Notifications about new content are delivered via the Kinesis notification - broker. +Architectural overview +********************** Context ======= +The arXiv search system supports queries about arXiv papers both from human +users and from API clients (via the arXiv API gateway). Most readers arrive +at the search interface via a small search bar in the running header of +arxiv.org pages, or by clicking on the name of an author on the abstract page +or other listings. + .. _figure-ng-search-context: .. figure:: _static/diagrams/ng-search-context.png @@ -75,14 +16,8 @@ Context System context for arXiv search. -The arXiv search system supports queries about arXiv papers both from human -users and from API clients (via the arXiv API gateway). Most readers arrive -at the search interface via a small search bar in the running header of -arxiv.org pages, or by clicking on the name of an author on the abstract page -or other listings. - The search system draws content from the core metadata repository (currently -the classic arXiv application, via the ``/docmeta`` endpoint), and (future) +the classic arXiv application, via the ``docmeta`` endpoint), and (future) from the fulltext extraction service. Notifications about new content are disseminated by the publication system via @@ -107,9 +42,10 @@ Containers The core of the search system is an ElasticSearch cluster, provided by the `AWS Elasticsearch Service `_. -The search service, implemented in Python/Flask, provides both the user-facing -interfaces as well as a REST API. The search service is only responsible for -reading from ElasticSearch. +The search application, implemented in Python/Flask, provides both the +user-facing interfaces as well as a REST API. These two sets of endpoints +(:mod:`search.routes.ui` and :mod:`search.routes.api`) are deployed as two +separate services. The indexing agent application is responsible for coordinating updates to the ElasticSearch cluster. The agent subscribes to notifications about the @@ -119,8 +55,8 @@ repository and the plain text store (future), transforms those data into a search document, and sends that document to ES. -Components: Search service -========================== +Components: Search UI service +============================== .. _figure-ng-search-application-components: @@ -129,19 +65,17 @@ Components: Search service Components of the search service. -The search service is a Flask application that handles user/client requests to -search arXiv papers. +The search ui service is a Flask application that handles user/client requests +to search arXiv papers. -The entry-point to the application is the application factory module, which -provides :func:`search.factory.create_ui_web_app`. That application factory -function attaches templates and static files from `arxiv-base -`_ for use in search-specific templates, +The entry-point to the application is :func:`search.factory.create_ui_web_app`. +That application factory function attaches templates and static files from +:mod:`arxiv.base` for use in search-specific templates, and attaches the routing blueprint provided by :mod:`search.routes.ui`. -:mod:`search.routes.ui` routes parameters from incoming requests to the three +:mod:`search.routes.ui` routes parameters from incoming requests to the two main search controllers: -- :mod:`search.controllers.authors` - :mod:`search.controllers.simple` - :mod:`search.controllers.advanced` @@ -150,30 +84,10 @@ use-cases in one controller, because we expect user interface methods to change significantly as enhanced features are introduced. Each controller module implements its own classes for form handling and validation. - - Components: Indexing agent ========================== - -.. _figure-ng-search-indexing-agent-components: - -.. figure:: _static/diagrams/ng-search-indexing-agent-components.png - :target: _static/diagrams/ng-search-indexing-agent-components.png - - Components view of the search agent. - -Notification handling is provided by two components: a notification consumer -provided by Amazon, implemented using the Java-based Kinesis Consumer -Library, and a record processor component implemented in Python that -processes new notifications received by the consumer. A so-called -MultiLangDaemon, a stand-alone Java process, provides the glue between the -KCL and our record processor. When new notifications are received by the -consumer, the MultiLangDaemon invokes the record processor, which in turn -starts the processing pipeline. - -The :class:`search.agent.consumer.MetadataRecordProcessor` implements the -indexing procedure for each notification. The -:meth:`search.agent.consumer.MetadataRecordProcessor.process_record` method -coordinates retrieval of metadata from the docmeta endpoint (classic), -transformation of those content into a search document, and updating -ElasticSearch. +Notifications about new document metadata are handled by :mod:`search.agent`, +implemented using :mod:`arxiv.base.agent`. This class coordinates retrieval of +metadata from the docmeta endpoint (classic), transforms those content +into a search document using :mod:`search.process.transform`, and updates +ElasticSearch using :mod:`search.services.index`. diff --git a/docs/source/conf.py b/docs/source/conf.py index 763c44dc..5ab00a0c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -181,4 +181,12 @@ ] -intersphinx_mapping = {'https://docs.python.org/3/': None} +intersphinx_mapping = { + 'python': ('https://docs.python.org/3.6', None), + 'arxitecture': ('https://cul-it.github.io/arxiv-arxitecture/', None), + 'arxiv.taxonomy': ('https://cul-it.github.io/arxiv-base', None), + 'arxiv.base': ('https://cul-it.github.io/arxiv-base', None), + 'browse': ('https://cul-it.github.io/arxiv-browse/', None), + 'search': ('https://cul-it.github.io/arxiv-search/', None), + 'zero': ('https://cul-it.github.io/arxiv-zero/', None), +} diff --git a/docs/source/index.rst b/docs/source/index.rst index 357345cc..1f31afe3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,6 +6,8 @@ arXiv Search System Documentation :caption: Contents: architecture.rst + search_ui.rst + search_api.rst api/modules.rst diff --git a/docs/source/search_api.rst b/docs/source/search_api.rst new file mode 100644 index 00000000..a967a595 --- /dev/null +++ b/docs/source/search_api.rst @@ -0,0 +1,34 @@ +Search API (Alpha) +****************** + +Release `0.5.0-alpha` introduces support for a metadata search API service. +This release targets milestone H2: Search API, with the following specific +goals: + +- H2.1: A search API is exposed via the API gateway, with feature-parity to + classic "arXiv API". + + - Consider content negotiation to support legacy XML and JSON(-LD). + +- H2.2: Opportunistic improvements, fixes, e.g. proper handling of UTF-8 + characters (ARXIVNG-257). +- H2.3: Deprecate classic arXiv API. + + +The current release supports only JSON serialization, provided by +:class:`search.routes.api.serialize.JSONSerializer`. An Atom/XML serializer +:class:`search.routes.api.serialize.AtomXMLSerializer` is planned but not yet +implemented. + +A formal description of the API (OpenAPI 3.0) and resources (JSON Schema) can +be found at ``_. + +The service endpoints are defined in :mod:`search.routes.api`: + +- The root endpoint :func:`search.routes.api.search` supports queries using the + same semantics as the advanced search UI. +- The paper metadata endpoint :func:`search.routes.api.paper` provides more + detailed metadata for a specific arXiv e-print. + +Requests are handled by the controllers in :mod:`search.controllers.api`, using +the :class:`search.domain.api.APIQuery` domain class. diff --git a/docs/source/search_ui.rst b/docs/source/search_ui.rst new file mode 100644 index 00000000..581b50e2 --- /dev/null +++ b/docs/source/search_ui.rst @@ -0,0 +1,67 @@ +Search Interface +**************** + +The current version of the arXiv search application is designed to meet the +goals outlined in arXiv-NG milestone H1: Replace Legacy Search. + +- H1.1. Replace the current advanced search interface, search results, and + search by author name. +- H1.2. The search result view should support pagination, and ordering by + publication date or relevance. +- H1.3. An indexing agent updates the search index at publication time in + response to a Kinesis notification, using metadata from the docmeta endpoint + in the classic system. + +Key Requirements +================ + +- Simple search: + + - Users should be able to search for arXiv papers by title, author, and + abstract. + - Searches can originate from any part of the arXiv.org site, via the + search bar in the site header. + +- Advanced search: + + - Users can search for papers using boolean combinations of search terms on + title, author names, and/or abstract. + - Users can filter results by primary classification, and submission date. + - Submission date supports prior year, specific year, and date range. + +- Author name search: + + - Users should be able to search for papers by author name. + - This should support queries originating on the abs page, and in search + results. + +- UI: The overall flavor of the search views should be substantially + similar to the classic views, but with styling that improves + readability, usability, and accessibility. + +Quality Goals +============= +- Code quality: + + - 90% test coverage on Python components that we develop/control. + - Linting: ``pylint`` passes with >= 9/10. + - Documentation: ``pydocstyle`` passes. + - Static checking: ``mypy`` passes. + +- Performance & reliability: + + - Response time: 99% of requests have a latency of 1 second or less. + - Error rate: parity with classic search. + - Request rate: support request volume of existing search * safety factor 3. + +- Accessibility: meet or exceed WCAG 2.0 level A for accessibility. + +Constraints +=========== +- Must be implemented in Python/Flask, and be deployable behind Apache as a + Python/WSGI application. +- The search application itself must be stateless. It must be able to connect + to an arbitrary ElasticSearch cluster, which can be specified via + configuration. +- Notifications about new content are delivered via the Kinesis notification + broker. diff --git a/mappings/DocumentMapping.json b/mappings/DocumentMapping.json index dc718f70..f3178241 100644 --- a/mappings/DocumentMapping.json +++ b/mappings/DocumentMapping.json @@ -388,6 +388,10 @@ "primary_classification": { "type": "object", "properties": { + "combined": { + "type": "text", + "analyzer": "simple" + }, "group": { "type": "object", "properties": { @@ -405,12 +409,12 @@ "id": { "type": "keyword", "normalizer": "simple", - "copy_to": ["combined"] + "copy_to": ["combined", "primary_classification.combined"] }, "name": { "type": "keyword", "normalizer": "simple", - "copy_to": ["combined"] + "copy_to": ["combined", "primary_classification.combined"] } } }, @@ -420,12 +424,12 @@ "id": { "type": "keyword", "normalizer": "simple", - "copy_to": ["combined"] + "copy_to": ["combined", "primary_classification.combined"] }, "name": { "type": "keyword", "normalizer": "simple", - "copy_to": ["combined"] + "copy_to": ["combined", "primary_classification.combined"] } } } @@ -434,6 +438,10 @@ "secondary_classification": { "type": "nested", "properties": { + "combined": { + "type": "text", + "analyzer": "simple" + }, "group": { "type": "object", "properties": { @@ -449,10 +457,14 @@ "type": "object", "properties": { "id": { - "type": "keyword" + "type": "keyword", + "normalizer": "simple", + "copy_to": ["combined", "secondary_classification.combined"] }, "name": { - "type": "keyword" + "type": "keyword", + "normalizer": "simple", + "copy_to": ["combined", "secondary_classification.combined"] } } }, @@ -460,10 +472,14 @@ "type": "object", "properties": { "id": { - "type": "keyword" + "type": "keyword", + "normalizer": "simple", + "copy_to": ["combined", "secondary_classification.combined"] }, "name": { - "type": "keyword" + "type": "keyword", + "normalizer": "simple", + "copy_to": ["combined", "secondary_classification.combined"] } } } diff --git a/requirements/dev.txt b/requirements/dev.txt deleted file mode 100644 index 9a97fe49..00000000 --- a/requirements/dev.txt +++ /dev/null @@ -1,45 +0,0 @@ -amazon-kclpy==1.4.4 -arxiv-base==0.6.1 -boto==2.48.0 -boto3==1.6.6 -botocore==1.9.6 -certifi==2017.7.27.1 -chardet==3.0.4 -click==6.7 -coverage==4.4.2 -dataclasses==0.4 -docutils==0.14 -elasticsearch==6.1.1 -elasticsearch-dsl==6.1.0 -Flask==0.12.2 -Flask-S3==0.3.3 -idna==2.6 -ipaddress==1.0.19 -itsdangerous==0.24 -Jinja2==2.10 -jmespath==0.9.3 -jsonschema==2.6.0 -MarkupSafe==1.0 -mccabe==0.6.1 -mock==2.0.0 -mypy==0.560 -nose2==0.7.3 -pbr==3.1.1 -psutil==5.4.3 -pycodestyle==2.3.1 -pydocstyle==2.1.1 -pyflakes==1.6.0 -pylama==7.4.3 -python-dateutil==2.6.1 -pytz==2017.3 -requests==2.18.4 -s3transfer==0.1.13 -six==1.11.0 -snowballstemmer==1.2.1 -thrift==0.11.0 -thrift-connector==0.23 -typed-ast==1.1.0 -urllib3==1.22 -Werkzeug==0.13 -WTForms==2.1 -bleach==2.0.0 diff --git a/requirements/prod.txt b/requirements/prod.txt deleted file mode 100644 index 9a97fe49..00000000 --- a/requirements/prod.txt +++ /dev/null @@ -1,45 +0,0 @@ -amazon-kclpy==1.4.4 -arxiv-base==0.6.1 -boto==2.48.0 -boto3==1.6.6 -botocore==1.9.6 -certifi==2017.7.27.1 -chardet==3.0.4 -click==6.7 -coverage==4.4.2 -dataclasses==0.4 -docutils==0.14 -elasticsearch==6.1.1 -elasticsearch-dsl==6.1.0 -Flask==0.12.2 -Flask-S3==0.3.3 -idna==2.6 -ipaddress==1.0.19 -itsdangerous==0.24 -Jinja2==2.10 -jmespath==0.9.3 -jsonschema==2.6.0 -MarkupSafe==1.0 -mccabe==0.6.1 -mock==2.0.0 -mypy==0.560 -nose2==0.7.3 -pbr==3.1.1 -psutil==5.4.3 -pycodestyle==2.3.1 -pydocstyle==2.1.1 -pyflakes==1.6.0 -pylama==7.4.3 -python-dateutil==2.6.1 -pytz==2017.3 -requests==2.18.4 -s3transfer==0.1.13 -six==1.11.0 -snowballstemmer==1.2.1 -thrift==0.11.0 -thrift-connector==0.23 -typed-ast==1.1.0 -urllib3==1.22 -Werkzeug==0.13 -WTForms==2.1 -bleach==2.0.0 diff --git a/requirements/test.txt b/requirements/test.txt deleted file mode 100644 index 49bc87f5..00000000 --- a/requirements/test.txt +++ /dev/null @@ -1,6 +0,0 @@ -coverage==4.5 -mypy==0.570 -nose2==0.7.3 -pylint==1.8.2 -coveralls==1.3.0 -pydocstyle==2.1.1 diff --git a/schema/DocumentSet.json b/schema/DocumentSet.json deleted file mode 100644 index 49bd8354..00000000 --- a/schema/DocumentSet.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "$schema": "http://json-schema.org/schema#", - "title": "DocumentSet", - "description": "Prototype schema for arXiv documents", - "type": "object", - "required": ["metadata", "results"], - "properties": { - "metadata": { - "type": "object", - "query": { - "description": "The request query from which the document set is derived", - "type": "string" - }, - "total": { - "description": "Total number of results that respond to the query.", - "type": "integer", - "minimum": 0 - }, - "pagination": { - "description": "Pagination details", - "type": "object", - "properties": { - "next": { - "type": "string", - "format": "uri", - "description": "URI for the next page of results" - }, - "previous": { - "type": "string", - "format": "uri", - "description": "URI for the previous page of results" - } - } - } - }, - "results": { - "type": "object", - "$ref": "Document.json#Document" - } - } -} diff --git a/schema/resources/Classification.json b/schema/resources/Classification.json new file mode 100644 index 00000000..7bb2e1cd --- /dev/null +++ b/schema/resources/Classification.json @@ -0,0 +1,9 @@ +{ + "title": "Classification", + "type": "object", + "properties": { + "archive": {"$ref": "./ClassificationTerm.json"}, + "group": {"$ref": "./ClassificationTerm.json"}, + "category": {"$ref": "./ClassificationTerm.json"} + } +} diff --git a/schema/resources/ClassificationTerm.json b/schema/resources/ClassificationTerm.json new file mode 100644 index 00000000..5f0a686b --- /dev/null +++ b/schema/resources/ClassificationTerm.json @@ -0,0 +1,13 @@ +{ + "title": "ClassificationTerm", + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": ["id", "name"] +} diff --git a/schema/resources/Document.json b/schema/resources/Document.json new file mode 100644 index 00000000..53330af2 --- /dev/null +++ b/schema/resources/Document.json @@ -0,0 +1,192 @@ +{ + "Document": { + "title": "Document", + "description": "Schema for arXiv document metadata returned by the search API.", + "type": "object", + "properties": { + "abs_categories": { + "description": "Categories as they would appear on the /abs page", + "type": "string" + }, + "abstract": { + "type": "string" + }, + "acm_class": { + "description": "Classifications from ACM Computing Classification System", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "owners": { + "items": { + "type": "object", + "$ref": "./Person.json" + } + }, + "authors": { + "type": "array", + "items": { + "type": "object", + "$ref": "./Person.json" + }, + "minItems": 1 + }, + "comments": { + "type": "string" + }, + "submitted_date": { + "description": "Date this version of paper was submitted.", + "type": "string", + "format": "date-time" + }, + "announced_date_first": { + "description": "Year and month (``%Y-%m``) the paper was originally announced.", + "type": "string" + }, + "paper_id": { + "description": "arXiv paper identifier without version affix.", + "type": "string" + }, + "paper_id_v": { + "description": "arXiv paper identifier with version affix.", + "type": "string" + }, + "doi": { + "type": "string" + }, + "formats": { + "description": "Derivative paper formats available to users", + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "properties": { + "format": { + "type": "string" + }, + "href": { + "type": "string", + "format": "uri" + } + } + } + }, + "href": { + "type": "string", + "format": "uri" + }, + "fulltext": { + "type": "string" + }, + "is_current": { + "type": "boolean" + }, + "is_withdrawn": { + "type": "boolean" + }, + "journal_ref": { + "type": "string" + }, + "license": { + "type": "object", + "properties": { + "href": { + "type": "string", + "nullable": true + }, + "label": { + "type": "string", + "nullable": true + } + } + }, + "msc_class": { + "description": "Classifications from American Mathematical Society Mathematical Subject Classification (MSC)", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "primary_classification": { + "$ref": "./Classification.json" + }, + "secondary_classification": { + "type": "array", + "items": {"$ref": "./Classification.json"} + }, + "report_num": { + "type": "string" + }, + "source": { + "properties": { + "flags": { + "type": "string", + "nullable": true + }, + "format": { + "type": "string", + "nullable": true + }, + "size_bytes": { + "minimum": 0, + "type": "integer" + } + }, + "required": ["size_bytes"] + }, + "submitter": { + "description": "Submitter data. Name may not match those associated with user account, since user data is copied to submission data at the time of submission creation.", + "type": "object", + "$ref": "./Person.json" + }, + "title": { + "type": "string" + }, + "version": { + "description": "The version number for this paper.", + "minimum": 1, + "type": "integer" + }, + "latest": { + "description": "arXiv paper identifier (with version affix) of latest version of this paper.", + "type": "object", + "properties": { + "version": { + "description": "Number of the latest version of this paper.", + "minimum": 1, + "type": "integer" + }, + "href": { + "type": "string", + "format": "uri", + "description": "Location of the detailed metadata record available via the API" + }, + "canonical": { + "type": "string", + "format": "uri", + "description": "Canonical arXiv URI." + }, + "paper_id": { + "description": "Paper ID with version affix of latest version.", + "type": "string" + } + } + }, + "canonical": { + "type": "string", + "format": "uri", + "description": "Canonical arXiv URI." + } + }, + "required": [ + "paper_id", + "paper_id_v", + "version", + "href", + "canonical" + ] + } +} diff --git a/schema/Document.json b/schema/resources/DocumentMetadata.json similarity index 82% rename from schema/Document.json rename to schema/resources/DocumentMetadata.json index 74166186..7a957ae9 100644 --- a/schema/Document.json +++ b/schema/resources/DocumentMetadata.json @@ -1,53 +1,6 @@ { - "$schema": "http://json-schema.org/schema#", - "title": "Document", - "description": "Prototype schema for arXiv documents", - "definitions": { - "category": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "archive": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "group": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "classification": { - "type": "object", - "properties": { - "archive": {"$ref": "#/definitions/archive"}, - "group": {"$ref": "#/definitions/group"}, - "category": {"$ref": "#/definitions/category"} - } - } - }, + "title": "DocumentMetadata", + "description": "Schema for arXiv document metadata provided by the docmeta endpoint.", "type": "object", "properties": { "abs_categories": { @@ -148,9 +101,9 @@ "minItems": 0 }, "announced_first": { - "desription": "Date (year and month) the paper was originally announced", + "description": "Date (year and month) the paper was originally announced", "type": "string" - } + }, "updated_date": { "description": "Date this version of paper was last touched", "type": "string" @@ -217,11 +170,11 @@ "type": "string" }, "primary_classification": { - "$ref": "#/definitions/classification" + "$ref": "./Classification.json" }, "secondary_classification": { "type": "array", - "items": {"$ref": "#/definitions/classification"} + "items": {"$ref": "./Classification.json"} }, "proxy": { "type": "string" diff --git a/schema/resources/DocumentSet.json b/schema/resources/DocumentSet.json new file mode 100644 index 00000000..27576192 --- /dev/null +++ b/schema/resources/DocumentSet.json @@ -0,0 +1,47 @@ +{ + "title": "DocumentSet", + "description": "A set of documents that respond to a query.", + "type": "object", + "required": ["metadata", "results"], + "properties": { + "metadata": { + "description": "Summary information about the search, including pagination.", + "properties": { + "start": { + "description": "Offset (zero-based) of first result in this documentset from start of original search results.", + "type": "integer" + }, + "end": { + "description": "Offset (zero-based) of last result in this documentset from start of original search results.", + "type": "integer" + }, + "total": { + "description": "Total number of documents that respond to this query.", + "type": "integer" + }, + "query": { + "description": "Query parameters interpreted from the request.", + "type": "array", + "items": { + "type": "object", + "properties": { + "parameter": { + "type": "string" + }, + "value": { + "type": "string" + } + } + } + } + } + }, + "results": { + "type": "array", + "items": { + "type": "object", + "$ref": "Document.json" + } + } + } +} diff --git a/schema/resources/Person.json b/schema/resources/Person.json new file mode 100644 index 00000000..dec4cec1 --- /dev/null +++ b/schema/resources/Person.json @@ -0,0 +1,48 @@ +{ + "title": "Person", + "description": "Schema for person in metadata returned by the search API.", + "type": "object", + "properties": { + "full_name": { + "description": "The fullest representation of the person's name available in arXiv metadata.", + "type": "string" + }, + "last_name": { + "description": "The family or surname part of the person's name, if available.", + "type": "string" + }, + "first_name": { + "description": "The personal or forename part of the person's name, if available.", + "type": "string" + }, + "suffix": { + "description": "The suffix part of the person's name, if available.", + "type": "string" + }, + "affiliation": { + "description": "Institutional affiliations as entered at the time of submission, if available.", + "type": "array", + "items": { + "type": "string" + }, + "minItems": 0 + }, + "orcid": { + "description": "ORCID identifier, if available.", + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + }, + "author_id": { + "description": "arXiv author identifier, if available.", + "oneOf": [ + {"type": "string"}, + {"type": "null"} + ] + } + }, + "required": [ + "full_name" + ] +} diff --git a/schema/search.yaml b/schema/search.yaml new file mode 100644 index 00000000..8f6d3306 --- /dev/null +++ b/schema/search.yaml @@ -0,0 +1,599 @@ +openapi: "3.0.0" +info: + version: "0.1.2" + title: "arXiv Search API" + description: | + A query API for arXiv paper metadata. + termsOfService: "https://arxiv.org/help/general" + contact: + name: "arXiv API Team" + email: nextgen@arxiv.org + license: + name: MIT +servers: + - url: https://api.arxiv.org/metadata/ + description: Metadata API endpoint. +paths: + /: + get: + operationId: queryPapers + description: | + Returns all published arXiv papers that respond to the specified + query parameters. By default, returns most recent papers first. + + ## Example request + + ```bash + curl \ + -H "Authorization: Bearer 4mggHnvB3ZV1bV3GObE6wZFw8pul5nGyzfeABSdfDg" \ + https://api.arxiv.org/metadata/?size=5&license=http://arxiv.org/licenses/nonexclusive-distrib/1.0/&include=license + + ``` + + ## Example response + + ```json + { + "metadata": { + "end": 5, + "query": [ + { + "parameter": "license", + "value": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/" + }, + { + "parameter": "include", + "value": "license" + } + ], + "size": 5, + "start": 0, + "total": 993119 + }, + "results": [ + { + "canonical": "https://arxiv.org/abs/1812.01565v1", + "href": "http://127.0.0.1:5000/1812.01565v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.01565", + "paper_id_v": "1812.01565v1", + "title": "Impact of radiation backgrounds on the formation of massive black holes", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03980v1", + "href": "http://127.0.0.1:5000/1812.03980v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03980", + "paper_id_v": "1812.03980v1", + "title": "Building Ethically Bounded AI", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03942v1", + "href": "http://127.0.0.1:5000/1812.03942v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03942", + "paper_id_v": "1812.03942v1", + "title": "Accurate Evaluation of $\\mathcal{P}$,$\\mathcal{T}$-odd Faraday Effect in Atoms of Xe and Hg", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03969v1", + "href": "http://127.0.0.1:5000/1812.03969v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03969", + "paper_id_v": "1812.03969v1", + "title": "Modified gravity, gravitational waves and the large-scale structure of the Universe: A brief report", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03956v1", + "href": "http://127.0.0.1:5000/1812.03956v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03956", + "paper_id_v": "1812.03956v1", + "title": "X-ray reflectivity with a twist: quantitative time-resolved X-ray reflectivity using monochromatic synchrotron radiation", + "version": 1 + } + ] + } + + ``` + + parameters: + - name: all + in: query + description: | + Performs a query across all fields. Has the same behavior as the + simple search function on the main arXiv website. Supports quoted + literals, wildcards, etc. + required: false + style: form + explode: true + schema: + type: string + + - name: author + in: query + description: | + Search by author name. For the most precise name search, use + ``surname(s), forename(s)`` or ``surname(s), initial(s)``. Author + names enclosed in quotes will return only exact matches only. + Diacritic character variants are automatically searched. For + more information about the limitations of searching by author name, + see https://blogs.cornell.edu/arxiv/2018/05/04/release-search-v0-2-some-notes-on-names/. + required: false + style: form + explode: true + schema: + type: string + example: sinskaja, e n + + - name: title + in: query + description: | + Text search in paper titles. Supports quoted literals and + wildcards. + required: false + style: form + explode: true + schema: + type: string + example: "theory of life" + + - name: abstract + in: query + description: | + Text search in paper abstracts. Supports quoted literals and + wildcards. + required: false + style: form + explode: true + schema: + type: string + example: abstr?ct + + - name: comments + in: query + description: | + Text search in paper comments. Supports quoted literals and + wildcards. See https://arxiv.org/help/prep#comments for a + description of this field. + required: false + style: form + explode: true + schema: + type: string + example: "color figures" + + - name: journal_ref + in: query + description: | + Text search in journal reference. Supports quoted literals and + wildcards. See https://arxiv.org/help/prep#journal for a + description of this field. + required: false + style: form + explode: true + schema: + type: string + example: "j cool beans" + + - name: report_num + in: query + description: | + Text search in report number. Supports quoted literals and + wildcards. See https://arxiv.org/help/prep#report for a description + of this field. + required: false + style: form + explode: true + schema: + type: string + example: SU-4240-720 + + - name: acm_class + in: query + description: | + Keyword match on ACM classification code. Supports wildcards. See + https://arxiv.org/help/prep#acm for a description of this field. + required: false + style: form + explode: true + schema: + type: string + example: F.2.2 + + - name: msc_class + in: query + description: | + Keyword match on MSC classification code. Supports wildcards. See + https://arxiv.org/help/prep#msc for a description of this field. + required: false + style: form + explode: true + schema: + type: string + example: 14J60 + + - name: doi + in: query + description: | + Keyword match on DOI. Supports wildcards. See + https://arxiv.org/help/prep#doi for a description of this field. + required: false + style: form + explode: true + schema: + type: string + example: 10.1016/S0550-3213(01)00405-9 + + - name: paper_id + in: query + description: | + Keyword match on arXiv paper ID, with our without a version affix. + Supports wildcards. See https://arxiv.org/help/arxiv_identifier for + information about arXiv paper identifiers. + required: false + style: form + explode: true + schema: + type: string + example: 1601.00123 + + - name: orcid + in: query + description: | + Match on author ORCID ID. For information about ORCID IDs, + see https://arxiv.org/help/orcid. Note that ORCID IDs are only + available for the submitter and any author-owners who have claimed + the paper. See also + https://blogs.cornell.edu/arxiv/2018/05/04/release-search-v0-2-some-notes-on-names/. + required: false + style: form + explode: true + schema: + type: string + example: 0000-0002-0564-9939 + + - name: author_id + in: query + description: | + Match on arXiv author identifier. For more information about author + identifiers, see https://arxiv.org/help/author_identifiers. Note + that author identifiers are only available for the submitter and + any author-owners who have claimed the paper. See also + https://blogs.cornell.edu/arxiv/2018/05/04/release-search-v0-2-some-notes-on-names/. + required: false + style: form + explode: true + schema: + type: string + example: warner_s_1 + + - name: primary_classification + in: query + description: | + Limit query by primary classification. This field supports + filtering by group, archive, and category. Note that group names + are prefixed by ``grp_``, e.g. ``grp_physics``, ``grp_q-bio``. See + https://arxiv.github.io/arxiv-base/arxiv/arxiv.taxonomy.html for + more information. + examples: + groupMath: + summary: Limit results to the math group. + value: grp_math + archivePhysics: + summary: Limit results to the physics archive. + value: physics + categoryHE: + summary: | + Limit results to the High Enegery Astrophysical Phenomena + category. + value: astro-ph.HE + required: false + style: form + schema: + type: array + items: + type: string + + - name: secondary_classification + in: query + description: | + Limit query by secondary (cross-list) classification. This field + supports filtering by group, archive, and category. Note that group + names are prefixed by ``grp_``, e.g. ``grp_physics``, + ``grp_q-bio``. See + https://arxiv.github.io/arxiv-base/arxiv/arxiv.taxonomy.html for + more information. + examples: + groupMath: + summary: Limit results to those cross-listed in the math group. + value: grp_math + archivePhysics: + summary: | + Limit results to those cross-listed in the physics archive. + value: physics + categoryHE: + summary: | + Limit results to those cross-listed in the High Enegery + Astrophysical Phenomena category. + value: astro-ph.HE + required: false + style: form + explode: true + schema: + type: array + items: + type: string + + - name: include + in: query + description: | + Fields to include in the result set, in addition to paper ID and + version. See ``DocumentMetadata.json`` specification for all + available fields. + required: false + style: form + explode: true + schema: + type: array + items: + type: string + example: title + + - name: start_date + in: query + description: | + Limit results to papers submitted or announced (see ``date_type``) + on or after this date. + required: false + style: form + schema: + type: string + format: date + example: "1998-04-03" + + - name: end_date + in: query + description: | + Limit results to papers submitted or announced (see ``date_type``) + on or before this date. + required: false + style: form + schema: + type: string + format: date + example: "1998-04-09" + + - name: date_type + in: query + description: | + The date property used to apply ``start_date`` and ``end_date`` + filters. Note that ``announced_date_first`` has only year and month + precision. + required: false + style: form + schema: + type: string + default: submitted_date + enum: + - submitted_date_first + - submitted_date + - announced_date_first + example: submitted_date_first + + responses: + '200': + description: All arXiv papers that respond to specified query. + content: + application/json: + schema: + $ref: './resources/DocumentSet.json#' + default: + description: unexpected error + content: + application/json: + schema: + $ref: '#/components/schemas/Error' + /{id}: + get: + description: | + Get metadata about an arXiv paper by arXiv ID. See + https://arxiv.org/help/arxiv_identifier for information about arXiv + paper identifiers. + + ## Example request + + ```bash + curl \ + -H "Authorization: Bearer 4mggHnvB3ZV1bV3GOaE8wZFw8pul5nGyzfeABSdfDg" \ + https://api.arxiv.org/metadata/1811.00536v1 + ``` + + ## Example response + ```json + { + "abs_categories": "cond-mat.str-el hep-th math-ph math.MP", + "abstract": "We give a systematic construction and classification of fermionic symmetry-protected topological states for generic fermionic symmetry group $G_f=\\mathbb Z_2^f\\rtimes G_b$, which is a central extension of bosonic symmetry group $G_b$ (may contain time reversal symmetry) by the fermion parity symmetry group $\\mathbb Z_2^f=\\{1,P_f\\}$. For each class in the classification (except those with 2D $p+ip$ chiral superconductor decorations), we construct a fixed-point wave function which admits exactly solvable commuting-projector Hamiltonian. The classification is based on the notion of equivalence class of fermionic symmetric local unitary transformations.", + "acm_class": [], + "announced_date_first": "2018-11", + "authors": [ + { + "affiliation": [], + "author_id": null, + "first_name": "Qing-Rui", + "full_name": "Qing-Rui Wang", + "last_name": "Wang", + "orcid": null, + "suffix": "" + }, + { + "affiliation": [], + "author_id": null, + "first_name": "Zheng-Cheng", + "full_name": "Zheng-Cheng Gu", + "last_name": "Gu", + "orcid": null, + "suffix": "" + } + ], + "authors_freeform": "Qing-Rui Wang and Zheng-Cheng Gu", + "canonical": "https://arxiv.org/abs/1811.00536v1", + "comments": "68 pages, 16 figures", + "doi": [], + "formats": [ + { + "format": "pdf", + "href": "https://arxiv.org/pdf/1811.00536v1" + }, + { + "format": "other", + "href": "https://arxiv.org/format/1811.00536v1" + } + ], + "href": "https://api.arxiv.org/metadata/1811.00536v1", + "is_current": true, + "is_withdrawn": false, + "journal_ref": "", + "latest": { + "canonical": "https://arxiv.org/abs/1811.00536v1", + "href": "https://api.arxiv.org/metadata/1811.00536v1", + "paper_id": "1811.00536v1", + "version": 1 + }, + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "msc_class": [], + "owners": [ + { + "affiliation": [], + "author_id": null, + "first_name": "Qing-Rui", + "full_name": "Qing-Rui Wang", + "last_name": "Wang", + "orcid": null, + "suffix": "" + } + ], + "paper_id": "1811.00536", + "paper_id_v": "1811.00536v1", + "primary_classification": { + "archive": { + "id": "cond-mat", + "name": "Condensed Matter" + }, + "category": { + "id": "cond-mat.str-el", + "name": "Strongly Correlated Electrons" + }, + "group": { + "id": "grp_physics", + "name": "Physics" + } + }, + "report_num": "", + "secondary_classification": [ + { + "archive": { + "id": "hep-th", + "name": "High Energy Physics - Theory" + }, + "category": { + "id": "hep-th", + "name": "High Energy Physics - Theory" + }, + "group": { + "id": "grp_physics", + "name": "Physics" + } + }, + { + "archive": { + "id": "math-ph", + "name": "Mathematical Physics" + }, + "category": { + "id": "math-ph", + "name": "Mathematical Physics" + }, + "group": { + "id": "grp_physics", + "name": "Physics" + } + } + ], + "source": { + "flags": "", + "format": "pdftex", + "size_bytes": 2478991 + }, + "submitted_date": "2018-11-01T13:57:22-04:00", + "submitted_date_first": "2018-11-01T13:57:22-04:00", + "submitter": { + "affiliation": [], + "author_id": null, + "first_name": "", + "full_name": "Qing-Rui Wang", + "last_name": "", + "orcid": null, + "suffix": "" + }, + "title": "General group super-cohomology theory of fermionic symmetry-protected topological phases", + "version": 1 + } + ``` + + operationId: getPaperByID + parameters: + - name: id + in: path + description: arXiv ID of paper to retrieve. + required: true + schema: + type: string + responses: + '200': + description: Metadata about the requested arXiv paper. + content: + application/json: + schema: + $ref: './resources/Document.json#Document' + default: + description: unexpected error + content: + application/json: + schema: + $ref: '#/components/schemas/Error' + +components: + schemas: + Error: + required: + - code + - message + properties: + code: + type: integer + format: int32 + message: + type: string diff --git a/search/agent/__init__.py b/search/agent/__init__.py index 6b9f440b..663cb3f0 100644 --- a/search/agent/__init__.py +++ b/search/agent/__init__.py @@ -9,17 +9,11 @@ and becomes available for discovery via :mod:`search.routes.ui`. """ from typing import Optional -from datetime import datetime -import warnings from flask import current_app as app -from arxiv.base import logging +from arxiv.base import agent from .consumer import MetadataRecordProcessor, DocumentFailed, IndexingFailed -from .base import CheckpointManager - -logger = logging.getLogger(__name__) -logger.propagate = False def process_stream(duration: Optional[int] = None) -> None: @@ -35,31 +29,6 @@ def process_stream(duration: Optional[int] = None) -> None: """ # We use the Flask application instance for configuration, and to manage # integrations with metadata service, search index. - with warnings.catch_warnings(): # boto3 is notoriously annoying. - warnings.simplefilter("ignore") - start_at = app.config.get('KINESIS_START_AT') - start_type = app.config.get('KINESIS_START_TYPE') - if not start_type: - start_type = 'AT_TIMESTAMP' - if start_type == 'AT_TIMESTAMP' and not start_at: - start_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + agent.process_stream(MetadataRecordProcessor, app.config, + duration=duration) - processor = MetadataRecordProcessor( - app.config['KINESIS_STREAM'], - app.config['KINESIS_SHARD_ID'], - app.config['AWS_ACCESS_KEY_ID'], - app.config['AWS_SECRET_ACCESS_KEY'], - app.config['AWS_REGION'], - CheckpointManager( - app.config['KINESIS_CHECKPOINT_VOLUME'], - app.config['KINESIS_STREAM'], - app.config['KINESIS_SHARD_ID'], - ), - endpoint=app.config.get('KINESIS_ENDPOINT', None), - verify=app.config.get('KINESIS_VERIFY', 'true') == 'true', - duration=duration, - start_type=start_type, - start_at=start_at, - sleep=float(app.config['KINESIS_SLEEP']) - ) - processor.go() diff --git a/search/agent/base.py b/search/agent/base.py deleted file mode 100644 index acbd5585..00000000 --- a/search/agent/base.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -Provides a base class for Kinesis record handling. - -.. _todo: This should move to arXiv-base, per ARXIVNG-281. -""" - -import time -import json -from datetime import datetime, timedelta -import os -from typing import Any, Optional, Tuple, Generator, Callable, Dict, Union -from contextlib import contextmanager -import signal - -import boto3 -from botocore.exceptions import WaiterError, NoCredentialsError, \ - PartialCredentialsError, BotoCoreError, ClientError - -from arxiv.base import logging -logger = logging.getLogger(__name__) -logger.propagate = False - -NOW = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') - - -class CheckpointError(RuntimeError): - """Checkpointing failed.""" - - -class StreamNotAvailable(RuntimeError): - """Could not find or connect to the stream.""" - - -class KinesisRequestFailed(RuntimeError): - """Raised when a Kinesis request failed permanently.""" - - -class StopProcessing(RuntimeError): - """Gracefully stopped processing upon unrecoverable error.""" - - -class ConfigurationError(RuntimeError): - """There was a problem with the configuration.""" - - -def retry(retries: int = 5, wait: int = 5) -> Callable: - """ - Decorator factory for retrying Kinesis calls. - - Parameters - ---------- - retries : int - Number of times to retry before failing. - wait : int - Number of seconds to wait between retries. - - Returns - ------- - function - A decorator that retries the decorated func ``retries`` times before - raising :class:`.KinesisRequestFailed`. - - """ - __retries = retries - - def decorator(func: Callable) -> Callable: - """Retry the decorated func on ClientErrors up to ``retries`` times.""" - _retries = __retries - - def inner(*args, **kwargs) -> Any: # type: ignore - retries = _retries - while retries > 0: - try: - return func(*args, **kwargs) - except ClientError as e: - code = e.response['Error']['Code'] - logger.error('Caught ClientError %s, retrying', code) - time.sleep(wait) - retries -= 1 - raise KinesisRequestFailed('Max retries; last code: {code}') - return inner - return decorator - - -class CheckpointManager(object): - """Provides on-disk loading and updating of consumer checkpoints.""" - - def __init__(self, base_path: str, stream_name: str, shard_id: str) \ - -> None: - """Load or create a new file for checkpointing.""" - if not os.path.exists(base_path): - raise ValueError(f'Path does not exist: {base_path}') - self.file_path = os.path.join(base_path, - f'{stream_name}__{shard_id}.json') - if not os.path.exists(self.file_path): - try: - with open(self.file_path, 'w') as f: - f.write('') - except Exception as e: # The containing path doesn't exist. - raise ValueError(f'Could not use {self.file_path}') from e - - with open(self.file_path) as f: - position = f.read() - self.position = position if position else None - - def checkpoint(self, position: str) -> None: - """Checkpoint at ``position``.""" - try: - with open(self.file_path, 'w') as f: - f.write(position) - self.position = position - except Exception as e: - raise CheckpointError('Could not checkpoint') from e - - -class BaseConsumer(object): - """ - Kinesis stream consumer. - - Consumes a single shard from a single stream, and checkpoints on disk - (to reduce external dependencies). - """ - - def __init__(self, stream_name: str = '', shard_id: str = '', - access_key: str = '', secret_key: str = '', region: str = '', - checkpointer: Optional[CheckpointManager] = None, - back_off: int = 5, batch_size: int = 50, - endpoint: Optional[str] = None, verify: bool = True, - duration: Optional[int] = None, - start_type: str = 'AT_TIMESTAMP', - start_at: str = NOW) -> None: - """Initialize a new stream consumer.""" - logger.info(f'New consumer for {stream_name} ({shard_id})') - self.stream_name = stream_name - self.shard_id = shard_id - self.checkpointer = checkpointer - if self.checkpointer: - self.position = self.checkpointer.position - else: - self.position = None - self.duration = duration - self.start_time = None - self.back_off = back_off - self.batch_size = batch_size - self.sleep_time = 5 - self.start_at = start_at - self.start_type = start_type - logger.info(f'Got start_type={start_type} and start_at={start_at}') - - if not self.stream_name or not self.shard_id: - logger.info( - 'No stream indicated; making no attempt to connect to Kinesis' - ) - return - - logger.info(f'Getting a new connection to Kinesis at {endpoint}' - f' in region {region}, with SSL verification={verify}') - self.client = boto3.client('kinesis', - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - endpoint_url=endpoint, - verify=verify, - region_name=region) - - logger.info(f'Waiting for {self.stream_name} to be available') - try: - self.wait_for_stream() - except (KinesisRequestFailed, StreamNotAvailable): - logger.info('Could not connect to stream; attempting to create') - self.client.create_stream( - StreamName=self.stream_name, - ShardCount=1 - ) - logger.info(f'Created; waiting for {self.stream_name} again') - self.wait_for_stream() - - # Intercept SIGINT and SIGTERM so that we can checkpoint before exit. - self.exit = False - signal.signal(signal.SIGINT, self.stop) - signal.signal(signal.SIGTERM, self.stop) - logger.info('Ready to start') - - def stop(self, signal: int, frame: Any) -> None: - """Set exit flag for a graceful stop.""" - logger.error(f'Received signal {signal}') - self._checkpoint() - logger.error('Done') - raise StopProcessing(f'Received signal {signal}') - - @retry(5, 10) - def wait_for_stream(self) -> None: - """ - Wait for the stream to become available. - - If the stream becomes available, returns ``None``. Otherwise, raises - a :class:`.StreamNotAvailable` exception. - - Raises - ------ - :class:`.StreamNotAvailable` - Raised when the stream could not be reached. - - """ - waiter = self.client.get_waiter('stream_exists') - try: - logger.error(f'Waiting for stream {self.stream_name}') - waiter.wait( - StreamName=self.stream_name, - Limit=1, - ExclusiveStartShardId=self.shard_id - ) - except WaiterError as e: - logger.error('Failed to get stream while waiting') - raise StreamNotAvailable('Could not connect to stream') from e - except (PartialCredentialsError, NoCredentialsError) as e: - logger.error('Credentials missing or incomplete: %s', e.msg) - raise ConfigurationError('Credentials missing') from e - - def _get_iterator(self) -> str: - """ - Get a new shard iterator. - - If our position is set, we will start with the record immediately after - that position. Otherwise, we start at ``start_at`` timestamp. - - Returns - ------- - str - The sequence ID of the record on which to start. - - """ - params: Dict[str, Any] = dict( - StreamName=self.stream_name, - ShardId=self.shard_id - ) - if self.position: - params.update(dict( - ShardIteratorType='AFTER_SEQUENCE_NUMBER', - StartingSequenceNumber=self.position - )) - elif self.start_type == 'AT_TIMESTAMP' and self.start_at: - start_at = datetime.strptime(self.start_at, '%Y-%m-%dT%H:%M:%S') - params.update(dict( - ShardIteratorType='AT_TIMESTAMP', - Timestamp=( - start_at - datetime.utcfromtimestamp(0) - ).total_seconds() - )) - elif self.start_type == 'TRIM_HORIZON': - params.update(dict(ShardIteratorType='TRIM_HORIZON')) - try: - it: str = self.client.get_shard_iterator(**params)['ShardIterator'] - return it - except self.client.exceptions.InvalidArgumentException as e: - logger.info('Got InvalidArgumentException: %s', str(e)) - # Iterator may not have come from this stream/shard. - if self.position is not None: - self.position = None - return self._get_iterator() - raise KinesisRequestFailed('Could not get shard iterator') - - def _checkpoint(self) -> None: - """ - Checkpoint at the current position. - - The current position is the sequence number of the last record that was - successfully processed. - """ - if self.position is not None and self.checkpointer: - self.checkpointer.checkpoint(self.position) - logger.debug(f'Set checkpoint at {self.position}') - - @retry(retries=10, wait=5) - def get_records(self, iterator: str, limit: int) -> Tuple[str, dict]: - """Get the next batch of ``limit`` or fewer records.""" - logger.debug(f'Get more records from {iterator}, limit {limit}') - response = self.client.get_records(ShardIterator=iterator, - Limit=limit) - iterator = response['NextShardIterator'] - return iterator, response - - def _check_timeout(self) -> None: - """If a processing duration is set, exit if duration is exceeded.""" - if not self.start_time or not self.duration: - return - running_for = time.time() - self.start_time - if running_for > self.duration: - logger.info(f'Ran for {running_for} seconds; exiting') - self._checkpoint() - raise StopProcessing(f'Ran for {running_for} seconds; exiting') - - def process_records(self, start: str) -> Tuple[str, int]: - """Retrieve and process records starting at ``start``.""" - logger.debug(f'Get more records, starting at {start}') - processed = 0 - try: - time.sleep(self.sleep_time) # Don't get carried away. - next_start, response = self.get_records(start, self.batch_size) - except Exception as e: - self._checkpoint() - raise StopProcessing('Unhandled exception: %s' % str(e)) from e - - logger.debug('Got %i records', len(response['Records'])) - for record in response['Records']: - self._check_timeout() - - # It is possible that Kinesis will replay the same message several - # times, especially at the end of the stream. There's no point in - # replaying the message, so we'll continue on. - if record['SequenceNumber'] == self.position: - continue - - self.process_record(record) - processed += 1 - - # Setting the position means that we have successfully - # processed this record. - if record['SequenceNumber']: # Make sure it's set. - self.position = record['SequenceNumber'] - logger.debug(f'Updated position to {self.position}') - logger.debug(f'Next start is {next_start}') - return next_start, processed - - def go(self) -> None: - """Main processing routine.""" - self.start_time = time.time() - logger.info(f'Starting processing from position {self.position}' - f' on stream {self.stream_name} and shard {self.shard_id}') - - start = self._get_iterator() - while True: - start, processed = self.process_records(start) - if processed > 0: - self._checkpoint() # Checkpoint after every batch. - if start is None: # Shard is closed. - logger.error('Shard closed unexpectedly; no new iterator') - self._checkpoint() - raise StopProcessing('Could not get a new iterator') - self._check_timeout() - - def process_record(self, record: dict) -> None: - """ - Process a single record from the stream. - - Parameters - ---------- - record : dict - - """ - logger.info(f'Processing record {record["SequenceNumber"]}') - logger.debug(f'Process record {record}') - # raise NotImplementedError('Should be implemented by a subclass') diff --git a/search/agent/consumer.py b/search/agent/consumer.py index 8507a5da..16823705 100644 --- a/search/agent/consumer.py +++ b/search/agent/consumer.py @@ -1,15 +1,14 @@ """Provides a record processor for MetadataIsAvailable notifications.""" -from typing import Dict, List import json import os import time -from typing import List, Any, Optional +from typing import List, Any, Optional, Dict from arxiv.base import logging from search.services import metadata, index from search.process import transform from search.domain import DocMeta, Document, asdict -from .base import BaseConsumer +from arxiv.base.agent import BaseConsumer logger = logging.getLogger(__name__) logger.propagate = False diff --git a/search/agent/tests/test_base_consumer.py b/search/agent/tests/test_base_consumer.py deleted file mode 100644 index ede647e5..00000000 --- a/search/agent/tests/test_base_consumer.py +++ /dev/null @@ -1,166 +0,0 @@ -"""Tests for :class:`.BaseConsumer`.""" - -from unittest import TestCase, mock -from botocore.exceptions import BotoCoreError, WaiterError, ClientError - -from search.agent.base import BaseConsumer, StreamNotAvailable, StopProcessing - - -class TestBaseConsumer(TestCase): - """Test :class:`.BaseConsumer` behavior and public methods.""" - - def setUp(self): - self.checkpointer = mock.MagicMock() - self.checkpointer.position = None - - @mock.patch('boto3.client') - def test_init(self, mock_client_factory): - """On init, consumer should wait for stream to be available.""" - mock_client = mock.MagicMock() - mock_waiter = mock.MagicMock() - mock_client.get_waiter.return_value = mock_waiter - mock_client_factory.return_value = mock_client - - try: - BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', 'us-east-1', - self.checkpointer) - except Exception as e: - self.fail('If the waiter returns without an exception, no' - ' exception should be raised.') - self.assertEqual(mock_waiter.wait.call_count, 1, - "A boto3 waiter should be used") - - @mock.patch('boto3.client') - def test_init_stream_not_available(self, mock_client_factory): - """If the stream is not available, should raise an exception.""" - mock_client = mock.MagicMock() - mock_waiter = mock.MagicMock() - - def raise_waiter_error(*a, **k): - raise WaiterError('', {}, {}) - - mock_waiter.wait.side_effect = raise_waiter_error - mock_client.get_waiter.return_value = mock_waiter - mock_client_factory.return_value = mock_client - with self.assertRaises(StreamNotAvailable): - BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', 'us-east-1', - self.checkpointer) - - @mock.patch('boto3.client') - def test_iteration(self, mock_client_factory): - """Test iteration behavior.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_records.return_value = { - 'Records': [ - {'SequenceNumber': str(i)} for i in range(10) - ], - 'NextShardIterator': '10' - } - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer) - next_start, processed = consumer.process_records('0') - self.assertGreater(mock_client.get_records.call_count, 0) - self.assertEqual(processed, 10) - self.assertEqual(next_start, '10', "Should return NextShardIterator") - - @mock.patch('boto3.client') - def test_process_records_until_shard_closes(self, mock_client_factory): - """Should call GetRecords until no next iterator is available.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - def get_records(**kwargs): - start = int(kwargs['ShardIterator']) - end = start + int(kwargs['Limit']) - if start > 500: - return {'Records': [], 'NextShardIterator': None} - return { - 'Records': [ - {'SequenceNumber': str(i)} for i in range(start, end) - ], - 'NextShardIterator': str(end + 1) - } - - mock_client.get_records.side_effect = get_records - - batch_size = 50 - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - batch_size=batch_size) - with self.assertRaises(StopProcessing): - consumer.go() - self.assertEqual(mock_client.get_records.call_count, - (500/batch_size) + 1, - "Should call Kinesis GetRecords until no iterator" - " is returned.") - - @mock.patch('boto3.client') - def test_process_records_with_clienterror(self, mock_client_factory): - """Should try to checkpoint before exiting.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - def raise_client_error(*args, **kwargs): - raise ClientError({'Error': {'Code': 'foo'}}, {}) - - mock_client.get_records.side_effect = raise_client_error - - batch_size = 50 - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - batch_size=batch_size) - consumer.position = 'fooposition' - try: - consumer.go() - except Exception: - pass - self.assertEqual(self.checkpointer.checkpoint.call_count, 1) - - @mock.patch('boto3.client') - def test_start_from_timestamp(self, mock_client_factory): - """Consumer is initialized with start_type 'AT_TIMESTAMP'.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - start_type='AT_TIMESTAMP') - consumer._get_iterator() - args, kwargs = mock_client.get_shard_iterator.call_args - self.assertEqual(kwargs['ShardIteratorType'], 'AT_TIMESTAMP') - self.assertIn('Timestamp', kwargs) - - @mock.patch('boto3.client') - def test_start_from_position(self, mock_client_factory): - """Consumer is initialized with start_type 'AT_TIMESTAMP'.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - start_type='AT_TIMESTAMP') - consumer.position = 'fooposition' - consumer._get_iterator() - args, kwargs = mock_client.get_shard_iterator.call_args - self.assertEqual(kwargs['ShardIteratorType'], 'AFTER_SEQUENCE_NUMBER') - self.assertEqual(kwargs['StartingSequenceNumber'], 'fooposition') - - @mock.patch('boto3.client') - def test_start_from_trim_horizon(self, mock_client_factory): - """Consumer is initialized with start_type 'AT_TIMESTAMP'.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - start_type='TRIM_HORIZON') - consumer._get_iterator() - args, kwargs = mock_client.get_shard_iterator.call_args - self.assertEqual(kwargs['ShardIteratorType'], 'TRIM_HORIZON') - self.assertNotIn('StartingSequenceNumber', kwargs) diff --git a/search/agent/tests/tests.py b/search/agent/tests/test_integration.py similarity index 99% rename from search/agent/tests/tests.py rename to search/agent/tests/test_integration.py index 9ad22496..9fed6ca3 100644 --- a/search/agent/tests/tests.py +++ b/search/agent/tests/test_integration.py @@ -10,7 +10,7 @@ import threading from search.agent import process_stream -from search.agent.base import StopProcessing +from arxiv.base.agent import StopProcessing from search.services import metadata from search.domain import DocMeta from search.factory import create_ui_web_app diff --git a/search/config.py b/search/config.py index cf287772..99f17598 100644 --- a/search/config.py +++ b/search/config.py @@ -57,7 +57,7 @@ application context. """ -APPLICATION_ROOT = os.environ.get('APPLICATION_ROOT', None) +APPLICATION_ROOT = os.environ.get('APPLICATION_ROOT', '/') """ If the application does not occupy a whole domain or subdomain this can be set to the path where the application is configured to live. This is for session @@ -224,9 +224,26 @@ FLASKS3_ACTIVE = os.environ.get('FLASKS3_ACTIVE', 0) # Settings for display of release information -RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/8H5OFQ' -RELEASE_NOTES_TEXT = 'Search v0.4 released 2018-07-18' - +RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/giazFQ' +RELEASE_NOTES_TEXT = 'Search v0.5 released 2018-12-20' + + +EXTERNAL_URL_SCHEME = os.environ.get('EXTERNAL_URL_SCHEME', 'https') +BASE_SERVER = os.environ.get('BASE_SERVER', 'arxiv.org') + +URLS = [ + ("pdf", "/pdf/v", BASE_SERVER), + ("abs", "/abs/v", BASE_SERVER), + ("abs_by_id", "/abs/", BASE_SERVER), + ("pdfonly", "/pdf/v", BASE_SERVER), + ("dvi", "/dvi/v", BASE_SERVER), + ("html", "/html/v", BASE_SERVER), + ("ps", "/ps/v", BASE_SERVER), + ("source", "/e-print/v", BASE_SERVER), + ("other", "/format/v", BASE_SERVER), +] + +JWT_SECRET = os.environ.get('JWT_SECRET', 'foosecret') # TODO: one place to set the version, update release notes text, JIRA issue # collector, etc. diff --git a/search/controllers/advanced/__init__.py b/search/controllers/advanced/__init__.py index 22f23d09..dbca1b23 100644 --- a/search/controllers/advanced/__init__.py +++ b/search/controllers/advanced/__init__.py @@ -130,6 +130,11 @@ def search(request_params: MultiDict) -> Response: "your search again. If this problem persists, please " "report it to help@arxiv.org." ) from e + except index.OutsideAllowedRange as e: + raise BadRequest( + "Hello clever friend. You can't get results in that range" + " right now." + ) from e response_data['query'] = q else: logger.debug('form is invalid: %s', str(form.errors)) @@ -173,6 +178,8 @@ def _query_from_form(form: forms.AdvancedSearchForm) -> AdvancedQuery: q = _update_query_with_dates(q, form.date.data) q = _update_query_with_terms(q, form.terms.data) q = _update_query_with_classification(q, form.classification.data) + q.include_cross_list = form.classification.include_cross_list.data \ + == form.classification.INCLUDE_CROSS_LIST if form.include_older_versions.data: q.include_older_versions = True order = form.order.data @@ -184,7 +191,7 @@ def _query_from_form(form: forms.AdvancedSearchForm) -> AdvancedQuery: def _update_query_with_classification(q: AdvancedQuery, data: MultiDict) \ -> AdvancedQuery: - q.primary_classification = ClassificationList() + q.classification = ClassificationList() archives = [ ('computer_science', 'cs'), ('economics', 'econ'), ('eess', 'eess'), ('mathematics', 'math'), ('q_biology', 'q-bio'), @@ -194,19 +201,19 @@ def _update_query_with_classification(q: AdvancedQuery, data: MultiDict) \ if data.get(field): # Fix for these typing issues is coming soon! # See: https://github.com/python/mypy/pull/4397 - q.primary_classification.append( - Classification(archive=archive) # type: ignore + q.classification.append( + Classification(archive={'id': archive}) # type: ignore ) if data.get('physics') and 'physics_archives' in data: if 'all' in data['physics_archives']: - q.primary_classification.append( - Classification(group='grp_physics') # type: ignore + q.classification.append( + Classification(group={'id': 'grp_physics'}) # type: ignore ) else: - q.primary_classification.append( + q.classification.append( Classification( # type: ignore - group='grp_physics', - archive=data['physics_archives'] + group={'id': 'grp_physics'}, + archive={'id': data['physics_archives']} ) ) return q diff --git a/search/controllers/advanced/forms.py b/search/controllers/advanced/forms.py index 3512ad54..a5035e7f 100644 --- a/search/controllers/advanced/forms.py +++ b/search/controllers/advanced/forms.py @@ -13,7 +13,7 @@ from wtforms import widgets from arxiv import taxonomy -from search.domain import DateRange +from search.domain import DateRange, AdvancedQuery from search.controllers.util import does_not_start_with_wildcard, \ strip_white_space, has_balanced_quotes @@ -73,21 +73,7 @@ class FieldForm(Form): operator = SelectField("Operator", choices=[ ('AND', 'AND'), ('OR', 'OR'), ('NOT', 'NOT') ], default='AND') - field = SelectField("Field", choices=[ - ('title', 'Title'), - ('author', 'Author(s)'), - ('abstract', 'Abstract'), - ('comments', 'Comments'), - ('journal_ref', 'Journal reference'), - ('acm_class', 'ACM classification'), - ('msc_class', 'MSC classification'), - ('report_num', 'Report number'), - ('paper_id', 'arXiv identifier'), - ('doi', 'DOI'), - ('orcid', 'ORCID'), - ('author_id', 'arXiv author ID'), - ('all', 'All fields') - ]) + field = SelectField("Field", choices=AdvancedQuery.SUPPORTED_FIELDS) class ClassificationForm(Form): @@ -95,6 +81,8 @@ class ClassificationForm(Form): # pylint: disable=too-few-public-methods + # TODO: this should not be hard-coded! + # # Map arXiv archives to fields on this form. Ideally we would autogenerate # form fields based on the arXiv taxonomy, but this can't easily happen # until we replace the classic-style advanced interface with faceted @@ -114,6 +102,9 @@ class ClassificationForm(Form): in taxonomy.ARCHIVES_ACTIVE.items() if description['in_group'] == 'grp_physics'] + INCLUDE_CROSS_LIST = 'include' + EXCLUDE_CROSS_LIST = 'exclude' + computer_science = BooleanField('Computer Science (cs)') economics = BooleanField('Economics (econ)') eess = BooleanField('Electrical Engineering and Systems Science (eess)') @@ -124,6 +115,11 @@ class ClassificationForm(Form): q_finance = BooleanField('Quantitative Finance (q-fin)') statistics = BooleanField('Statistics (stat)') + include_cross_list = RadioField('Include cross-list', choices=[ + (INCLUDE_CROSS_LIST, 'Include cross-listed papers'), + (EXCLUDE_CROSS_LIST, 'Exclude cross-listed papers') + ], default=INCLUDE_CROSS_LIST) + def yearInBounds(form: Form, field: DateField) -> None: """May not be prior to 1991, or later than the current year.""" diff --git a/search/controllers/advanced/tests.py b/search/controllers/advanced/tests.py index d547cad7..283e0196 100644 --- a/search/controllers/advanced/tests.py +++ b/search/controllers/advanced/tests.py @@ -392,42 +392,44 @@ def test_classification_is_selected(self): class_data = {'computer_science': True} q = advanced._update_query_with_classification(Query(), class_data) self.assertIsInstance(q, Query) - self.assertIsInstance(q.primary_classification, list) - self.assertEqual(len(q.primary_classification), 1) - self.assertIsInstance(q.primary_classification[0], Classification) - self.assertEqual(q.primary_classification[0].archive, 'cs') + self.assertIsInstance(q.classification, list) + self.assertEqual(len(q.classification), 1) + self.assertIsInstance(q.classification[0], Classification) + self.assertEqual(q.classification[0].archive['id'], 'cs') def test_multiple_classifications_are_selected(self): """Selected classifications are added to the query.""" class_data = {'computer_science': True, 'eess': True} q = advanced._update_query_with_classification(Query(), class_data) self.assertIsInstance(q, Query) - self.assertIsInstance(q.primary_classification, list) - self.assertEqual(len(q.primary_classification), 2) - self.assertIsInstance(q.primary_classification[0], Classification) - self.assertIsInstance(q.primary_classification[1], Classification) + self.assertIsInstance(q.classification, list) + self.assertEqual(len(q.classification), 2) + self.assertIsInstance(q.classification[0], Classification) + self.assertIsInstance(q.classification[1], Classification) def test_physics_is_selected_all_archives(self): """The physics group is added to the query.""" class_data = {'physics': True, 'physics_archives': 'all'} q = advanced._update_query_with_classification(Query(), class_data) self.assertIsInstance(q, Query) - self.assertIsInstance(q.primary_classification, list) - self.assertEqual(len(q.primary_classification), 1) - self.assertIsInstance(q.primary_classification[0], Classification) - self.assertIsNone(q.primary_classification[0].archive) - self.assertEqual(q.primary_classification[0].group, 'grp_physics') + self.assertIsInstance(q.classification, list) + self.assertEqual(len(q.classification), 1) + self.assertIsInstance(q.classification[0], Classification) + self.assertIsNone(q.classification[0].archive) + self.assertEqual(q.classification[0].group['id'], + 'grp_physics') def test_physics_is_selected_specific_archive(self): """The physic group and specified archive are added to the query.""" class_data = {'physics': True, 'physics_archives': 'hep-ex'} q = advanced._update_query_with_classification(Query(), class_data) self.assertIsInstance(q, Query) - self.assertIsInstance(q.primary_classification, list) - self.assertEqual(len(q.primary_classification), 1) - self.assertIsInstance(q.primary_classification[0], Classification) - self.assertEqual(q.primary_classification[0].archive, 'hep-ex') - self.assertEqual(q.primary_classification[0].group, 'grp_physics') + self.assertIsInstance(q.classification, list) + self.assertEqual(len(q.classification), 1) + self.assertIsInstance(q.classification[0], Classification) + self.assertEqual(q.classification[0].archive['id'], 'hep-ex') + self.assertEqual(q.classification[0].group['id'], + 'grp_physics') def test_physics_is_selected_specific_archive_plus_other_groups(self): """The physics group and specified archive are added to the query.""" @@ -438,10 +440,10 @@ def test_physics_is_selected_specific_archive_plus_other_groups(self): } q = advanced._update_query_with_classification(Query(), class_data) self.assertIsInstance(q, Query) - self.assertIsInstance(q.primary_classification, list) - self.assertEqual(len(q.primary_classification), 2) - self.assertIsInstance(q.primary_classification[0], Classification) - self.assertIsInstance(q.primary_classification[1], Classification) + self.assertIsInstance(q.classification, list) + self.assertEqual(len(q.classification), 2) + self.assertIsInstance(q.classification[0], Classification) + self.assertIsInstance(q.classification[1], Classification) class TestUpdateQueryWithFieldedTerms(TestCase): diff --git a/search/controllers/api/__init__.py b/search/controllers/api/__init__.py new file mode 100644 index 00000000..d689f4e7 --- /dev/null +++ b/search/controllers/api/__init__.py @@ -0,0 +1,200 @@ +"""Controller for search API requests.""" + +from typing import Tuple, Dict, Any, Optional, List +import re +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +import dateutil.parser +from pytz import timezone +import pytz + + +from werkzeug.datastructures import MultiDict, ImmutableMultiDict +from werkzeug.exceptions import InternalServerError, BadRequest, NotFound +from flask import url_for + +from arxiv import status, taxonomy +from arxiv.base import logging + +from search.services import index, fulltext, metadata +from search.controllers.util import paginate +from ...domain import Query, APIQuery, FieldedSearchList, FieldedSearchTerm, \ + DateRange, ClassificationList, Classification, asdict, DocumentSet, \ + Document + +logger = logging.getLogger(__name__) +EASTERN = timezone('US/Eastern') + + +def search(params: MultiDict) -> Tuple[Dict[str, Any], int, Dict[str, Any]]: + """ + Handle a search request from the API. + + Parameters + ---------- + params : :class:`MultiDict` + GET query parameters from the request. + + Returns + ------- + dict + Response data (to serialize). + int + HTTP status code. + dict + Extra headers for the response. + """ + q = APIQuery() + query_terms: List[Dict[str, Any]] = [] + terms = _get_fielded_terms(params, query_terms) + if terms is not None: + q.terms = terms + date_range = _get_date_params(params, query_terms) + if date_range is not None: + q.date_range = date_range + + primary = params.get('primary_classification') + if primary: + primary_classification = _get_classification(primary, + 'primary_classification', + query_terms) + q.primary_classification = primary_classification + + secondaries = params.getlist('secondary_classification') + if secondaries: + q.secondary_classification = [ + _get_classification(sec, 'secondary_classification', query_terms) + for sec in secondaries + ] + + include_fields = _get_include_fields(params, query_terms) + if include_fields: + q.include_fields += include_fields + + q = paginate(q, params) # type: ignore + document_set = index.search(q, highlight=False) + document_set.metadata['query'] = query_terms + logger.debug('Got document set with %i results', len(document_set.results)) + return {'results': document_set, 'query': q}, status.HTTP_200_OK, {} + + +def paper(paper_id: str) -> Tuple[Dict[str, Any], int, Dict[str, Any]]: + """ + Handle a request for paper metadata from the API. + + Parameters + ---------- + paper_id : str + arXiv paper ID for the requested paper. + + Returns + ------- + dict + Response data (to serialize). + int + HTTP status code. + dict + Extra headers for the response. + + Raises + ------ + :class:`NotFound` + Raised when there is no document with the provided paper ID. + + """ + try: + document = index.get_document(paper_id) + except index.DocumentNotFound as e: + logger.error('Document not found') + raise NotFound('No such document') from e + return {'results': document}, status.HTTP_200_OK, {} + + +def _get_include_fields(params: MultiDict, query_terms: List) -> List[str]: + include_fields = params.getlist('include') + allowed_fields = Document.fields() + if include_fields: + inc = [field for field in include_fields if field in allowed_fields] + for field in inc: + query_terms.append({'parameter': 'include', 'value': field}) + return inc + return [] + + +def _get_fielded_terms(params: MultiDict, query_terms: List) \ + -> Optional[FieldedSearchList]: + terms = FieldedSearchList() + for field, _ in Query.SUPPORTED_FIELDS: + values = params.getlist(field) + for value in values: + query_terms.append({'parameter': field, 'value': value}) + terms.append(FieldedSearchTerm( # type: ignore + operator='AND', + field=field, + term=value + )) + if len(terms) == 0: + return None + return terms + + +def _get_date_params(params: MultiDict, query_terms: List) \ + -> Optional[DateRange]: + date_params = {} + for field in ['start_date', 'end_date']: + value = params.getlist(field) + if not value: + continue + try: + dt = dateutil.parser.parse(value[0]) + if not dt.tzinfo: + dt = pytz.utc.localize(dt) + dt = dt.replace(tzinfo=EASTERN) + except ValueError: + raise BadRequest({'field': field, 'reason': 'invalid datetime'}) + date_params[field] = dt + query_terms.append({'parameter': field, 'value': dt}) + if 'date_type' in params: + date_params['date_type'] = params.get('date_type') + query_terms.append({'parameter': 'date_type', + 'value': date_params['date_type']}) + if date_params: + return DateRange(**date_params) # type: ignore + return None + + +def _to_classification(value: str, query_terms: List) \ + -> Tuple[Classification, ...]: + clsns = [] + if value in taxonomy.definitions.GROUPS: + klass = taxonomy.Group + field = 'group' + elif value in taxonomy.definitions.ARCHIVES: + klass = taxonomy.Archive + field = 'archive' + elif value in taxonomy.definitions.CATEGORIES: + klass = taxonomy.Category + field = 'category' + else: + raise ValueError('not a valid classification') + cast_value = klass(value) + clsns.append(Classification(**{field: {'id': value}})) # type: ignore + if cast_value.unalias() != cast_value: + clsns.append(Classification(**{field: {'id': cast_value.unalias()}})) # type: ignore + if cast_value.canonical != cast_value \ + and cast_value.canonical != cast_value.unalias(): + clsns.append(Classification(**{field: {'id': cast_value.canonical}})) # type: ignore + return tuple(clsns) + + +def _get_classification(value: str, field: str, query_terms: List) \ + -> Tuple[Classification, ...]: + try: + clsns = _to_classification(value, query_terms) + except ValueError: + raise BadRequest({ + 'field': field, + 'reason': 'not a valid classification term' + }) + query_terms.append({'parameter': field, 'value': value}) + return clsns diff --git a/search/controllers/api/tests.py b/search/controllers/api/tests.py new file mode 100644 index 00000000..6b9cb24b --- /dev/null +++ b/search/controllers/api/tests.py @@ -0,0 +1,144 @@ +"""Tests for advanced search controller, :mod:`search.controllers.advanced`.""" + +from unittest import TestCase, mock +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +from werkzeug import MultiDict +from werkzeug.exceptions import InternalServerError, BadRequest + +from arxiv import status + +from search.domain import Query, DateRange, FieldedSearchTerm, Classification,\ + AdvancedQuery, DocumentSet +from search.controllers import api +from search.domain import api as api_domain +from search.services.index import IndexConnectionError, QueryError + + +class TestAPISearch(TestCase): + """Tests for :func:`.api.search`.""" + + @mock.patch(f'{api.__name__}.index') + def test_no_params(self, mock_index): + """Request with no parameters.""" + params = MultiDict({}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + self.assertIn("results", data, "Results are returned") + self.assertIn("query", data, "Query object is returned") + expected_fields = api_domain.get_required_fields() \ + + api_domain.get_default_extra_fields() + self.assertEqual(set(data["query"].include_fields), + set(expected_fields), + "Default set of fields is included") + + @mock.patch(f'{api.__name__}.index') + def test_include_fields(self, mock_index): + """Request with specific fields included.""" + extra_fields = ['title', 'abstract', 'authors'] + params = MultiDict({'include': extra_fields}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + self.assertIn("results", data, "Results are returned") + self.assertIn("query", data, "Query object is returned") + expected_fields = api_domain.get_required_fields() + extra_fields + self.assertEqual(set(data["query"].include_fields), + set(expected_fields), + "Requested fields are included") + + @mock.patch(f'{api.__name__}.index') + def test_group_primary_classification(self, mock_index): + """Request with a group as primary classification.""" + group = 'grp_physics' + params = MultiDict({'primary_classification': group}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 1) + self.assertEqual(query.primary_classification[0], + Classification(group={'id': group})) + + @mock.patch(f'{api.__name__}.index') + def test_archive_primary_classification(self, mock_index): + """Request with an archive as primary classification.""" + archive = 'physics' + params = MultiDict({'primary_classification': archive}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 1) + self.assertEqual(query.primary_classification[0], + Classification(archive={'id': archive})) + + @mock.patch(f'{api.__name__}.index') + def test_archive_subsumed_classification(self, mock_index): + """Request with a subsumed archive as primary classification.""" + archive = 'chao-dyn' + params = MultiDict({'primary_classification': archive}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 2) + self.assertEqual(query.primary_classification[0], + Classification(archive={'id': archive})) + self.assertEqual(query.primary_classification[1], + Classification(archive={'id': 'nlin.CD'}), + "The canonical archive is used instead") + + @mock.patch(f'{api.__name__}.index') + def test_category_primary_classification(self, mock_index): + """Request with a category as primary classification.""" + category = 'cs.DL' + params = MultiDict({'primary_classification': category}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 1) + self.assertEqual(query.primary_classification[0], + Classification(category={'id': category})) + + @mock.patch(f'{api.__name__}.index') + def test_bad_classification(self, mock_index): + """Request with nonsense as primary classification.""" + params = MultiDict({'primary_classification': 'nonsense'}) + with self.assertRaises(BadRequest): + api.search(params) + + @mock.patch(f'{api.__name__}.index') + def test_with_start_date(self, mock_index): + """Request with dates specified.""" + params = MultiDict({'start_date': '1999-01-02'}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertIsNotNone(query.date_range) + self.assertEqual(query.date_range.start_date.year, 1999) + self.assertEqual(query.date_range.start_date.month, 1) + self.assertEqual(query.date_range.start_date.day, 2) + self.assertEqual(query.date_range.date_type, + DateRange.SUBMITTED_CURRENT, + "Submitted date of current version is the default") + + @mock.patch(f'{api.__name__}.index') + def test_with_end_dates_and_type(self, mock_index): + """Request with end date and date type specified.""" + params = MultiDict({'end_date': '1999-01-02', + 'date_type': 'announced_date_first'}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertIsNotNone(query.date_range) + self.assertEqual(query.date_range.end_date.year, 1999) + self.assertEqual(query.date_range.end_date.month, 1) + self.assertEqual(query.date_range.end_date.day, 2) + + self.assertEqual(query.date_range.date_type, + DateRange.ANNOUNCED) diff --git a/search/controllers/simple/__init__.py b/search/controllers/simple/__init__.py index 3a2739a4..e871ada2 100644 --- a/search/controllers/simple/__init__.py +++ b/search/controllers/simple/__init__.py @@ -89,8 +89,8 @@ def search(request_params: MultiDict, arxiv_id = None if arxiv_id: - return {}, status.HTTP_301_MOVED_PERMANENTLY,\ - {'Location': f'https://arxiv.org/abs/{arxiv_id}'} + headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)} + return {}, status.HTTP_301_MOVED_PERMANENTLY, headers # Here we intervene on the user's query to look for holdouts from the # classic search system's author indexing syntax (surname_f). We @@ -154,6 +154,11 @@ def search(request_params: MultiDict, "search again. If this problem persists, please report it to " "help@arxiv.org." ) from e + except index.OutsideAllowedRange as e: + raise BadRequest( + "Hello clever friend. You can't get results in that range" + " right now." + ) from e except Exception as e: logger.error('Unhandled exception: %s', str(e)) @@ -242,8 +247,9 @@ def _update_with_archives(q: SimpleQuery, archives: List[str]) -> SimpleQuery: :class:`SimpleQuery` """ logger.debug('Search within %s', archives) - q.primary_classification = ClassificationList([ - Classification(archive=archive) for archive in archives # type: ignore + q.classification = ClassificationList([ + Classification(archive={'id': archive}) # type: ignore + for archive in archives ]) return q diff --git a/search/controllers/simple/forms.py b/search/controllers/simple/forms.py index 9e60163a..2d5b6049 100644 --- a/search/controllers/simple/forms.py +++ b/search/controllers/simple/forms.py @@ -9,28 +9,13 @@ from search.controllers.util import does_not_start_with_wildcard, \ has_balanced_quotes, strip_white_space +from ...domain import Query class SimpleSearchForm(Form): """Provides a simple field-query search form.""" - searchtype = SelectField("Field", choices=[ - ('all', 'All fields'), - ('title', 'Title'), - ('author', 'Author(s)'), - ('abstract', 'Abstract'), - ('comments', 'Comments'), - ('journal_ref', 'Journal reference'), - ('acm_class', 'ACM classification'), - ('msc_class', 'MSC classification'), - ('report_num', 'Report number'), - ('paper_id', 'arXiv identifier'), - ('doi', 'DOI'), - ('orcid', 'ORCID'), - ('author_id', 'arXiv author ID'), - ('help', 'Help pages'), - ('full_text', 'Full text') - ]) + searchtype = SelectField("Field", choices=Query.SUPPORTED_FIELDS) query = StringField('Search or Article ID', filters=[strip_white_space], validators=[does_not_start_with_wildcard, diff --git a/search/controllers/simple/tests.py b/search/controllers/simple/tests.py index 6512bc90..acb44db9 100644 --- a/search/controllers/simple/tests.py +++ b/search/controllers/simple/tests.py @@ -94,6 +94,8 @@ def _raiseDocumentNotFound(*args, **kwargs): class TestSearchController(TestCase): """Tests for :func:`.simple.search`.""" + @mock.patch('search.controllers.simple.url_for', + lambda *a, **k: f'https://arxiv.org/{k["paper_id"]}') @mock.patch('search.controllers.simple.index') def test_arxiv_id(self, mock_index): """Query parameter contains an arXiv ID.""" diff --git a/search/controllers/util.py b/search/controllers/util.py index 9110ad3f..a506f090 100644 --- a/search/controllers/util.py +++ b/search/controllers/util.py @@ -52,8 +52,8 @@ def paginate(query: Query, data: dict) -> Query: :class:`.Query` """ - query.page_start = int(data.get('start', 0)) - query.page_size = int(data.get('size', 50)) + query.page_start = max(int(data.get('start', 0)), 0) + query.size = min(int(data.get('size', 50)), Query.MAXIMUM_size) return query diff --git a/search/domain/__init__.py b/search/domain/__init__.py index 8e3e3512..9247c384 100644 --- a/search/domain/__init__.py +++ b/search/domain/__init__.py @@ -11,3 +11,4 @@ # pylint: disable=wildcard-import from .base import * from .advanced import * +from .api import * diff --git a/search/domain/advanced.py b/search/domain/advanced.py index 25b51c9d..c15b4db5 100644 --- a/search/domain/advanced.py +++ b/search/domain/advanced.py @@ -35,8 +35,31 @@ class AdvancedQuery(Query): An advanced query contains fielded search terms and boolean operators. """ + SUPPORTED_FIELDS = [ + ('title', 'Title'), + ('author', 'Author(s)'), + ('abstract', 'Abstract'), + ('comments', 'Comments'), + ('journal_ref', 'Journal reference'), + ('acm_class', 'ACM classification'), + ('msc_class', 'MSC classification'), + ('report_num', 'Report number'), + ('paper_id', 'arXiv identifier'), + ('cross_list_category', 'Cross-list category'), + ('doi', 'DOI'), + ('orcid', 'ORCID'), + ('author_id', 'arXiv author ID'), + ('all', 'All fields') + ] + date_range: Optional[DateRange] = None - primary_classification: ClassificationList = field( + + classification: ClassificationList = field( default_factory=ClassificationList ) + """Classification(s) by which to limit results.""" + + include_cross_list: bool = field(default=True) + """If True, secondaries are considered when limiting by classification.""" + terms: FieldedSearchList = field(default_factory=FieldedSearchList) diff --git a/search/domain/api.py b/search/domain/api.py new file mode 100644 index 00000000..1d7604c9 --- /dev/null +++ b/search/domain/api.py @@ -0,0 +1,43 @@ +"""API-specific domain classes.""" + +from .base import DateRange, Query, ClassificationList, Classification, List +from .advanced import FieldedSearchList, FieldedSearchTerm + +from dataclasses import dataclass, field +from typing import NamedTuple, Optional, Tuple + + +def get_default_extra_fields() -> List[str]: + """These are the default extra fields.""" + return ['title'] + + +def get_required_fields() -> List[str]: + """These fields should always be included.""" + return ['paper_id', 'paper_id_v', 'version', 'href', 'canonical'] + + +@dataclass +class APIQuery(Query): + """ + Represents an API query. + + Similar to an advanced query. + """ + + date_range: Optional[DateRange] = None + primary_classification: Tuple[Classification, ...] = \ + field(default_factory=tuple) + """Limit results to a specific primary classification.""" + secondary_classification: List[Tuple[Classification, ...]] = field( + default_factory=list + ) + """Limit results by cross-list classification.""" + terms: FieldedSearchList = field(default_factory=FieldedSearchList) + include_fields: List[str] = field(default_factory=get_default_extra_fields) + + def __post_init__(self) -> None: + """Be sure that the required fields are prepended to include_fields.""" + self.include_fields = list( + set(get_required_fields() + self.include_fields) + ) diff --git a/search/domain/base.py b/search/domain/base.py index 8a90a1eb..57a203fe 100644 --- a/search/domain/base.py +++ b/search/domain/base.py @@ -6,6 +6,8 @@ from pytz import timezone import re +from arxiv import taxonomy + from dataclasses import dataclass, field from dataclasses import asdict as _asdict @@ -17,6 +19,30 @@ def asdict(obj: Any) -> dict: return {key: value for key, value in _asdict(obj).items()} +@dataclass +class Person: + """Represents an author, owner, or other person in metadata.""" + + full_name: str + last_name: str = field(default_factory=str) + first_name: str = field(default_factory=str) + suffix: str = field(default_factory=str) + + affiliation: List[str] = field(default_factory=list) + """Institutional affiliations.""" + + orcid: Optional[str] = field(default=None) + """ORCID identifier.""" + + author_id: Optional[str] = field(default=None) + """Legacy arXiv author identifier.""" + + @classmethod + def fields(cls) -> List[str]: + """Get the names of fields on this class.""" + return cls.__dataclass_fields__.keys() # type: ignore + + @dataclass class DocMeta: """Metadata for an arXiv paper, retrieved from the core repository.""" @@ -102,15 +128,60 @@ def __str__(self) -> str: class Classification: """Represents an arXiv classification for a paper.""" - group: Optional[str] = None - archive: Optional[str] = None - category: Optional[str] = None + group: Optional[dict] = None + archive: Optional[dict] = None + category: Optional[dict] = None + + @property + def group_display(self) -> str: + """Get a human-friendly display label for the group.""" + if self.group is None: + return "" + label: str + if "name" in self.group: + label = self.group["name"] + else: + label = taxonomy.get_group_display(self.group["id"]) + return label + + @property + def archive_display(self) -> str: + """Get a human-friendly display label for the archive.""" + if self.archive is None: + return "" + label: str + if "name" in self.archive: + label = self.archive["name"] + else: + label = taxonomy.get_archive_display(self.archive["id"]) + return label + + @property + def category_display(self) -> str: + """Get a human-friendly display label for the category.""" + if self.category is None: + return "" + label: str + if "name" in self.category: + label = self.category["name"] + else: + label = taxonomy.get_category_display(self.category["id"]) + return label def __str__(self) -> str: """Build a string representation, for use in rendering.""" - return ":".join( - [p for p in [self.group, self.archive, self.category] if p] - ) + s = "" + if self.group: + s += self.group_display + if self.archive: + if s: + s += " :: " + s += self.archive_display + if self.category: + if s: + s += " :: " + s += self.category_display + return s class ClassificationList(list): @@ -125,8 +196,30 @@ def __str__(self) -> str: class Query: """Represents a search query originating from the UI or API.""" + MAXIMUM_size = 500 + """The maximum number of records that can be retrieved.""" + + SUPPORTED_FIELDS = [ + ('all', 'All fields'), + ('title', 'Title'), + ('author', 'Author(s)'), + ('abstract', 'Abstract'), + ('comments', 'Comments'), + ('journal_ref', 'Journal reference'), + ('acm_class', 'ACM classification'), + ('msc_class', 'MSC classification'), + ('report_num', 'Report number'), + ('paper_id', 'arXiv identifier'), + ('doi', 'DOI'), + ('orcid', 'ORCID'), + ('license', 'License (URI)'), + ('author_id', 'arXiv author ID'), + ('help', 'Help pages'), + ('full_text', 'Full text') + ] + order: Optional[str] = field(default=None) - page_size: int = field(default=50) + size: int = field(default=50) page_start: int = field(default=0) include_older_versions: bool = field(default=False) hide_abstracts: bool = field(default=False) @@ -134,12 +227,12 @@ class Query: @property def page_end(self) -> int: """Get the index/offset of the end of the page.""" - return self.page_start + self.page_size + return self.page_start + self.size @property def page(self) -> int: """Get the approximate page number.""" - return 1 + int(round(self.page_start/self.page_size)) + return 1 + int(round(self.page_start/self.size)) def __str__(self) -> str: """Build a string representation, for use in rendering.""" @@ -156,9 +249,14 @@ class SimpleQuery(Query): search_field: str = field(default_factory=str) value: str = field(default_factory=str) - primary_classification: ClassificationList = field( + + classification: ClassificationList = field( default_factory=ClassificationList ) + """Classification(s) by which to limit results.""" + + include_cross_list: bool = field(default=True) + """If True, secondaries are considered when limiting by classification.""" @dataclass(init=True) @@ -173,9 +271,9 @@ class Document: id: str = field(default_factory=str) abstract: str = field(default_factory=str) abstract_tex: str = field(default_factory=str) - authors: List[Dict] = field(default_factory=list) + authors: List[Person] = field(default_factory=list) authors_freeform: str = field(default_factory=str) - owners: List[Dict] = field(default_factory=list) + owners: List[Person] = field(default_factory=list) modified_date: str = field(default_factory=str) updated_date: str = field(default_factory=str) is_current: bool = True @@ -189,7 +287,7 @@ class Document: version: int = 1 latest: str = field(default_factory=str) latest_version: int = 0 - submitter: Dict = field(default_factory=dict) + submitter: Optional[Person] = field(default=None) report_num: str = field(default_factory=str) proxy: bool = False msc_class: List[str] = field(default_factory=list) diff --git a/search/encode.py b/search/encode.py new file mode 100644 index 00000000..6a17ee3e --- /dev/null +++ b/search/encode.py @@ -0,0 +1,23 @@ +"""Utilities for response encoding/serialization.""" + +from datetime import date, datetime + +from flask.json import JSONEncoder + +from typing import Any, List, Union + + +class ISO8601JSONEncoder(JSONEncoder): + """Renders date and datetime objects as ISO8601 datetime strings.""" + + def default(self, obj: Any) -> Union[str, List[Any]]: + """Overriden to render date(time)s in isoformat.""" + try: + if isinstance(obj, (date, datetime)): + return obj.isoformat() + iterable = iter(obj) + except TypeError: + pass + else: + return list(iterable) + return JSONEncoder.default(self, obj) #type: ignore diff --git a/search/factory.py b/search/factory.py index db5ebebd..da1a6231 100644 --- a/search/factory.py +++ b/search/factory.py @@ -7,9 +7,12 @@ from arxiv.base import Base from arxiv.base.middleware import wrap, request_logs -from search.routes import ui +from search.routes import ui, api from search.services import index from search.converters import ArchiveConverter +from search.encode import ISO8601JSONEncoder + +from arxiv.users import auth s3 = FlaskS3() @@ -34,3 +37,28 @@ def create_ui_web_app() -> Flask: wrap(app, [request_logs.ClassicLogsMiddleware]) return app + + +def create_api_web_app() -> Flask: + """Initialize an instance of the search frontend UI web application.""" + logging.getLogger('boto').setLevel(logging.ERROR) + logging.getLogger('boto3').setLevel(logging.ERROR) + logging.getLogger('botocore').setLevel(logging.ERROR) + + app = Flask('search') + app.json_encoder = ISO8601JSONEncoder + app.config.from_pyfile('config.py') + + index.init_app(app) + + Base(app) + auth.Auth(app) + app.register_blueprint(api.blueprint) + + wrap(app, [request_logs.ClassicLogsMiddleware, + auth.middleware.AuthMiddleware]) + + for error, handler in api.exceptions.get_handlers(): + app.errorhandler(error)(handler) + + return app diff --git a/search/routes/api/__init__.py b/search/routes/api/__init__.py new file mode 100644 index 00000000..a2dc6fb5 --- /dev/null +++ b/search/routes/api/__init__.py @@ -0,0 +1,50 @@ +"""Provides routing blueprint from the search API.""" + +import json +from typing import Dict, Callable, Union, Any, Optional, List +from functools import wraps +from urllib.parse import urljoin, urlparse, parse_qs, urlencode, urlunparse + +from flask.json import jsonify +from flask import Blueprint, render_template, redirect, request, Response, \ + url_for +from werkzeug.urls import Href, url_encode, url_parse, url_unparse, url_encode +from werkzeug.datastructures import MultiDict, ImmutableMultiDict + +from arxiv import status +from arxiv.base import logging +from werkzeug.exceptions import InternalServerError +from search.controllers import api + +from . import serialize, exceptions + +from arxiv.users.auth.decorators import scoped +from arxiv.users.auth import scopes + +logger = logging.getLogger(__name__) + +blueprint = Blueprint('api', __name__, url_prefix='/') + +ATOM_XML = "application/atom+xml" +JSON = "application/json" + + +@blueprint.route('/', methods=['GET']) +@scoped(required=scopes.READ_PUBLIC) +def search() -> Response: + """Main query endpoint.""" + logger.debug('Got query: %s', request.args) + data, status_code, headers = api.search(request.args) + # requested = request.accept_mimetypes.best_match([JSON, ATOM_XML]) + # if requested == ATOM_XML: + # return serialize.as_atom(data), status, headers + response_data = serialize.as_json(data['results'], query=data['query']) + return response_data, status_code, headers + + +@blueprint.route('v', methods=['GET']) +@scoped(required=scopes.READ_PUBLIC) +def paper(paper_id: str, version: str) -> Response: + """Document metadata endpoint.""" + data, status_code, headers = api.paper(f'{paper_id}v{version}') + return serialize.as_json(data['results']), status_code, headers diff --git a/search/routes/api/exceptions.py b/search/routes/api/exceptions.py new file mode 100644 index 00000000..bdf9db84 --- /dev/null +++ b/search/routes/api/exceptions.py @@ -0,0 +1,109 @@ +""" +Exception handlers for API endpoints. + +.. todo:: This module belongs in :mod:`arxiv.base`. + +""" + +from typing import Callable, List, Tuple + +from werkzeug.exceptions import NotFound, Forbidden, Unauthorized, \ + MethodNotAllowed, RequestEntityTooLarge, BadRequest, InternalServerError, \ + HTTPException +from flask import make_response, Response, jsonify + +from arxiv import status +from arxiv.base import logging + +logger = logging.getLogger(__name__) + +_handlers = [] + + +def handler(exception: type) -> Callable: + """Generate a decorator to register a handler for an exception.""" + def deco(func: Callable) -> Callable: + """Register a function as an exception handler.""" + _handlers.append((exception, func)) + return func + return deco + + +def get_handlers() -> List[Tuple[type, Callable]]: + """ + Get a list of registered exception handlers. + + Returns + ------- + list + List of (:class:`.HTTPException`, callable) tuples. + """ + return _handlers + + +@handler(NotFound) +def handle_not_found(error: NotFound) -> Response: + """Render the base 404 error page.""" + rendered = jsonify({'code': error.code, 'error': error.description}) + response = make_response(rendered) + response.status_code = status.HTTP_404_NOT_FOUND + return response + + +@handler(Forbidden) +def handle_forbidden(error: Forbidden) -> Response: + """Render the base 403 error page.""" + rendered = jsonify({'code': error.code, 'error': error.description}) + response = make_response(rendered) + response.status_code = status.HTTP_403_FORBIDDEN + return response + + +@handler(Unauthorized) +def handle_unauthorized(error: Unauthorized) -> Response: + """Render the base 401 error page.""" + rendered = jsonify({'code': error.code, 'error': error.description}) + response = make_response(rendered) + response.status_code = status.HTTP_401_UNAUTHORIZED + return response + + +@handler(MethodNotAllowed) +def handle_method_not_allowed(error: MethodNotAllowed) -> Response: + """Render the base 405 error page.""" + rendered = jsonify({'code': error.code, 'error': error.description}) + response = make_response(rendered) + response.status_code = status.HTTP_405_METHOD_NOT_ALLOWED + return response + + +@handler(RequestEntityTooLarge) +def handle_request_entity_too_large(error: RequestEntityTooLarge) -> Response: + """Render the base 413 error page.""" + rendered = jsonify({'code': error.code, 'error': error.description}) + response = make_response(rendered) + response.status_code = status.HTTP_413_REQUEST_ENTITY_TOO_LARGE + return response + + +@handler(BadRequest) +def handle_bad_request(error: BadRequest) -> Response: + """Render the base 400 error page.""" + rendered = jsonify({'code': error.code, 'error': error.description}) + response = make_response(rendered) + response.status_code = status.HTTP_400_BAD_REQUEST + return response + + +@handler(InternalServerError) +def handle_internal_server_error(error: InternalServerError) -> Response: + """Render the base 500 error page.""" + if isinstance(error, HTTPException): + rendered = jsonify({'code': error.code, 'error': error.description}) + else: + logger.error('Caught unhandled exception: %s', error) + rendered = jsonify({'code': status.HTTP_500_INTERNAL_SERVER_ERROR, + 'error': 'Unexpected error'}) + response = make_response(rendered) + response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR + return response diff --git a/search/routes/api/serialize.py b/search/routes/api/serialize.py new file mode 100644 index 00000000..910d172a --- /dev/null +++ b/search/routes/api/serialize.py @@ -0,0 +1,205 @@ +"""Serializers for API responses.""" + +from typing import Union, Optional +from lxml import etree +from flask import jsonify, url_for + +from arxiv import status +from search.domain import DocumentSet, Document, Classification, Person, \ + APIQuery + + +class BaseSerializer(object): + """Base class for API serializers.""" + + +class JSONSerializer(BaseSerializer): + """Serializes a :class:`DocumentSet` as JSON.""" + + @classmethod + def _transform_person(cls, person: Person) -> dict: + return { + 'first_name': person.first_name, + 'last_name': person.last_name, + 'suffix': person.suffix, + 'affiliation': person.affiliation, + 'orcid': person.orcid, + 'author_id': person.author_id, + 'full_name': person.full_name, + } + + @classmethod + def _transform_classification(cls, clsn: Classification) -> Optional[dict]: + if clsn.category is None: + return None + return { + 'group': clsn.group, + 'archive': clsn.archive, + 'category': clsn.category + } + + @classmethod + def _transform_format(cls, fmt: str, paper_id: str, version: int) -> dict: + return { + "format": fmt, + "href": url_for(fmt, paper_id=paper_id, version=version) + } + + @classmethod + def _transform_latest(cls, document: Document) -> Optional[dict]: + if not document.latest: + return None + return { + "paper_id": document.latest, + "href": url_for("api.paper", paper_id=document.paper_id, + version=document.latest_version, + _external=True), + "canonical": url_for("abs", paper_id=document.paper_id, + version=document.latest_version), + "version": document.latest_version + } + + @classmethod + def _transform_license(cls, license: dict) -> dict: + return { + 'label': license['label'], + 'href': license['uri'] + } + + @classmethod + def transform_document(cls, doc: Document, + query: Optional[APIQuery] = None) -> dict: + """Select a subset of :class:`Document` properties for public API.""" + fields = [ + ('abs_categories', doc.abs_categories), + ('abstract', doc.abstract), + ('acm_class', doc.acm_class), + ('owners', [ + cls._transform_person(owner) for owner in doc.owners + if owner is not None + ]), + ('authors', [ + cls._transform_person(author) for author in doc.authors + if author is not None + ]), + ('comments', doc.comments), + ('authors_freeform', doc.authors_freeform), + ('submitted_date', doc.submitted_date), + ('submitted_date_first', doc.submitted_date_first), + ('announced_date_first', ( + doc.announced_date_first.strftime('%Y-%m') + if doc.announced_date_first is not None + else None + )), + ('paper_id', doc.paper_id), + ('paper_id_v', doc.paper_id_v), + ('doi', doc.doi), + ('formats', [ + cls._transform_format(fmt, doc.paper_id, doc.version) + for fmt in doc.formats + ]), + ('is_current', doc.is_current), + ('is_withdrawn', doc.is_withdrawn), + ('journal_ref', doc.journal_ref), + ('license', + cls._transform_license(doc.license) if doc.license else None), + ('msc_class', doc.msc_class), + ('primary_classification', + cls._transform_classification(doc.primary_classification) + if doc.primary_classification else None), + ('secondary_classification', [ + cls._transform_classification(clsn) + for clsn in doc.secondary_classification + ]), + ('report_num', doc.report_num), + ('source', doc.source), # TODO, link? + ('submitter', ( + cls._transform_person(doc.submitter) + if doc.submitter is not None else None + )), + ('title', doc.title), + ('version', doc.version), + ('latest', cls._transform_latest(doc)), + ('href', url_for("api.paper", paper_id=doc.paper_id, + version=doc.version, _external=True)), + ('canonical', url_for("abs", paper_id=doc.paper_id, + version=doc.version)) + ] + + # Only return fields that have been explicitly requested. + if query is not None: + _data = {field: value for field, value in fields + if field in query.include_fields} + else: + _data = {field: value for field, value in fields} + return _data + + @classmethod + def serialize(cls, document_set: DocumentSet, + query: Optional[APIQuery] = None) -> str: + """Generate JSON for a :class:`DocumentSet`.""" + serialized: str = jsonify({ + 'results': [ + cls.transform_document(doc, query=query) + for doc in document_set.results + ], + 'metadata': { + 'start': document_set.metadata.get('start'), + 'end': document_set.metadata.get('end'), + 'size': document_set.metadata.get('size'), + 'total': document_set.metadata.get('total'), + 'query': document_set.metadata.get('query', []) + }, + }) + return serialized + + @classmethod + def serialize_document(cls, document: Document, + query: Optional[APIQuery] = None) -> str: + """Generate JSON for a single :class:`Document`.""" + serialized: str = jsonify( + cls.transform_document(document, query=query) + ) + return serialized + + +def as_json(document_or_set: Union[DocumentSet, Document], + query: Optional[APIQuery] = None) -> str: + """Serialize a :class:`DocumentSet` as JSON.""" + if type(document_or_set) is DocumentSet: + return JSONSerializer.serialize(document_or_set, query=query) # type: ignore + return JSONSerializer.serialize_document(document_or_set, query=query) # type: ignore + + +# TODO: implement me! +class AtomXMLSerializer(BaseSerializer): + """Atom XML serializer for paper metadata.""" + + ATOM = "http://www.w3.org/2005/Atom" + OPENSEARCH = "http://a9.com/-/spec/opensearch/1.1/" + ARXIV = "http://arxiv.org/schemas/atom" + NSMAP = { + None: ATOM, + "opensearch": OPENSEARCH, + "arxiv": ARXIV + } +# fields = { +# 'title': '{%s}title' % ATOM, +# 'id': '{%s}id' % ATOM, +# 'submitted_date': '{%s}published' % ATOM, +# 'modified_date': '{%s}updated' % ATOM, +# 'abstract': '{%s}summary' % ATOM, +# '' +# } +# +# def __init__(cls, *args, **kwargs) -> None: +# super(AtomXMLSerializer, cls).__init__(*args, **kwargs) +# cls._root = etree.Element('feed', nsmap=cls.NSMAP) +# +# def transform(cls): +# for document in cls.iter_documents(): +# +# +# +# def __repr__(cls) -> str: +# return etree.tostring(cls._root, pretty_print=True) diff --git a/search/routes/api/tests/__init__.py b/search/routes/api/tests/__init__.py new file mode 100644 index 00000000..1b3faf02 --- /dev/null +++ b/search/routes/api/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for API routes.""" diff --git a/search/routes/api/tests/test_api.py b/search/routes/api/tests/test_api.py new file mode 100644 index 00000000..1f1c47f7 --- /dev/null +++ b/search/routes/api/tests/test_api.py @@ -0,0 +1,199 @@ +"""Tests for API routes.""" + +import os +import json +from datetime import datetime +from unittest import TestCase, mock + +import jsonschema + +from arxiv.users import helpers, auth +from arxiv.users.domain import Scope +from arxiv import status + +from search import factory +from search import domain + + +class TestAPISearchRequests(TestCase): + """Requests against the main search API.""" + + SCHEMA_PATH = os.path.abspath('schema/resources/DocumentSet.json') + + def setUp(self): + """Instantiate and configure an API app.""" + jwt_secret = 'foosecret' + os.environ['JWT_SECRET'] = jwt_secret + self.app = factory.create_api_web_app() + self.app.config['JWT_SECRET'] = jwt_secret + self.client = self.app.test_client() + + with open(self.SCHEMA_PATH) as f: + self.schema = json.load(f) + + def test_request_without_token(self): + """No auth token is provided on the request.""" + response = self.client.get('/') + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + + def test_with_token_lacking_scope(self): + """Client auth token lacks required public read scope.""" + token = helpers.generate_token('1234', 'foo@bar.com', 'foouser', + scope=[Scope('something', 'read')]) + response = self.client.get('/', headers={'Authorization': token}) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + @mock.patch(f'{factory.__name__}.api.api') + def test_with_valid_token(self, mock_controller): + """Client auth token has required public read scope.""" + document = domain.Document( + submitted_date=datetime.now(), + submitted_date_first=datetime.now(), + announced_date_first=datetime.now(), + id='1234.5678', + abstract='very abstract', + authors=[ + domain.Person(full_name='F. Bar', orcid='1234-5678-9012-3456') + ], + submitter=domain.Person(full_name='S. Ubmitter', author_id='su_1'), + modified_date=datetime.now(), + updated_date=datetime.now(), + is_current=True, + is_withdrawn=False, + license={ + 'uri': 'http://foo.license/1', + 'label': 'Notalicense 5.4' + }, + paper_id='1234.5678', + paper_id_v='1234.5678v6', + title='tiiiitle', + source={ + 'flags': 'A', + 'format': 'pdftotex', + 'size_bytes': 2 + }, + version=6, + latest='1234.5678v6', + latest_version=6, + report_num='somenum1', + msc_class=['c1'], + acm_class=['z2'], + journal_ref='somejournal (1991): 2-34', + doi='10.123456/7890', + comments='very science', + abs_categories='astro-ph.CO foo.BR', + formats=['pdf', 'other'], + primary_classification=domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BR', 'name': 'Foo Category'}, + ), + secondary_classification=[ + domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BZ', 'name': 'Baz Category'}, + ) + ] + ) + docs = domain.DocumentSet( + results=[document], + metadata={'start': 0, 'end': 1, 'size': 50, 'total': 1} + ) + r_data = {'results': docs, 'query': domain.APIQuery()} + mock_controller.search.return_value = r_data, status.HTTP_200_OK, {} + token = helpers.generate_token('1234', 'foo@bar.com', 'foouser', + scope=[auth.scopes.READ_PUBLIC]) + response = self.client.get('/', headers={'Authorization': token}) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + data = json.loads(response.data) + res = jsonschema.RefResolver( + 'file://%s/' % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), + None + ) + self.assertIsNone(jsonschema.validate(data, self.schema, resolver=res), + 'Response content is valid per schema') + + for field in domain.api.get_required_fields(): + self.assertIn(field, data['results'][0]) + + @mock.patch(f'{factory.__name__}.api.api') + def test_with_valid_token_limit_fields(self, mock_controller): + """Client auth token has required public read scope.""" + document = domain.Document( + submitted_date=datetime.now(), + submitted_date_first=datetime.now(), + announced_date_first=datetime.now(), + id='1234.5678', + abstract='very abstract', + authors=[ + domain.Person(full_name='F. Bar', orcid='1234-5678-9012-3456') + ], + submitter=domain.Person(full_name='S. Ubmitter', author_id='su_1'), + modified_date=datetime.now(), + updated_date=datetime.now(), + is_current=True, + is_withdrawn=False, + license={ + 'uri': 'http://foo.license/1', + 'label': 'Notalicense 5.4' + }, + paper_id='1234.5678', + paper_id_v='1234.5678v6', + title='tiiiitle', + source={ + 'flags': 'A', + 'format': 'pdftotex', + 'size_bytes': 2 + }, + version=6, + latest='1234.5678v6', + latest_version=6, + report_num='somenum1', + msc_class=['c1'], + acm_class=['z2'], + journal_ref='somejournal (1991): 2-34', + doi='10.123456/7890', + comments='very science', + abs_categories='astro-ph.CO foo.BR', + formats=['pdf', 'other'], + primary_classification=domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BR', 'name': 'Foo Category'}, + ), + secondary_classification=[ + domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BZ', 'name': 'Baz Category'}, + ) + ] + ) + docs = domain.DocumentSet( + results=[document], + metadata={'start': 0, 'end': 1, 'size': 50, 'total': 1} + ) + + query = domain.APIQuery(include_fields=['abstract', 'license']) + r_data = {'results': docs, 'query': query} + mock_controller.search.return_value = r_data, status.HTTP_200_OK, {} + token = helpers.generate_token('1234', 'foo@bar.com', 'foouser', + scope=[auth.scopes.READ_PUBLIC]) + response = self.client.get('/', headers={'Authorization': token}) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + data = json.loads(response.data) + res = jsonschema.RefResolver( + 'file://%s/' % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), + None + ) + self.assertIsNone(jsonschema.validate(data, self.schema, resolver=res), + 'Response content is valid per schema') + + for field in domain.api.get_required_fields(): + self.assertEqual( + set(data['results'][0].keys()), + set(query.include_fields) + ) diff --git a/search/routes/api/tests/test_serialize.py b/search/routes/api/tests/test_serialize.py new file mode 100644 index 00000000..8d3ed542 --- /dev/null +++ b/search/routes/api/tests/test_serialize.py @@ -0,0 +1,161 @@ +"""Tests for serializers.""" + +import os +from unittest import TestCase, mock +from datetime import datetime +import json +import jsonschema +from .... import domain, encode +from .. import serialize + + +def mock_jsonify(o): + return json.dumps(o, cls=encode.ISO8601JSONEncoder) + + +class TestSerializeJSONDocument(TestCase): + """Serialize a single :class:`domain.Document` as JSON.""" + + SCHEMA_PATH = os.path.abspath('schema/resources/Document.json') + + def setUp(self): + with open(self.SCHEMA_PATH) as f: + self.schema = json.load(f) + + @mock.patch(f'{serialize.__name__}.url_for', lambda *a, **k: 'http://f/12') + @mock.patch(f'{serialize.__name__}.jsonify', mock_jsonify) + def test_to_json(self): + """Just your run-of-the-mill arXiv document generates valid JSON.""" + document = domain.Document( + submitted_date=datetime.now(), + submitted_date_first=datetime.now(), + announced_date_first=datetime.now(), + id='1234.5678', + abstract='very abstract', + authors=[ + domain.Person(full_name='F. Bar', orcid='1234-5678-9012-3456') + ], + submitter=domain.Person(full_name='S. Ubmitter', author_id='su_1'), + modified_date=datetime.now(), + updated_date=datetime.now(), + is_current=True, + is_withdrawn=False, + license={ + 'uri': 'http://foo.license/1', + 'label': 'Notalicense 5.4' + }, + paper_id='1234.5678', + paper_id_v='1234.5678v6', + title='tiiiitle', + source={ + 'flags': 'A', + 'format': 'pdftotex', + 'size_bytes': 2 + }, + version=6, + latest='1234.5678v6', + latest_version=6, + report_num='somenum1', + msc_class=['c1'], + acm_class=['z2'], + journal_ref='somejournal (1991): 2-34', + doi='10.123456/7890', + comments='very science', + abs_categories='astro-ph.CO foo.BR', + formats=['pdf', 'other'], + primary_classification=domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BR', 'name': 'Foo Category'}, + ), + secondary_classification=[ + domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BZ', 'name': 'Baz Category'}, + ) + ] + ) + srlzd = serialize.as_json(document) + res = jsonschema.RefResolver( + 'file://%s/' % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), + None + ) + self.assertIsNone( + jsonschema.validate(json.loads(srlzd), self.schema, resolver=res) + ) + + +class TestSerializeJSONDocumentSet(TestCase): + """Serialize a :class:`domain.DocumentSet` as JSON.""" + + SCHEMA_PATH = os.path.abspath('schema/resources/DocumentSet.json') + + def setUp(self): + with open(self.SCHEMA_PATH) as f: + self.schema = json.load(f) + + @mock.patch(f'{serialize.__name__}.url_for', lambda *a, **k: 'http://f/12') + @mock.patch(f'{serialize.__name__}.jsonify', mock_jsonify) + def test_to_json(self): + """Just your run-of-the-mill arXiv document generates valid JSON.""" + document = domain.Document( + submitted_date=datetime.now(), + submitted_date_first=datetime.now(), + announced_date_first=datetime.now(), + id='1234.5678', + abstract='very abstract', + authors=[ + domain.Person(full_name='F. Bar', orcid='1234-5678-9012-3456') + ], + submitter=domain.Person(full_name='S. Ubmitter', author_id='su_1'), + modified_date=datetime.now(), + updated_date=datetime.now(), + is_current=True, + is_withdrawn=False, + license={ + 'uri': 'http://foo.license/1', + 'label': 'Notalicense 5.4' + }, + paper_id='1234.5678', + paper_id_v='1234.5678v6', + title='tiiiitle', + source={ + 'flags': 'A', + 'format': 'pdftotex', + 'size_bytes': 2 + }, + version=6, + latest='1234.5678v6', + latest_version=6, + report_num='somenum1', + msc_class=['c1'], + acm_class=['z2'], + journal_ref='somejournal (1991): 2-34', + doi='10.123456/7890', + comments='very science', + abs_categories='astro-ph.CO foo.BR', + formats=['pdf', 'other'], + primary_classification=domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BR', 'name': 'Foo Category'}, + ), + secondary_classification=[ + domain.Classification( + group={'id': 'foo', 'name': 'Foo Group'}, + archive={'id': 'foo', 'name': 'Foo Archive'}, + category={'id': 'foo.BZ', 'name': 'Baz Category'}, + ) + ] + ) + meta = {'start': 0, 'size': 50, 'end': 50, 'total': 500202} + document_set = domain.DocumentSet(results=[document], metadata=meta) + srlzd = serialize.as_json(document_set) + res = jsonschema.RefResolver( + 'file://%s/' % os.path.abspath(os.path.dirname(self.SCHEMA_PATH)), + None + ) + self.assertIsNone( + jsonschema.validate(json.loads(srlzd), self.schema, resolver=res) + ) diff --git a/search/routes/ui.py b/search/routes/ui.py index b0ac8be5..efe8341d 100644 --- a/search/routes/ui.py +++ b/search/routes/ui.py @@ -151,12 +151,12 @@ def external_url(service: str, name: str, **parameters: Any) \ @blueprint.context_processor def url_for_page_builder() -> Dict[str, Callable]: """Add a page URL builder function to the template context.""" - def url_for_page(page: int, page_size: int) -> str: + def url_for_page(page: int, size: int) -> str: """Build an URL to for a search result page.""" rule = request.url_rule parts = url_parse(url_for(rule.endpoint)) args = request.args.copy() - args['start'] = (page - 1) * page_size + args['start'] = (page - 1) * size parts = parts.replace(query=url_encode(args)) url: str = url_unparse(parts) return url diff --git a/search/services/index/__init__.py b/search/services/index/__init__.py index 58f00672..dba9db93 100644 --- a/search/services/index/__init__.py +++ b/search/services/index/__init__.py @@ -33,14 +33,15 @@ from search.context import get_application_config, get_application_global from arxiv.base import logging from search.domain import Document, DocumentSet, Query, AdvancedQuery, \ - SimpleQuery, asdict + SimpleQuery, asdict, APIQuery from .exceptions import QueryError, IndexConnectionError, DocumentNotFound, \ IndexingError, OutsideAllowedRange, MappingError from .util import MAX_RESULTS from .advanced import advanced_search from .simple import simple_search -from .highlighting import highlight +from .api import api_search +from . import highlighting from . import results logger = logging.getLogger(__name__) @@ -75,6 +76,9 @@ def handle_es_exceptions() -> Generator: elif e.error == 'parsing_exception': logger.error('ES parsing_exception: %s', e.info) raise QueryError(e.info) from e + elif e.status_code == 404: + logger.error('Caught NotFoundError: %s', e) + raise DocumentNotFound('No such document') logger.error('Problem communicating with ES: %s' % e.error) raise IndexConnectionError( 'Problem communicating with ES: %s' % e.error @@ -270,7 +274,8 @@ def add_document(self, document: Document) -> None: Parameters ---------- document : :class:`.Document` - Must be a valid search document, per ``schema/Document.json``. + Must be a valid search document, per + ``schema/DocumentMetadata.json``. Raises ------ @@ -297,7 +302,8 @@ def bulk_add_documents(self, documents: List[Document], Parameters ---------- document : :class:`.Document` - Must be a valid search document, per ``schema/Document.json``. + Must be a valid search document, per + ``schema/DocumentMetadata.json``. docs_per_chunk: int Number of documents to send to ES in a single chunk Raises @@ -355,10 +361,10 @@ def get_document(self, document_id: int) -> Document: if not record: logger.error("No such document: %s", document_id) raise DocumentNotFound('No such document') - return Document(**record['_source']) # type: ignore + return results.to_document(record['_source'], highlight=False) # See https://github.com/python/mypy/issues/3937 - def search(self, query: Query) -> DocumentSet: + def search(self, query: Query, highlight: bool = True) -> DocumentSet: """ Perform a search. @@ -379,7 +385,7 @@ def search(self, query: Query) -> DocumentSet: """ # Make sure that the user is not requesting a nonexistant page. - max_pages = int(MAX_RESULTS/query.page_size) + max_pages = int(MAX_RESULTS/query.size) if query.page > max_pages: _message = f'Requested page {query.page}, but max is {max_pages}' logger.error(_message) @@ -393,20 +399,29 @@ def search(self, query: Query) -> DocumentSet: current_search = advanced_search(current_search, query) elif isinstance(query, SimpleQuery): current_search = simple_search(current_search, query) + elif isinstance(query, APIQuery): + current_search = api_search(current_search, query) except TypeError as e: - logger.error('Malformed query: %s', str(e)) - raise QueryError('Malformed query') from e + raise e + # logger.error('Malformed query: %s', str(e)) + # raise QueryError('Malformed query') from e + + if highlight: + # Highlighting is performed by Elasticsearch; here we include the + # fields and configuration for highlighting. + current_search = highlighting.highlight(current_search) - # Highlighting is performed by Elasticsearch; here we include the - # fields and configuration for highlighting. - current_search = highlight(current_search) + if isinstance(query, APIQuery): + current_search = current_search.extra( + _source={'include': query.include_fields} + ) with handle_es_exceptions(): # Slicing the search adds pagination parameters to the request. resp = current_search[query.page_start:query.page_end].execute() # Perform post-processing on the search results. - return results.to_documentset(query, resp) + return results.to_documentset(query, resp, highlight=highlight) def exists(self, paper_id_v: str) -> bool: """Determine whether a paper exists in the index.""" @@ -456,9 +471,9 @@ def current_session() -> SearchSession: @wraps(SearchSession.search) -def search(query: Query) -> DocumentSet: +def search(query: Query, highlight: bool = True) -> DocumentSet: """Retrieve search results.""" - return current_session().search(query) + return current_session().search(query, highlight=highlight) @wraps(SearchSession.add_document) diff --git a/search/services/index/advanced.py b/search/services/index/advanced.py index f43cc03c..d33fa3d1 100644 --- a/search/services/index/advanced.py +++ b/search/services/index/advanced.py @@ -1,13 +1,16 @@ """Supports the advanced search feature.""" -from typing import Any +from typing import Any, Union + +from functools import reduce, wraps +from operator import ior, iand from elasticsearch_dsl import Search, Q, SF from elasticsearch_dsl.query import Range, Match, Bool from search.domain import AdvancedQuery, Classification -from .prepare import SEARCH_FIELDS +from .prepare import SEARCH_FIELDS, limit_by_classification from .util import sort @@ -20,7 +23,7 @@ def advanced_search(search: Search, query: AdvancedQuery) -> Search: search : :class:`.Search` An Elasticsearch search in preparation. query : :class:`.AdvancedQuery` - An advanced query, originating from the advanced search controller. + A query originating from the advanced search UI. Returns ------- @@ -33,10 +36,14 @@ def advanced_search(search: Search, query: AdvancedQuery) -> Search: # behavior of faceted search. if not query.include_older_versions: search = search.filter("term", is_current=True) + _q_clsn = limit_by_classification(query.classification) + if query.include_cross_list: + _q_clsn |= limit_by_classification(query.classification, + "secondary_classification") q = ( _fielded_terms_to_q(query) & _date_range(query) - & _classifications(query) + & _q_clsn ) if query.order is None or query.order == 'relevance': # Boost the current version heavily when sorting by relevance. @@ -50,30 +57,6 @@ def advanced_search(search: Search, query: AdvancedQuery) -> Search: return search -def _classification(field: str, classification: Classification) -> Match: - """Get a query part for a :class:`.Classification`.""" - query = Q() - if classification.group: - field_name = '%s__group__id' % field - query &= Q('match', **{field_name: classification.group}) - if classification.archive: - field_name = '%s__archive__id' % field - query &= Q('match', **{field_name: classification.archive}) - return query - - -def _classifications(q: AdvancedQuery) -> Match: - """Get a query part for classifications on an :class:`.AdvancedQuery`.""" - if not q.primary_classification: - return Q() - query = _classification('primary_classification', - q.primary_classification[0]) - if len(q.primary_classification) > 1: - for classification in q.primary_classification[1:]: - query |= _classification('primary_classification', classification) - return query - - def _date_range(q: AdvancedQuery) -> Range: """Generate a query part for a date range.""" if not q.date_range: diff --git a/search/services/index/api.py b/search/services/index/api.py new file mode 100644 index 00000000..1d76c122 --- /dev/null +++ b/search/services/index/api.py @@ -0,0 +1,132 @@ +"""Supports the advanced search feature.""" + +from typing import Any, Union + +from functools import reduce, wraps +from operator import ior, iand + +from elasticsearch_dsl import Search, Q, SF +from elasticsearch_dsl.query import Range, Match, Bool + +from search.domain import Classification, APIQuery + +from .prepare import SEARCH_FIELDS, query_primary_exact, query_secondary_exact +from .util import sort + + +def api_search(search: Search, query: APIQuery) -> Search: + """ + Prepare a :class:`.Search` from a :class:`.APIQuery`. + + Parameters + ---------- + search : :class:`.Search` + An Elasticsearch search in preparation. + query : :class:`.APIQuery` + An query originating from the API. + + Returns + ------- + :class:`.Search` + The passed ES search object, updated with specific query parameters + that implement the advanced query. + + """ + # Classification and date are treated as filters; this foreshadows the + # behavior of faceted search. + if not query.include_older_versions: + search = search.filter("term", is_current=True) + + _q_clsn = Q() + if query.primary_classification: + _q_clsn &= reduce(ior, map(query_primary_exact, + list(query.primary_classification))) + if query.secondary_classification: + for classification in query.secondary_classification: + _q_clsn &= reduce(ior, map(query_secondary_exact, + list(classification))) + q = ( + _fielded_terms_to_q(query) + & _date_range(query) + & _q_clsn + ) + if query.order is None or query.order == 'relevance': + # Boost the current version heavily when sorting by relevance. + q = Q('function_score', query=q, boost=5, boost_mode="multiply", + score_mode="max", + functions=[ + SF({'weight': 5, 'filter': Q('term', is_current=True)}) + ]) + search = sort(query, search) + search = search.query(q) + return search + + +def _date_range(q: APIQuery) -> Range: + """Generate a query part for a date range.""" + if not q.date_range: + return Q() + params = {} + if q.date_range.date_type == q.date_range.ANNOUNCED: + fmt = '%Y-%m' + else: + fmt = '%Y-%m-%dT%H:%M:%S%z' + if q.date_range.start_date: + params["gte"] = q.date_range.start_date.strftime(fmt) + if q.date_range.end_date: + params["lt"] = q.date_range.end_date.strftime(fmt) + return Q('range', **{q.date_range.date_type: params}) + + +def _grouped_terms_to_q(term_pair: tuple) -> Q: + """Generate a :class:`.Q` from grouped terms.""" + term_a_raw, operator, term_b_raw = term_pair + + if type(term_a_raw) is tuple: + term_a = _grouped_terms_to_q(term_a_raw) + else: + term_a = SEARCH_FIELDS[term_a_raw.field](term_a_raw.term) + + if type(term_b_raw) is tuple: + term_b = _grouped_terms_to_q(term_b_raw) + else: + term_b = SEARCH_FIELDS[term_b_raw.field](term_b_raw.term) + + if operator == 'OR': + return term_a | term_b + elif operator == 'AND': + return term_a & term_b + elif operator == 'NOT': + return term_a & ~term_b + else: + # TODO: Confirm proper exception. + raise TypeError("Invalid operator for terms") + + +def _get_operator(obj: Any) -> str: + if type(obj) is tuple: + return _get_operator(obj[0]) + return obj.operator # type: ignore + + +def _group_terms(query: APIQuery) -> tuple: + """Group fielded search terms into a set of nested tuples.""" + terms = query.terms[:] + for operator in ['NOT', 'AND', 'OR']: + i = 0 + while i < len(terms) - 1: + if _get_operator(terms[i+1]) == operator: + terms[i] = (terms[i], operator, terms[i+1]) + terms.pop(i+1) + i -= 1 + i += 1 + assert len(terms) == 1 + return terms[0] # type: ignore + + +def _fielded_terms_to_q(query: APIQuery) -> Match: + if len(query.terms) == 1: + return SEARCH_FIELDS[query.terms[0].field](query.terms[0].term) + elif len(query.terms) > 1: + return _grouped_terms_to_q(_group_terms(query)) + return Q('match_all') diff --git a/search/services/index/highlighting.py b/search/services/index/highlighting.py index c9f4a504..0c30226c 100644 --- a/search/services/index/highlighting.py +++ b/search/services/index/highlighting.py @@ -9,15 +9,18 @@ """ import re -from typing import Any +from typing import Any, Union from elasticsearch_dsl import Search, Q, SF -from elasticsearch_dsl.response import Response +from elasticsearch_dsl.response import Response, Hit import bleach from flask import escape +from arxiv.base import logging from .util import TEXISM +logger = logging.getLogger(__name__) + HIGHLIGHT_TAG_OPEN = '' HIGHLIGHT_TAG_CLOSE = '' @@ -60,15 +63,11 @@ def highlight(search: Search) -> Search: search = search.highlight('doi', type='plain') search = search.highlight('report_num', type='plain') - # Setting number_of_fragments to 0 tells ES to highlight the entire - # abstract. + # Setting number_of_fragments to 0 tells ES to highlight the entire field. search = search.highlight('abstract', number_of_fragments=0) search = search.highlight('abstract.tex', type='plain', number_of_fragments=0) search = search.highlight('abstract.english', number_of_fragments=0) - - search = search.highlight('primary_classification*', type='plain', - number_of_fragments=0) return search @@ -143,10 +142,7 @@ def preview(value: str, fragment_size: int = 400, return snippet -# def _highlight(value: str, pattern: ) - - -def add_highlighting(result: dict, raw: Response) -> dict: +def add_highlighting(result: dict, raw: Union[Response, Hit]) -> dict: """ Add hit highlighting to a search result. @@ -166,21 +162,25 @@ def add_highlighting(result: dict, raw: Response) -> dict: """ # There may or may not be highlighting in the result set. highlighted_fields = getattr(raw.meta, 'highlight', None) + # ``meta.matched_queries`` contains a list of query ``_name``s that # matched. This is nice for non-string fields. matched_fields = getattr(raw.meta, 'matched_queries', []) + # These are from hits within child documents, e.g. + # secondary_classification. + inner_hits = getattr(raw.meta, 'inner_hits', None) + # The values here will (almost) always be list-like. So we need to stitch # them together. Note that dir(None) won't return anything, so this block # is skipped if there are no highlights from ES. for field in dir(highlighted_fields): + if field.startswith('_'): + continue value = getattr(highlighted_fields, field) if hasattr(value, '__iter__'): value = '…'.join(value) - if 'primary_classification' in field: - field = 'primary_classification' - # Non-TeX searches may hit inside of TeXisms. Highlighting those # fragments (i.e. inserting HTML) will break MathJax rendering. # To guard against this while preserving highlighting, we move @@ -207,6 +207,13 @@ def add_highlighting(result: dict, raw: Response) -> dict: if field not in result['highlight']: result['match'][field] = True + # We're using inner_hits to see which category in particular responded to + # the query. + if hasattr(inner_hits, 'secondary_classification'): + result['match']['secondary_classification'] = [ + ih.category.id for ih in inner_hits.secondary_classification + ] + # We just want to know whether there was a hit on the announcement date. result['match']['announced_date_first'] = ( bool('announced_date_first' in matched_fields) @@ -221,7 +228,6 @@ def add_highlighting(result: dict, raw: Response) -> dict: for field in ['abstract.tex', 'abstract.english', 'abstract']: if field in result['highlight']: - print(field) value = result['highlight'][field] abstract_snippet = preview(value) result['preview']['abstract'] = abstract_snippet diff --git a/search/services/index/prepare.py b/search/services/index/prepare.py index faa2b5aa..5ae48294 100644 --- a/search/services/index/prepare.py +++ b/search/services/index/prepare.py @@ -111,24 +111,53 @@ def _query_announcement_date(term: str) -> Optional[Q]: def _query_primary(term: str, operator: str = 'and') -> Q: - # In the 'or' case, we're basically just looking for hit highlighting - # after a match on the combined field. Since primary classification fields - # are keyword fields, they won't match the same way as the combined field - # (text). So we have to be a bit fuzzy here to get the highlight. - # TODO: in a future version, we should consider changes to the mappings - # to make this more straightforward. - if operator == 'or': - return reduce(ior, [( - Q("match", **{"primary_classification__category__id": {"query": part, "operator": operator}}) - | Q("wildcard", **{"primary_classification.category.name": f"*{part}*"}) - | Q("match", **{"primary_classification__archive__id": {"query": part, "operator": operator}}) - | Q("wildcard", **{"primary_classification.archive.name": f"*{part}*"}) - ) for part in term.split()]) - return ( - Q("match", **{"primary_classification__category__id": {"query": term, "operator": operator}}) - | Q("match", **{"primary_classification__category__name": {"query": term, "operator": operator}}) - | Q("match", **{"primary_classification__archive__id": {"query": term, "operator": operator}}) - | Q("match", **{"primary_classification__archive__name": {"query": term, "operator": operator}}) + # This now uses the "primary_classification.combined" field, which is + # isomorphic to the document-level "combined" field. So we get + # straightforward hit highlighting and a more consistent behavior. + return Q("match", **{ + "primary_classification__combined": { + "query": term, + "operator": operator, + "_name": "primary_classification" + } + }) + + +def query_primary_exact(classification: Classification) -> Q: + """Generate a :class:`Q` for primary classification by ID.""" + return reduce(iand, [ + Q("match", **{f"primary_classification__{field}__id": + getattr(classification, field)['id']}) + for field in ['group', 'archive', 'category'] + if getattr(classification, field, None) is not None + ]) + + +def query_secondary_exact(classification: Classification) -> Q: + """Generate a :class:`Q` for secondary classification by ID.""" + return Q("nested", path="secondary_classification", + query=reduce(iand, [ + Q("match", **{f"secondary_classification__{field}__id": + getattr(classification, field)['id']}) + for field in ['group', 'archive', 'category'] + if getattr(classification, field, None) is not None + ])) + + +def _query_secondary(term: str, operator: str = 'and') -> Q: + return Q( + "nested", + path="secondary_classification", + query=Q( + "match", **{ + "secondary_classification.combined": { + "query": term, + "operator": operator + } + } + ), + _name="secondary_classification", + inner_hits={} # This gets us the specific category that matched. ) @@ -142,11 +171,15 @@ def _query_paper_id(term: str, operator: str = 'and') -> Q: return q +def _license_query(term: str, operator: str = 'and') -> Q: + """Search by license, using its URI (exact).""" + return Q('term', **{'license__uri': term}) + + def _query_combined(term: str) -> Q: # Only wildcards in literals should be escaped. wildcard_escaped, has_wildcard = wildcard_escape(term) query_term = (wildcard_escaped if has_wildcard else escape(term)).lower() - # All terms must match in the combined field. return Q("query_string", fields=['combined'], default_operator='AND', allow_leading_wildcard=False, query=query_term) @@ -215,7 +248,8 @@ def _query_all_fields(term: str) -> Q: _query_report_num(term, operator='or'), _query_acm_class(term, operator='or'), _query_msc_class(term, operator='or'), - _query_primary(term, operator='or') + _query_primary(term, operator='or'), + _query_secondary(term, operator='or'), ] # If the whole query matches on a specific field, we should consider that @@ -233,10 +267,10 @@ def _query_all_fields(term: str) -> Q: _query_report_num(term, operator='and'), _query_acm_class(term, operator='and'), _query_msc_class(term, operator='and'), - _query_primary(term, operator='and') + _query_primary(term, operator='and'), + _query_secondary(term, operator='and') ]) - # It is possible that the query includes a date-related term, which we # interpret as an announcement date of v1 of the paper. We currently # support both "standard" `yyyy` or `yyyy-MM`` formats as well as a @@ -293,7 +327,7 @@ def _query_all_fields(term: str) -> Q: match_remainder = _query_combined(remainder) match_all_fields |= (match_remainder & match_date) - match_individual_sans_date = reduce(ior, [ + match_sans_date = reduce(ior, [ _query_paper_id(remainder, operator='AND'), author_query(remainder, operator='AND'), _query_title(remainder, default_operator='and'), @@ -306,16 +340,12 @@ def _query_all_fields(term: str) -> Q: _query_report_num(remainder, operator='and'), _query_acm_class(remainder, operator='and'), _query_msc_class(remainder, operator='and'), - _query_primary(remainder, operator='and') + _query_primary(remainder, operator='and'), + _query_secondary(remainder, operator='and') ]) - match_individual_field = Q('bool', should=[ - match_individual_field, - match_individual_sans_date & match_date - ], minimum_should_match=1) + match_individual_field |= (match_sans_date & match_date) else: - match_all_fields = Q('bool', - should=[match_all_fields, match_date], - minimum_should_match=1) + match_all_fields |= match_date query = (match_all_fields | match_individual_field) query &= Q("bool", should=queries) # Partial matches across fields. @@ -325,26 +355,24 @@ def _query_all_fields(term: str) -> Q: boost_mode='multiply') -def limit_by_classification(classifications: ClassificationList) -> Q: +def limit_by_classification(classifications: ClassificationList, + field: str = 'primary_classification') -> Q: """Generate a :class:`Q` to limit a query by by classification.""" - def _to_q(classification: Classification) -> Q: - _qs = [] - if classification.group: - _qs.append( - Q("match", **{"primary_classification__group__id": {"query": classification.group}}) - ) - if classification.archive: - _qs.append( - Q("match", **{"primary_classification__archive__id": {"query": classification.archive}}) - ) - if classification.category: - _qs.append( - Q("match", **{"primary_classification__category__id": {"query": classification.category}}) - ) - return reduce(iand, _qs) - - return reduce(ior, [_to_q(clsn) for clsn in classifications]) + if len(classifications) == 0: + return Q() + + def _to_q(clsn: Classification) -> Q: + return reduce(iand, [ + Q('match', **{f'{field}__{level}__id': getattr(clsn, level)['id']}) + for level in ['group', 'archive', 'category'] + if getattr(clsn, level) is not None + ]) + + _q = reduce(ior, map(_to_q, classifications)) + if field == 'secondary_classification': + _q = Q("nested", path="secondary_classification", query=_q) + return _q SEARCH_FIELDS: Dict[str, Callable[[str], Q]] = dict([ @@ -356,9 +384,11 @@ def _to_q(classification: Classification) -> Q: ('report_num', _query_report_num), ('acm_class', _query_acm_class), ('msc_class', _query_msc_class), + ('cross_list_category', _query_secondary), ('doi', _query_doi), ('paper_id', _query_paper_id), ('orcid', orcid_query), ('author_id', author_id_query), + ('license', _license_query), ('all', _query_all_fields) ]) diff --git a/search/services/index/results.py b/search/services/index/results.py index 27174c75..fcfab44f 100644 --- a/search/services/index/results.py +++ b/search/services/index/results.py @@ -7,10 +7,11 @@ import re from datetime import datetime from math import floor -from typing import Any, Dict +from typing import Any, Dict, Union -from elasticsearch_dsl.response import Response -from search.domain import Document, Query, DocumentSet +from elasticsearch_dsl.response import Response, Hit +from elasticsearch_dsl.utils import AttrList, AttrDict +from search.domain import Document, Query, DocumentSet, Classification, Person from arxiv.base import logging from .util import MAX_RESULTS, TEXISM @@ -20,45 +21,94 @@ logger.propagate = False -def _to_document(raw: Response) -> Document: +def _to_author(author_data: dict) -> Person: + """Prevent e-mail, other extraneous data, from escaping.""" + data = {} + for key, value in author_data.items(): + if key == 'email': + continue + elif key == 'name': + key = 'full_name' + if key not in Person.fields(): + continue + data[key] = value + return Person(**data) # type: ignore + + +def to_document(raw: Union[Hit, dict], highlight: bool = True) -> Document: """Transform an ES search result back into a :class:`.Document`.""" # typing: ignore result: Dict[str, Any] = {} - result['highlight'] = {} + result['match'] = {} # Hit on field, but no highlighting. result['truncated'] = {} # Preview is truncated. + for key in Document.fields(): - if not hasattr(raw, key): + if type(raw) is Hit: + if not hasattr(raw, key): + continue + value = getattr(raw, key) + + elif type(raw) is dict: + if key not in raw: + continue + value = raw.get(key) + else: continue - value = getattr(raw, key) - if key == 'announced_date_first' and value and isinstance(value, str): + + # We want to prevent ES-specific data types from escaping the module + # API. + if isinstance(value, AttrList): + value = value._l_ + elif isinstance(value, AttrDict): + value = value.to_dict() + + if key == 'primary_classification': + value = Classification(**value) # type: ignore + elif key == 'secondary_classification': + value = [Classification(**v) for v in value] # type: ignore + elif key in ['authors', 'owners']: + value = [_to_author(au) for au in value] + elif key == 'submitter': + value = _to_author(value) + + elif key == 'announced_date_first' and \ + value and isinstance(value, str): value = datetime.strptime(value, '%Y-%m').date() - if key in ['submitted_date', 'submitted_date_first', - 'submitted_date_latest']: + elif key in ['submitted_date', 'submitted_date_first', + 'submitted_date_latest']: try: value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S%z') except (ValueError, TypeError): - logger.warning( - f'Could not parse {key}: {value} as datetime' - ) + logger.warning(f'Could not parse {key}: {value} as datetime') pass - if key in ['acm_class', 'msc_class'] and value: + elif key in ['acm_class', 'msc_class'] and value: value = '; '.join(value) result[key] = value - result['score'] = raw.meta.score - if type(result['abstract']) is str: + + if type(raw) is Response: + result['score'] = raw.meta.score # type: ignore + + if type(result.get('abstract')) is str and highlight: + if 'preview' not in result: + result['preview'] = {} result['preview']['abstract'] = preview(result['abstract']) if result['preview']['abstract'].endswith('…'): result['truncated']['abstract'] = True - logger.debug('%s: add highlighting to result', raw.paper_id) - result = add_highlighting(result, raw) + if highlight and type(raw) in [Response, Hit]: + result['highlight'] = {} + logger.debug('%s: add highlighting to result', + raw.paper_id) # type: ignore + result = add_highlighting(result, raw) + return Document(**result) # type: ignore # See https://github.com/python/mypy/issues/3937 -def to_documentset(query: Query, response: Response) -> DocumentSet: +def to_documentset(query: Query, response: Response, highlight: bool = True) \ + -> DocumentSet: """ Transform a response from ES to a :class:`.DocumentSet`. @@ -76,23 +126,23 @@ def to_documentset(query: Query, response: Response) -> DocumentSet: page, along with pagination metadata. """ - max_pages = int(MAX_RESULTS/query.page_size) - N_pages_raw = response['hits']['total']/query.page_size + max_pages = int(MAX_RESULTS/query.size) + N_pages_raw = response['hits']['total']/query.size N_pages = int(floor(N_pages_raw)) + \ - int(N_pages_raw % query.page_size > 0) + int(N_pages_raw % query.size > 0) logger.debug('got %i results', response['hits']['total']) return DocumentSet(**{ # type: ignore 'metadata': { 'start': query.page_start, - 'end': min(query.page_start + query.page_size, + 'end': min(query.page_start + query.size, response['hits']['total']), 'total': response['hits']['total'], 'current_page': query.page, 'total_pages': N_pages, - 'page_size': query.page_size, + 'size': query.size, 'max_pages': max_pages }, - 'results': [_to_document(raw) for raw in response] + 'results': [to_document(raw, highlight=highlight) for raw in response] }) # See https://github.com/python/mypy/issues/3937 diff --git a/search/services/index/simple.py b/search/services/index/simple.py index 8ba01086..1406f0c1 100644 --- a/search/services/index/simple.py +++ b/search/services/index/simple.py @@ -28,8 +28,12 @@ def simple_search(search: Search, query: SimpleQuery) -> Search: """ search = search.filter("term", is_current=True) q = SEARCH_FIELDS[query.search_field](query.value) - if query.primary_classification: - q &= limit_by_classification(query.primary_classification) + if query.classification: + _q = limit_by_classification(query.classification) + if query.include_cross_list: + _q |= limit_by_classification(query.classification, + "secondary_classification") + q &= _q search = search.query(q) search = sort(query, search) return search diff --git a/search/services/index/tests/tests.py b/search/services/index/tests/tests.py index 34d5e3ec..a7d8f8e9 100644 --- a/search/services/index/tests/tests.py +++ b/search/services/index/tests/tests.py @@ -25,7 +25,11 @@ def test_advanced_query(self, mock_Elasticsearch, mock_Search): """:class:`.index.search` supports :class:`AdvancedQuery`.""" mock_results = mock.MagicMock() mock_results.__getitem__.return_value = {'total': 53} - mock_result = mock.MagicMock() + mock_result = mock.MagicMock( + authors=[{'full_name': 'N. Ame'}], + owners=[{'full_name': 'N. Ame'}], + submitter={'full_name': 'N. Ame'} + ) mock_result.meta.score = 1 mock_results.__iter__.return_value = [mock_result] mock_Search.execute.return_value = mock_results @@ -41,16 +45,16 @@ def test_advanced_query(self, mock_Elasticsearch, mock_Search): query = AdvancedQuery( order='relevance', - page_size=10, + size=10, date_range=DateRange( start_date=datetime.now() - timedelta(days=5), end_date=datetime.now() ), - primary_classification=ClassificationList([ + classification=ClassificationList([ Classification( - group='physics', - archive='physics', - category='hep-th' + group={'id': 'physics'}, + archive={'id': 'physics'}, + category={'id': 'hep-th'} ) ]), terms=FieldedSearchList([ @@ -80,7 +84,7 @@ def test_advanced_query(self, mock_Elasticsearch, mock_Search): self.assertEqual(document_set.metadata['total'], 53) self.assertEqual(document_set.metadata['current_page'], 1) self.assertEqual(document_set.metadata['total_pages'], 6) - self.assertEqual(document_set.metadata['page_size'], 10) + self.assertEqual(document_set.metadata['size'], 10) self.assertEqual(len(document_set.results), 1) @mock.patch('search.services.index.Search') @@ -89,7 +93,11 @@ def test_simple_query(self, mock_Elasticsearch, mock_Search): """:class:`.index.search` supports :class:`SimpleQuery`.""" mock_results = mock.MagicMock() mock_results.__getitem__.return_value = {'total': 53} - mock_result = mock.MagicMock() + mock_result = mock.MagicMock( + authors=[{'full_name': 'N. Ame'}], + owners=[{'full_name': 'N. Ame'}], + submitter={'full_name': 'N. Ame'} + ) mock_result.meta.score = 1 mock_results.__iter__.return_value = [mock_result] mock_Search.execute.return_value = mock_results @@ -105,7 +113,7 @@ def test_simple_query(self, mock_Elasticsearch, mock_Search): query = SimpleQuery( order='relevance', - page_size=10, + size=10, search_field='title', value='foo title' ) @@ -115,7 +123,7 @@ def test_simple_query(self, mock_Elasticsearch, mock_Search): self.assertEqual(document_set.metadata['total'], 53) self.assertEqual(document_set.metadata['current_page'], 1) self.assertEqual(document_set.metadata['total_pages'], 6) - self.assertEqual(document_set.metadata['page_size'], 10) + self.assertEqual(document_set.metadata['size'], 10) self.assertEqual(len(document_set.results), 1) diff --git a/search/services/metadata.py b/search/services/metadata.py index 0c518aec..4de6bfb9 100644 --- a/search/services/metadata.py +++ b/search/services/metadata.py @@ -114,7 +114,8 @@ def retrieve(self, document_id: str) -> DocMeta: f'{document_id}: retrieve metadata from {target} with SSL' f' verify {self._verify_cert}' ) - response = requests.get(target, verify=self._verify_cert) + response = requests.get(target, verify=self._verify_cert, + headers={'User-Agent': 'arXiv/system'}) except requests.exceptions.SSLError as e: logger.error('SSLError: %s', e) raise SecurityException('SSL failed: %s' % e) from e diff --git a/search/static/css/search.css b/search/static/css/search.css index c65ebeba..6095ded5 100644 --- a/search/static/css/search.css +++ b/search/static/css/search.css @@ -2,10 +2,17 @@ margin-bottom: 1.25em; } .content .arxiv-result p { margin-bottom: 0.25em; } + .content .arxiv-result .title { + line-height: 1.25em; } + .content .arxiv-result .tags { + margin-bottom: 0; } + .content .arxiv-result .tags:not(:last-child) { + margin-bottom: 0; } .content .arxiv-result span.tag { height: 1.25rem; } .content .arxiv-result .list-title { - font-size: 1.05em; } + font-size: 1.05em; + line-height: 1.25em; } .search-hit { color: #287916 !important; diff --git a/search/static/css/search.css.map b/search/static/css/search.css.map index e133266d..283b7c5f 100644 --- a/search/static/css/search.css.map +++ b/search/static/css/search.css.map @@ -1,6 +1,6 @@ { "version": 3, -"mappings": "AACE,sBAAa;EACX,aAAa,EAAE,MAAM;EACrB,wBAAC;IACC,aAAa,EAAE,MAAK;EACtB,+BAAQ;IACN,MAAM,EAAE,MAAM;EAChB,kCAAW;IACT,SAAS,EAAE,MAAM;;AAEvB,WAAW;EACT,KAAK,EAAE,OAAO;EACd,gBAAgB,EAAE,OAAO;EACzB,WAAW,EAAE,IAAI;;;AAInB,aAAa;EACX,WAAW,EAAE,SAAS;;AAExB,WAAW;EACT,UAAU,EAAE,IAAI;EAChB,aAAa,EAAE,IAAI;;AAErB,gBAAgB;EACd,MAAM,EAAE,YAAY;;AAEtB,iBAAiB;EACf,UAAU,EAAE,IAAI;EAChB,aAAa,EAAE,IAAI;;AAGnB,oCAAoC;EADtC,mBAAmB;IAEf,WAAW,EAAE,eAAe;IAC5B,YAAY,EAAE,eAAe;AAC/B,oCAAoC;EAJtC,mBAAmB;IAKf,WAAW,EAAE,gBAAgB;IAC7B,YAAY,EAAE,gBAAgB;;AAElC,mBAAmB;EACjB,UAAU,EAAE,IAAI;;AAElB,wBAA+D;EAC7D,MAAM,EAAE,CAAC;;AAEX,SAAS;EACP,MAAM,EAAE,iBAAkC;EAC1C,aAAa,EAAE,GAAG;EAClB,OAAO,EAAE,iBAAgB;EACzB,MAAM,EAAE,KAAK;;AAEf,oBAAoB;EAClB,MAAM,EAAE,iBAAiB;;AAE3B,OAAO;EACL,WAAW,EAAE,GAAG;EAChB,gBAAgB,EAAE,OAAO;EACzB,OAAO,EAAE,OAAM;EACf,WAAW,EAAE,MAAK;;AAEpB,YAAY;EACV,WAAW,EAAE,QAAQ;;AAEvB,aAAa;EACX,MAAM,EAAE,CAAC;EACT,IAAI,EAAE,gBAAa;EACnB,MAAM,EAAE,GAAG;EACX,KAAK,EAAE,CAAC;EACR,MAAM,EAAE,IAAI;EACZ,QAAQ,EAAE,MAAM;EAChB,OAAO,EAAE,CAAC;EACV,QAAQ,EAAE,QAAQ;;AAEpB,aAAa;EACX,SAAS,EAAE,IAAI;;;AAMX,kDAAkB;EAChB,aAAa,EAAE,MAAK;AACxB,oCAAoC;EAJtC,wBAAmB;IAKf,OAAO,EAAE,IAAI;IACb,eAAe,EAAE,UAAU;IAEzB,kDAAkB;MAChB,YAAY,EAAE,IAAI;MAClB,aAAa,EAAE,CAAC;IAEhB;;gEAAQ;MAGN,yBAAyB,EAAE,GAAG;MAC9B,sBAAsB,EAAE,GAAG;IAE7B;;+DAAQ;MAGN,0BAA0B,EAAE,GAAG;MAC/B,uBAAuB,EAAE,GAAG;IAChC;;oDAAQ;MAGN,aAAa,EAAE,CAAC;MAChB;;;;iEAAQ;QAEN,OAAO,EAAE,CAAC;MACZ;;;;;;;;gEAAQ;QAIN,OAAO,EAAE,CAAC;QACV;;;;;;;;wEAAO;UACL,OAAO,EAAE,CAAC;IAChB,6CAAa;MACX,SAAS,EAAE,CAAC;IAChB,4CAAqB;MACnB,eAAe,EAAE,MAAM;IACzB,yCAAkB;MAChB,eAAe,EAAE,QAAQ;IAEzB,sDAAQ;MACN,SAAS,EAAE,CAAC;MACZ,WAAW,EAAE,CAAC", +"mappings": "AACE,sBAAa;EACX,aAAa,EAAE,MAAM;EACrB,wBAAC;IACC,aAAa,EAAE,MAAK;EACtB,6BAAM;IACJ,WAAW,EAAE,MAAM;EACrB,4BAAK;IACH,aAAa,EAAE,CAAC;IAChB,6CAAkB;MAChB,aAAa,EAAE,CAAC;EACpB,+BAAQ;IACN,MAAM,EAAE,OAAO;EACjB,kCAAW;IACT,SAAS,EAAE,MAAM;IACjB,WAAW,EAAE,MAAM;;AAEzB,WAAW;EACT,KAAK,EAAE,kBAAkB;EACzB,gBAAgB,EAAE,kBAAkB;EACpC,WAAW,EAAE,IAAI;;;AAInB,aAAa;EACX,WAAW,EAAE,SAAS;;AAExB,WAAW;EACT,UAAU,EAAE,IAAI;EAChB,aAAa,EAAE,IAAI;;AAErB,gBAAgB;EACd,MAAM,EAAE,YAAY;;AAEtB,iBAAiB;EACf,UAAU,EAAE,IAAI;EAChB,aAAa,EAAE,IAAI;;AAGnB,oCAAoC;EADtC,mBAAmB;IAEf,WAAW,EAAE,eAAe;IAC5B,YAAY,EAAE,eAAe;AAC/B,oCAAoC;EAJtC,mBAAmB;IAKf,WAAW,EAAE,gBAAgB;IAC7B,YAAY,EAAE,gBAAgB;;AAElC,mBAAmB;EACjB,UAAU,EAAE,IAAI;;AAElB,wBAA+D;EAC7D,MAAM,EAAE,CAAC;;AAEX,SAAS;EACP,MAAM,EAAE,iBAAkC;EAC1C,aAAa,EAAE,GAAG;EAClB,OAAO,EAAE,iBAAgB;EACzB,MAAM,EAAE,KAAK;;AAEf,oBAAoB;EAClB,MAAM,EAAE,iBAAiB;;AAE3B,OAAO;EACL,WAAW,EAAE,GAAG;EAChB,gBAAgB,EAAE,OAAO;EACzB,OAAO,EAAE,OAAM;EACf,WAAW,EAAE,MAAK;;AAEpB,YAAY;EACV,WAAW,EAAE,QAAQ;;AAEvB,aAAa;EACX,MAAM,EAAE,CAAC;EACT,IAAI,EAAE,gBAAa;EACnB,MAAM,EAAE,GAAG;EACX,KAAK,EAAE,CAAC;EACR,MAAM,EAAE,IAAI;EACZ,QAAQ,EAAE,MAAM;EAChB,OAAO,EAAE,CAAC;EACV,QAAQ,EAAE,QAAQ;;AAEpB,aAAa;EACX,SAAS,EAAE,IAAI;;;AAMX,kDAAkB;EAChB,aAAa,EAAE,MAAK;AACxB,oCAAoC;EAJtC,wBAAmB;IAKf,OAAO,EAAE,IAAI;IACb,eAAe,EAAE,UAAU;IAEzB,kDAAkB;MAChB,YAAY,EAAE,IAAI;MAClB,aAAa,EAAE,CAAC;IAEhB;;gEAAQ;MAGN,yBAAyB,EAAE,GAAG;MAC9B,sBAAsB,EAAE,GAAG;IAE7B;;+DAAQ;MAGN,0BAA0B,EAAE,GAAG;MAC/B,uBAAuB,EAAE,GAAG;IAChC;;oDAAQ;MAGN,aAAa,EAAE,CAAC;MAChB;;;;iEAAQ;QAEN,OAAO,EAAE,CAAC;MACZ;;;;;;;;gEAAQ;QAIN,OAAO,EAAE,CAAC;QACV;;;;;;;;wEAAO;UACL,OAAO,EAAE,CAAC;IAChB,6CAAa;MACX,SAAS,EAAE,CAAC;IAChB,4CAAqB;MACnB,eAAe,EAAE,MAAM;IACzB,yCAAkB;MAChB,eAAe,EAAE,QAAQ;IAEzB,sDAAQ;MACN,SAAS,EAAE,CAAC;MACZ,WAAW,EAAE,CAAC", "sources": ["../sass/search.sass"], "names": [], "file": "search.css" diff --git a/search/static/sass/search.sass b/search/static/sass/search.sass index 443aab61..8c01fb60 100644 --- a/search/static/sass/search.sass +++ b/search/static/sass/search.sass @@ -3,10 +3,17 @@ margin-bottom: 1.25em p margin-bottom: .25em + .title + line-height: 1.25em + .tags + margin-bottom: 0 + &:not(:last-child) + margin-bottom: 0 span.tag height: 1.25rem .list-title font-size: 1.05em + line-height: 1.25em .search-hit color: #287916 !important diff --git a/search/templates/search/advanced_search.html b/search/templates/search/advanced_search.html index 7a2efb42..d7668351 100644 --- a/search/templates/search/advanced_search.html +++ b/search/templates/search/advanced_search.html @@ -147,6 +147,15 @@ {{ select_field(form.classification.statistics) }} +
+
+ {% for subfield in form.classification.include_cross_list %} + + {% endfor %} +
+
Date @@ -328,7 +337,7 @@
{# TODO - adjust this layout so that it matches across all forms less awkwardly #}
{% if query %} - Simple Search + Simple Search {% else %} Simple Search {% endif %} @@ -339,25 +348,22 @@ {% if has_classic_format %} {{ search_macros.show_classic_author_search() }} {% endif %} -
-
- -
- Refine This Query + +
+ -
-
+
+

{% if query %} - Simple Search + Simple Search {% else %} Simple Search {% endif %} -

+

diff --git a/search/templates/search/base.html b/search/templates/search/base.html index dece26ed..966ad7dc 100644 --- a/search/templates/search/base.html +++ b/search/templates/search/base.html @@ -28,7 +28,7 @@ }, fieldValues: { "components": ["16000"], // Search component. - "versions": ["14134"], // Release search-0.4 + "versions": ["14157"], // Release search-0.5 "customfield_11401": window.location.href } }; @@ -41,7 +41,8 @@

{% block title %}Search{% endblock title %}

-
+ @@ -50,5 +51,10 @@

{% block title %}Search{% endblock title %}

{% block within_content %} Specific results here {% endblock within_content %} +
+ + {{ config.RELEASE_NOTES_TEXT }}   + +
{% endblock content %} diff --git a/search/templates/search/search-macros.html b/search/templates/search/search-macros.html index d4ffec14..f157d106 100644 --- a/search/templates/search/search-macros.html +++ b/search/templates/search/search-macros.html @@ -34,7 +34,7 @@ {% macro pagination(metadata, url_for_page) -%}