Refactor opensearch away (#91)

* Remove opensearch code * Remove old dependencies * Consolidate tests into one folder and add setup Moves all tests into one folder and adds the necessary setup to run them * Add test setup to ci * fix path for cli install * fix docker compose up command * Try to fix flakey ecr upload These steps are failing for some reason, this sets the versions to match the backend in case that is the cause * Fix tagging issue
climatepolicyradar · Feb 8, 2024 · 8a66238 · 8a66238
1 parent 31baac9
commit 8a66238
Show file tree

Hide file tree

Showing 43 changed files with 175 additions and 52,972 deletions.
diff --git a/.env.example b/.env.example
@@ -1,31 +1,3 @@
-# Opensearch connection settings
-OPENSEARCH_USER=admin
-OPENSEARCH_PASSWORD=admin
-OPENSEARCH_URL=http://localhost:9200
-OPENSEARCH_INDEX_PREFIX=navigator
-
-OPENSEARCH_USE_SSL=False
-OPENSEARCH_VERIFY_CERTS=False
-OPENSEARCH_SSL_SHOW_WARN=False
-
-TARGET_LANGUAGES=en,fr # comma-separated 2-letter ISO codes
-
-# Optional config. Defaults are set in src/config.py
-INDEX_ENCODER_CACHE_FOLDER=/models
-SBERT_MODEL=msmarco-distilbert-dot-v5
-ENCODING_BATCH_SIZE=32
-CDN_URL=https://cdn.climatepolicyradar.org
-OPENSEARCH_INDEX_NUM_SHARDS=1
-OPENSEARCH_INDEX_NUM_REPLICAS=2
-KNN_PARAM_EF_SEARCH=100
-NMSLIB_EF_CONSTRUCTION=512
-NMSLIB_M=16
-OPENSEARCH_INDEX_EMBEDDING_DIM=768
-OPENSEARCH_BULK_REQUEST_TIMEOUT=60
-
-EMBEDDINGS_INPUT_PREFIX=
-INDEXER_INPUT_PREFIX=
-
 VESPA_INSTANCE_URL=http://localhost:8080/
 VESPA_PRIVATE_KEY=
 VESPA_KEY_LOCATION=

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,27 +20,34 @@ jobs:
           cp .env.example .env
           make build
 
-      - name: Run Unit Tests
-        run:  make test
+      - name: Install latest Vespa CLI
+        env:
+          VESPA_CLI_VERSION: "8.250.43"
+        run: |
+          mkdir vespa-cli
+          curl -fsSL https://github.com/vespa-engine/vespa/releases/download/v${VESPA_CLI_VERSION}/vespa-cli_${VESPA_CLI_VERSION}_linux_amd64.tar.gz | \
+            tar -zxf - -C vespa-cli --strip-component=1
+          echo "vespa-cli/bin" >> $GITHUB_PATH
 
-      - name: Run Integration Tests
-        run: echo TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO
+      - name: Setup Vespa Test Instance
+        run:  make vespa_setup
+
+      - name: Run Tests
+        run:  make test
 
       - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1-node16
+        uses: aws-actions/configure-aws-credentials@v4
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: eu-west-1
 
       - name: Login to Amazon ECR
         id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v1
+        uses: aws-actions/amazon-ecr-login@v1.6.1
 
       - name: Push Images to ECR
         run: |
           .github/retag-and-push.sh navigator-search-indexer latest
         env:
           DOCKER_REGISTRY: ${{ secrets.DOCKER_REGISTRY }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
diff --git a/Dockerfile b/Dockerfile
@@ -18,12 +18,12 @@ RUN poetry config virtualenvs.create false
 RUN poetry install
 
 # Copy files to image
-COPY ./data ./data
 COPY ./src ./src
 COPY ./cli ./cli
+COPY ./tests ./tests
 
 # Pre-download the model
 ENV PYTHONPATH "${PYTHONPATH}:/app"
 
 # Run the indexer on the input s3 directory
-ENTRYPOINT [ "sh", "./cli/run.sh" ]
+ENTRYPOINT [ "sh", "./cli/run.sh" ]
diff --git a/Makefile b/Makefile
@@ -1,47 +1,21 @@
 include .env
 
-.PHONY: build test dev_install opensearch_test_data
+.PHONY: build test dev_install
 
 setup:
 	cp .env.example .env
 
 build:
 	docker build -t navigator-search-indexer .
 
-download_indexer_inputs:
-	docker run --entrypoint aws --env-file=.env -v ${PWD}/data:/app/data navigator-search-indexer s3 sync ${INDEXER_INPUT_PREFIX} /app/data/indexer_input
-
-run_encoding_docker:
-	docker run --entrypoint python -v ${PWD}/data:/app/data navigator-search-indexer -m cli.text2embeddings ./data/embeddings_input ./data/indexer_input
-
-run_indexing_docker: download_indexer_inputs
-	docker run --entrypoint python --network=host --env-file=.env -v ${PWD}/data:/app/data navigator-search-indexer -m cli.index_data /app/data/indexer_input
+vespa_setup: vespa_confirm_cli_installed vespa_dev_start vespa_healthy vespa_deploy_schema
 
 test:
-	docker run --entrypoint python navigator-search-indexer -m pytest -vvv
+	docker-compose -f docker-compose.dev.yml run --rm navigator-search-indexer python -m pytest -vvv
 
 dev_install:
 	poetry install && poetry run pre-commit install
 
-test_against_aws:
-	cp Dockerfile.aws.example Dockerfile
-	docker build -t navigator-search-indexer-aws .
-	docker run -it navigator-search-indexer-aws python -m pytest
-
-run_local_against_aws:
-	cp Dockerfile.aws.example Dockerfile
-	docker build -t navigator-search-indexer-aws .
-	docker run -e EMBEDDINGS_INPUT_PREFIX=${EMBEDDINGS_INPUT_PREFIX} -e INDEXER_INPUT_PREFIX=${INDEXER_INPUT_PREFIX} -it navigator-search-indexer-aws
-
-# test data for backend
-create_test_index:
-	docker run --entrypoint python --network=host --env-file=.env -e OPENSEARCH_INDEX_PREFIX=navigator_test -v ${PWD}/data:/app/data navigator-search-indexer -m cli.test.create_test_index /app/data/embeddings_input
-
-opensearch_test_dump: create_test_index
-	rm -rf ./data/opensearch_test_dump/**
-	multielasticdump --input=http://admin:admin@localhost:9200 --output=./data/opensearch_test_dump --match="navigator_test_.*" --ignoreType=template
-
-
 # setup dev/test vespa
 vespa_confirm_cli_installed:
 	@if [ ! $$(which vespa) ]; then \
@@ -51,7 +25,7 @@ vespa_confirm_cli_installed:
 	fi
 
 vespa_dev_start:
-	docker-compose -f docker-compose.dev.yml up -d --remove-orphans --wait
+	docker compose -f docker-compose.dev.yml up --detach --wait vespaindexertest
 
 vespa_healthy:
 	@if [ ! $$(curl -f -s 'http://localhost:19071/status.html') ]; then \
@@ -63,5 +37,3 @@ vespa_healthy:
 vespa_deploy_schema:
 	vespa config set target local
 	@vespa deploy tests/vespa_test_schema --wait 300
-
-vespa_setup: vespa_confirm_cli_installed vespa_dev_start vespa_healthy vespa_deploy_schema
diff --git a/README.md b/README.md
@@ -1,202 +1,13 @@
-# Opensearch Indexer
+# Vespa Indexer
 
 The code in this folder contain a CLI tool used to index data into the Navigator search index:
 
 * `index_data.py`: loads document metadata from the Navigator database and indexes this data alongside the text and embeddings created from embeddings generation into the search index.
 
-There is also an `opensearch-query-example.ipynb` notebook that demonstrates running a query on the index. This is to be developed further and integrated into the Navigator APIs.
-
-## Creating a test data dump for the backend
-
-See `make opensearch_test_dump` and `cli/test/create_test_index.py`.
-
-## Running
-
-### 1. Building
-
-`make build`
-
-### 2. Loading data into Opensearch (in docker-compose)
-
-Note: this command will wipe and repopulate the index specified in `.env` if it's already populated.
-
-```shell
-docker run --net=host --env-file .env -v /path/to/text-ids-file:/text-ids-path -v /path/to/embeddings-file:/embeddings-path navigator-search-indexer python /app/index_data.py --text-ids-path /text-ids-path --embeddings-path /embeddings-path -d 768
-```
-
-## Opensearch index structure
-
-The following snippets are examples of the structure of different documents in the Opensearch index. Each document in the Opensearch index either describes a title, a description, or a text block of a document. **TODO: This will be revised once we remove the concept of actions from our database.**
-
-**Example opensearch document with text block:**
-
-``` json
-{
-    "document_url" : "https://cdn.climatepolicyradar.org/PHL/2020/PHL-2020-03-19-Sustainable Finance Policy Framework of 2020-319_1c11e58a696ca5741fdc3454b4369564.pdf",
-    "document_id" : 167,
-    "document_name" : "Sustainable Finance Policy Framework of 2020",
-    "document_date" : "19/03/2020",
-    "document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks.&nbsp;",
-    "document_category" : "Policy",
-    "document_type" : "Framework",
-    "document_keyword" : [
-      "Finance",
-      "Central Bank"
-    ],
-    "document_sector_name" : "Finance",
-    "document_hazard_name" : [ ],
-    "document_instrument_name" : [
-      "Processes, plans and strategies|Governance",
-      "Capacity building|Governance"
-    ],
-    "document_language" : "English",
-    "document_instrument_parent" : [ ],
-    "document_framework_name" : [ ],
-    "document_response_name" : [
-      "Mitigation",
-      "Adaptation"
-    ],
-    "document_name_and_id" : "Sustainable Finance Policy Framework of 2020 167",
-    "document_country_code" : "PHL",
-    "document_country_english_shortname" : "Philippines",
-    "document_region_english_shortname" : "East Asia & Pacific",
-    "document_region_code" : "East Asia & Pacific",
-    "document_source_name" : "CCLW",
-    "text_block_id" : "p0_b1",
-    "text" : "CIRCULAR NO. 1085",
-    "text_embedding" : [x768],
-    "text_block_coords" : [
-      [
-        263.2799987792969,
-        709.3638153076172
-      ],
-      [
-        364.5785827636719,
-        709.3638153076172
-      ],
-      [
-        364.5785827636719,
-        720.4228668212891
-      ],
-      [
-        263.2799987792969,
-        720.4228668212891
-      ]
-    ],
-    "text_block_page" : 0
-}
-```
-
-**Example Opensearch document with title:**
-
-Note the `for_search_document_name` field which is used for title search; the `document_name` field is identical to this field but appears on all documents for sorting purposes.
-
-``` json
-{
-  "document_url" : "https://cdn.climatepolicyradar.org/PHL/2020/PHL-2020-03-19-Sustainable Finance Policy Framework of 2020-319_1c11e58a696ca5741fdc3454b4369564.pdf",
-  "document_id" : 167,
-  "document_name" : "Sustainable Finance Policy Framework of 2020",
-  "document_date" : "19/03/2020",
-  "document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks.&nbsp;",
-  "document_category" : "Policy",
-  "document_type" : "Framework",
-  "document_keyword" : [
-    "Finance",
-    "Central Bank"
-  ],
-  "document_sector_name" : "Finance",
-  "document_hazard_name" : [ ],
-  "document_instrument_name" : [
-    "Processes, plans and strategies|Governance",
-    "Capacity building|Governance"
-  ],
-  "document_language" : "English",
-  "document_instrument_parent" : [ ],
-  "document_framework_name" : [ ],
-  "document_response_name" : [
-    "Mitigation",
-    "Adaptation"
-  ],
-  "document_name_and_id" : "Sustainable Finance Policy Framework of 2020 167",
-  "document_country_code" : "PHL",
-  "document_country_english_shortname" : "Philippines",
-  "document_region_english_shortname" : "East Asia & Pacific",
-  "document_region_code" : "East Asia & Pacific",
-  "document_source_name" : "CCLW",
-  "for_search_document_name" : "Sustainable Finance Policy Framework of 2020"
-}
-```
-
-**Example text block with description:**
-
-Note the `for_search_document_description` field and the `document_description` field - see comment about titles.
-
-``` json
-{
-  "document_url" : "https://cdn.climatepolicyradar.org/PHL/2020/PHL-2020-03-19-Sustainable Finance Policy Framework of 2020-319_1c11e58a696ca5741fdc3454b4369564.pdf",
-  "document_id" : 167,
-  "document_name" : "Sustainable Finance Policy Framework of 2020",
-  "document_date" : "19/03/2020",
-  "document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks.&nbsp;",
-  "document_category" : "Policy",
-  "document_type" : "Framework",
-  "document_keyword" : [
-    "Finance",
-    "Central Bank"
-  ],
-  "document_sector_name" : "Finance",
-  "document_hazard_name" : [ ],
-  "document_instrument_name" : [
-    "Processes, plans and strategies|Governance",
-    "Capacity building|Governance"
-  ],
-  "document_language" : "English",
-  "document_instrument_parent" : [ ],
-  "document_framework_name" : [ ],
-  "document_response_name" : [
-    "Mitigation",
-    "Adaptation"
-  ],
-  "document_name_and_id" : "Sustainable Finance Policy Framework of 2020 167",
-  "document_country_code" : "PHL",
-  "document_country_english_shortname" : "Philippines",
-  "document_region_english_shortname" : "East Asia & Pacific",
-  "document_region_code" : "East Asia & Pacific",
-  "document_source_name" : "CCLW",
-  "for_search_document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks.&nbsp;",
-  "document_description_embedding" : [x768],
-}
-```
-
-## Common issues
-
-### Virtual memory
-
-Error in docker logs:
-
-```shell
-opensearch-node1          | ERROR: [2] bootstrap checks failed
-opensearch-node1          | [1]: max virtual memory areas vm.max_map_count [65530] is too low, increase to at least [262144]
-opensearch-node1          | [2]: the default discovery settings are unsuitable for production use; at least one of [discovery.seed_hosts, discovery.seed_providers, cluster.initial_master_nodes] must be configured
-opensearch-node1          | ERROR: OpenSearch did not exit normally - check the logs at /usr/share/opensearch/logs/opensearch-cluster.log
-opensearch-node1          | [2022-04-14T14:49:58,972][INFO ][o.o.n.Node               ] [opensearch-node1] stopping ...
-opensearch-node1          | [2022-04-14T14:49:58,985][INFO ][o.o.n.Node               ] [opensearch-node1] stopped
-opensearch-node1          | [2022-04-14T14:49:58,985][INFO ][o.o.n.Node               ] [opensearch-node1] closing ...
-opensearch-node1          | [2022-04-14T14:49:58,995][INFO ][o.o.n.Node               ] [opensearch-node1] closed
-opensearch-node1          | Killing performance analyzer process 34
-opensearch-node1          | OpenSearch exited with code 78
-opensearch-node1          | Performance analyzer exited with code 143
-```
-
-Run [this command](https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html) on the host machine:
-
-```shell
-sysctl -w vm.max_map_count=262144
-```
-
 # Vespa test setup
 
 ```
-make vespa_test_setup
-poetry run pytest ./tests
+make build
+make vespa_setup
+make test
 ```
diff --git a/cli/index_data.py b/cli/index_data.py
@@ -1,5 +1,3 @@
-"""Index data into a running Opensearch index."""
-
 import os
 import sys
 import time
@@ -13,7 +11,6 @@
 from tqdm.auto import tqdm
 from cpr_data_access.parser_models import ParserOutput
 
-from src.index.opensearch import populate_opensearch
 from src.index.vespa_ import populate_vespa
 
 LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
@@ -113,11 +110,8 @@ def run_as_cli(
     index_type: str,
 ) -> None:
     if index_type.lower() == "opensearch":
-        tasks, embedding_dir_as_path = _get_index_tasks(
-            indexer_input_dir, s3, files_to_index, limit
-        )
-        populate_opensearch(tasks=tasks, embedding_dir_as_path=embedding_dir_as_path)
-        sys.exit(0)
+        click.echo(f"Index type: {index_type}, is no longer used", err=True)
+        sys.exit(1)
     elif index_type.lower() == "vespa":
         _LOGGER.warning("Vespa indexing still experimental")
         tasks, embedding_dir_as_path = _get_index_tasks(