Skip to content

Commit

Permalink
Refactor opensearch away (#91)
Browse files Browse the repository at this point in the history
* Remove opensearch code

* Remove old dependencies

* Consolidate tests into one folder and add setup

Moves all tests into one folder and adds the necessary setup to run them

* Add test setup to ci

* fix path for cli install

* fix docker compose up command

* Try to fix flakey ecr upload

These steps are failing for some reason, this sets the versions to match
the backend in case that is the cause

* Fix tagging issue
  • Loading branch information
olaughter authored Feb 8, 2024
1 parent 31baac9 commit 8a66238
Show file tree
Hide file tree
Showing 43 changed files with 175 additions and 52,972 deletions.
28 changes: 0 additions & 28 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,31 +1,3 @@
# Opensearch connection settings
OPENSEARCH_USER=admin
OPENSEARCH_PASSWORD=admin
OPENSEARCH_URL=http://localhost:9200
OPENSEARCH_INDEX_PREFIX=navigator

OPENSEARCH_USE_SSL=False
OPENSEARCH_VERIFY_CERTS=False
OPENSEARCH_SSL_SHOW_WARN=False

TARGET_LANGUAGES=en,fr # comma-separated 2-letter ISO codes

# Optional config. Defaults are set in src/config.py
INDEX_ENCODER_CACHE_FOLDER=/models
SBERT_MODEL=msmarco-distilbert-dot-v5
ENCODING_BATCH_SIZE=32
CDN_URL=https://cdn.climatepolicyradar.org
OPENSEARCH_INDEX_NUM_SHARDS=1
OPENSEARCH_INDEX_NUM_REPLICAS=2
KNN_PARAM_EF_SEARCH=100
NMSLIB_EF_CONSTRUCTION=512
NMSLIB_M=16
OPENSEARCH_INDEX_EMBEDDING_DIM=768
OPENSEARCH_BULK_REQUEST_TIMEOUT=60

EMBEDDINGS_INPUT_PREFIX=
INDEXER_INPUT_PREFIX=

VESPA_INSTANCE_URL=http://localhost:8080/
VESPA_PRIVATE_KEY=
VESPA_KEY_LOCATION=
Expand Down
23 changes: 15 additions & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,34 @@ jobs:
cp .env.example .env
make build
- name: Run Unit Tests
run: make test
- name: Install latest Vespa CLI
env:
VESPA_CLI_VERSION: "8.250.43"
run: |
mkdir vespa-cli
curl -fsSL https://github.com/vespa-engine/vespa/releases/download/v${VESPA_CLI_VERSION}/vespa-cli_${VESPA_CLI_VERSION}_linux_amd64.tar.gz | \
tar -zxf - -C vespa-cli --strip-component=1
echo "vespa-cli/bin" >> $GITHUB_PATH
- name: Run Integration Tests
run: echo TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO
- name: Setup Vespa Test Instance
run: make vespa_setup

- name: Run Tests
run: make test

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1-node16
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-1

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
uses: aws-actions/amazon-ecr-login@v1.6.1

- name: Push Images to ECR
run: |
.github/retag-and-push.sh navigator-search-indexer latest
env:
DOCKER_REGISTRY: ${{ secrets.DOCKER_REGISTRY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ RUN poetry config virtualenvs.create false
RUN poetry install

# Copy files to image
COPY ./data ./data
COPY ./src ./src
COPY ./cli ./cli
COPY ./tests ./tests

# Pre-download the model
ENV PYTHONPATH "${PYTHONPATH}:/app"

# Run the indexer on the input s3 directory
ENTRYPOINT [ "sh", "./cli/run.sh" ]
ENTRYPOINT [ "sh", "./cli/run.sh" ]
36 changes: 4 additions & 32 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,47 +1,21 @@
include .env

.PHONY: build test dev_install opensearch_test_data
.PHONY: build test dev_install

setup:
cp .env.example .env

build:
docker build -t navigator-search-indexer .

download_indexer_inputs:
docker run --entrypoint aws --env-file=.env -v ${PWD}/data:/app/data navigator-search-indexer s3 sync ${INDEXER_INPUT_PREFIX} /app/data/indexer_input

run_encoding_docker:
docker run --entrypoint python -v ${PWD}/data:/app/data navigator-search-indexer -m cli.text2embeddings ./data/embeddings_input ./data/indexer_input

run_indexing_docker: download_indexer_inputs
docker run --entrypoint python --network=host --env-file=.env -v ${PWD}/data:/app/data navigator-search-indexer -m cli.index_data /app/data/indexer_input
vespa_setup: vespa_confirm_cli_installed vespa_dev_start vespa_healthy vespa_deploy_schema

test:
docker run --entrypoint python navigator-search-indexer -m pytest -vvv
docker-compose -f docker-compose.dev.yml run --rm navigator-search-indexer python -m pytest -vvv

dev_install:
poetry install && poetry run pre-commit install

test_against_aws:
cp Dockerfile.aws.example Dockerfile
docker build -t navigator-search-indexer-aws .
docker run -it navigator-search-indexer-aws python -m pytest

run_local_against_aws:
cp Dockerfile.aws.example Dockerfile
docker build -t navigator-search-indexer-aws .
docker run -e EMBEDDINGS_INPUT_PREFIX=${EMBEDDINGS_INPUT_PREFIX} -e INDEXER_INPUT_PREFIX=${INDEXER_INPUT_PREFIX} -it navigator-search-indexer-aws

# test data for backend
create_test_index:
docker run --entrypoint python --network=host --env-file=.env -e OPENSEARCH_INDEX_PREFIX=navigator_test -v ${PWD}/data:/app/data navigator-search-indexer -m cli.test.create_test_index /app/data/embeddings_input

opensearch_test_dump: create_test_index
rm -rf ./data/opensearch_test_dump/**
multielasticdump --input=http://admin:admin@localhost:9200 --output=./data/opensearch_test_dump --match="navigator_test_.*" --ignoreType=template


# setup dev/test vespa
vespa_confirm_cli_installed:
@if [ ! $$(which vespa) ]; then \
Expand All @@ -51,7 +25,7 @@ vespa_confirm_cli_installed:
fi

vespa_dev_start:
docker-compose -f docker-compose.dev.yml up -d --remove-orphans --wait
docker compose -f docker-compose.dev.yml up --detach --wait vespaindexertest

vespa_healthy:
@if [ ! $$(curl -f -s 'http://localhost:19071/status.html') ]; then \
Expand All @@ -63,5 +37,3 @@ vespa_healthy:
vespa_deploy_schema:
vespa config set target local
@vespa deploy tests/vespa_test_schema --wait 300

vespa_setup: vespa_confirm_cli_installed vespa_dev_start vespa_healthy vespa_deploy_schema
197 changes: 4 additions & 193 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,202 +1,13 @@
# Opensearch Indexer
# Vespa Indexer

The code in this folder contain a CLI tool used to index data into the Navigator search index:

* `index_data.py`: loads document metadata from the Navigator database and indexes this data alongside the text and embeddings created from embeddings generation into the search index.

There is also an `opensearch-query-example.ipynb` notebook that demonstrates running a query on the index. This is to be developed further and integrated into the Navigator APIs.

## Creating a test data dump for the backend

See `make opensearch_test_dump` and `cli/test/create_test_index.py`.

## Running

### 1. Building

`make build`

### 2. Loading data into Opensearch (in docker-compose)

Note: this command will wipe and repopulate the index specified in `.env` if it's already populated.

```shell
docker run --net=host --env-file .env -v /path/to/text-ids-file:/text-ids-path -v /path/to/embeddings-file:/embeddings-path navigator-search-indexer python /app/index_data.py --text-ids-path /text-ids-path --embeddings-path /embeddings-path -d 768
```

## Opensearch index structure

The following snippets are examples of the structure of different documents in the Opensearch index. Each document in the Opensearch index either describes a title, a description, or a text block of a document. **TODO: This will be revised once we remove the concept of actions from our database.**

**Example opensearch document with text block:**

``` json
{
"document_url" : "https://cdn.climatepolicyradar.org/PHL/2020/PHL-2020-03-19-Sustainable Finance Policy Framework of 2020-319_1c11e58a696ca5741fdc3454b4369564.pdf",
"document_id" : 167,
"document_name" : "Sustainable Finance Policy Framework of 2020",
"document_date" : "19/03/2020",
"document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks. ",
"document_category" : "Policy",
"document_type" : "Framework",
"document_keyword" : [
"Finance",
"Central Bank"
],
"document_sector_name" : "Finance",
"document_hazard_name" : [ ],
"document_instrument_name" : [
"Processes, plans and strategies|Governance",
"Capacity building|Governance"
],
"document_language" : "English",
"document_instrument_parent" : [ ],
"document_framework_name" : [ ],
"document_response_name" : [
"Mitigation",
"Adaptation"
],
"document_name_and_id" : "Sustainable Finance Policy Framework of 2020 167",
"document_country_code" : "PHL",
"document_country_english_shortname" : "Philippines",
"document_region_english_shortname" : "East Asia & Pacific",
"document_region_code" : "East Asia & Pacific",
"document_source_name" : "CCLW",
"text_block_id" : "p0_b1",
"text" : "CIRCULAR NO. 1085",
"text_embedding" : [x768],
"text_block_coords" : [
[
263.2799987792969,
709.3638153076172
],
[
364.5785827636719,
709.3638153076172
],
[
364.5785827636719,
720.4228668212891
],
[
263.2799987792969,
720.4228668212891
]
],
"text_block_page" : 0
}
```

**Example Opensearch document with title:**

Note the `for_search_document_name` field which is used for title search; the `document_name` field is identical to this field but appears on all documents for sorting purposes.

``` json
{
"document_url" : "https://cdn.climatepolicyradar.org/PHL/2020/PHL-2020-03-19-Sustainable Finance Policy Framework of 2020-319_1c11e58a696ca5741fdc3454b4369564.pdf",
"document_id" : 167,
"document_name" : "Sustainable Finance Policy Framework of 2020",
"document_date" : "19/03/2020",
"document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks. ",
"document_category" : "Policy",
"document_type" : "Framework",
"document_keyword" : [
"Finance",
"Central Bank"
],
"document_sector_name" : "Finance",
"document_hazard_name" : [ ],
"document_instrument_name" : [
"Processes, plans and strategies|Governance",
"Capacity building|Governance"
],
"document_language" : "English",
"document_instrument_parent" : [ ],
"document_framework_name" : [ ],
"document_response_name" : [
"Mitigation",
"Adaptation"
],
"document_name_and_id" : "Sustainable Finance Policy Framework of 2020 167",
"document_country_code" : "PHL",
"document_country_english_shortname" : "Philippines",
"document_region_english_shortname" : "East Asia & Pacific",
"document_region_code" : "East Asia & Pacific",
"document_source_name" : "CCLW",
"for_search_document_name" : "Sustainable Finance Policy Framework of 2020"
}
```

**Example text block with description:**

Note the `for_search_document_description` field and the `document_description` field - see comment about titles.

``` json
{
"document_url" : "https://cdn.climatepolicyradar.org/PHL/2020/PHL-2020-03-19-Sustainable Finance Policy Framework of 2020-319_1c11e58a696ca5741fdc3454b4369564.pdf",
"document_id" : 167,
"document_name" : "Sustainable Finance Policy Framework of 2020",
"document_date" : "19/03/2020",
"document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks. ",
"document_category" : "Policy",
"document_type" : "Framework",
"document_keyword" : [
"Finance",
"Central Bank"
],
"document_sector_name" : "Finance",
"document_hazard_name" : [ ],
"document_instrument_name" : [
"Processes, plans and strategies|Governance",
"Capacity building|Governance"
],
"document_language" : "English",
"document_instrument_parent" : [ ],
"document_framework_name" : [ ],
"document_response_name" : [
"Mitigation",
"Adaptation"
],
"document_name_and_id" : "Sustainable Finance Policy Framework of 2020 167",
"document_country_code" : "PHL",
"document_country_english_shortname" : "Philippines",
"document_region_english_shortname" : "East Asia & Pacific",
"document_region_code" : "East Asia & Pacific",
"document_source_name" : "CCLW",
"for_search_document_description" : "This document was approved by circular 1085/2020 of Philippines' central bank. It defines the bank's vision to integrate sustainability principles in corporate governance and risk management frameworks as well as in strategic objectives of banks. ",
"document_description_embedding" : [x768],
}
```

## Common issues

### Virtual memory

Error in docker logs:

```shell
opensearch-node1 | ERROR: [2] bootstrap checks failed
opensearch-node1 | [1]: max virtual memory areas vm.max_map_count [65530] is too low, increase to at least [262144]
opensearch-node1 | [2]: the default discovery settings are unsuitable for production use; at least one of [discovery.seed_hosts, discovery.seed_providers, cluster.initial_master_nodes] must be configured
opensearch-node1 | ERROR: OpenSearch did not exit normally - check the logs at /usr/share/opensearch/logs/opensearch-cluster.log
opensearch-node1 | [2022-04-14T14:49:58,972][INFO ][o.o.n.Node ] [opensearch-node1] stopping ...
opensearch-node1 | [2022-04-14T14:49:58,985][INFO ][o.o.n.Node ] [opensearch-node1] stopped
opensearch-node1 | [2022-04-14T14:49:58,985][INFO ][o.o.n.Node ] [opensearch-node1] closing ...
opensearch-node1 | [2022-04-14T14:49:58,995][INFO ][o.o.n.Node ] [opensearch-node1] closed
opensearch-node1 | Killing performance analyzer process 34
opensearch-node1 | OpenSearch exited with code 78
opensearch-node1 | Performance analyzer exited with code 143
```

Run [this command](https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html) on the host machine:

```shell
sysctl -w vm.max_map_count=262144
```

# Vespa test setup

```
make vespa_test_setup
poetry run pytest ./tests
make build
make vespa_setup
make test
```
10 changes: 2 additions & 8 deletions cli/index_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
"""Index data into a running Opensearch index."""

import os
import sys
import time
Expand All @@ -13,7 +11,6 @@
from tqdm.auto import tqdm
from cpr_data_access.parser_models import ParserOutput

from src.index.opensearch import populate_opensearch
from src.index.vespa_ import populate_vespa

LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
Expand Down Expand Up @@ -113,11 +110,8 @@ def run_as_cli(
index_type: str,
) -> None:
if index_type.lower() == "opensearch":
tasks, embedding_dir_as_path = _get_index_tasks(
indexer_input_dir, s3, files_to_index, limit
)
populate_opensearch(tasks=tasks, embedding_dir_as_path=embedding_dir_as_path)
sys.exit(0)
click.echo(f"Index type: {index_type}, is no longer used", err=True)
sys.exit(1)
elif index_type.lower() == "vespa":
_LOGGER.warning("Vespa indexing still experimental")
tasks, embedding_dir_as_path = _get_index_tasks(
Expand Down
Loading

0 comments on commit 8a66238

Please sign in to comment.