Skip to content

Commit

Permalink
Fix: Allow vespa indexing direct from s3 (#69)
Browse files Browse the repository at this point in the history
* Fix: Allow vespa indexing direct from s3

* Update src/index/vespa.py

Co-authored-by: olaughter <[email protected]>

* remove unnecessary seek

---------

Co-authored-by: olaughter <[email protected]>
  • Loading branch information
Joel Wright and olaughter authored Oct 25, 2023
1 parent b749366 commit 06b5b7a
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/index/vespa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import logging
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import (
Annotated,
Expand All @@ -11,6 +12,7 @@
Sequence,
Tuple,
Union,
cast,
)

from cloudpathlib import S3Path
Expand Down Expand Up @@ -133,7 +135,9 @@ def get_document_generator(

physical_document_count = 0
for task in tasks:
embeddings = np.load(str(embedding_dir_as_path / f"{task.document_id}.npy"))
task_array_file_path = cast(Path, embedding_dir_as_path / f"{task.document_id}.npy")
with open(task_array_file_path, "rb") as task_array_file_like:
embeddings = np.load(BytesIO(task_array_file_like.read()))

family_document_id = DocumentID(task.document_metadata.family_import_id)
family_document = VespaFamilyDocument(
Expand Down

0 comments on commit 06b5b7a

Please sign in to comment.