Skip to content

Commit 0b71d3c

Browse files
committed
refactor: moved document timestamp handling from cli into indexer
1 parent 50147e7 commit 0b71d3c

File tree

2 files changed

+97
-65
lines changed

2 files changed

+97
-65
lines changed

Diff for: gptme_rag/cli.py

+5-61
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import signal
55
import sys
66
import time
7-
from datetime import datetime
87
from pathlib import Path
98

109
import click
@@ -58,33 +57,7 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
5857
try:
5958
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
6059

61-
# Get existing files and their metadata from the index, using absolute paths
62-
existing_docs = indexer.get_all_documents()
63-
logger.debug("Found %d existing documents in index", len(existing_docs))
64-
65-
existing_files = {}
66-
for doc in existing_docs:
67-
if "source" in doc.metadata:
68-
abs_path = os.path.abspath(doc.metadata["source"])
69-
last_modified = doc.metadata.get("last_modified")
70-
if last_modified:
71-
try:
72-
# Parse ISO format timestamp to float
73-
existing_files[abs_path] = datetime.fromisoformat(
74-
last_modified
75-
).timestamp()
76-
except ValueError:
77-
logger.warning(
78-
"Invalid last_modified format: %s", last_modified
79-
)
80-
existing_files[abs_path] = 0
81-
else:
82-
existing_files[abs_path] = 0
83-
# logger.debug("Existing file: %s", abs_path) # Too spammy
84-
85-
logger.debug("Loaded %d existing files from index", len(existing_files))
86-
87-
# First, collect all documents and filter for new/modified
60+
# Collect all documents (indexer handles modification checking)
8861
all_documents = []
8962
with console.status("Collecting documents...") as status:
9063
for path in paths:
@@ -93,47 +66,18 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
9366
else:
9467
status.update(f"Processing directory: {path}")
9568

96-
documents = indexer.collect_documents(path)
97-
98-
# Filter for new or modified documents
99-
filtered_documents = []
100-
for doc in documents:
101-
source = doc.metadata.get("source")
102-
if source:
103-
# Resolve to absolute path for consistent comparison
104-
abs_source = os.path.abspath(source)
105-
doc.metadata["source"] = abs_source
106-
current_mtime = os.path.getmtime(abs_source)
107-
108-
# Include if file is new or modified
109-
if abs_source not in existing_files:
110-
logger.debug("New file: %s", abs_source)
111-
filtered_documents.append(doc)
112-
# Round to microseconds (6 decimal places) for comparison
113-
elif round(current_mtime, 6) > round(
114-
existing_files[abs_source], 6
115-
):
116-
logger.debug(
117-
"Modified file: %s (current: %s, stored: %s)",
118-
abs_source,
119-
current_mtime,
120-
existing_files[abs_source],
121-
)
122-
filtered_documents.append(doc)
123-
else:
124-
logger.debug("Unchanged file: %s", abs_source)
125-
126-
all_documents.extend(filtered_documents)
69+
documents = indexer.collect_documents(path, check_modified=True)
70+
all_documents.extend(documents)
12771

12872
if not all_documents:
12973
console.print("No new or modified documents to index", style="yellow")
13074
return
13175

132-
# Then process them with a progress bar
76+
# Process with progress bar
13377
n_files = len(set(doc.metadata.get("source", "") for doc in all_documents))
13478
n_chunks = len(all_documents)
13579

136-
logger.info(f"Found {n_files} new/modified files to index ({n_chunks} chunks)")
80+
logger.info(f"Processing {n_files} files ({n_chunks} chunks)")
13781

13882
with tqdm(
13983
total=n_chunks,

Diff for: gptme_rag/indexing/indexer.py

+92-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import logging
2+
import os
23
import subprocess
34
import time
45
from collections.abc import Generator
6+
from datetime import datetime
57
from fnmatch import fnmatch as fnmatch_path
68
from logging import Filter
79
from pathlib import Path
@@ -176,7 +178,22 @@ def add_documents(self, documents: list[Document], batch_size: int = 10) -> None
176178
documents: List of documents to add
177179
batch_size: Number of documents to process in each batch
178180
"""
179-
list(self.add_documents_progress(documents, batch_size=batch_size))
181+
# Process documents in batches
182+
for i in range(0, len(documents), batch_size):
183+
batch = documents[i : i + batch_size]
184+
self._add_documents(batch)
185+
186+
# Update stored timestamps after successful indexing
187+
for doc in batch:
188+
if "source" in doc.metadata:
189+
abs_path = str(Path(doc.metadata["source"]).resolve())
190+
current_mtime = int(os.path.getmtime(abs_path))
191+
doc.metadata["last_modified"] = current_mtime
192+
# Update the document in the collection
193+
self.collection.update(
194+
ids=[doc.doc_id],
195+
metadatas=[doc.metadata],
196+
)
180197

181198
def add_documents_progress(
182199
self, documents: list[Document], batch_size: int = 10
@@ -201,6 +218,14 @@ def _add_documents(self, documents: list[Document]) -> None:
201218
doc = self._generate_doc_id(doc)
202219
assert doc.doc_id is not None
203220

221+
# Update timestamp in metadata to current time
222+
if "source" in doc.metadata:
223+
abs_path = str(Path(doc.metadata["source"]).resolve())
224+
current_mtime = os.path.getmtime(abs_path)
225+
doc.metadata["last_modified"] = self._normalize_timestamp(
226+
current_mtime
227+
)
228+
204229
contents.append(doc.content)
205230
metadatas.append(doc.metadata)
206231
ids.append(doc.doc_id)
@@ -869,14 +894,54 @@ def _get_valid_files(
869894

870895
return valid_files
871896

897+
def _normalize_timestamp(self, timestamp: str | float | int | None) -> str:
898+
"""Normalize timestamp to ISO format string."""
899+
if timestamp is None:
900+
return datetime.fromtimestamp(0).isoformat()
901+
try:
902+
if isinstance(timestamp, int | float):
903+
return datetime.fromtimestamp(float(timestamp)).isoformat()
904+
# If it's already an ISO string, validate and return
905+
if isinstance(timestamp, str):
906+
datetime.fromisoformat(timestamp) # Validate format
907+
return timestamp
908+
raise ValueError(f"Unsupported timestamp type: {type(timestamp)}")
909+
except (ValueError, TypeError) as e:
910+
logger.warning("Invalid timestamp format: %s (%s)", timestamp, e)
911+
return datetime.fromtimestamp(0).isoformat()
912+
913+
def _compare_timestamps(self, stored: str, current: float) -> bool:
914+
"""Compare stored ISO timestamp with current Unix timestamp.
915+
916+
Returns True if current is newer than stored."""
917+
try:
918+
stored_ts = datetime.fromisoformat(stored).timestamp()
919+
# Round to seconds for comparison
920+
return int(current) > int(stored_ts)
921+
except (ValueError, TypeError) as e:
922+
logger.warning("Error comparing timestamps: %s", e)
923+
return True # If we can't compare, assume modified
924+
925+
def _get_stored_timestamps(self) -> dict[str, str]:
926+
"""Get stored timestamps for all indexed files."""
927+
stored = {}
928+
for doc in self.get_all_documents():
929+
if "source" in doc.metadata:
930+
abs_path = str(Path(doc.metadata["source"]).resolve())
931+
timestamp = self._normalize_timestamp(doc.metadata.get("last_modified"))
932+
stored[abs_path] = timestamp
933+
logger.debug("Stored timestamp for %s: %s", abs_path, timestamp)
934+
return stored
935+
872936
def collect_documents(
873-
self, path: Path, glob_pattern: str = "**/*.*"
937+
self, path: Path, glob_pattern: str = "**/*.*", check_modified: bool = True
874938
) -> list[Document]:
875939
"""Collect documents from a file or directory without processing them.
876940
877941
Args:
878942
path: Path to collect documents from
879943
glob_pattern: Pattern to match files (only used for directories)
944+
check_modified: Whether to check for modifications (skip unchanged files)
880945
881946
Returns:
882947
List of documents ready for processing
@@ -888,8 +953,33 @@ def collect_documents(
888953
logger.debug(f"No valid files found in {path}")
889954
return documents
890955

956+
if check_modified:
957+
stored_timestamps = self._get_stored_timestamps()
958+
891959
# Process files in order (least deep first)
892960
for file_path in sorted(valid_files, key=lambda x: len(x.parts)):
961+
abs_path = str(file_path.resolve())
962+
963+
if check_modified:
964+
current_mtime = os.path.getmtime(file_path)
965+
stored_timestamp = stored_timestamps.get(abs_path)
966+
967+
if stored_timestamp and not self._compare_timestamps(
968+
stored_timestamp, current_mtime
969+
):
970+
logger.debug("Skipping unchanged file: %s", abs_path)
971+
continue
972+
973+
if not stored_timestamp:
974+
logger.debug("New file: %s", abs_path)
975+
else:
976+
logger.debug(
977+
"Modified file: %s (current: %s, stored: %s)",
978+
abs_path,
979+
self._normalize_timestamp(current_mtime),
980+
stored_timestamp,
981+
)
982+
893983
logger.debug(f"Processing {file_path}")
894984
documents.extend(Document.from_file(file_path, processor=self.processor))
895985

@@ -918,6 +1008,4 @@ def get_all_documents(self) -> list[Document]:
9181008
"""
9191009
logger.debug("Getting all documents from index")
9201010
docs = self.list_documents(group_by_source=False)
921-
for doc in docs:
922-
logger.debug("Retrieved document with metadata: %s", doc.metadata)
9231011
return docs

0 commit comments

Comments
 (0)