refactor: moved document timestamp handling from cli into indexer

ErikBjare · ErikBjare · commit 0b71d3c98bb6 · 2025-01-18T15:41:59.000+01:00
diff --git a/gptme_rag/cli.py b/gptme_rag/cli.py
@@ -4,7 +4,6 @@
 import signal
 import sys
 import time
-from datetime import datetime
 from pathlib import Path
 
 import click
@@ -58,33 +57,7 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
     try:
         indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
 
-        # Get existing files and their metadata from the index, using absolute paths
-        existing_docs = indexer.get_all_documents()
-        logger.debug("Found %d existing documents in index", len(existing_docs))
-
-        existing_files = {}
-        for doc in existing_docs:
-            if "source" in doc.metadata:
-                abs_path = os.path.abspath(doc.metadata["source"])
-                last_modified = doc.metadata.get("last_modified")
-                if last_modified:
-                    try:
-                        # Parse ISO format timestamp to float
-                        existing_files[abs_path] = datetime.fromisoformat(
-                            last_modified
-                        ).timestamp()
-                    except ValueError:
-                        logger.warning(
-                            "Invalid last_modified format: %s", last_modified
-                        )
-                        existing_files[abs_path] = 0
-                else:
-                    existing_files[abs_path] = 0
-                # logger.debug("Existing file: %s", abs_path)  # Too spammy
-
-        logger.debug("Loaded %d existing files from index", len(existing_files))
-
-        # First, collect all documents and filter for new/modified
+        # Collect all documents (indexer handles modification checking)
         all_documents = []
         with console.status("Collecting documents...") as status:
             for path in paths:
@@ -93,47 +66,18 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
                 else:
                     status.update(f"Processing directory: {path}")
 
-                documents = indexer.collect_documents(path)
-
-                # Filter for new or modified documents
-                filtered_documents = []
-                for doc in documents:
-                    source = doc.metadata.get("source")
-                    if source:
-                        # Resolve to absolute path for consistent comparison
-                        abs_source = os.path.abspath(source)
-                        doc.metadata["source"] = abs_source
-                        current_mtime = os.path.getmtime(abs_source)
-
-                        # Include if file is new or modified
-                        if abs_source not in existing_files:
-                            logger.debug("New file: %s", abs_source)
-                            filtered_documents.append(doc)
-                        # Round to microseconds (6 decimal places) for comparison
-                        elif round(current_mtime, 6) > round(
-                            existing_files[abs_source], 6
-                        ):
-                            logger.debug(
-                                "Modified file: %s (current: %s, stored: %s)",
-                                abs_source,
-                                current_mtime,
-                                existing_files[abs_source],
-                            )
-                            filtered_documents.append(doc)
-                        else:
-                            logger.debug("Unchanged file: %s", abs_source)
-
-                all_documents.extend(filtered_documents)
+                documents = indexer.collect_documents(path, check_modified=True)
+                all_documents.extend(documents)
 
         if not all_documents:
             console.print("No new or modified documents to index", style="yellow")
             return
 
-        # Then process them with a progress bar
+        # Process with progress bar
         n_files = len(set(doc.metadata.get("source", "") for doc in all_documents))
         n_chunks = len(all_documents)
 
-        logger.info(f"Found {n_files} new/modified files to index ({n_chunks} chunks)")
+        logger.info(f"Processing {n_files} files ({n_chunks} chunks)")
 
         with tqdm(
             total=n_chunks,
diff --git a/gptme_rag/indexing/indexer.py b/gptme_rag/indexing/indexer.py
@@ -1,7 +1,9 @@
 import logging
+import os
 import subprocess
 import time
 from collections.abc import Generator
+from datetime import datetime
 from fnmatch import fnmatch as fnmatch_path
 from logging import Filter
 from pathlib import Path
@@ -176,7 +178,22 @@ def add_documents(self, documents: list[Document], batch_size: int = 10) -> None
             documents: List of documents to add
             batch_size: Number of documents to process in each batch
         """
-        list(self.add_documents_progress(documents, batch_size=batch_size))
+        # Process documents in batches
+        for i in range(0, len(documents), batch_size):
+            batch = documents[i : i + batch_size]
+            self._add_documents(batch)
+
+            # Update stored timestamps after successful indexing
+            for doc in batch:
+                if "source" in doc.metadata:
+                    abs_path = str(Path(doc.metadata["source"]).resolve())
+                    current_mtime = int(os.path.getmtime(abs_path))
+                    doc.metadata["last_modified"] = current_mtime
+                    # Update the document in the collection
+                    self.collection.update(
+                        ids=[doc.doc_id],
+                        metadatas=[doc.metadata],
+                    )
 
     def add_documents_progress(
         self, documents: list[Document], batch_size: int = 10
@@ -201,6 +218,14 @@ def _add_documents(self, documents: list[Document]) -> None:
                 doc = self._generate_doc_id(doc)
                 assert doc.doc_id is not None
 
+                # Update timestamp in metadata to current time
+                if "source" in doc.metadata:
+                    abs_path = str(Path(doc.metadata["source"]).resolve())
+                    current_mtime = os.path.getmtime(abs_path)
+                    doc.metadata["last_modified"] = self._normalize_timestamp(
+                        current_mtime
+                    )
+
                 contents.append(doc.content)
                 metadatas.append(doc.metadata)
                 ids.append(doc.doc_id)
@@ -869,14 +894,54 @@ def _get_valid_files(
 
         return valid_files
 
+    def _normalize_timestamp(self, timestamp: str | float | int | None) -> str:
+        """Normalize timestamp to ISO format string."""
+        if timestamp is None:
+            return datetime.fromtimestamp(0).isoformat()
+        try:
+            if isinstance(timestamp, int | float):
+                return datetime.fromtimestamp(float(timestamp)).isoformat()
+            # If it's already an ISO string, validate and return
+            if isinstance(timestamp, str):
+                datetime.fromisoformat(timestamp)  # Validate format
+                return timestamp
+            raise ValueError(f"Unsupported timestamp type: {type(timestamp)}")
+        except (ValueError, TypeError) as e:
+            logger.warning("Invalid timestamp format: %s (%s)", timestamp, e)
+            return datetime.fromtimestamp(0).isoformat()
+
+    def _compare_timestamps(self, stored: str, current: float) -> bool:
+        """Compare stored ISO timestamp with current Unix timestamp.
+
+        Returns True if current is newer than stored."""
+        try:
+            stored_ts = datetime.fromisoformat(stored).timestamp()
+            # Round to seconds for comparison
+            return int(current) > int(stored_ts)
+        except (ValueError, TypeError) as e:
+            logger.warning("Error comparing timestamps: %s", e)
+            return True  # If we can't compare, assume modified
+
+    def _get_stored_timestamps(self) -> dict[str, str]:
+        """Get stored timestamps for all indexed files."""
+        stored = {}
+        for doc in self.get_all_documents():
+            if "source" in doc.metadata:
+                abs_path = str(Path(doc.metadata["source"]).resolve())
+                timestamp = self._normalize_timestamp(doc.metadata.get("last_modified"))
+                stored[abs_path] = timestamp
+                logger.debug("Stored timestamp for %s: %s", abs_path, timestamp)
+        return stored
+
     def collect_documents(
-        self, path: Path, glob_pattern: str = "**/*.*"
+        self, path: Path, glob_pattern: str = "**/*.*", check_modified: bool = True
     ) -> list[Document]:
         """Collect documents from a file or directory without processing them.
 
         Args:
             path: Path to collect documents from
             glob_pattern: Pattern to match files (only used for directories)
+            check_modified: Whether to check for modifications (skip unchanged files)
 
         Returns:
             List of documents ready for processing
@@ -888,8 +953,33 @@ def collect_documents(
             logger.debug(f"No valid files found in {path}")
             return documents
 
+        if check_modified:
+            stored_timestamps = self._get_stored_timestamps()
+
         # Process files in order (least deep first)
         for file_path in sorted(valid_files, key=lambda x: len(x.parts)):
+            abs_path = str(file_path.resolve())
+
+            if check_modified:
+                current_mtime = os.path.getmtime(file_path)
+                stored_timestamp = stored_timestamps.get(abs_path)
+
+                if stored_timestamp and not self._compare_timestamps(
+                    stored_timestamp, current_mtime
+                ):
+                    logger.debug("Skipping unchanged file: %s", abs_path)
+                    continue
+
+                if not stored_timestamp:
+                    logger.debug("New file: %s", abs_path)
+                else:
+                    logger.debug(
+                        "Modified file: %s (current: %s, stored: %s)",
+                        abs_path,
+                        self._normalize_timestamp(current_mtime),
+                        stored_timestamp,
+                    )
+
             logger.debug(f"Processing {file_path}")
             documents.extend(Document.from_file(file_path, processor=self.processor))
 
@@ -918,6 +1008,4 @@ def get_all_documents(self) -> list[Document]:
         """
         logger.debug("Getting all documents from index")
         docs = self.list_documents(group_by_source=False)
-        for doc in docs:
-            logger.debug("Retrieved document with metadata: %s", doc.metadata)
         return docs