Skip to content

Commit d055a02

Browse files
committed
attempt to anonymize transcripts
1 parent fa8a413 commit d055a02

File tree

11 files changed

+1062
-18
lines changed

11 files changed

+1062
-18
lines changed

src/app/endpoints/conversations.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from app.database import get_session
2121
from utils.endpoints import check_configuration_loaded, validate_conversation_ownership
2222
from utils.suid import check_suid
23+
from utils.user_anonymization import get_anonymous_user_id
2324

2425
logger = logging.getLogger("app.endpoints.handlers")
2526
router = APIRouter(tags=["conversations"])
@@ -154,13 +155,22 @@ def get_conversations_list_endpoint_handler(
154155

155156
user_id, _, _ = auth
156157

157-
logger.info("Retrieving conversations for user %s", user_id)
158+
# Get anonymous user ID for database lookup
159+
anonymous_user_id = get_anonymous_user_id(user_id)
160+
161+
logger.info(
162+
"Retrieving conversations for user %s (anonymous: %s)",
163+
user_id[:8] + "..." if len(user_id) > 8 else user_id,
164+
anonymous_user_id,
165+
)
158166

159167
with get_session() as session:
160168
try:
161-
# Get all conversations for this user
169+
# Get all conversations for this user using anonymous ID
162170
user_conversations = (
163-
session.query(UserConversation).filter_by(user_id=user_id).all()
171+
session.query(UserConversation)
172+
.filter_by(anonymous_user_id=anonymous_user_id)
173+
.all()
164174
)
165175

166176
# Return conversation summaries with metadata

src/app/endpoints/query.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,17 @@
2424
from configuration import configuration
2525
from app.database import get_session
2626
import metrics
27+
import constants
2728
from models.database.conversations import UserConversation
2829
from models.responses import QueryResponse, UnauthorizedResponse, ForbiddenResponse
2930
from models.requests import QueryRequest, Attachment
30-
import constants
3131
from utils.endpoints import (
3232
check_configuration_loaded,
3333
get_agent,
3434
get_system_prompt,
3535
validate_conversation_ownership,
3636
)
37+
from utils.user_anonymization import get_anonymous_user_id
3738
from utils.mcp_headers import mcp_headers_dependency, handle_mcp_headers_with_toolgroups
3839
from utils.transcripts import store_transcript
3940
from utils.types import TurnSummary
@@ -76,25 +77,31 @@ def is_transcripts_enabled() -> bool:
7677
def persist_user_conversation_details(
7778
user_id: str, conversation_id: str, model: str, provider_id: str
7879
) -> None:
79-
"""Associate conversation to user in the database."""
80+
"""Associate conversation to user in the database using anonymous user ID."""
81+
# Get anonymous user ID for database storage
82+
anonymous_user_id = get_anonymous_user_id(user_id)
83+
8084
with get_session() as session:
8185
existing_conversation = (
8286
session.query(UserConversation)
83-
.filter_by(id=conversation_id, user_id=user_id)
87+
.filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
8488
.first()
8589
)
8690

8791
if not existing_conversation:
8892
conversation = UserConversation(
8993
id=conversation_id,
90-
user_id=user_id,
94+
anonymous_user_id=anonymous_user_id,
9195
last_used_model=model,
9296
last_used_provider=provider_id,
9397
message_count=1,
9498
)
9599
session.add(conversation)
96100
logger.debug(
97-
"Associated conversation %s to user %s", conversation_id, user_id
101+
"Associated conversation %s to anonymous user %s (original: %s)",
102+
conversation_id,
103+
anonymous_user_id,
104+
user_id[:8] + "..." if len(user_id) > 8 else user_id,
98105
)
99106
else:
100107
existing_conversation.last_used_model = model

src/models/database/conversations.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ class UserConversation(Base): # pylint: disable=too-few-public-methods
1616
# The conversation ID
1717
id: Mapped[str] = mapped_column(primary_key=True)
1818

19-
# The user ID associated with the conversation
20-
user_id: Mapped[str] = mapped_column(index=True)
19+
# The anonymous user ID associated with the conversation
20+
anonymous_user_id: Mapped[str] = mapped_column(index=True)
2121

2222
# The last provider/model used in the conversation
2323
last_used_model: Mapped[str] = mapped_column()
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""User ID anonymization mapping model."""
2+
3+
from datetime import datetime
4+
5+
from sqlalchemy.orm import Mapped, mapped_column
6+
from sqlalchemy import DateTime, func, Index
7+
8+
from models.database.base import Base
9+
10+
11+
class UserMapping(Base): # pylint: disable=too-few-public-methods
12+
"""Model for mapping real user IDs to anonymous UUIDs."""
13+
14+
__tablename__ = "user_mapping"
15+
16+
# Anonymous UUID used for all storage/analytics (primary key)
17+
anonymous_id: Mapped[str] = mapped_column(primary_key=True)
18+
19+
# Original user ID from authentication (hashed for security)
20+
user_id_hash: Mapped[str] = mapped_column(index=True, unique=True)
21+
22+
created_at: Mapped[datetime] = mapped_column(
23+
DateTime(timezone=True),
24+
server_default=func.now(), # pylint: disable=not-callable
25+
)
26+
27+
# Index for efficient lookups
28+
__table_args__ = (Index("ix_user_mapping_hash_lookup", "user_id_hash"),)

src/utils/endpoints.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from configuration import AppConfig
1414
from utils.suid import get_suid
1515
from utils.types import GraniteToolParser
16+
from utils.user_anonymization import get_anonymous_user_id
1617

1718

1819
logger = logging.getLogger("utils.endpoints")
@@ -21,11 +22,14 @@
2122
def validate_conversation_ownership(
2223
user_id: str, conversation_id: str
2324
) -> UserConversation | None:
24-
"""Validate that the conversation belongs to the user."""
25+
"""Validate that the conversation belongs to the user using anonymous ID lookup."""
26+
# Get anonymous user ID for database lookup
27+
anonymous_user_id = get_anonymous_user_id(user_id)
28+
2529
with get_session() as session:
2630
conversation = (
2731
session.query(UserConversation)
28-
.filter_by(id=conversation_id, user_id=user_id)
32+
.filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
2933
.first()
3034
)
3135
return conversation

src/utils/transcripts.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,16 @@
1414
from models.requests import Attachment, QueryRequest
1515
from utils.suid import get_suid
1616
from utils.types import TurnSummary
17+
from utils.user_anonymization import get_anonymous_user_id
1718

1819
logger = logging.getLogger("utils.transcripts")
1920

2021

21-
def construct_transcripts_path(user_id: str, conversation_id: str) -> Path:
22-
"""Construct path to transcripts."""
22+
def construct_transcripts_path(anonymous_user_id: str, conversation_id: str) -> Path:
23+
"""Construct path to transcripts using anonymous user ID."""
2324
# these two normalizations are required by Snyk as it detects
2425
# this Path sanitization pattern
25-
uid = os.path.normpath("/" + user_id).lstrip("/")
26+
uid = os.path.normpath("/" + anonymous_user_id).lstrip("/")
2627
cid = os.path.normpath("/" + conversation_id).lstrip("/")
2728
file_path = (
2829
configuration.user_data_collection_configuration.transcripts_storage or ""
@@ -46,7 +47,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional-
4647
"""Store transcript in the local filesystem.
4748
4849
Args:
49-
user_id: The user ID (UUID).
50+
user_id: The original user ID from authentication (will be anonymized).
5051
conversation_id: The conversation ID (UUID).
5152
query_is_valid: The result of the query validation.
5253
query: The query (without attachments).
@@ -56,7 +57,15 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional-
5657
truncated: The flag indicating if the history was truncated.
5758
attachments: The list of `Attachment` objects.
5859
"""
59-
transcripts_path = construct_transcripts_path(user_id, conversation_id)
60+
# Get anonymous user ID for storage
61+
anonymous_user_id = get_anonymous_user_id(user_id)
62+
logger.debug(
63+
"Anonymized user %s to %s for transcript storage",
64+
user_id[:8] + "..." if len(user_id) > 8 else user_id,
65+
anonymous_user_id,
66+
)
67+
68+
transcripts_path = construct_transcripts_path(anonymous_user_id, conversation_id)
6069
transcripts_path.mkdir(parents=True, exist_ok=True)
6170

6271
data_to_store = {
@@ -65,7 +74,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional-
6574
"model": model_id,
6675
"query_provider": query_request.provider,
6776
"query_model": query_request.model,
68-
"user_id": user_id,
77+
"anonymous_user_id": anonymous_user_id, # Store anonymous ID only
6978
"conversation_id": conversation_id,
7079
"timestamp": datetime.now(UTC).isoformat(),
7180
},

src/utils/user_anonymization.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
"""User ID anonymization utilities."""
2+
3+
import hashlib
4+
import logging
5+
from typing import Optional
6+
7+
from sqlalchemy.exc import IntegrityError
8+
9+
from models.database.user_mapping import UserMapping
10+
from app.database import get_session
11+
from utils.suid import get_suid
12+
13+
logger = logging.getLogger("utils.user_anonymization")
14+
15+
16+
def _hash_user_id(user_id: str) -> str:
17+
"""
18+
Create a consistent hash of the user ID for mapping purposes.
19+
20+
Uses SHA-256 with a fixed salt to ensure consistent hashing
21+
while preventing rainbow table attacks.
22+
"""
23+
# Use a fixed salt - in production, this should be configurable
24+
salt = "lightspeed_user_anonymization_salt_v1"
25+
hash_input = f"{salt}:{user_id}".encode("utf-8")
26+
return hashlib.sha256(hash_input).hexdigest()
27+
28+
29+
def get_anonymous_user_id(auth_user_id: str) -> str:
30+
"""
31+
Get or create an anonymous UUID for a user ID from authentication.
32+
33+
This function:
34+
1. Hashes the original user ID for secure storage
35+
2. Looks up existing anonymous mapping
36+
3. Creates new anonymous UUID if none exists
37+
4. Returns the anonymous UUID for use in storage/analytics
38+
39+
Args:
40+
auth_user_id: The original user ID from authentication
41+
42+
Returns:
43+
Anonymous UUID string for this user
44+
"""
45+
user_id_hash = _hash_user_id(auth_user_id)
46+
47+
with get_session() as session:
48+
# Try to find existing mapping
49+
existing_mapping = (
50+
session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first()
51+
)
52+
53+
if existing_mapping:
54+
logger.debug(
55+
"Found existing anonymous ID for user hash %s", user_id_hash[:8] + "..."
56+
)
57+
return existing_mapping.anonymous_id
58+
59+
# Create new anonymous mapping
60+
anonymous_id = get_suid()
61+
new_mapping = UserMapping(anonymous_id=anonymous_id, user_id_hash=user_id_hash)
62+
63+
try:
64+
session.add(new_mapping)
65+
session.commit()
66+
logger.info(
67+
"Created new anonymous ID %s for user hash %s",
68+
anonymous_id,
69+
user_id_hash[:8] + "...",
70+
)
71+
return anonymous_id
72+
73+
except IntegrityError as e:
74+
session.rollback()
75+
# Race condition - another thread created the mapping
76+
logger.warning("Race condition creating user mapping: %s", e)
77+
78+
# Try to fetch the mapping created by the other thread
79+
existing_mapping = (
80+
session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first()
81+
)
82+
83+
if existing_mapping:
84+
return existing_mapping.anonymous_id
85+
86+
# If we still can't find it, something is wrong
87+
logger.error(
88+
"Failed to create or retrieve user mapping for hash %s",
89+
user_id_hash[:8] + "...",
90+
)
91+
raise RuntimeError("Unable to create or retrieve anonymous user ID") from e
92+
93+
94+
def get_user_count() -> int:
95+
"""
96+
Get the total number of unique users in the system.
97+
98+
Returns:
99+
Total count of unique anonymous users
100+
"""
101+
with get_session() as session:
102+
return session.query(UserMapping).count()
103+
104+
105+
def find_anonymous_user_id(auth_user_id: str) -> Optional[str]:
106+
"""
107+
Find existing anonymous ID for a user without creating a new one.
108+
109+
Args:
110+
auth_user_id: The original user ID from authentication
111+
112+
Returns:
113+
Anonymous UUID if found, None otherwise
114+
"""
115+
user_id_hash = _hash_user_id(auth_user_id)
116+
117+
with get_session() as session:
118+
existing_mapping = (
119+
session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first()
120+
)
121+
122+
return existing_mapping.anonymous_id if existing_mapping else None

0 commit comments

Comments
 (0)