Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Stabilize support for MSC3970: updated transaction semantics (scope to device_id) #15629

Merged
merged 15 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/15629.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Scope transaction IDs to devices (implement [MSC3970](https://github.com/matrix-org/matrix-spec-proposals/pull/3970)).
3 changes: 0 additions & 3 deletions synapse/config/experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,6 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None:
"msc3981_recurse_relations", False
)

# MSC3970: Scope transaction IDs to devices
self.msc3970_enabled = experimental.get("msc3970_enabled", False)

# MSC4009: E.164 Matrix IDs
self.msc4009_e164_mxids = experimental.get("msc4009_e164_mxids", False)

Expand Down
31 changes: 12 additions & 19 deletions synapse/events/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,16 +405,13 @@ def serialize_event(
time_now_ms: int,
*,
config: SerializeEventConfig = _DEFAULT_SERIALIZE_EVENT_CONFIG,
msc3970_enabled: bool = False,
) -> JsonDict:
"""Serialize event for clients

Args:
e
time_now_ms
config: Event serialization config
msc3970_enabled: Whether MSC3970 is enabled. It changes whether we should
include the `transaction_id` in the event's `unsigned` section.

Returns:
The serialized event dictionary.
Expand All @@ -440,28 +437,29 @@ def serialize_event(
e.unsigned["redacted_because"],
time_now_ms,
config=config,
msc3970_enabled=msc3970_enabled,
)

# If we have a txn_id saved in the internal_metadata, we should include it in the
# unsigned section of the event if it was sent by the same session as the one
# requesting the event.
txn_id: Optional[str] = getattr(e.internal_metadata, "txn_id", None)
if txn_id is not None and config.requester is not None:
# For the MSC3970 rules to be applied, we *need* to have the device ID in the
# event internal metadata. Since we were not recording them before, if it hasn't
# been recorded, we fallback to the old behaviour.
# Old events do not have the device ID stored in the internal metadata,
# if it hasn't been recorded, fallback to using the access token instead.
event_device_id: Optional[str] = getattr(e.internal_metadata, "device_id", None)
if msc3970_enabled and event_device_id is not None:
if event_device_id is not None:
clokep marked this conversation as resolved.
Show resolved Hide resolved
if event_device_id == config.requester.device_id:
d["unsigned"]["transaction_id"] = txn_id

else:
# The pre-MSC3970 behaviour is to only include the transaction ID if the
# event was sent from the same access token. For regular users, we can use
# the access token ID to determine this. For guests, we can't, but since
# each guest only has one access token, we can just check that the event was
# sent by the same user as the one requesting the event.
# Fallback behaviour: only include the transaction ID if the event
# was sent from the same access token.
#
# For regular users, the access token ID can be used to determine this.
#
# For guests, we can't check the access token ID, but since each guest
# only has one access token, just check the event was sent by the same
# user as the one requesting the event.
event_token_id: Optional[int] = getattr(
e.internal_metadata, "token_id", None
)
Expand Down Expand Up @@ -504,9 +502,6 @@ class EventClientSerializer:
clients.
"""

def __init__(self, *, msc3970_enabled: bool = False):
self._msc3970_enabled = msc3970_enabled

def serialize_event(
self,
event: Union[JsonDict, EventBase],
Expand All @@ -531,9 +526,7 @@ def serialize_event(
if not isinstance(event, EventBase):
return event

serialized_event = serialize_event(
event, time_now, config=config, msc3970_enabled=self._msc3970_enabled
)
serialized_event = serialize_event(event, time_now, config=config)

# Check if there are any bundled aggregations to include with the event.
if bundle_aggregations:
Expand Down
4 changes: 1 addition & 3 deletions synapse/handlers/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,6 @@ def __init__(self, hs: "HomeServer"):
expiry_ms=30 * 60 * 1000,
)

self._msc3970_enabled = hs.config.experimental.msc3970_enabled

async def create_event(
self,
requester: Requester,
Expand Down Expand Up @@ -906,7 +904,7 @@ async def get_event_from_transaction(
An event if one could be found, None otherwise.
"""

if self._msc3970_enabled and requester.device_id:
if requester.device_id:
# When MSC3970 is enabled, we lookup for events sent by the same device first,
# and fallback to the old behaviour if none were found.
existing_event_id = (
Expand Down
8 changes: 3 additions & 5 deletions synapse/handlers/room_member.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,6 @@ def __init__(self, hs: "HomeServer"):
self.request_ratelimiter = hs.get_request_ratelimiter()
hs.get_notifier().add_new_join_in_room_callback(self._on_user_joined_room)

self._msc3970_enabled = hs.config.experimental.msc3970_enabled

def _on_user_joined_room(self, event_id: str, room_id: str) -> None:
"""Notify the rate limiter that a room join has occurred.

Expand Down Expand Up @@ -424,9 +422,9 @@ async def _local_membership_update(
# do it up front for efficiency.)
if txn_id:
existing_event_id = None
if self._msc3970_enabled and requester.device_id:
# When MSC3970 is enabled, we lookup for events sent by the same device
# first, and fallback to the old behaviour if none were found.
# Look for an event sent by the same device. If none is found, fallback
# to an event sent by the same access token.
if requester.device_id:
clokep marked this conversation as resolved.
Show resolved Hide resolved
existing_event_id = (
await self.store.get_event_id_from_transaction_id_and_device_id(
room_id,
Expand Down
12 changes: 6 additions & 6 deletions synapse/rest/client/transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ def __init__(self, hs: "HomeServer"):
# for at *LEAST* 30 mins, and at *MOST* 60 mins.
self.cleaner = self.clock.looping_call(self._cleanup, CLEANUP_PERIOD_MS)

self._msc3970_enabled = hs.config.experimental.msc3970_enabled

def _get_transaction_key(self, request: IRequest, requester: Requester) -> Hashable:
"""A helper function which returns a transaction key that can be used
with TransactionCache for idempotent requests.
Expand All @@ -78,18 +76,20 @@ def _get_transaction_key(self, request: IRequest, requester: Requester) -> Hasha
elif requester.app_service is not None:
return (path, "appservice", requester.app_service.id)

# With MSC3970, we use the user ID and device ID as the transaction key
elif self._msc3970_enabled:
# We use the user ID and device ID as the transaction key.
elif requester.device_id:
assert requester.user, "Requester must have a user"
assert requester.device_id, "Requester must have a device_id"
return (path, "user", requester.user, requester.device_id)

# Otherwise, the pre-MSC3970 behaviour is to use the access token ID
# Some requsters don't have device IDs, these are mostly handled above
# (appservice and guest users), but does not cover access tokens minted
# by the admin API. Use the access token ID instead.
else:
assert (
requester.access_token_id is not None
), "Requester must have an access_token_id"
return (path, "user", requester.access_token_id)
return (path, "user_admin", requester.access_token_id)

def fetch_or_execute_request(
self,
Expand Down
4 changes: 1 addition & 3 deletions synapse/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,9 +784,7 @@ def get_oidc_handler(self) -> "OidcHandler":

@cache_in_self
def get_event_client_serializer(self) -> EventClientSerializer:
return EventClientSerializer(
msc3970_enabled=self.config.experimental.msc3970_enabled
)
return EventClientSerializer()

@cache_in_self
def get_password_policy_handler(self) -> PasswordPolicyHandler:
Expand Down
15 changes: 6 additions & 9 deletions synapse/storage/databases/main/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,6 @@ def __init__(
self._backfill_id_gen: AbstractStreamIdGenerator = self.store._backfill_id_gen
self._stream_id_gen: AbstractStreamIdGenerator = self.store._stream_id_gen

self._msc3970_enabled = hs.config.experimental.msc3970_enabled

@trace
async def _persist_events_and_state_updates(
self,
Expand Down Expand Up @@ -1012,9 +1010,10 @@ def _persist_transaction_ids_txn(
)
)

# Pre-MSC3970, we rely on the access_token_id to scope the txn_id for events.
# Since this is an experimental flag, we still store the mapping even if the
# flag is disabled.
# Previously Synapsed used the access_token_id to scope the txn_id for events.
clokep marked this conversation as resolved.
Show resolved Hide resolved
#
# TODO Remove this once Synapse cannot be rolled back to a version without
# device IDs being stored.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a target version of SCHEMA_COMPAT_VERSION you need? If you can find out, might be worth writing it here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Errr, so right now we write to both event_txn_id and event_txn_id_device_id. I think we want to leave this as-is so we can rollback if needed.

I guess in the future we would want to:

  • Wait
  • Stop writing (and reading to) to event_txn_id and bump the SCHEMA_VERSION
  • Wait
  • Bump the SCHEMA_COMPAT_VERSION, rip out the code

Does that sound like the right series of steps?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We talked about this a bit: https://matrix.to/#/!vcyiEtMVHIhWXcJAfl:sw1v.org/$WQwonqO7etYpa0ik4aVzVsf32R_VOqIyPzBjTl3pjps?via=matrix.org&via=element.io&via=beeper.com

My tl;dr is that the above steps look reasonable, although you usually want to stop reading before writing. Unfortunately we do still use this for certain situations, e.g. appservice users don't have device IDs. So we would need to figure out what to do there?

My inclination is to merge this as-is (so as to not block Matrix 1.7 support) and file a follow-up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately we do still use this for certain situations, e.g. appservice users don't have device IDs. So we would need to figure out what to do there?

Probably good to add a comment to the diff on the caveat for why the access token fallback needs to stay until that is figured out

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was planning to point to an issue. Was waiting to see if others had thoughts before clicking merge.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Taking a step back and re-evaluating my other threads with @sandhose (and re-reading the comments in this PR) I don't actually think we can remove this since it gets used for guests, appservice users, and access tokens from the admin API. 😢

I've made two changes:

  • Consolidate some logic so we have fewer places we attempt to line up an event from a transaction ID.
  • Added a lot more comments.

Could folks (in particular @sandhose) let me know if these still read OK? I feel like I'm missing something, but I've tried to include my current understanding in the comments.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing is, guests and appservice users don't have access token IDs anyway, so it basically won't do anything if we keep it. The only thing left is admin-minted, device-less tokens.

Keep in mind, there are two layers of things that ensure idempotency:

  • the in-memory HTTP transaction cache, which properly covers guests, appservices and device-less tokens
  • the db-persisted (for 24h) transaction_id -> event_id mapping, which only worked for access tokens with IDs in it, so no guests and no appservices

Also, this is only for idempotency, the transaction ID echo in event serialisation is handled by the event internal metadata JSON blob, which is separate from those tables.

If I'm not mistaken, I think this would only affect

  • device-less real users
  • when they replay a request, either load-balanced to another worker, or after a Synapse restart

I don't mind keeping this, but also I don't think it's worth keeping it around.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this is only for idempotency, the transaction ID echo in event serialisation is handled by the event internal metadata JSON blob, which is separate from those tables.

And I suppose the same rules apply to the event internal metadata info -- guests and appservices users don't have access token IDs so we can't be storing it there anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking through this a bit more I think we need two separate approaches:

Database transaction mapping

This describes what to do with "the db-persisted (for 24h) transaction_id -> event_id mapping".

This only survives for 24 hours, but that's not really relevant to the conversation of rolling back (or forwards). As background, there's two tables (event_txn_id and event_txn_id_device_id) for access token IDs and device IDs, respectively. To gracefully handle up/downgrades I propose:

  1. (This PR = N) Always write to both and read data from either (first event_txn_id_device_id, fallback to event_txn_id). Bump the schema version to V.
  2. In a future PR (N+2 versions), stop reading and writing to event_txn_id, bump the schema version to V+1.

At this point we have a database schema still compatible with today, but we're not using data event_txn_id, so rolling back to e.g. N is fine, but not before N. To enforce this, bump the minimum schema version to V.

  1. In a future PR (N+4 versions), add a schema delta which drops the event_txn_id table.

At this point we no longer have a database schema compatible with N, but we can still rollback OK to N+2. So bump the minimum schema version to V+1.

Internal metadata

The device ID & access token ID are unconditionally written into the internal metadata of events. It then follows similar logic where the transaction ID is returned to a user when serializing the event if:

  1. The device ID exists in the internal metadata and matches that of the sender. 1
  2. The requester & sender user IDs match and:
    • The token ID exists in the internal metadata and matches that of the sender.
    • The requester is a guest.
    • The requester is an appservice.

A similar thing should be followed:

  1. (This PR = N) Start using the device ID in-lieu of the token ID, if it is set. (Note that both are currently saved in the internal metadata.)
  2. In a future PR (N+2), stop writing the token ID into the internal metadata.

This implies that we would leave the logic for the fallback "forever", but I think we're already at am minimum schema version where we've always written both values, so we only need to be concerned with old data.

An option here would be to do a data migration for any event who has a token ID which still exists in the database, add the device ID. Then remove the fallback code. This seems a bit onerous though.

Footnotes

  1. Device IDs matching should also be gated by the user ID, I think. There's more risk of those overlapping that the token IDs since they're client controllable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#16042 was filed to handle follow-up.

if to_insert_token_id:
self.db_pool.simple_insert_many_txn(
txn,
Expand All @@ -1030,10 +1029,8 @@ def _persist_transaction_ids_txn(
values=to_insert_token_id,
)

# With MSC3970, we rely on the device_id instead to scope the txn_id for events.
# We're only inserting if MSC3970 is *enabled*, because else the pre-MSC3970
# behaviour would allow for a UNIQUE constraint violation on this table
if to_insert_device_id and self._msc3970_enabled:
clokep marked this conversation as resolved.
Show resolved Hide resolved
# Synapse now relies on the device_id to scope the txn_id for events.
if to_insert_device_id:
self.db_pool.simple_insert_many_txn(
txn,
table="event_txn_id_device_id",
Expand Down
3 changes: 2 additions & 1 deletion synapse/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ class Requester:
request, or None if it came via the appservice API or similar
is_guest: True if the user making this request is a guest user
shadow_banned: True if the user making this request has been shadow-banned.
device_id: device_id which was set at authentication time
device_id: device_id which was set at authentication time, this will be
None for appservices, guests, and tokens generated by the admin API
app_service: the AS requesting on behalf of the user
authenticated_entity: The entity that authenticated when making the request.
This is different to the user_id when an admin user or the server is
Expand Down