diff --git a/Sources/LiveKit/Agent/Agent.swift b/Sources/LiveKit/Agent/Agent.swift new file mode 100644 index 000000000..2e2c1c081 --- /dev/null +++ b/Sources/LiveKit/Agent/Agent.swift @@ -0,0 +1,57 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Combine +import Foundation + +@MainActor +open class Agent: ObservableObject { + public typealias Identity = Participant.Identity + + @Published public private(set) var state: AgentState = .idle + + @Published public private(set) var audioTrack: (any AudioTrack)? + @Published public private(set) var avatarVideoTrack: (any VideoTrack)? + + public let participant: Participant + + public init(participant: Participant) { + self.participant = participant + observe(participant) + } + + private func observe(_ participant: Participant) { + Task { [weak self] in + for try await _ in participant.changes { + guard let self else { return } + + state = participant.agentState + updateTracks(of: participant) + } + } + } + + private func updateTracks(of participant: Participant) { + audioTrack = participant.audioTracks.first(where: { $0.source == .microphone })?.track as? AudioTrack + avatarVideoTrack = participant.avatarWorker?.firstCameraVideoTrack + } +} + +extension AgentState: CustomStringConvertible { + public var description: String { + rawValue.capitalized + } +} diff --git a/Sources/LiveKit/Agent/Chat/Message.swift b/Sources/LiveKit/Agent/Chat/Message.swift new file mode 100644 index 000000000..529728c1c --- /dev/null +++ b/Sources/LiveKit/Agent/Chat/Message.swift @@ -0,0 +1,41 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Foundation + +/// A message received from the agent. +public struct ReceivedMessage: Identifiable, Equatable, Codable, Sendable { + public let id: String + public let timestamp: Date + public let content: Content + + public enum Content: Equatable, Codable, Sendable { + case agentTranscript(String) + case userTranscript(String) + case userInput(String) + } +} + +/// A message sent to the agent. +public struct SentMessage: Identifiable, Equatable, Codable, Sendable { + public let id: String + public let timestamp: Date + public let content: Content + + public enum Content: Equatable, Codable, Sendable { + case userInput(String) + } +} diff --git a/Sources/LiveKit/Agent/Chat/Receive/MessageReceiver.swift b/Sources/LiveKit/Agent/Chat/Receive/MessageReceiver.swift new file mode 100644 index 000000000..2344be30e --- /dev/null +++ b/Sources/LiveKit/Agent/Chat/Receive/MessageReceiver.swift @@ -0,0 +1,27 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Foundation + +/// A protocol that defines a message receiver. +/// +/// A message receiver is responsible for creating a stream of messages from the agent. +/// It is used to receive messages from the agent and update the message feed. +/// +/// - SeeAlso: ``ReceivedMessage`` +public protocol MessageReceiver: Sendable { + func messages() async throws -> AsyncStream +} diff --git a/Sources/LiveKit/Agent/Chat/Receive/TranscriptionDelegateReceiver.swift b/Sources/LiveKit/Agent/Chat/Receive/TranscriptionDelegateReceiver.swift new file mode 100644 index 000000000..43c8bfe1a --- /dev/null +++ b/Sources/LiveKit/Agent/Chat/Receive/TranscriptionDelegateReceiver.swift @@ -0,0 +1,68 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Foundation + +/// An actor that receives transcription messages from the room and yields them as messages. +/// +/// Room delegate methods are called multiple times for each message, with a stable message ID +/// that can be direcly used for diffing. +/// +/// Example: +/// ``` +/// { id: "1", content: "Hello" } +/// { id: "1", content: "Hello world!" } +/// ``` +@available(*, deprecated, message: "Use TranscriptionStreamReceiver compatible with livekit-agents 1.0") +actor TranscriptionDelegateReceiver: MessageReceiver, RoomDelegate { + private let room: Room + private var continuation: AsyncStream.Continuation? + + init(room: Room) { + self.room = room + room.add(delegate: self) + } + + deinit { + room.remove(delegate: self) + } + + /// Creates a new message stream for the transcription delegate receiver. + func messages() -> AsyncStream { + let (stream, continuation) = AsyncStream.makeStream(of: ReceivedMessage.self) + self.continuation = continuation + return stream + } + + nonisolated func room(_: Room, participant: Participant, trackPublication _: TrackPublication, didReceiveTranscriptionSegments segments: [TranscriptionSegment]) { + segments + .filter { !$0.text.isEmpty } + .forEach { segment in + let message = ReceivedMessage( + id: segment.id, + timestamp: segment.lastReceivedTime, + content: participant.isAgent ? .agentTranscript(segment.text) : .userTranscript(segment.text) + ) + Task { + await yield(message) + } + } + } + + private func yield(_ message: ReceivedMessage) { + continuation?.yield(message) + } +} diff --git a/Sources/LiveKit/Agent/Chat/Receive/TranscriptionStreamReceiver.swift b/Sources/LiveKit/Agent/Chat/Receive/TranscriptionStreamReceiver.swift new file mode 100644 index 000000000..bec7674cc --- /dev/null +++ b/Sources/LiveKit/Agent/Chat/Receive/TranscriptionStreamReceiver.swift @@ -0,0 +1,174 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Foundation + +/// An actor that converts raw text streams from the LiveKit `Room` into `Message` objects. +/// - Note: Streams are supported by `livekit-agents` >= 1.0.0. +/// - SeeAlso: ``TranscriptionDelegateReceiver`` +/// +/// For agent messages, new text stream is emitted for each message, and the stream is closed when the message is finalized. +/// Each agent message is delivered in chunks, that are accumulated and published into the message stream. +/// +/// For user messages, the full transcription is sent each time, but may be updated until finalized. +/// +/// The ID of the segment is stable and unique across the lifetime of the message. +/// This ID can be used directly for `Identifiable` conformance. +/// +/// Example text stream for agent messages: +/// ``` +/// { segment_id: "1", content: "Hello" } +/// { segment_id: "1", content: " world" } +/// { segment_id: "1", content: "!" } +/// { segment_id: "2", content: "Hello" } +/// { segment_id: "2", content: " Apple" } +/// { segment_id: "2", content: "!" } +/// ``` +/// +/// Example text stream for user messages: +/// ``` +/// { segment_id: "3", content: "Hello" } +/// { segment_id: "3", content: "Hello world!" } +/// { segment_id: "4", content: "Hello" } +/// { segment_id: "4", content: "Hello Apple!" } +/// ``` +/// +/// Example output: +/// ``` +/// Message(id: "1", timestamp: 2025-01-01 12:00:00 +0000, content: .agentTranscript("Hello world!")) +/// Message(id: "2", timestamp: 2025-01-01 12:00:10 +0000, content: .agentTranscript("Hello Apple!")) +/// Message(id: "3", timestamp: 2025-01-01 12:00:20 +0000, content: .userTranscript("Hello world!")) +/// Message(id: "4", timestamp: 2025-01-01 12:00:30 +0000, content: .userTranscript("Hello Apple!")) +/// ``` +/// +actor TranscriptionStreamReceiver: MessageReceiver, Loggable { + private struct PartialMessageID: Hashable { + let segmentID: String + let participantID: Participant.Identity + } + + private struct PartialMessage { + var content: String + let timestamp: Date + var streamID: String + + mutating func appendContent(_ newContent: String) { + content += newContent + } + + mutating func replaceContent(_ newContent: String, streamID: String) { + content = newContent + self.streamID = streamID + } + } + + private let room: Room + private let topic: String + + private lazy var partialMessages: [PartialMessageID: PartialMessage] = [:] + + init(room: Room, topic: String = "lk.transcription") { + self.room = room + self.topic = topic + } + + /// Creates a new message stream for the chat topic. + func messages() async throws -> AsyncStream { + let (stream, continuation) = AsyncStream.makeStream(of: ReceivedMessage.self) + + let topic = topic + + try await room.registerTextStreamHandler(for: topic) { [weak self] reader, participantIdentity in + for try await message in reader where !message.isEmpty { + guard let self else { return } + await continuation.yield(processIncoming(partialMessage: message, reader: reader, participantIdentity: participantIdentity)) + } + } + + continuation.onTermination = { _ in + Task { [weak self] in + await self?.room.unregisterTextStreamHandler(for: topic) + } + } + + return stream + } + + /// Aggregates the incoming text into a message, storing the partial content in the `partialMessages` dictionary. + /// - Note: When the message is finalized, or a new message is started, the dictionary is purged to limit memory usage. + private func processIncoming(partialMessage message: String, reader: TextStreamReader, participantIdentity: Participant.Identity) -> ReceivedMessage { + let attributes = reader.info.attributes.mapped(to: TranscriptionAttributes.self) + if attributes == nil { + log("Unable to read message attributes from \(reader.info.attributes)", .error) + } + + let segmentID = attributes?.lkSegmentID ?? reader.info.id + let participantID = participantIdentity + let partialID = PartialMessageID(segmentID: segmentID, participantID: participantID) + + let currentStreamID = reader.info.id + + let timestamp: Date + let updatedContent: String + + if var existingMessage = partialMessages[partialID] { + // Update existing message + if existingMessage.streamID == currentStreamID { + // Same stream, append content + existingMessage.appendContent(message) + } else { + // Different stream for same segment, replace content + existingMessage.replaceContent(message, streamID: currentStreamID) + } + updatedContent = existingMessage.content + timestamp = existingMessage.timestamp + partialMessages[partialID] = existingMessage + } else { + // This is a new message + updatedContent = message + timestamp = reader.info.timestamp + partialMessages[partialID] = PartialMessage( + content: updatedContent, + timestamp: timestamp, + streamID: currentStreamID + ) + cleanupPreviousTurn(participantIdentity, exceptSegmentID: segmentID) + } + + let isFinal = attributes?.lkTranscriptionFinal ?? false + if isFinal { + partialMessages[partialID] = nil + } + + let newOrUpdatedMessage = ReceivedMessage( + id: segmentID, + timestamp: timestamp, + content: participantIdentity == room.localParticipant.identity ? .userTranscript(updatedContent) : .agentTranscript(updatedContent) + ) + + return newOrUpdatedMessage + } + + private func cleanupPreviousTurn(_ participantID: Participant.Identity, exceptSegmentID: String) { + let keysToRemove = partialMessages.keys.filter { + $0.participantID == participantID && $0.segmentID != exceptSegmentID + } + + for key in keysToRemove { + partialMessages[key] = nil + } + } +} diff --git a/Sources/LiveKit/Agent/AgentState+.swift b/Sources/LiveKit/Agent/Chat/Send/MessageSender.swift similarity index 63% rename from Sources/LiveKit/Agent/AgentState+.swift rename to Sources/LiveKit/Agent/Chat/Send/MessageSender.swift index 9bb45b096..fe78232c0 100644 --- a/Sources/LiveKit/Agent/AgentState+.swift +++ b/Sources/LiveKit/Agent/Chat/Send/MessageSender.swift @@ -14,8 +14,14 @@ * limitations under the License. */ -extension AgentState: CustomStringConvertible { - public var description: String { - rawValue.capitalized - } +import Foundation + +/// A protocol that defines a message sender. +/// +/// A message sender is responsible for sending messages to the agent. +/// It is used to send messages to the agent and update the message feed. +/// +/// - SeeAlso: ``SentMessage`` +public protocol MessageSender: Sendable { + func send(_ message: SentMessage) async throws } diff --git a/Sources/LiveKit/Agent/Chat/Send/TextMessageSender.swift b/Sources/LiveKit/Agent/Chat/Send/TextMessageSender.swift new file mode 100644 index 000000000..3fcfc87e0 --- /dev/null +++ b/Sources/LiveKit/Agent/Chat/Send/TextMessageSender.swift @@ -0,0 +1,55 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Foundation + +/// An actor that sends local messages to the agent. +/// Currently, it only supports sending text messages. +/// +/// It also serves as the loopback for the local messages, +/// so that they can be displayed in the message feed +/// without relying on the agent-side transcription. +actor TextMessageSender: MessageSender, MessageReceiver { + private let room: Room + private let topic: String + + private var messageContinuation: AsyncStream.Continuation? + + init(room: Room, topic: String = "lk.chat") { + self.room = room + self.topic = topic + } + + func send(_ message: SentMessage) async throws { + guard case let .userInput(text) = message.content else { return } + + try await room.localParticipant.sendText(text, for: topic) + + let loopbackMessage = ReceivedMessage( + id: message.id, + timestamp: message.timestamp, + content: .userInput(text) + ) + + messageContinuation?.yield(loopbackMessage) + } + + func messages() async throws -> AsyncStream { + let (stream, continuation) = AsyncStream.makeStream() + messageContinuation = continuation + return stream + } +} diff --git a/Sources/LiveKit/Agent/LocalMedia.swift b/Sources/LiveKit/Agent/LocalMedia.swift new file mode 100644 index 000000000..fe2c09ed3 --- /dev/null +++ b/Sources/LiveKit/Agent/LocalMedia.swift @@ -0,0 +1,181 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@preconcurrency import AVFoundation +import Combine +import Foundation + +@MainActor +open class LocalMedia: ObservableObject { + // MARK: - Error + + public enum Error: LocalizedError { + case mediaDevice(Swift.Error) + + public var errorDescription: String? { + switch self { + case let .mediaDevice(error): + "Media device error: \(error.localizedDescription)" + } + } + } + + // MARK: - Devices + + @Published public private(set) var error: Error? + + @Published public private(set) var microphoneTrack: (any AudioTrack)? + @Published public private(set) var cameraTrack: (any VideoTrack)? + @Published public private(set) var screenShareTrack: (any VideoTrack)? + + @Published public private(set) var isMicrophoneEnabled: Bool = false + @Published public private(set) var isCameraEnabled: Bool = false + @Published public private(set) var isScreenShareEnabled: Bool = false + + @Published public private(set) var audioDevices: [AudioDevice] = AudioManager.shared.inputDevices + @Published public private(set) var selectedAudioDeviceID: String = AudioManager.shared.inputDevice.deviceId + + @Published public private(set) var videoDevices: [AVCaptureDevice] = [] + @Published public private(set) var selectedVideoDeviceID: String? + + @Published public private(set) var canSwitchCamera = false + + // MARK: - Dependencies + + private var localParticipant: LocalParticipant + + // MARK: - Initialization + + public init(localParticipant: LocalParticipant) { + self.localParticipant = localParticipant + + observe(localParticipant) + observeDevices() + } + + public convenience init(room: Room) { + self.init(localParticipant: room.localParticipant) + } + + public convenience init(session: Session) { + self.init(room: session.room) + } + + private func observe(_ localParticipant: LocalParticipant) { + Task { [weak self] in + for try await _ in localParticipant.changes { + guard let self else { return } + + microphoneTrack = localParticipant.firstAudioTrack + cameraTrack = localParticipant.firstCameraVideoTrack + screenShareTrack = localParticipant.firstScreenShareVideoTrack + + isMicrophoneEnabled = localParticipant.isMicrophoneEnabled() + isCameraEnabled = localParticipant.isCameraEnabled() + isScreenShareEnabled = localParticipant.isScreenShareEnabled() + } + } + } + + private func observeDevices() { + try? AudioManager.shared.set(microphoneMuteMode: .inputMixer) // don't play mute sound effect + Task { + try await AudioManager.shared.setRecordingAlwaysPreparedMode(true) + } + + AudioManager.shared.onDeviceUpdate = { _ in + Task { @MainActor [weak self] in + self?.audioDevices = AudioManager.shared.inputDevices + self?.selectedAudioDeviceID = AudioManager.shared.defaultInputDevice.deviceId + } + } + + Task { + canSwitchCamera = try await CameraCapturer.canSwitchPosition() + videoDevices = try await CameraCapturer.captureDevices() + selectedVideoDeviceID = videoDevices.first?.uniqueID + } + } + + deinit { + AudioManager.shared.onDeviceUpdate = nil + } + + // MARK: - Toggle + + public func toggleMicrophone() async { + do { + try await localParticipant.setMicrophone(enabled: !isMicrophoneEnabled) + } catch { + self.error = .mediaDevice(error) + } + } + + public func toggleCamera(disableScreenShare: Bool = false) async { + let enable = !isCameraEnabled + do { + if enable, disableScreenShare, isScreenShareEnabled { + try await localParticipant.setScreenShare(enabled: false) + } + + let device = try await CameraCapturer.captureDevices().first(where: { $0.uniqueID == selectedVideoDeviceID }) + try await localParticipant.setCamera(enabled: enable, captureOptions: CameraCaptureOptions(device: device)) + } catch { + self.error = .mediaDevice(error) + } + } + + public func toggleScreenShare(disableCamera: Bool = false) async { + let enable = !isScreenShareEnabled + do { + if enable, disableCamera, isCameraEnabled { + try await localParticipant.setCamera(enabled: false) + } + try await localParticipant.setScreenShare(enabled: enable) + } catch { + self.error = .mediaDevice(error) + } + } + + // MARK: - Select + + public func select(audioDevice: AudioDevice) { + selectedAudioDeviceID = audioDevice.deviceId + + let device = AudioManager.shared.inputDevices.first(where: { $0.deviceId == selectedAudioDeviceID }) ?? AudioManager.shared.defaultInputDevice + AudioManager.shared.inputDevice = device + } + + public func select(videoDevice: AVCaptureDevice) async { + selectedVideoDeviceID = videoDevice.uniqueID + + guard let cameraCapturer = getCameraCapturer() else { return } + let captureOptions = CameraCaptureOptions(device: videoDevice) + _ = try? await cameraCapturer.set(options: captureOptions) + } + + public func switchCamera() async { + guard let cameraCapturer = getCameraCapturer() else { return } + _ = try? await cameraCapturer.switchCameraPosition() + } + + // MARK: - Private + + private func getCameraCapturer() -> CameraCapturer? { + guard let cameraTrack = localParticipant.firstCameraVideoTrack as? LocalVideoTrack else { return nil } + return cameraTrack.capturer as? CameraCapturer + } +} diff --git a/Sources/LiveKit/Agent/Session.swift b/Sources/LiveKit/Agent/Session.swift new file mode 100644 index 000000000..fdfc6f00d --- /dev/null +++ b/Sources/LiveKit/Agent/Session.swift @@ -0,0 +1,281 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Combine +import Foundation +import OrderedCollections + +@MainActor +open class Session: ObservableObject { + private static let agentNameAttribute = "lk.agent_name" + + // MARK: - Error + + public enum Error: LocalizedError { + case agentNotConnected + case failedToConnect(Swift.Error) + case failedToSend(Swift.Error) + + public var errorDescription: String? { + switch self { + case .agentNotConnected: + "Agent not connected" + case let .failedToConnect(error): + "Failed to connect: \(error.localizedDescription)" + case let .failedToSend(error): + "Failed to send: \(error.localizedDescription)" + } + } + } + + // MARK: - State + + @Published public private(set) var error: Error? + + @Published public private(set) var connectionState: ConnectionState = .disconnected + @Published public private(set) var bufferingSpeechLocally = false + public var isReady: Bool { + switch connectionState { + case .disconnected where bufferingSpeechLocally, + .connecting where bufferingSpeechLocally, + .connected, + .reconnecting: + true + default: + false + } + } + + @Published private var agents: [Agent.Identity: Agent] = [:] + public var agent: Agent? { agents.values.first } + public var hasAgent: Bool { !agents.isEmpty } + + @Published private var messagesDict: OrderedDictionary = [:] + public var messages: [ReceivedMessage] { messagesDict.values.elements } + + // MARK: - Dependencies + + public let room: Room + + private enum TokenSourceConfiguration { + case fixed(any TokenSourceFixed) + case configurable(any TokenSourceConfigurable, TokenRequestOptions) + } + + private let tokenSourceConfiguration: TokenSourceConfiguration + private var options: SessionOptions + + private let senders: [any MessageSender] + private let receivers: [any MessageReceiver] + + // MARK: - Internal state + + private var waitForAgentTask: Task? + + // MARK: - Init + + private init(tokenSourceConfiguration: TokenSourceConfiguration, + options: SessionOptions, + senders: [any MessageSender]?, + receivers: [any MessageReceiver]?) + { + self.tokenSourceConfiguration = tokenSourceConfiguration + self.options = options + room = options.room + + let textMessageSender = TextMessageSender(room: room) + let resolvedSenders = senders ?? [textMessageSender] + let resolvedReceivers = receivers ?? [textMessageSender, TranscriptionStreamReceiver(room: room)] + + self.senders = resolvedSenders + self.receivers = resolvedReceivers + + observe(room: room) + observe(receivers: resolvedReceivers) + } + + public convenience init(tokenSource: any TokenSourceFixed, + options: SessionOptions = .init(), + senders: [any MessageSender]? = nil, + receivers: [any MessageReceiver]? = nil) + { + self.init(tokenSourceConfiguration: .fixed(tokenSource), + options: options, + senders: senders, + receivers: receivers) + } + + public convenience init(tokenSource: any TokenSourceConfigurable, + tokenOptions: TokenRequestOptions = .init(), + options: SessionOptions = .init(), + senders: [any MessageSender]? = nil, + receivers: [any MessageReceiver]? = nil) + { + self.init(tokenSourceConfiguration: .configurable(tokenSource, tokenOptions), + options: options, + senders: senders, + receivers: receivers) + } + + public static func withAgent(_ agentName: String, + agentMetadata: String? = nil, + tokenSource: any TokenSourceConfigurable, + options: SessionOptions = .init(), + senders: [any MessageSender]? = nil, + receivers: [any MessageReceiver]? = nil) -> Session + { + Session(tokenSource: tokenSource, + tokenOptions: .init(agentName: agentName, agentMetadata: agentMetadata), + options: options, + senders: senders, + receivers: receivers) + } + + deinit { + waitForAgentTask?.cancel() + } + + private func observe(room: Room) { + Task { [weak self] in + for try await _ in room.changes { + guard let self else { return } + + connectionState = room.connectionState + updateAgents(in: room) + } + } + } + + private func updateAgents(in room: Room) { + let agentParticipants = room.agentParticipants + + var newAgents: [Participant.Identity: Agent] = [:] + + for (identity, participant) in agentParticipants { + if let existingAgent = agents[identity] { + newAgents[identity] = existingAgent + } else { + let newAgent = Agent(participant: participant) + newAgents[identity] = newAgent + } + } + + agents = newAgents + } + + private func observe(receivers: [any MessageReceiver]) { + for receiver in receivers { + Task { [weak self] in + for await message in try await receiver.messages() { + guard let self else { return } + messagesDict.updateValue(message, forKey: message.id) + } + } + } + } + + // MARK: - Agents + + private func agent(named agentName: String) -> Agent? { + agents.values.first { $0.participant.attributes[Self.agentNameAttribute] == agentName } + } + + private subscript(agentName: String) -> Agent? { + agent(named: agentName) + } + + // MARK: - Lifecycle + + public func start() async { + guard connectionState == .disconnected else { return } + + error = nil + waitForAgentTask?.cancel() + + let timeout = options.agentConnectTimeout + + defer { + waitForAgentTask = Task { [weak self] in + try await Task.sleep(nanoseconds: UInt64(timeout * Double(NSEC_PER_SEC))) + try Task.checkCancellation() + guard let self else { return } + if connectionState == .connected, agents.isEmpty { + self.error = .agentNotConnected + } + } + } + + do { + let response = try await fetchToken() + + if options.preConnectAudio { + try await room.withPreConnectAudio(timeout: timeout) { + await MainActor.run { self.bufferingSpeechLocally = true } + try await self.room.connect(url: response.serverURL.absoluteString, + token: response.participantToken) + await MainActor.run { self.bufferingSpeechLocally = false } + } + } else { + try await room.connect(url: response.serverURL.absoluteString, + token: response.participantToken) + } + } catch { + self.error = .failedToConnect(error) + } + } + + public func end() async { + await room.disconnect() + } + + public func resetError() { + error = nil + } + + // MARK: - Messages + + @discardableResult + public func send(text: String) async -> SentMessage { + let message = SentMessage(id: UUID().uuidString, timestamp: Date(), content: .userInput(text)) + do { + for sender in senders { + try await sender.send(message) + } + } catch { + self.error = .failedToSend(error) + } + return message + } + + public func getMessageHistory() -> [ReceivedMessage] { + messages + } + + public func restoreMessageHistory(_ messages: [ReceivedMessage]) { + messagesDict = .init(uniqueKeysWithValues: messages.sorted(by: { $0.timestamp < $1.timestamp }).map { ($0.id, $0) }) + } + + // MARK: - Helpers + + private func fetchToken() async throws -> TokenSourceResponse { + switch tokenSourceConfiguration { + case let .fixed(source): + try await source.fetch() + case let .configurable(source, options): + try await source.fetch(options) + } + } +} diff --git a/Sources/LiveKit/Agent/SessionOptions.swift b/Sources/LiveKit/Agent/SessionOptions.swift new file mode 100644 index 000000000..1c86aa61c --- /dev/null +++ b/Sources/LiveKit/Agent/SessionOptions.swift @@ -0,0 +1,33 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Foundation + +public struct SessionOptions: Sendable { + public var room: Room + public var preConnectAudio: Bool + public var agentConnectTimeout: TimeInterval + + public init( + room: Room = .init(), + preConnectAudio: Bool = true, + agentConnectTimeout: TimeInterval = 20 + ) { + self.room = room + self.preConnectAudio = preConnectAudio + self.agentConnectTimeout = agentConnectTimeout + } +} diff --git a/Sources/LiveKit/Support/ObservableObject+.swift b/Sources/LiveKit/Support/ObservableObject+.swift new file mode 100644 index 000000000..3da521406 --- /dev/null +++ b/Sources/LiveKit/Support/ObservableObject+.swift @@ -0,0 +1,31 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@preconcurrency import Combine + +extension ObservableObject { + /// An async sequence that emits the `objectWillChange` events. + var changes: AsyncStream { + AsyncStream { continuation in + let cancellable = objectWillChange.sink { _ in + continuation.yield() + } + continuation.onTermination = { _ in + cancellable.cancel() + } + } + } +} diff --git a/Sources/LiveKit/Track/VideoTrack.swift b/Sources/LiveKit/Track/VideoTrack.swift index 65ff33f33..6598567cf 100644 --- a/Sources/LiveKit/Track/VideoTrack.swift +++ b/Sources/LiveKit/Track/VideoTrack.swift @@ -71,3 +71,11 @@ extension VideoTrackProtocol where Self: Track { return missingCodecs } } + +public extension Track { + /// The aspect ratio of the video track or 1 if the dimensions are not available. + var aspectRatio: CGFloat { + guard let dimensions else { return 1 } + return CGFloat(dimensions.width) / CGFloat(dimensions.height) + } +} diff --git a/Sources/LiveKit/Types/Attributes/AttributeTypings.swift b/Sources/LiveKit/Types/Attributes/AttributeTypings.swift index 35dbc8f0b..9fc56e609 100644 --- a/Sources/LiveKit/Types/Attributes/AttributeTypings.swift +++ b/Sources/LiveKit/Types/Attributes/AttributeTypings.swift @@ -20,6 +20,35 @@ import Foundation extension AgentAttributes: Hashable {} extension AgentAttributes: Equatable {} +// Bool as String encoding +extension TranscriptionAttributes { + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + lkSegmentID = try container.decodeIfPresent(String.self, forKey: .lkSegmentID) + lkTranscribedTrackID = try container.decodeIfPresent(String.self, forKey: .lkTranscribedTrackID) + + // Decode as Bool first, fallback to String + if let boolValue = try? container.decodeIfPresent(Bool.self, forKey: .lkTranscriptionFinal) { + lkTranscriptionFinal = boolValue + } else if let stringValue = try? container.decodeIfPresent(String.self, forKey: .lkTranscriptionFinal) { + lkTranscriptionFinal = (stringValue as NSString).boolValue + } else { + lkTranscriptionFinal = nil + } + } + + func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encodeIfPresent(lkSegmentID, forKey: .lkSegmentID) + try container.encodeIfPresent(lkTranscribedTrackID, forKey: .lkTranscribedTrackID) + + // Always encode Bool as a string if it exists + if let boolValue = lkTranscriptionFinal { + try container.encode(boolValue ? "true" : "false", forKey: .lkTranscriptionFinal) + } + } +} + // MARK: - AgentAttributes struct AgentAttributes: Codable, Sendable { diff --git a/Tests/LiveKitCoreTests/Agent/TranscriptionTests.swift b/Tests/LiveKitCoreTests/Agent/TranscriptionTests.swift new file mode 100644 index 000000000..33713d152 --- /dev/null +++ b/Tests/LiveKitCoreTests/Agent/TranscriptionTests.swift @@ -0,0 +1,187 @@ +/* + * Copyright 2025 LiveKit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@testable import LiveKit +import OrderedCollections +#if canImport(LiveKitTestSupport) +import LiveKitTestSupport +#endif + +actor MessageCollector { + private var updates: [ReceivedMessage] = [] + private var messages: OrderedDictionary = [:] + + func add(_ message: ReceivedMessage) { + updates.append(message) + messages[message.id] = message + } + + func getUpdates() -> [ReceivedMessage] { + updates + } + + func getMessages() -> OrderedDictionary { + messages + } +} + +class TranscriptionTests: LKTestCase, @unchecked Sendable { + private var rooms: [Room] = [] + private var receiver: TranscriptionStreamReceiver! + private var senderRoom: Room! + private var messageCollector: MessageCollector! + private var collectionTask: Task! + private var messageExpectation: XCTestExpectation! + + // Same segment, same stream + func testUpdates() async throws { + let segmentID = "test-segment" + let streamID = UUID().uuidString + let testChunks = ["Hey", " there!", " What's up?"] + let expectedContent = ["Hey", "Hey there!", "Hey there! What's up?"] + + try await runTranscriptionTest( + chunks: testChunks, + segmentID: segmentID, + streamID: streamID, + expectedContent: expectedContent + ) + } + + // Same segment, different stream + func testReplace() async throws { + let segmentID = "test-segment" + let testChunks = ["Hey", "Hey there!", "Hey there! What's up?"] + let expectedContent = ["Hey", "Hey there!", "Hey there! What's up?"] + + try await runTranscriptionTest( + chunks: testChunks, + segmentID: segmentID, + streamID: nil, + expectedContent: expectedContent + ) + } + + private func setupTestEnvironment(expectedCount: Int) async throws { + messageExpectation = expectation(description: "Receives all message updates") + messageExpectation.expectedFulfillmentCount = expectedCount + + receiver = TranscriptionStreamReceiver(room: rooms[0]) + let messageStream = try await receiver.messages() + messageCollector = MessageCollector() + senderRoom = rooms[1] + + collectionTask = Task { @Sendable in + var iterator = messageStream.makeAsyncIterator() + while let message = await iterator.next() { + await self.messageCollector.add(message) + self.messageExpectation.fulfill() + } + } + } + + private func sendTranscriptionChunks( + chunks: [String], + segmentID: String, + streamID: String? = nil, + to room: Room + ) async throws { + let topic = "lk.transcription" + + for (index, chunk) in chunks.enumerated() { + let isLast = index == chunks.count - 1 + + var attributes: [String: String] = [ + "lk.segment_id": segmentID, + "lk.transcription_final": "false", + ] + + if isLast { + attributes["lk.transcription_final"] = "true" + } + + let options = StreamTextOptions( + topic: topic, + attributes: attributes, + id: streamID ?? UUID().uuidString + ) + + try await room.localParticipant.sendText(chunk, options: options) + try await Task.sleep(nanoseconds: 10_000_000) + } + } + + private func validateTranscriptionResults( + updates: [ReceivedMessage], + messages: OrderedDictionary, + segmentID: String, + expectedContent: [String] + ) { + // Validate updates + XCTAssertEqual(updates.count, expectedContent.count) + for (index, expected) in expectedContent.enumerated() { + XCTAssertEqual(updates[index].content, .agentTranscript(expected)) + XCTAssertEqual(updates[index].id, segmentID) + } + + // Validate timestamps are consistent + let firstTimestamp = updates[0].timestamp + for update in updates { + XCTAssertEqual(update.timestamp, firstTimestamp) + } + + // Validate final message + XCTAssertEqual(messages.count, 1) + XCTAssertEqual(messages.keys[0], segmentID) + XCTAssertEqual(messages.values[0].content, .agentTranscript(expectedContent.last!)) + XCTAssertEqual(messages.values[0].id, segmentID) + XCTAssertEqual(messages.values[0].timestamp, firstTimestamp) + } + + private func runTranscriptionTest( + chunks: [String], + segmentID: String, + streamID: String? = nil, + expectedContent: [String] + ) async throws { + try await withRooms([ + RoomTestingOptions(canSubscribe: true), + RoomTestingOptions(canPublishData: true), + ]) { rooms in + self.rooms = rooms + try await self.setupTestEnvironment(expectedCount: expectedContent.count) + try await self.sendTranscriptionChunks( + chunks: chunks, + segmentID: segmentID, + streamID: streamID, + to: self.senderRoom + ) + + await self.fulfillment(of: [self.messageExpectation], timeout: 5) + self.collectionTask.cancel() + + let updates = await self.messageCollector.getUpdates() + let messages = await self.messageCollector.getMessages() + + self.validateTranscriptionResults( + updates: updates, + messages: messages, + segmentID: segmentID, + expectedContent: expectedContent + ) + } + } +}