Files
clawdbot/Swabble/Sources/SwabbleCore/Speech/SpeechPipeline.swift
2025-12-23 01:31:59 +01:00

115 lines
4.1 KiB
Swift

import AVFoundation
import Foundation
import Speech
@available(macOS 26.0, iOS 26.0, *)
public struct SpeechSegment: Sendable {
public let text: String
public let isFinal: Bool
}
@available(macOS 26.0, iOS 26.0, *)
public enum SpeechPipelineError: Error {
case authorizationDenied
case analyzerFormatUnavailable
case transcriberUnavailable
}
/// Live microphone SpeechAnalyzer SpeechTranscriber pipeline.
@available(macOS 26.0, iOS 26.0, *)
public actor SpeechPipeline {
private struct UnsafeBuffer: @unchecked Sendable { let buffer: AVAudioPCMBuffer }
private var engine = AVAudioEngine()
private var transcriber: SpeechTranscriber?
private var analyzer: SpeechAnalyzer?
private var inputContinuation: AsyncStream<AnalyzerInput>.Continuation?
private var resultTask: Task<Void, Never>?
private let converter = BufferConverter()
public init() {}
public func start(localeIdentifier: String, etiquette: Bool) async throws -> AsyncStream<SpeechSegment> {
let auth = await requestAuthorizationIfNeeded()
guard auth == .authorized else { throw SpeechPipelineError.authorizationDenied }
let transcriberModule = SpeechTranscriber(
locale: Locale(identifier: localeIdentifier),
transcriptionOptions: etiquette ? [.etiquetteReplacements] : [],
reportingOptions: [.volatileResults],
attributeOptions: [])
transcriber = transcriberModule
guard let analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriberModule])
else {
throw SpeechPipelineError.analyzerFormatUnavailable
}
analyzer = SpeechAnalyzer(modules: [transcriberModule])
let (stream, continuation) = AsyncStream<AnalyzerInput>.makeStream()
inputContinuation = continuation
let inputNode = engine.inputNode
let inputFormat = inputNode.outputFormat(forBus: 0)
inputNode.removeTap(onBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 2048, format: inputFormat) { [weak self] buffer, _ in
guard let self else { return }
let boxed = UnsafeBuffer(buffer: buffer)
Task { await self.handleBuffer(boxed.buffer, targetFormat: analyzerFormat) }
}
engine.prepare()
try engine.start()
try await analyzer?.start(inputSequence: stream)
guard let transcriberForStream = transcriber else {
throw SpeechPipelineError.transcriberUnavailable
}
return AsyncStream { continuation in
self.resultTask = Task {
do {
for try await result in transcriberForStream.results {
let seg = SpeechSegment(text: String(result.text.characters), isFinal: result.isFinal)
continuation.yield(seg)
}
} catch {
// swallow errors and finish
}
continuation.finish()
}
continuation.onTermination = { _ in
Task { await self.stop() }
}
}
}
public func stop() async {
resultTask?.cancel()
inputContinuation?.finish()
engine.inputNode.removeTap(onBus: 0)
engine.stop()
try? await analyzer?.finalizeAndFinishThroughEndOfInput()
}
private func handleBuffer(_ buffer: AVAudioPCMBuffer, targetFormat: AVAudioFormat) async {
do {
let converted = try converter.convert(buffer, to: targetFormat)
let input = AnalyzerInput(buffer: converted)
inputContinuation?.yield(input)
} catch {
// drop on conversion failure
}
}
private func requestAuthorizationIfNeeded() async -> SFSpeechRecognizerAuthorizationStatus {
let current = SFSpeechRecognizer.authorizationStatus()
guard current == .notDetermined else { return current }
return await withCheckedContinuation { continuation in
SFSpeechRecognizer.requestAuthorization { status in
continuation.resume(returning: status)
}
}
}
}