diff --git a/Package.swift b/Package.swift index a374aaf0..6dd9893e 100644 --- a/Package.swift +++ b/Package.swift @@ -92,6 +92,7 @@ let package = Package( ], dependencies: [ .package(url: "https://github.com/vmanot/CorePersistence.git", branch: "main"), + .package(url: "https://github.com/vmanot/Media", branch: "main"), .package(url: "https://github.com/vmanot/Merge.git", branch: "master"), .package(url: "https://github.com/vmanot/NetworkKit.git", branch: "master"), .package(url: "https://github.com/vmanot/Swallow.git", branch: "master"), @@ -115,7 +116,7 @@ let package = Package( "Merge", "NetworkKit", "Swallow", - "SwiftUIX", + "SwiftUIX" ], path: "Sources/LargeLanguageModels", resources: [ @@ -191,7 +192,8 @@ let package = Package( "LargeLanguageModels", "Merge", "NetworkKit", - "Swallow" + "Swallow", + "Media" ], path: "Sources/_Gemini", swiftSettings: [ @@ -389,6 +391,7 @@ let package = Package( "Ollama", "OpenAI", "Swallow", + "NeetsAI", ], path: "Sources/AI", swiftSettings: [ diff --git a/Sources/AI/AnySpeechSynthesisRequestHandling.swift b/Sources/AI/AnySpeechSynthesisRequestHandling.swift new file mode 100644 index 00000000..5684958f --- /dev/null +++ b/Sources/AI/AnySpeechSynthesisRequestHandling.swift @@ -0,0 +1,42 @@ +// +// AnySpeechSynthesisRequestHandling.swift +// AI +// +// Created by Jared Davidson on 1/14/25. +// + +import ElevenLabs +import LargeLanguageModels +import NeetsAI + +public struct AnySpeechSynthesisRequestHandling: Hashable { + private let _hashValue: Int + + public let base: any CoreMI._ServiceClientProtocol & SpeechSynthesisRequestHandling + + public var displayName: String { + switch base { + case is ElevenLabs.Client: + return "ElevenLabs" + case is NeetsAI.Client: + return "NeetsAI" + default: + fatalError() + } + } + + public init( + _ base: any CoreMI._ServiceClientProtocol & SpeechSynthesisRequestHandling + ) { + self.base = base + self._hashValue = ObjectIdentifier(base as AnyObject).hashValue + } + + public static func == (lhs: AnySpeechSynthesisRequestHandling, rhs: AnySpeechSynthesisRequestHandling) -> Bool { + lhs._hashValue == rhs._hashValue + } + + public func hash(into hasher: inout Hasher) { + hasher.combine(_hashValue) + } +} diff --git a/Sources/ElevenLabs/Intramodular/API/ElevenLabs.APISpecification.swift b/Sources/ElevenLabs/Intramodular/API/ElevenLabs.APISpecification.swift index fa442c34..656255b8 100644 --- a/Sources/ElevenLabs/Intramodular/API/ElevenLabs.APISpecification.swift +++ b/Sources/ElevenLabs/Intramodular/API/ElevenLabs.APISpecification.swift @@ -128,9 +128,6 @@ extension ElevenLabs.APISpecification { context: DecodeOutputContext ) throws -> Output { do { - if Input.self == RequestBodies.EditVoiceInput.self { - print("TEsts") - } try response.validate() } catch { let apiError: Error diff --git a/Sources/ElevenLabs/Intramodular/ElevenLabs.Client.swift b/Sources/ElevenLabs/Intramodular/ElevenLabs.Client.swift index 2093a3ea..2d7a03b6 100644 --- a/Sources/ElevenLabs/Intramodular/ElevenLabs.Client.swift +++ b/Sources/ElevenLabs/Intramodular/ElevenLabs.Client.swift @@ -10,10 +10,15 @@ import SwiftAPI import Merge import FoundationX import Swallow +import LargeLanguageModels extension ElevenLabs { @RuntimeDiscoverable public final class Client: SwiftAPI.Client, ObservableObject { + public static var persistentTypeRepresentation: some IdentityRepresentation { + CoreMI._ServiceVendorIdentifier._ElevenLabs + } + public typealias API = ElevenLabs.APISpecification public typealias Session = HTTPSession @@ -33,6 +38,25 @@ extension ElevenLabs { } } +extension ElevenLabs.Client: CoreMI._ServiceClientProtocol { + public convenience init( + account: (any CoreMI._ServiceAccountProtocol)? + ) async throws { + let account: any CoreMI._ServiceAccountProtocol = try account.unwrap() + let serviceVendorIdentifier: CoreMI._ServiceVendorIdentifier = try account.serviceVendorIdentifier.unwrap() + + guard serviceVendorIdentifier == CoreMI._ServiceVendorIdentifier._ElevenLabs else { + throw CoreMI._ServiceClientError.incompatibleVendor(serviceVendorIdentifier) + } + + guard let credential = try account.credential as? CoreMI._ServiceCredentialTypes.APIKeyCredential else { + throw CoreMI._ServiceClientError.invalidCredential(try account.credential) + } + + self.init(apiKey: credential.apiKey) + } +} + extension ElevenLabs.Client { public func availableVoices() async throws -> [ElevenLabs.Voice] { try await run(\.listVoices).voices @@ -50,7 +74,6 @@ extension ElevenLabs.Client { voiceSettings: voiceSettings, model: model ) - return try await run(\.textToSpeech, with: .init(voiceId: voiceID, requestBody: requestBody)) } @@ -107,3 +130,56 @@ extension ElevenLabs.Client { try await run(\.deleteVoice, with: voice.rawValue) } } + +// MARK: - Conformances + +extension ElevenLabs.Client: SpeechSynthesisRequestHandling { + public func availableVoices() async throws -> [AbstractVoice] { + return try await self.availableVoices().map({try $0.__conversion()}) + } + + public func speech(for text: String, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + try await self.speech( + for: text, + voiceID: voiceID, + voiceSettings: .init(settings: voiceSettings), + model: .init(rawValue: model) ?? .MultilingualV1 + ) + } + + public func speechToSpeech(inputAudioURL: URL, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + try await self.speechToSpeech( + inputAudioURL: inputAudioURL, + voiceID: voiceID, + voiceSettings: .init(settings: voiceSettings), + model: .init(rawValue: model) ?? .MultilingualV1 + ) + } + + public func upload(voiceWithName name: String, description: String, fileURL: URL) async throws -> AbstractVoice.ID { + let voice: ElevenLabs.Voice.ID = try await self.upload( + voiceWithName: name, + description: description, + fileURL: fileURL + ) + + return .init(rawValue: voice.rawValue) + } + + public func edit(voice: AbstractVoice.ID, name: String, description: String, fileURL: URL?) async throws -> Bool { + try await self.edit( + voice: ElevenLabs.Voice.ID(rawValue: voice.rawValue), + name: name, + description: description, + fileURL: fileURL + ) + } + + public func delete(voice: AbstractVoice.ID) async throws { + try await self.delete( + voice: ElevenLabs.Voice.ID( + rawValue: voice.rawValue + ) + ) + } +} diff --git a/Sources/ElevenLabs/Intramodular/ElevenLabs.Voice.swift b/Sources/ElevenLabs/Intramodular/ElevenLabs.Voice.swift index 3a54532f..dbe29d63 100644 --- a/Sources/ElevenLabs/Intramodular/ElevenLabs.Voice.swift +++ b/Sources/ElevenLabs/Intramodular/ElevenLabs.Voice.swift @@ -4,6 +4,7 @@ import Foundation import Swift +import LargeLanguageModels extension ElevenLabs { public struct Voice: Hashable, Identifiable, Sendable { @@ -42,3 +43,24 @@ extension ElevenLabs.Voice: Codable { case isOwner } } + +extension ElevenLabs.Voice: AbstractVoiceConvertible { + public func __conversion() throws -> AbstractVoice { + return AbstractVoice( + voiceID: self.voiceID, + name: self.name, + description: self.description + ) + } +} + +extension ElevenLabs.Voice: AbstractVoiceInitiable { + public init(voice: AbstractVoice) throws { + self.init( + voiceID: voice.voiceID, + name: voice.name, + description: voice.description, + isOwner: nil + ) + } +} diff --git a/Sources/ElevenLabs/Intramodular/ElevenLabs.VoiceSettings.swift b/Sources/ElevenLabs/Intramodular/ElevenLabs.VoiceSettings.swift index 1ffb7947..f0a6b825 100644 --- a/Sources/ElevenLabs/Intramodular/ElevenLabs.VoiceSettings.swift +++ b/Sources/ElevenLabs/Intramodular/ElevenLabs.VoiceSettings.swift @@ -3,6 +3,7 @@ // import Foundation +import LargeLanguageModels extension ElevenLabs { public struct VoiceSettings: Codable, Sendable, Hashable { @@ -98,3 +99,29 @@ extension ElevenLabs.VoiceSettings { ) } } + +// MARK: - Conformances + +extension ElevenLabs.VoiceSettings: AbstractVoiceSettingsConvertible { + public func __conversion() throws -> AbstractVoiceSettings { + return .init( + stability: stability, + similarityBoost: similarityBoost, + styleExaggeration: styleExaggeration, + speakerBoost: speakerBoost, + removeBackgroundNoise: removeBackgroundNoise + ) + } +} + +extension ElevenLabs.VoiceSettings: AbstractVoiceSettingsInitiable { + public init(settings: AbstractVoiceSettings) throws { + self.init( + stability: settings.stability, + similarityBoost: settings.similarityBoost, + styleExaggeration: settings.styleExaggeration, + speakerBoost: settings.speakerBoost, + removeBackgroundNoise: settings.removeBackgroundNoise + ) + } +} diff --git a/Sources/HumeAI/Intramodular/HumeAI.Client+SpeechSynthesisRequestHandling.swift b/Sources/HumeAI/Intramodular/HumeAI.Client+SpeechSynthesisRequestHandling.swift new file mode 100644 index 00000000..cd5a4e8f --- /dev/null +++ b/Sources/HumeAI/Intramodular/HumeAI.Client+SpeechSynthesisRequestHandling.swift @@ -0,0 +1,44 @@ +// +// HumeAI+ElevenLabsClientProtocol.swift +// Voice +// +// Created by Jared Davidson on 11/22/24. +// + +import Foundation +import SwiftUI +import AVFoundation +import LargeLanguageModels + +extension HumeAI.Client: SpeechSynthesisRequestHandling { + public func availableVoices() async throws -> [AbstractVoice] { + return try await getAllAvailableVoices().map( + { voice in + return AbstractVoice( + voiceID: voice.id, + name: voice.name, + description: nil + ) + }) + } + + public func speech(for text: String, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + throw HumeAI.APIError.unknown(message: "Text to speech not supported") + } + + public func speechToSpeech(inputAudioURL: URL, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + throw HumeAI.APIError.unknown(message: "Speech to speech not supported") + } + + public func upload(voiceWithName name: String, description: String, fileURL: URL) async throws -> AbstractVoice.ID { + throw HumeAI.APIError.unknown(message: "Voice creation is not supported") + } + + public func edit(voice: AbstractVoice.ID, name: String, description: String, fileURL: URL?) async throws -> Bool { + throw HumeAI.APIError.unknown(message: "Voice creation is not supported") + } + + public func delete(voice: AbstractVoice.ID) async throws { + throw HumeAI.APIError.unknown(message: "Voice creation is not supported") + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/AbstractVoice.swift b/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/AbstractVoice.swift new file mode 100644 index 00000000..80de4232 --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/AbstractVoice.swift @@ -0,0 +1,41 @@ +// +// AudioStore.swift +// Voice +// +// Created by Jared Davidson on 10/31/24. +// + +import CorePersistence +import SwiftUI +import AVFoundation +import UniformTypeIdentifiers + +public struct AbstractVoice: Codable, Hashable, Identifiable, Sendable { + public typealias ID = _TypeAssociatedID + + public let id: ID + public let voiceID: String + public let name: String + public let description: String? + + public init( + voiceID: String, + name: String, + description: String? + ) { + self.id = .init(rawValue: voiceID) + self.voiceID = voiceID + self.name = name + self.description = description + } +} + +// MARK: - Conformances + +public protocol AbstractVoiceInitiable { + init(voice: AbstractVoice) throws +} + +public protocol AbstractVoiceConvertible { + func __conversion() throws -> AbstractVoice +} diff --git a/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/AbstractVoiceSettings.swift b/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/AbstractVoiceSettings.swift new file mode 100644 index 00000000..b54b685f --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/AbstractVoiceSettings.swift @@ -0,0 +1,122 @@ +// +// VoiceStore.swift +// Voice +// +// Created by Jared Davidson on 10/30/24. +// + +import SwiftUIX +import CorePersistence + +public struct AbstractVoiceSettings: Codable, Sendable, Initiable, Equatable { + public init() { + self.init(stability: 1.0) + } + + + public enum Setting: String, Codable, Sendable { + case stability + case similarityBoost = "similarity_boost" + case styleExaggeration = "style" + case speakerBoost = "use_speaker_boost" + } + + /// Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments it is recommended to lower this value. + /// This is a double between 0 (more variable) and 1 (more stable). + public var stability: Double + + /// Increasing the Similarity Boost setting enhances the overall voice clarity and targets speaker similarity. However, very high values can cause artifacts, so it is recommended to adjust this setting to find the optimal value. + /// This is a double between 0 (Low) and 1 (High). + public var similarityBoost: Double + + /// High values are recommended if the style of the speech should be exaggerated compared to the selected voice. Higher values can lead to more instability in the generated speech. Setting this to 0 will greatly increase generation speed and is the default setting. + public var styleExaggeration: Double + + /// Boost the similarity of the synthesized speech and the voice at the cost of some generation speed. + public var speakerBoost: Bool + + public var removeBackgroundNoise: Bool + + public init(stability: Double, + similarityBoost: Double, + styleExaggeration: Double, + speakerBoost: Bool, + removeBackgroundNoise: Bool) { + self.stability = max(0, min(1, stability)) + self.similarityBoost = max(0, min(1, similarityBoost)) + self.styleExaggeration = max(0, min(1, styleExaggeration)) + self.speakerBoost = speakerBoost + self.removeBackgroundNoise = removeBackgroundNoise + } + + public init(stability: Double? = nil, + similarityBoost: Double? = nil, + styleExaggeration: Double? = nil, + speakerBoost: Bool? = nil, + removeBackgroundNoise: Bool? = nil) { + self.stability = stability.map { max(0, min(1, $0)) } ?? 0.5 + self.similarityBoost = similarityBoost.map { max(0, min(1, $0)) } ?? 0.75 + self.styleExaggeration = styleExaggeration.map { max(0, min(1, $0)) } ?? 0 + self.speakerBoost = speakerBoost ?? true + self.removeBackgroundNoise = removeBackgroundNoise ?? false + } + + public init(stability: Double) { + self.init( + stability: stability, + similarityBoost: 0.75, + styleExaggeration: 0, + speakerBoost: true, + removeBackgroundNoise: false + ) + } + + public init(similarityBoost: Double) { + self.init( + stability: 0.5, + similarityBoost: similarityBoost, + styleExaggeration: 0, + speakerBoost: true, + removeBackgroundNoise: false + ) + } + + public init(styleExaggeration: Double) { + self.init( + stability: 0.5, + similarityBoost: 0.75, + styleExaggeration: styleExaggeration, + speakerBoost: true, + removeBackgroundNoise: false + ) + } + + public init(speakerBoost: Bool) { + self.init( + stability: 0.5, + similarityBoost: 0.75, + styleExaggeration: 0, + speakerBoost: speakerBoost, + removeBackgroundNoise: false + ) + } + + public func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + + try container.encode(stability, forKey: .stability) + try container.encode(similarityBoost, forKey: .similarityBoost) + try container.encode(styleExaggeration, forKey: .styleExaggeration) + try container.encode(speakerBoost, forKey: .speakerBoost) + try container.encode(removeBackgroundNoise, forKey: .removeBackgroundNoise) + } +} + + +public protocol AbstractVoiceSettingsInitiable { + init(settings: AbstractVoiceSettings) throws +} + +public protocol AbstractVoiceSettingsConvertible { + func __conversion() throws -> AbstractVoiceSettings +} diff --git a/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/SpeechSynthesisRequestHandling.swift b/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/SpeechSynthesisRequestHandling.swift new file mode 100644 index 00000000..e8bec05c --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/AbstractVoice (WIP)/SpeechSynthesisRequestHandling.swift @@ -0,0 +1,63 @@ +// +// SpeechSynthesisRequestHandling.swift +// Voice +// +// Created by Jared Davidson on 10/30/24. +// + +import Foundation +import SwiftUI + +public protocol SpeechToSpeechRequest { + +} + +public protocol SpeechToSpeechRequestHandling { + +} + +public protocol SpeechSynthesisRequestHandling: AnyObject { + func availableVoices() async throws -> [AbstractVoice] + + func speech( + for text: String, + voiceID: String, + voiceSettings: AbstractVoiceSettings, + model: String + ) async throws -> Data + + func speechToSpeech( + inputAudioURL: URL, + voiceID: String, + voiceSettings: AbstractVoiceSettings, + model: String + ) async throws -> Data + + func upload( + voiceWithName name: String, + description: String, + fileURL: URL + ) async throws -> AbstractVoice.ID + + func edit( + voice: AbstractVoice.ID, + name: String, + description: String, + fileURL: URL? + ) async throws -> Bool + + func delete(voice: AbstractVoice.ID) async throws +} + +// MARK: - Environment Key + +private struct AbstractClientKey: EnvironmentKey { + static let defaultValue: (any SpeechSynthesisRequestHandling)? = nil +} + +extension EnvironmentValues { + public var speechSynthesizer: (any SpeechSynthesisRequestHandling)? { + get { self[AbstractClientKey.self] } + set { self[AbstractClientKey.self] = newValue } + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationRequestHandling.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationRequestHandling.swift new file mode 100644 index 00000000..bc82693e --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationRequestHandling.swift @@ -0,0 +1,70 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import AVFoundation +import Foundation +import SwiftUI + +public protocol VideoGenerationRequestHandling { + func availableModels() async throws -> [VideoModel] + + func textToVideo( + text: String, + model: VideoModel, + settings: VideoGenerationSettings + ) async throws -> Data + + func imageToVideo( + imageURL: URL, + model: VideoModel, + settings: VideoGenerationSettings + ) async throws -> Data + + func videoToVideo( + videoURL: URL, + prompt: String, + model: VideoModel, + settings: VideoGenerationSettings + ) async throws -> Data +} + +private struct VideoGeneratorKey: EnvironmentKey { + public static let defaultValue: (any VideoGenerationRequestHandling)? = nil +} + +extension EnvironmentValues { + public var videoClient: (any VideoGenerationRequestHandling)? { + get { self[VideoGeneratorKey.self] } + set { self[VideoGeneratorKey.self] = newValue } + } +} + +public struct AnyVideoGenerationRequestHandling: Hashable { + public let base: any CoreMI._ServiceClientProtocol & VideoGenerationRequestHandling + private let _hashValue: Int + +// var displayName: String { +// switch base { +// case is FalVideoGenerationRequestHandling: +// return "Fal" +// default: +// fatalError() +// } +// } + + public init( + _ base: any CoreMI._ServiceClientProtocol & VideoGenerationRequestHandling + ) { + self.base = base + self._hashValue = ObjectIdentifier(base as AnyObject).hashValue + } + + public static func == (lhs: AnyVideoGenerationRequestHandling, rhs: AnyVideoGenerationRequestHandling) -> Bool { + lhs._hashValue == rhs._hashValue + } + + public func hash(into hasher: inout Hasher) { + hasher.combine(_hashValue) + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.FrameRate.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.FrameRate.swift new file mode 100644 index 00000000..da61c5dd --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.FrameRate.swift @@ -0,0 +1,16 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import Foundation + +extension VideoGenerationSettings { + public enum FrameRate: Int, Codable, CaseIterable { + case fps8 = 8 + case fps16 = 16 + case fps24 = 24 + case fps30 = 30 + + public var fps: Int { rawValue } + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.MotionSettings.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.MotionSettings.swift new file mode 100644 index 00000000..addec423 --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.MotionSettings.swift @@ -0,0 +1,23 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import Foundation + +extension VideoGenerationSettings { + public struct MotionSettings: Codable, Hashable { + public var stabilize: Bool + public var motionBucketId: Int // 0-127 + public var conditioningAugmentation: Double // 0.01-0.1 + + public init( + stabilize: Bool = true, + motionBucketId: Int = 127, + conditioningAugmentation: Double = 0.02 + ) { + self.stabilize = stabilize + self.motionBucketId = max(0, min(127, motionBucketId)) + self.conditioningAugmentation = max(0.01, min(0.1, conditioningAugmentation)) + } + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.Quality.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.Quality.swift new file mode 100644 index 00000000..5ce0de27 --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.Quality.swift @@ -0,0 +1,29 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import Foundation + +extension VideoGenerationSettings { + public enum Quality: String, Codable, CaseIterable { + case draft = "draft" // 20 steps + case fast = "fast" // 30 steps + case balanced = "balanced" // 35 steps + case quality = "quality" // 40 steps + case max = "max" // 50 steps + + public var inferenceSteps: Int { + switch self { + case .draft: return 20 + case .fast: return 30 + case .balanced: return 35 + case .quality: return 40 + case .max: return 50 + } + } + + public var qualityValue: Double { + Double(inferenceSteps - 20) / 30 + } + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.Resolution.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.Resolution.swift new file mode 100644 index 00000000..a140a046 --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.Resolution.swift @@ -0,0 +1,163 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import Foundation + +extension VideoGenerationSettings { + public enum Resolution: Codable, Hashable { + // Square Resolutions + case sd512x512 + case sd768x768 + case sd1024x1024 + + // Landscape HD Resolutions + case hd720p // 1280x720 + case hd1080p // 1920x1080 + case hd1440p // 2560x1440 + case uhd4k // 3840x2160 + + // Social Media Formats + case instagram // 1080x1080 + case story // 1080x1920 + case tiktok // 1080x1920 + case youtube // 1920x1080 + + // Custom Resolution + case custom(width: Int, height: Int) + + public static var allCases: [Resolution] { + [ + .sd512x512, .sd768x768, .sd1024x1024, + .hd720p, .hd1080p, .hd1440p, .uhd4k, + .instagram, .story, .tiktok, .youtube + ] + } + + public var dimensions: (width: Int, height: Int) { + switch self { + // Square Resolutions + case .sd512x512: + return (512, 512) + case .sd768x768: + return (768, 768) + case .sd1024x1024: + return (1024, 1024) + + // Landscape HD Resolutions + case .hd720p: + return (1280, 720) + case .hd1080p: + return (1920, 1080) + case .hd1440p: + return (2560, 1440) + case .uhd4k: + return (3840, 2160) + + // Social Media Formats + case .instagram: + return (1080, 1080) + case .story: + return (1080, 1920) + case .tiktok: + return (1080, 1920) + case .youtube: + return (1920, 1080) + + case .custom(let width, let height): + return (width, height) + } + } + + public var width: Int { dimensions.width } + public var height: Int { dimensions.height } + + public var aspectRatio: String { + let gcd = calculateGCD(width, height) + let simplifiedWidth = width / gcd + let simplifiedHeight = height / gcd + + // Check for common aspect ratios + switch (simplifiedWidth, simplifiedHeight) { + case (1, 1): return "1:1" // Square + case (16, 9): return "16:9" // Standard Widescreen + case (9, 16): return "9:16" // Vertical/Portrait + case (4, 3): return "4:3" // Traditional TV + case (21, 9): return "21:9" // Ultrawide + default: return "\(simplifiedWidth):\(simplifiedHeight)" + } + } + + public var resolution: String { + switch self { + case .uhd4k: + return "4K" + case .hd1440p: + return "1440p" + case .hd1080p, .youtube: + return "1080p" + case .hd720p: + return "720p" + case .instagram, .story, .tiktok: + return "1080p" + case .sd512x512: + return "512p" + case .sd768x768: + return "768p" + case .sd1024x1024: + return "1024p" + case .custom(let width, _): + if width >= 3840 { return "4K" } + if width >= 2560 { return "1440p" } + if width >= 1920 { return "1080p" } + if width >= 1280 { return "720p" } + return "\(width)p" + } + } + + public static func detectResolution(width: Int, height: Int) -> Resolution { + switch (width, height) { + case (512, 512): return .sd512x512 + case (768, 768): return .sd768x768 + case (1024, 1024): return .sd1024x1024 + case (1280, 720): return .hd720p + case (1920, 1080): return .hd1080p + case (2560, 1440): return .hd1440p + case (3840, 2160): return .uhd4k + case (1080, 1080): return .instagram + case (1080, 1920): return .story + default: return .custom(width: width, height: height) + } + } + + private func calculateGCD(_ a: Int, _ b: Int) -> Int { + var a = a + var b = b + while b != 0 { + let temp = b + b = a % b + a = temp + } + + return a + } + + public var displayName: String { + switch self { + case .sd512x512: return "512×512" + case .sd768x768: return "768×768" + case .sd1024x1024: return "1024×1024" + case .hd720p: return "HD 720p" + case .hd1080p: return "Full HD 1080p" + case .hd1440p: return "QHD 1440p" + case .uhd4k: return "4K UHD" + case .instagram: return "Instagram Square" + case .story: return "Instagram/TikTok Story" + case .tiktok: return "TikTok Video" + case .youtube: return "YouTube HD" + case .custom(let width, let height): + return "\(width)×\(height)" + } + } + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.StyleStrength.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.StyleStrength.swift new file mode 100644 index 00000000..1fdc10af --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.StyleStrength.swift @@ -0,0 +1,27 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import Foundation + +extension VideoGenerationSettings { + public enum StyleStrength: String, Codable, CaseIterable { + case subtle = "subtle" // 1-5 + case balanced = "balanced" // 5-10 + case strong = "strong" // 10-15 + case extreme = "extreme" // 15-20 + + public var guidanceScale: Double { + switch self { + case .subtle: return 3.0 + case .balanced: return 7.5 + case .strong: return 12.5 + case .extreme: return 17.5 + } + } + + public var strengthValue: Double { + (guidanceScale - 1) / 19 + } + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.swift new file mode 100644 index 00000000..81a72cfb --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoGenerationSettings.swift @@ -0,0 +1,43 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import Foundation + +public struct VideoGenerationSettings: Codable, Hashable, Equatable { + /// Duration of the generated video in seconds (1-60) + public var duration: Double { + didSet { + duration = max(1, min(60, duration)) + } + } + + public var resolution: Resolution + public var frameRate: FrameRate + public var quality: Quality + public var styleStrength: StyleStrength + public var motion: MotionSettings + public var negativePrompt: String + + public var fps: Int { frameRate.fps } + public var numInferenceSteps: Int { quality.inferenceSteps } + public var guidanceScale: Double { styleStrength.guidanceScale } + + public init( + duration: Double = 10.0, + resolution: Resolution = .sd512x512, + frameRate: FrameRate = .fps24, + quality: Quality = .balanced, + styleStrength: StyleStrength = .balanced, + motion: MotionSettings = MotionSettings(), + negativePrompt: String = "" + ) { + self.duration = max(1, min(60, duration)) + self.resolution = resolution + self.frameRate = frameRate + self.quality = quality + self.styleStrength = styleStrength + self.motion = motion + self.negativePrompt = negativePrompt + } +} diff --git a/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoModel.swift b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoModel.swift new file mode 100644 index 00000000..ed63bae8 --- /dev/null +++ b/Sources/LargeLanguageModels/Intramodular/VideoGeneration (WIP)/VideoModel.swift @@ -0,0 +1,35 @@ +// +// Copyright (c) Preternatural AI, Inc. +// + +import CorePersistence +import Foundation + +public struct VideoModel: Codable, Hashable, Identifiable { + public typealias ID = _TypeAssociatedID + + public let id: ID + public let endpoint: String + public let name: String + public let description: String? + public let capabilities: [Capability] + + public enum Capability: String, Codable { + case textToVideo + case imageToVideo + case videoToVideo + } + + public init( + endpoint: String, + name: String, + description: String?, + capabilities: [Capability] + ) { + self.id = .random() + self.endpoint = endpoint + self.name = name + self.description = description + self.capabilities = capabilities + } +} diff --git a/Sources/NeetsAI/Intramodular/Models/NeetsAI.Voice.swift b/Sources/NeetsAI/Intramodular/Models/NeetsAI.Voice.swift index 2f035154..ee23943a 100644 --- a/Sources/NeetsAI/Intramodular/Models/NeetsAI.Voice.swift +++ b/Sources/NeetsAI/Intramodular/Models/NeetsAI.Voice.swift @@ -6,6 +6,7 @@ // import Foundation +import LargeLanguageModels extension NeetsAI { public struct Voice: Codable, Hashable { @@ -15,3 +16,24 @@ extension NeetsAI { public let supportedModels: [String] } } + +extension NeetsAI.Voice: AbstractVoiceConvertible { + public func __conversion() throws -> AbstractVoice { + return AbstractVoice( + voiceID: self.id, + name: self.title ?? "", + description: self.aliasOf + ) + } +} + +extension NeetsAI.Voice: AbstractVoiceInitiable { + public init(voice: AbstractVoice) throws { + self.init( + id: .init(voice.voiceID), + title: voice.name, + aliasOf: voice.description, + supportedModels: [] + ) + } +} diff --git a/Sources/NeetsAI/Intramodular/NeetsAI.Client+SpeechSynthesisRequestHandling.swift b/Sources/NeetsAI/Intramodular/NeetsAI.Client+SpeechSynthesisRequestHandling.swift new file mode 100644 index 00000000..973024bd --- /dev/null +++ b/Sources/NeetsAI/Intramodular/NeetsAI.Client+SpeechSynthesisRequestHandling.swift @@ -0,0 +1,53 @@ +// +// NeetsAI.Client+SpeechSynthesisRequestHandling.swift +// Voice +// + +import Foundation +import SwiftUI +import AVFoundation +import LargeLanguageModels + +extension NeetsAI.Client: SpeechSynthesisRequestHandling { + public func availableVoices() async throws -> [AbstractVoice] { + let voices = try await getAllAvailableVoices() + .map({ try $0.__conversion() }) + .filter({ !$0.name.isEmpty }) + .unique(by: \.name) + return voices + } + + public func speech(for text: String, voiceID: String, voiceSettings: LargeLanguageModels.AbstractVoiceSettings, model: String) async throws -> Data { + let audio = try await generateSpeech( + text: text, + voiceId: voiceID, + model: .init(rawValue: model) ?? .mistralai + ) + return audio + } + + public func speechToSpeech(inputAudioURL: URL, voiceID: String, voiceSettings: LargeLanguageModels.AbstractVoiceSettings, model: String) async throws -> Data { + throw NeetsAI.APIError.unknown(message: "Speech to speech not supported") + + } + + public func upload(voiceWithName name: String, description: String, fileURL: URL) async throws -> LargeLanguageModels.AbstractVoice.ID { + throw NeetsAI.APIError.unknown(message: "Uploading Voice is not supported") + } + + public func edit(voice: LargeLanguageModels.AbstractVoice.ID, name: String, description: String, fileURL: URL?) async throws -> Bool { + throw NeetsAI.APIError.unknown(message: "Editing Voice is not supported") + } + + public func delete(voice: LargeLanguageModels.AbstractVoice.ID) async throws { + throw NeetsAI.APIError.unknown(message: "Deleting Voice is not supported") + } +} + +// FIXME: - REMOVE ME +extension Sequence { + func unique(by keyPath: KeyPath) -> [Element] { + var seen = Set() + return filter { seen.insert($0[keyPath: keyPath]).inserted } + } +} diff --git a/Sources/NeetsAI/module.swift b/Sources/NeetsAI/module.swift index 1c4d3b99..5b26df46 100644 --- a/Sources/NeetsAI/module.swift +++ b/Sources/NeetsAI/module.swift @@ -5,3 +5,5 @@ // Created by Jared Davidson on 11/22/24. // +@_exported import Swallow +@_exported import SwallowMacrosClient diff --git a/Sources/PlayHT/Intramodular/Models/PlayHT.Voice.swift b/Sources/PlayHT/Intramodular/Models/PlayHT.Voice.swift index 3ac8907f..7ee76f26 100644 --- a/Sources/PlayHT/Intramodular/Models/PlayHT.Voice.swift +++ b/Sources/PlayHT/Intramodular/Models/PlayHT.Voice.swift @@ -7,6 +7,7 @@ import Foundation import Swallow +import LargeLanguageModels extension PlayHT { public struct Voice: Codable, Hashable, Identifiable { @@ -16,7 +17,7 @@ extension PlayHT { public let name: String public let language: String? public let languageCode: String? - public let voiceEngine: String + public let voiceEngine: String? public let isCloned: Bool? public let gender: String? public let accent: String? @@ -26,6 +27,39 @@ extension PlayHT { public let texture: String? public let loudness: String? public let tempo: String? + + + init( + id: ID, + name: String, + language: String? = nil, + languageCode: String? = nil, + voiceEngine: String? = nil, + isCloned: Bool? = nil, + gender: String? = nil, + accent: String? = nil, + age: String? = nil, + style: String? = nil, + sample: String? = nil, + texture: String? = nil, + loudness: String? = nil, + tempo: String? = nil + ) { + self.id = id + self.name = name + self.language = language + self.languageCode = languageCode + self.voiceEngine = voiceEngine + self.isCloned = isCloned + self.gender = gender + self.accent = accent + self.age = age + self.style = style + self.sample = sample + self.texture = texture + self.loudness = loudness + self.tempo = tempo + } private enum CodingKeys: String, CodingKey { case id, name, language, languageCode, voiceEngine, isCloned @@ -72,3 +106,24 @@ extension PlayHT { case flac = "flac" } } + +// MARK: - Conformances + +extension PlayHT.Voice: AbstractVoiceConvertible { + public func __conversion() throws -> AbstractVoice { + return AbstractVoice( + voiceID: self.id.rawValue, + name: self.name, + description: nil + ) + } +} + +extension PlayHT.Voice: AbstractVoiceInitiable { + public init(voice: AbstractVoice) throws { + self.init( + id: .init(rawValue: voice.id.rawValue), + name: voice.name + ) + } +} diff --git a/Sources/PlayHT/Intramodular/PlayHT.Client+SpeechSynthesisRequestHandling.swift b/Sources/PlayHT/Intramodular/PlayHT.Client+SpeechSynthesisRequestHandling.swift new file mode 100644 index 00000000..c987b479 --- /dev/null +++ b/Sources/PlayHT/Intramodular/PlayHT.Client+SpeechSynthesisRequestHandling.swift @@ -0,0 +1,56 @@ +// +// PlayHT+SpeechSynthesisRequestHandling.swift +// Voice +// +// Created by Jared Davidson on 11/20/24. +// + +import Foundation +import AI +import ElevenLabs +import SwiftUI +import AVFoundation +import LargeLanguageModels + +extension PlayHT.Client: SpeechSynthesisRequestHandling { + public func availableVoices() async throws -> [AbstractVoice] { + let voices: [AbstractVoice] = try await getAllAvailableVoices().map { try $0.__conversion() } + return voices + } + + public func speech(for text: String, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + let data: Data = try await streamTextToSpeech( + text: text, + voice: voiceID, + settings: .init(), + model: .playHT2Turbo + ) + + return data + } + + public func speechToSpeech(inputAudioURL: URL, voiceID: String, voiceSettings: LargeLanguageModels.AbstractVoiceSettings, model: String) async throws -> Data { + throw PlayHT.APIError.unknown(message: "Speech to speech not supported") + } + + public func upload(voiceWithName name: String, description: String, fileURL: URL) async throws -> AbstractVoice.ID { + let mp4URL = try await fileURL.convertAudioToMP4() + let fileURLString = mp4URL.absoluteString + let voiceID = try await instantCloneVoice( + sampleFileURL: fileURLString, + name: name + ) + + try? FileManager.default.removeItem(at: mp4URL) + + return .init(rawValue: voiceID.rawValue) + } + + public func edit(voice: LargeLanguageModels.AbstractVoice.ID, name: String, description: String, fileURL: URL?) async throws -> Bool { + throw PlayHT.APIError.unknown(message: "Voice editing not supported") + } + + public func delete(voice: LargeLanguageModels.AbstractVoice.ID) async throws { + try await deleteClonedVoice(voice: .init(rawValue: voice.rawValue)) + } +} diff --git a/Sources/PlayHT/Intramodular/PlayHT.Client.swift b/Sources/PlayHT/Intramodular/PlayHT.Client.swift index 66e6e80f..eb63bfa8 100644 --- a/Sources/PlayHT/Intramodular/PlayHT.Client.swift +++ b/Sources/PlayHT/Intramodular/PlayHT.Client.swift @@ -59,14 +59,14 @@ extension PlayHT.Client: CoreMI._ServiceClientProtocol { extension PlayHT.Client { public func getAllAvailableVoices() async throws -> [PlayHT.Voice] { - async let htVoices = availableVoices() - async let clonedVoices = clonedVoices() + async let htVoices = self.getAvailableVoices() + async let clonedVoices = self.clonedVoices() let (available, cloned) = try await (htVoices, clonedVoices) return available + cloned } - public func availableVoices() async throws -> [PlayHT.Voice] { + public func getAvailableVoices() async throws -> [PlayHT.Voice] { try await run(\.listVoices).voices } diff --git a/Sources/PlayHT/Intramodular/URL++.swift b/Sources/PlayHT/Intramodular/URL++.swift new file mode 100644 index 00000000..f584da1f --- /dev/null +++ b/Sources/PlayHT/Intramodular/URL++.swift @@ -0,0 +1,79 @@ +// +// URL++.swift +// AI +// +// Created by Jared Davidson on 1/14/25. +// + +import AVFoundation +import AudioToolbox + +// FIXME: - This needs to be moved somewhere else (@archetapp) + +extension URL { + func convertAudioToMP4() async throws -> URL { + let outputURL = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString) + .appendingPathExtension("mp4") + + let asset = AVURLAsset(url: self) + + let composition = AVMutableComposition() + guard let compositionTrack = composition.addMutableTrack( + withMediaType: .audio, + preferredTrackID: kCMPersistentTrackID_Invalid + ) else { + throw NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "Could not create composition track"]) + } + + guard let audioTrack = try await asset.loadTracks(withMediaType: .audio).first else { + throw NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "No audio track found"]) + } + + let timeRange = CMTimeRange(start: .zero, duration: try await asset.load(.duration)) + for i in 0..<4 { + try compositionTrack.insertTimeRange( + timeRange, + of: audioTrack, + at: CMTime(seconds: Double(i) * timeRange.duration.seconds, preferredTimescale: 600) + ) + } + + guard let exportSession = AVAssetExportSession( + asset: composition, + presetName: AVAssetExportPresetPassthrough + ) else { + throw NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "Could not create export session"]) + } + + exportSession.outputURL = outputURL + exportSession.outputFileType = AVFileType.mp4 + exportSession.shouldOptimizeForNetworkUse = true + + // Create a tuple of values we need to check after export + try await withCheckedThrowingContinuation { continuation in + let mainQueue = DispatchQueue.main + exportSession.exportAsynchronously { + mainQueue.async { + switch exportSession.status { + case .completed: + continuation.resume() + case .failed: + continuation.resume(throwing: exportSession.error ?? NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "Export failed"])) + case .cancelled: + continuation.resume(throwing: NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "Export cancelled"])) + default: + continuation.resume(throwing: NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "Unknown export error"])) + } + } + } + } + + let fileSize = try FileManager.default.attributesOfItem(atPath: outputURL.path)[.size] as? Int ?? 0 + if fileSize < 5000 { // 5KB minimum + throw NSError(domain: "AudioConversion", code: -1, userInfo: [NSLocalizedDescriptionKey: "Converted file too small"]) + } + + return outputURL + } +} diff --git a/Sources/Rime/Intramodular/Models/Rime.Voice.swift b/Sources/Rime/Intramodular/Models/Rime.Voice.swift index 1a341b41..459a19ba 100644 --- a/Sources/Rime/Intramodular/Models/Rime.Voice.swift +++ b/Sources/Rime/Intramodular/Models/Rime.Voice.swift @@ -6,10 +6,32 @@ // import Foundation +import CorePersistence import Swallow +import LargeLanguageModels extension Rime { public struct Voice: Hashable { + public typealias ID = _TypeAssociatedID + + public init( + name: String, + age: String?, + country: String?, + region: String?, + demographic: String?, + genre: [String]? + ) { + self.id = .init(rawValue: UUID().uuidString) + self.name = name + self.age = age + self.country = country + self.region = region + self.demographic = demographic + self.genre = genre + } + + public let id: ID public let name: String public let age: String? public let country: String? @@ -42,5 +64,30 @@ extension Rime.Voice: Codable { self.region = try container.decodeIfPresent(String.self, forKey: Rime.Voice.CodingKeys.region) self.demographic = try container.decodeIfPresent(String.self, forKey: Rime.Voice.CodingKeys.demographic) self.genre = try container.decodeIfPresent([String].self, forKey: Rime.Voice.CodingKeys.genre) + + self.id = .init(rawValue: UUID().uuidString) + } +} + +extension Rime.Voice: AbstractVoiceInitiable { + public init(voice: AbstractVoice) throws { + self.init( + name: voice.name, + age: nil, + country: nil, + region: nil, + demographic: nil, + genre: nil + ) + } +} + +extension Rime.Voice: AbstractVoiceConvertible { + public func __conversion() throws -> AbstractVoice { + return AbstractVoice( + voiceID: self.id.rawValue, + name: self.name, + description: nil + ) } } diff --git a/Sources/Rime/Intramodular/Rime.Client+SpeechSynthesisRequestHandling.swift b/Sources/Rime/Intramodular/Rime.Client+SpeechSynthesisRequestHandling.swift new file mode 100644 index 00000000..93126293 --- /dev/null +++ b/Sources/Rime/Intramodular/Rime.Client+SpeechSynthesisRequestHandling.swift @@ -0,0 +1,55 @@ +// +// Rime+SpeechSynthesisRequestHandling.swift +// Voice +// +// Created by Jared Davidson on 11/21/24. +// + +import Foundation +import AI +import ElevenLabs +import SwiftUI +import AVFoundation + +extension Rime.Client: SpeechSynthesisRequestHandling { + public func availableVoices() async throws -> [AbstractVoice] { + return try await getAllAvailableVoiceDetails().map { try $0.__conversion() } + } + + public func speech(for text: String, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + return try await streamTextToSpeech( + text: text, + voice: voiceID, + outputAudio: .MP3, + model: .mist + ) + } + + public func speechToSpeech(inputAudioURL: URL, voiceID: String, voiceSettings: AbstractVoiceSettings, model: String) async throws -> Data { + throw Rime.APIError.unknown(message: "Speech to speech not supported") + } + + public func upload(voiceWithName name: String, description: String, fileURL: URL) async throws -> AbstractVoice.ID { + throw Rime.APIError.unknown(message: "Voice creation is not supported") + } + + public func edit(voice: AbstractVoice.ID, name: String, description: String, fileURL: URL?) async throws -> Bool { + throw Rime.APIError.unknown(message: "Voice creation is not supported") + } + + public func delete(voice: AbstractVoice.ID) async throws { + throw Rime.APIError.unknown(message: "Voice creation is not supported") + } + + public func availableVoices() async throws -> [ElevenLabs.Voice] { + return try await getAllAvailableVoiceDetails().map { voice in + ElevenLabs.Voice( + voiceID: voice.name, + name: voice.name, + description: voice.demographic, + isOwner: false + ) + } + } + +} diff --git a/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.RequestBodies.swift b/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.RequestBodies.swift index 7003aebd..0eaaa91f 100644 --- a/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.RequestBodies.swift +++ b/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.RequestBodies.swift @@ -142,10 +142,31 @@ extension _Gemini.APISpecification { } } - public struct FileUploadInput: Codable, HTTPRequest.Multipart.ContentConvertible { + public struct FinalizeFileUploadInput { + public let data: Data + public let uploadUrl: String + public let fileSize: Int + + public init(data: Data, uploadUrl: String, fileSize: Int) { + self.data = data + self.uploadUrl = uploadUrl + self.fileSize = fileSize + } + } + + public struct StartFileUploadInput: Codable { + public struct UploadMetadata: Codable { + let file: FileMetadata + + struct FileMetadata: Codable { + let display_name: String + } + } + public let fileData: Data public let mimeType: String public let displayName: String + public let metadata: UploadMetadata public init( fileData: Data, @@ -155,11 +176,12 @@ extension _Gemini.APISpecification { self.fileData = fileData self.mimeType = mimeType self.displayName = displayName + self.metadata = .init(file: .init(display_name: displayName)) } - + /* public func __conversion() throws -> HTTPRequest.Multipart.Content { var result = HTTPRequest.Multipart.Content() - + // TODO: - Add this to `HTTPMediaType` @jared @vmanot let fileExtension: String = { guard let subtype = mimeType.split(separator: "/").last else { @@ -188,17 +210,11 @@ extension _Gemini.APISpecification { } }() - result.append( - .file( - named: "file", - data: fileData, - filename: "\(displayName).\(fileExtension)", - contentType: .init(rawValue: mimeType) - ) - ) + result.ap return result } + */ } public struct DeleteFileInput: Codable { diff --git a/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.swift b/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.swift index fe289d07..587217f2 100644 --- a/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.swift +++ b/Sources/_Gemini/Intramodular/API/_Gemini.APISpecification.swift @@ -70,22 +70,43 @@ extension _Gemini { "/v1beta/models/\(context.input.model):generateContent" }) @Body(json: \.requestBody) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var generateContent = Endpoint() // Initial Upload Request endpoint @POST @Path("/upload/v1beta/files") - @Header([ - "X-Goog-Upload-Command": "start, upload, finalize" - ]) - @Body(multipart: .input) - var uploadFile = Endpoint() + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) + @Header({ context in + [ + HTTPHeaderField(key: "X-Goog-Upload-Protocol", value: "resumable"), + HTTPHeaderField(key: "X-Goog-Upload-Command", value: "start"), + HTTPHeaderField(key: "X-Goog-Upload-Header-Content-Length", value: "\(context.input.fileData.count)"), + HTTPHeaderField(key: "X-Goog-Upload-Header-Content-Type", value: context.input.mimeType), + HTTPHeaderField.contentType(.json) + ] + }) + @Body(json: \RequestBodies.StartFileUploadInput.metadata) + var startFileUpload = Endpoint() + + @POST + @AbsolutePath({ $0.input.uploadUrl }) + @Header({ context in + [ + HTTPHeaderField(key: "Content-Length", value: "\(context.input.fileSize)"), + HTTPHeaderField(key: "X-Goog-Upload-Offset", value: "0"), + HTTPHeaderField(key: "X-Goog-Upload-Command", value: "upload, finalize") + ] + }) + @Body(data: \RequestBodies.FinalizeFileUploadInput.data) + var finalizeFileUpload = Endpoint() // File Status endpoint @GET @Path({ context -> String in "/v1beta/\(context.input.name.rawValue)" }) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var getFile = Endpoint() @GET @@ -101,8 +122,13 @@ extension _Gemini { parameters["pageToken"] = pageToken } + if let apiKey = context.root.configuration.apiKey { + parameters["key"] = apiKey + } + return parameters }) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var listFiles = Endpoint() // Delete File endpoint @@ -110,24 +136,28 @@ extension _Gemini { @Path({ context -> String in "/\(context.input.fileURL.path)" }) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var deleteFile = Endpoint() - //Fine Tuning + // Fine Tuning @POST @Path("/v1beta/tunedModels") @Body(json: \.requestBody) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var createTunedModel = Endpoint() @GET @Path({ context -> String in "/v1/\(context.input.operationName)" }) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var getTuningOperation = Endpoint() @GET @Path({ context -> String in "/v1beta/\(context.input.modelName)" }) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var getTunedModel = Endpoint() @POST @@ -135,6 +165,7 @@ extension _Gemini { "/v1beta/\(context.input.model):generateContent" // Use the model name directly }) @Body(json: \.requestBody) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var generateTunedContent = Endpoint() @POST @@ -142,6 +173,7 @@ extension _Gemini { "/v1beta/models/\(context.input.model):embedContent" }) @Body(json: \.input) + @Query({ $0.root.configuration.apiKey.map { ["key": $0] } ?? [:] }) var generateEmbedding = Endpoint() } } @@ -152,15 +184,11 @@ extension _Gemini.APISpecification { from input: Input, context: BuildRequestContext ) throws -> Request { - var request = try super.buildRequestBase( + let request = try super.buildRequestBase( from: input, context: context ) - if let apiKey = context.root.configuration.apiKey { - request = request.query([.init(name: "key", value: apiKey)]) - } - return request } @@ -168,15 +196,33 @@ extension _Gemini.APISpecification { from response: Request.Response, context: DecodeOutputContext ) throws -> Output { - - print(response) - try response.validate() + if let options: _Gemini.APISpecification.Options = context.options as? _Gemini.APISpecification.Options, let headerKey = options.outputHeaderKey { + let stringValue: String? = response.headerFields.first (where: { $0.key == headerKey })?.value + + switch Output.self { + case String.self: + return (try stringValue.unwrap()) as! Output + case Optional.self: + return stringValue as! Output + default: + throw _Gemini.APIError.invalidContentType + } + } + return try response.decode( Output.self, keyDecodingStrategy: .convertFromSnakeCase ) } } + + public class Options { + var outputHeaderKey: HTTPHeaderField.Key? + + init(outputHeaderKey: HTTPHeaderField.Key? = nil) { + self.outputHeaderKey = outputHeaderKey + } + } } diff --git a/Sources/_Gemini/Intramodular/_Gemini.Client+Files.swift b/Sources/_Gemini/Intramodular/_Gemini.Client+Files.swift index b6fa298c..020e3cde 100644 --- a/Sources/_Gemini/Intramodular/_Gemini.Client+Files.swift +++ b/Sources/_Gemini/Intramodular/_Gemini.Client+Files.swift @@ -6,9 +6,21 @@ import CoreMI import Dispatch import FoundationX import Merge +import Media import NetworkKit import Swallow +fileprivate enum TempError: CustomStringError, Error { + case fetchedResponse + + public var description: String { + switch self { + case .fetchedResponse: + return "Got response url from header" + } + } +} + extension _Gemini.Client { public func uploadFile( from data: Data, @@ -20,25 +32,27 @@ extension _Gemini.Client { throw FileProcessingError.invalidFileName } - do { - var mimeType: String? = mimeType?.rawValue ?? _MediaAssetFileType(data)?.mimeType - - if mimeType == nil, let swiftType { - mimeType = HTTPMediaType(_swiftType: swiftType)?.rawValue - } - - let input = _Gemini.APISpecification.RequestBodies.FileUploadInput( - fileData: data, - mimeType: try mimeType.unwrap(), - displayName: displayName - ) - - let response = try await run(\.uploadFile, with: input) - - return response.file - } catch { - throw _Gemini.APIError.unknown(message: "File upload failed: \(error.localizedDescription)") + var mimeType: String? = mimeType?.rawValue ?? _MediaAssetFileType(data)?.mimeType + + if mimeType == nil, let swiftType { + mimeType = HTTPMediaType(_swiftType: swiftType)?.rawValue } + + let input = _Gemini.APISpecification.RequestBodies.StartFileUploadInput( + fileData: data, + mimeType: try mimeType.unwrap(), + displayName: displayName + ) + + let uploadURLString: String = try await run(\.startFileUpload, with: input, options: _Gemini.APISpecification.Options(outputHeaderKey: .custom("x-goog-upload-url"))).value + + let result: _Gemini.APISpecification.ResponseBodies.FileUpload = try await run(\.finalizeFileUpload, with: _Gemini.APISpecification.RequestBodies.FinalizeFileUploadInput(data: data, uploadUrl: uploadURLString, fileSize: data.count)) + + return result.file + } + + public func upload(file: any MediaFile) async throws { + try await self.uploadFile(from: file.url, mimeType: HTTPMediaType(fileURL: file.url), displayName: file.name) } public func uploadFile(