Bump mlx to version 2.21.2 (#94)

# Bumps mlx to version 2.21.2 ## ♻️ Current situation & Problem The MLX library is currently 1.18.1, and can be bumped to 2.x to support newer models. ## ⚙️ Release Notes Updates the MLX library to 2.21.2 and migrates code for breaking changes. ## 📝 Code of Conduct & Contributing Guidelines By submitting creating this pull request, you agree to follow our [Code of Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md) and [Contributing Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md): - [X] I agree to follow the [Code of Conduct](https://github.com/StanfordSpezi/.github/blob/main/CODE_OF_CONDUCT.md) and [Contributing Guidelines](https://github.com/StanfordSpezi/.github/blob/main/CONTRIBUTING.md). --------- Co-authored-by: Leon Nissen <> Co-authored-by: Vishnu Ravi <vishnur@stanford.edu> Co-authored-by: Paul Schmiedmayer <PSchmiedmayer@users.noreply.github.com>
StanfordSpezi · Feb 3, 2025 · fe15019 · fe15019
1 parent 26b1e07
commit fe15019
Show file tree

Hide file tree

Showing 8 changed files with 125 additions and 83 deletions.
diff --git a/.linkspector.yml b/.linkspector.yml
@@ -0,0 +1,14 @@
+#
+# This source file is part of the Stanford Spezi open source project
+#
+# SPDX-FileCopyrightText: 2025 Stanford University and the project authors (see CONTRIBUTORS.md)
+#
+# SPDX-License-Identifier: MIT
+# 
+dirs:
+  - .
+useGitIgnore: true
+ignorePatterns:
+  - pattern: '^https://platform.openai.com/docs/guides/.*$' # Causes false positives
+  - pattern: '^doc:.*$'
+  - pattern: '^http://localhost.*$'
diff --git a/Package.swift b/Package.swift
@@ -28,14 +28,14 @@ let package = Package(
     ],
     dependencies: [
         .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.21.2")),
-        .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "1.18.1"),  // Pin MLX Swift Examples as it doesn't follow semantic versioning
+        .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "2.21.2"),  // Pin MLX Swift Examples as it doesn't follow semantic versioning
         .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.14")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),
         .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0"),
         .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"),
         .package(url: "https://github.com/StanfordSpezi/SpeziOnboarding", from: "1.1.1"),
-        .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.1")),
+        .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.3")),
         .package(url: "https://github.com/StanfordSpezi/SpeziViews", from: "1.3.1")
     ],
     targets: [
@@ -54,12 +54,9 @@ let package = Package(
                 .product(name: "SpeziFoundation", package: "SpeziFoundation"),
                 .product(name: "Spezi", package: "Spezi"),
                 .product(name: "MLX", package: "mlx-swift"),
-                .product(name: "MLXFast", package: "mlx-swift"),
-                .product(name: "MLXNN", package: "mlx-swift"),
-                .product(name: "MLXOptimizers", package: "mlx-swift"),
                 .product(name: "MLXRandom", package: "mlx-swift"),
                 .product(name: "Transformers", package: "swift-transformers"),
-                .product(name: "LLM", package: "mlx-swift-examples")
+                .product(name: "MLXLLM", package: "mlx-swift-examples")
             ]
         ),
         .target(
@@ -68,7 +65,7 @@ let package = Package(
                 .product(name: "SpeziOnboarding", package: "SpeziOnboarding"),
                 .product(name: "SpeziViews", package: "SpeziViews"),
                 .target(name: "SpeziLLMLocal"),
-                .product(name: "LLM", package: "mlx-swift-examples")
+                .product(name: "MLXLLM", package: "mlx-swift-examples")
             ]
         ),
         .target(

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ The target enables developers to easily execute medium-size Language Models (LLM
 > Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
 > [!IMPORTANT]
-> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
+> Important: To use the LLM local target, some LLMs require adding the *Increase Memory Limit* entitlement to the project.
 
 #### Setup
 
@@ -147,7 +147,7 @@ class LLMOpenAIAppDelegate: SpeziAppDelegate {
 ```
 
 > [!IMPORTANT]
-> If using `SpeziLLMOpenAI` on macOS, ensure to add the [`Keychain Access Groups` entitlement](https://developer.apple.com/documentation/bundleresources/entitlements/keychain-access-groups) to the enclosing Xcode project via *PROJECT_NAME > Signing&Capabilities > + Capability*. The array of keychain groups can be left empty, only the base entitlement is required.
+> If using `SpeziLLMOpenAI` on macOS, ensure to add the *`Keychain Access Groups` entitlement* to the enclosing Xcode project via *PROJECT_NAME > Signing&Capabilities > + Capability*. The array of keychain groups can be left empty, only the base entitlement is required.
 
 #### Usage
 

diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -8,6 +8,7 @@
 
 import Foundation
 import MLXLLM
+import MLXLMCommon
 import SpeziChat
 import SpeziLLM
 

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -9,23 +9,32 @@
 import Foundation
 import MLX
 import MLXLLM
+import MLXLMCommon
 import MLXRandom
 import os
 import SpeziChat
 import SpeziLLM
 
 
 extension LLMLocalSession {
-    // swiftlint:disable:next identifier_name function_body_length
+    private var generationParameters: GenerateParameters {
+        .init(
+            temperature: schema.samplingParameters.temperature,
+            topP: schema.samplingParameters.topP,
+            repetitionPenalty: schema.samplingParameters.penaltyRepeat,
+            repetitionContextSize: schema.samplingParameters.repetitionContextSize
+        )
+    }
+
+    // swiftlint:disable:next identifier_name
     internal func _generate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
 #if targetEnvironment(simulator)
-        // swiftlint:disable:next return_value_from_void_function
-        return await _mockGenerate(continuation: continuation)
+        await _mockGenerate(continuation: continuation)
+        return
 #endif
 
         guard let modelContainer = await self.modelContainer else {
-            Self.logger.error("SpeziLLMLocal: Failed to load `modelContainer`")
-            await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
+            await handleError("Failed to load `modelContainer`", error: .modelNotFound, continuation: continuation)
             return
         }
 
@@ -35,15 +44,8 @@ extension LLMLocalSession {
             await self.context.formattedChat
         }
 
-        guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in
-            if let chatTempalte = self.schema.parameters.chatTemplate {
-               return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte)
-            } else {
-                return try tokenizer.applyChatTemplate(messages: messages)
-            }
-        }) else {
-            Self.logger.error("SpeziLLMLocal: Failed to format chat with given context")
-            await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation)
+        guard let modelInput: LMInput = try? await prepareModelInput(messages: messages, modelContainer: modelContainer) else {
+            await handleError("Failed to format chat with given context", error: .illegalContext, continuation: continuation)
             return
         }
 
@@ -53,78 +55,104 @@ extension LLMLocalSession {
             return
         }
 
-        let parameters: GenerateParameters = .init(
-            temperature: schema.samplingParameters.temperature,
-            topP: schema.samplingParameters.topP,
-            repetitionPenalty: schema.samplingParameters.penaltyRepeat,
-            repetitionContextSize: schema.samplingParameters.repetitionContextSize
-        )
-
-        // swiftlint:disable:next closure_body_length
-        let result = await modelContainer.perform { model, tokenizer in
-            let result = MLXLLM.generate(
-                promptTokens: promptTokens,
-                parameters: parameters,
-                model: model,
-                tokenizer: tokenizer,
-                extraEOSTokens: schema.parameters.extraEOSTokens
-            ) { tokens in
-                if Task.isCancelled {
-                    return .stop
-                }
-
-                if tokens.count >= self.schema.parameters.maxOutputLength {
-                    Self.logger.debug("SpeziLLMLocal: Max output length exceeded.")
-                    return .stop
+        do {
+            let result = try await modelContainer.perform { modelContext in
+                let result = try MLXLMCommon.generate(
+                    input: modelInput,
+                    parameters: generationParameters,
+                    context: modelContext
+                ) { tokens in
+                    processTokens(tokens, modelContext: modelContext, continuation: continuation)
                 }
 
-                if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
-                    let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens))
-                    let text = tokenizer.decode(tokens: lastTokens)
-
-                    Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)")
-                    continuation.yield(text)
-
-                    if schema.injectIntoContext {
-                        Task { @MainActor in
-                            context.append(assistantOutput: text)
-                        }
-                    }
-                }
-
-                return .more
+                processRemainingTokens(result: result, modelContext: modelContext, continuation: continuation)
+                return result
             }
 
-            // Yielding every Nth token may result in missing the final tokens.
-            let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
-            let lastTokens = Array(result.tokens.suffix(reaminingTokens))
-            let text = tokenizer.decode(tokens: lastTokens)
+            Self.logger.debug(
+                """
+                SpeziLLMLocal:
+                Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public)
+                Generation tokens per second: \(result.tokensPerSecond, privacy: .public)
+                """
+            )
+
+            await MainActor.run {
+                continuation.finish()
+                state = .ready
+            }
+        } catch {
+            await handleError("Generation ended with error: \(error)", error: .generationError, continuation: continuation)
+            return
+        }
+    }
+
+    private func prepareModelInput(messages: [[String: String]], modelContainer: ModelContainer) async throws -> LMInput {
+        try await modelContainer.perform { modelContext in
+            if let chatTemplate = self.schema.parameters.chatTemplate {
+                let tokens = try modelContext.tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate)
+                return LMInput(text: .init(tokens: MLXArray(tokens)))
+            } else {
+                return try await modelContext.processor.prepare(input: .init(messages: messages))
+            }
+        }
+    }
+
+    private func processTokens(
+        _ tokens: [Int],
+        modelContext: ModelContext,
+        continuation: AsyncThrowingStream<String, any Error>.Continuation
+    ) -> GenerateDisposition {
+        if Task.isCancelled {
+            return .stop
+        }
+
+        if tokens.count >= self.schema.parameters.maxOutputLength {
+            Self.logger.debug("SpeziLLMLocal: Max output length exceeded.")
+            return .stop
+        }
+
+        if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
+            let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens))
+            let text = modelContext.tokenizer.decode(tokens: lastTokens)
+
+            Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)")
             continuation.yield(text)
 
             if schema.injectIntoContext {
                 Task { @MainActor in
                     context.append(assistantOutput: text)
-                    context.completeAssistantStreaming()
                 }
             }
-
-            return result
         }
 
-        Self.logger.debug(
-            """
-            SpeziLLMLocal:
-            Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public)
-            Generation tokens per second: \(result.tokensPerSecond, privacy: .public)
-            """
-        )
+        return .more
+    }
+
+    private func processRemainingTokens(
+        result: GenerateResult,
+        modelContext: ModelContext,
+        continuation: AsyncThrowingStream<String, any Error>.Continuation
+    ) {
+        // Yielding every Nth token may result in missing the final tokens.
+        let remainingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
+        let lastTokens = Array(result.tokens.suffix(remainingTokens))
+        let text = modelContext.tokenizer.decode(tokens: lastTokens)
+        continuation.yield(text)
 
-        await MainActor.run {
-            continuation.finish()
-            state = .ready
+        if schema.injectIntoContext {
+            Task { @MainActor in
+                context.append(assistantOutput: text)
+                context.completeAssistantStreaming()
+            }
         }
     }
 
+    private func handleError(_ message: String, error: LLMLocalError, continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
+        Self.logger.error("SpeziLLMLocal: \(message)")
+        await finishGenerationWithError(error, on: continuation)
+    }
+
     private func _mockGenerate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
         let tokens = [
             "Mock ", "Message ", "from ", "SpeziLLM! ",

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -9,6 +9,7 @@
 import Foundation
 import Hub
 import MLXLLM
+import MLXLMCommon
 
 
 extension LLMLocalSession {
@@ -46,10 +47,10 @@ extension LLMLocalSession {
         }
 
         do {
-            let modelContainer = try await loadModelContainer(configuration: self.schema.configuration)
+            let modelContainer = try await LLMModelFactory.shared.loadContainer(configuration: self.schema.configuration)
 
-            let numParams = await modelContainer.perform { [] model, _ in
-                model.numParameters()
+            let numParams = await modelContainer.perform { modelContext in
+                modelContext.model.numParameters()
             }
 
             await MainActor.run {

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -10,6 +10,7 @@
 import Foundation
 import MLX
 import MLXLLM
+import MLXLMCommon
 import MLXRandom
 import os
 import SpeziChat
@@ -83,7 +84,7 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     @MainActor public var customContext: [[String: String]] = []
 
     @MainActor public var numParameters: Int?
-    @MainActor public var modelConfiguration: ModelConfiguration?
+    @MainActor public var modelConfiguration: ModelRegistry?
     @MainActor public var modelContainer: ModelContainer?
 
 

diff --git a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
@@ -29,7 +29,7 @@ You need to add the SpeziLLM Swift package to
  
 > Important: Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
-> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
+> Important: To use the LLM local target, some LLMs require adding the *Increase Memory Limit* entitlement to the project.
 
 ## Spezi LLM Local Components