microsoft · dluc · Mar 24, 2025 · Mar 24, 2025
diff --git a/Directory.Packages.props b/Directory.Packages.props
@@ -24,7 +24,7 @@
     <PackageVersion Include="Microsoft.Extensions.Hosting.Abstractions" Version="8.0.1" />
     <PackageVersion Include="Microsoft.Extensions.Http" Version="9.0.3" />
     <PackageVersion Include="Microsoft.Extensions.Logging" Version="8.0.0" />
-    <PackageVersion Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.5.2" />
+    <PackageVersion Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" />
     <PackageVersion Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.3" />
     <PackageVersion Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.5.2" />
     <PackageVersion Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.5.2" />
@@ -34,7 +34,7 @@
     <PackageVersion Include="OllamaSharp" Version="5.1.7" />
     <PackageVersion Include="PdfPig" Version="0.1.10" />
     <PackageVersion Include="Polly.Core" Version="8.5.2" />
-    <PackageVersion Include="RabbitMQ.Client" Version="7.0.0" />
+    <PackageVersion Include="RabbitMQ.Client" Version="7.1.2" />
     <PackageVersion Include="ReadLine" Version="2.0.1" />
     <PackageVersion Include="Swashbuckle.AspNetCore" Version="8.0.0" />
     <PackageVersion Include="System.Linq.Async" Version="6.0.1" />

diff --git a/KernelMemory.sln b/KernelMemory.sln
@@ -224,6 +224,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "InteractiveSetup", "tools\I
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "testapps", "testapps", "{AEF463F6-F813-498C-830A-3B4CED6DC4A7}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "213-onnx", "examples\213-onnx\213-onnx.csproj", "{E7ECB0D7-A4AA-4529-B191-3FDFE8674784}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -530,6 +532,9 @@ Global
 		{D6BC74A5-41C7-4A60-9C2E-F246DC40145A}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{D6BC74A5-41C7-4A60-9C2E-F246DC40145A}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{D6BC74A5-41C7-4A60-9C2E-F246DC40145A}.Release|Any CPU.Build.0 = Release|Any CPU
+		{E7ECB0D7-A4AA-4529-B191-3FDFE8674784}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{E7ECB0D7-A4AA-4529-B191-3FDFE8674784}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{E7ECB0D7-A4AA-4529-B191-3FDFE8674784}.Release|Any CPU.ActiveCfg = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -630,6 +635,7 @@ Global
 		{82670921-FDCD-4672-84BD-4353F5AC24A0} = {AEF463F6-F813-498C-830A-3B4CED6DC4A7}
 		{CCA96699-483E-4B2A-95DF-25F0C98E3BB6} = {AEF463F6-F813-498C-830A-3B4CED6DC4A7}
 		{AEF463F6-F813-498C-830A-3B4CED6DC4A7} = {5E7DD43D-B5E7-4827-B57D-447E5B428589}
+		{E7ECB0D7-A4AA-4529-B191-3FDFE8674784} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}

diff --git a/examples/212-dotnet-ollama/Program.cs b/examples/212-dotnet-ollama/Program.cs
@@ -48,7 +48,7 @@ public static async Task Main()
         // Import some text
         await memory.ImportTextAsync("Today is October 32nd, 2476");
 
-        // Generate an answer - This uses OpenAI for embeddings and finding relevant data, and LM Studio to generate an answer
+        // Generate an answer
         var answer = await memory.AskAsync("What's the current date (don't check for validity)?");
         Console.WriteLine("-------------------");
         Console.WriteLine(answer.Question);

diff --git a/examples/213-onnx/213-onnx.csproj b/examples/213-onnx/213-onnx.csproj
@@ -0,0 +1,13 @@
+<Project Sdk="Microsoft.NET.Sdk.Web">
+
+    <PropertyGroup>
+        <TargetFramework>net8.0</TargetFramework>
+        <RollForward>LatestMajor</RollForward>
+        <ImplicitUsings>enable</ImplicitUsings>
+    </PropertyGroup>
+
+    <ItemGroup>
+        <ProjectReference Include="..\..\extensions\KM\KernelMemory\KernelMemory.csproj" />
+    </ItemGroup>
+
+</Project>
diff --git a/examples/213-onnx/Program.cs b/examples/213-onnx/Program.cs
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.KernelMemory;
+
+/* This example shows how to use KM with Ollama
+ *
+ * 1. Download phi4 model from https://huggingface.co/microsoft/phi-4-onnx
+ *
+ * 2. Edit appsettings.json (or appsettings.Development.json) and set the model path.
+ *
+ * 3. Run the code
+ */
+public static class Program
+{
+    public static async Task Main()
+    {
+        var onnxCfg = new OnnxConfig();
+        var azureOpenAIEmbeddingConfig = new AzureOpenAIConfig();
+
+        new ConfigurationBuilder()
+            .AddJsonFile("appsettings.json")
+            .AddJsonFile("appsettings.development.json", optional: true)
+            .AddJsonFile("appsettings.Development.json", optional: true)
+            .Build()
+            .BindSection("KernelMemory:Services:Onnx", onnxCfg)
+            .BindSection("KernelMemory:Services:AzureOpenAIEmbedding", azureOpenAIEmbeddingConfig);
+
+        var memory = new KernelMemoryBuilder()
+            .WithOnnxTextGeneration(onnxCfg)
+            .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig)
+            .Configure(builder => builder.Services.AddLogging(l =>
+            {
+                l.SetMinimumLevel(LogLevel.Warning);
+                l.AddSimpleConsole(c => c.SingleLine = true);
+            }))
+            .Build();
+
+        // Import some text
+        await memory.ImportTextAsync("Yesterday was October 21st, 2476");
+        await memory.ImportTextAsync("Tomorrow will be October 23rd, 2476");
+
+        // Generate an answer
+        var answer = await memory.AskAsync("What's the current date?");
+        Console.WriteLine(answer.Result);
+
+        /*
+
+        -- Output using phi-4-onnx:
+
+        Based on the provided information, if yesterday was October 21st, 2476, then today is October 22nd, 2476.
+        */
+    }
+}
diff --git a/examples/213-onnx/Properties/launchSettings.json b/examples/213-onnx/Properties/launchSettings.json
@@ -0,0 +1,11 @@
+{
+  "profiles": {
+    "console": {
+      "commandName": "Project",
+      "launchBrowser": false,
+      "environmentVariables": {
+        "ASPNETCORE_ENVIRONMENT": "Development"
+      }
+    }
+  }
+}
diff --git a/examples/213-onnx/appsettings.json b/examples/213-onnx/appsettings.json
@@ -0,0 +1,46 @@
+{
+  "KernelMemory": {
+    "Services": {
+      "Onnx": {
+        // Source: https://huggingface.co/microsoft/phi-4-onnx/tree/main
+        "TextModelDir": "/tmp/onnx/phi-4-onnx",
+        "MaxTokens": 16384
+      },
+      "AzureOpenAIEmbedding": {
+        // "ApiKey" or "AzureIdentity"
+        // AzureIdentity: use automatic Entra (AAD) authentication mechanism.
+        //   You can test locally using the AZURE_TENANT_ID, AZURE_CLIENT_ID, AZURE_CLIENT_SECRET env vars.
+        "Auth": "AzureIdentity",
+        // Optional when Auth == AzureIdentity. Leave it null to use the default.
+        // in which case use this to change the client audience.
+        "AzureIdentityAudience": null,
+        "Endpoint": "https://<...>.openai.azure.com/",
+        "APIKey": "",
+        // Your Azure Deployment name
+        "Deployment": "",
+        // The max number of tokens supported by model deployed
+        // See https://learn.microsoft.com/azure/ai-services/openai/concepts/models
+        "MaxTokenTotal": 8191,
+        // Which tokenizer to use to correctly measure the size of chunks.
+        // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure.
+        // - Use p50k for the old text-davinci-003 models
+        // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models
+        // - Use o200k for the most recent gpt-4o family
+        "Tokenizer": "cl100k",
+        // The number of dimensions output embeddings should have.
+        // Only supported in "text-embedding-3" and later models developed with
+        // MRL, see https://arxiv.org/abs/2205.13147
+        "EmbeddingDimensions": null,
+        // How many embeddings to calculate in parallel. The max value depends on
+        // the model and deployment in use.
+        // See https://learn.microsoft.com/azure/ai-services/openai/reference#embeddings
+        "MaxEmbeddingBatchSize": 1,
+        // How many times to retry in case of throttling.
+        "MaxRetries": 10,
+        // Thumbprints of certificates that should be trusted for HTTPS requests when SSL policy errors are detected.
+        // This should only be used for local development when using a proxy to call the OpenAI endpoints.
+        "TrustedCertificateThumbprints": []
+      }
+    }
+  }
+}
diff --git a/examples/README.md b/examples/README.md
@@ -21,6 +21,7 @@ Some examples about how to use Kernel Memory.
 8. Local models and external connectors
     * [Using custom LLMs](104-dotnet-custom-LLM)
     * [Using local LLMs with Ollama](212-dotnet-ollama)
+    * [Using local LLMs with ONNX models](213-onnx)
     * [Using local LLMs with llama.cpp via LlamaSharp](105-dotnet-serverless-llamasharp)
     * [Using local models with LM Studio](208-dotnet-lmstudio)
     * [Using Semantic Kernel LLM connectors](107-dotnet-SemanticKernel-TextCompletion)

diff --git a/extensions/Chunkers/Chunkers.UnitTests/doc2.md b/extensions/Chunkers/Chunkers.UnitTests/doc2.md
@@ -450,6 +450,7 @@ Examples and Tools
 8. Local models and external connectors
     * [Using custom LLMs](examples/104-dotnet-custom-LLM)
     * [Using local LLMs with Ollama](examples/212-dotnet-ollama)
+    * [Using local LLMs with ONNX models](examples/213-onnx)
     * [Using local LLMs with llama.cpp via LlamaSharp](examples/105-dotnet-serverless-llamasharp)
     * [Using local models with LM Studio](examples/208-dotnet-lmstudio)
     * [Using Semantic Kernel LLM connectors](examples/107-dotnet-SemanticKernel-TextCompletion)

diff --git a/extensions/ONNX/Onnx/OnnxTextGenerator.cs b/extensions/ONNX/Onnx/OnnxTextGenerator.cs
@@ -33,7 +33,7 @@ public sealed class OnnxTextGenerator : ITextGenerator, IDisposable
     /// Tokenizer used with the Onnx Generator and Model classes to produce tokens.
     /// This has the potential to contain a null value, depending on the contents of the Model Directory.
     /// </summary>
-    private readonly Tokenizer? _tokenizer = default;
+    private readonly Tokenizer _tokenizer;
 
     /// <summary>
     /// Tokenizer used for GetTokens() and CountTokens()
@@ -84,15 +84,55 @@ public OnnxTextGenerator(
         this._log.LogDebug("Onnx model loaded");
     }
 
+    /// <inheritdoc/>
+    public int CountTokens(string text)
+    {
+        // TODO: Implement with _tokenizer and remove _textTokenizer
+        return this._textTokenizer.CountTokens(text);
+    }
+
+    /// <inheritdoc/>
+    public IReadOnlyList<string> GetTokens(string text)
+    {
+        // TODO: Implement with _tokenizer and remove _textTokenizer
+        return this._textTokenizer.GetTokens(text);
+    }
+
     /// <inheritdoc/>
     public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
         string prompt,
         TextGenerationOptions? options = null,
         [EnumeratorCancellation] CancellationToken cancellationToken = default)
     {
-        var tokens = this._tokenizer?.Encode(prompt);
+        // TODO: the prompt format should be configurable
+        using var sequences = this._tokenizer.Encode($"<|user|>{prompt}<|end|><|assistant|>");
+
         using var generatorParams = new GeneratorParams(this._model);
+        this.SetGeneratorParams(generatorParams, options);
 
+        using var tokenizerStream = this._tokenizer.CreateStream();
+        using var generator = new Generator(this._model, generatorParams);
+        generator.AppendTokenSequences(sequences);
+
+        while (!generator.IsDone())
+        {
+            generator.GenerateNextToken();
+            var x = tokenizerStream.Decode(generator.GetSequence(0)[^1]);
+            yield return new GeneratedTextContent(x);
+        }
+
+        await Task.CompletedTask.ConfigureAwait(false);
+    }
+
+    /// <inheritdoc/>
+    public void Dispose()
+    {
+        this._model.Dispose();
+        this._tokenizer.Dispose();
+    }
+
+    private void SetGeneratorParams(GeneratorParams generatorParams, TextGenerationOptions? options)
+    {
         generatorParams.SetSearchOption("max_length", this.MaxTokenTotal);
         generatorParams.SetSearchOption("min_length", this._config.MinLength);
         generatorParams.SetSearchOption("num_return_sequences", this._config.ResultsPerPrompt);
@@ -145,49 +185,5 @@ public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
 
                 break;
         }
-
-        generatorParams.SetInputSequences(tokens);
-
-        using (var generator = new Generator(this._model, generatorParams))
-        {
-            List<int> outputTokens = [];
-
-            while (!generator.IsDone() && cancellationToken.IsCancellationRequested == false)
-            {
-                generator.ComputeLogits();
-                generator.GenerateNextToken();
-
-                outputTokens.AddRange(generator.GetSequence(0));
-
-                if (outputTokens.Count > 0 && this._tokenizer != null)
-                {
-                    var newToken = outputTokens[^1];
-                    yield return this._tokenizer.Decode([newToken]);
-                }
-            }
-        }
-
-        await Task.CompletedTask.ConfigureAwait(false);
-    }
-
-    /// <inheritdoc/>
-    public int CountTokens(string text)
-    {
-        // TODO: Implement with _tokenizer and remove _textTokenizer
-        return this._textTokenizer.CountTokens(text);
-    }
-
-    /// <inheritdoc/>
-    public IReadOnlyList<string> GetTokens(string text)
-    {
-        // TODO: Implement with _tokenizer and remove _textTokenizer
-        return this._textTokenizer.GetTokens(text);
-    }
-
-    /// <inheritdoc/>
-    public void Dispose()
-    {
-        this._model?.Dispose();
-        this._tokenizer?.Dispose();
     }
 }