From f455c8ef4f4a2c91b86e0b6652f043649ac1b1c7 Mon Sep 17 00:00:00 2001
From: Stum Huang <shuang@best2e.com>
Date: Thu, 23 Apr 2026 09:15:49 +0800
Subject: [PATCH 1/3] fix(embedding): prevent pipeline() hang in Node/Bun
 runtime

Two independent issues caused pipeline("feature-extraction", ...) to hang
indefinitely (35s+) on first call, poisoning initPromise so every subsequent
embed() blocked forever. Symptom: web UI blank, /api/search returned
"Empty reply from server".

1. ONNX WASM threading deadlock
   @huggingface/transformers v4 defaults wasm.numThreads > 1, but Node.js
   and Bun lack SharedArrayBuffer support, so onnxruntime-web deadlocks
   during pipeline init. Fixed by forcing numThreads=1 in
   ensureTransformersLoaded(). Ref xenova/transformers.js#488.

2. dtype default mismatch
   transformers v4 default dtype tries to load model.onnx (fp32, ~500MB).
   The cached model directory only ships model_quantized.onnx, so pipeline
   falls back to a network fetch from huggingface.co. In restricted
   networks this fails with "Unable to connect". Fixed by passing
   dtype: "q8" to the pipeline() options so the local quantized model is
   used unconditionally.

After both fixes, pipeline ready in ~2.3s and /api/search returns real
results (similarity 0.457, vecLen=768).
---
 src/services/embedding.ts | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/services/embedding.ts b/src/services/embedding.ts
index 8473c09..f99c906 100644
--- a/src/services/embedding.ts
+++ b/src/services/embedding.ts
@@ -17,6 +17,14 @@ async function ensureTransformersLoaded(): Promise<NonNullable<typeof _transform
   mod.env.allowLocalModels = true;
   mod.env.allowRemoteModels = true;
   mod.env.cacheDir = join(CONFIG.storagePath, ".cache");
+  // CRITICAL: Disable WASM multi-threading. In Node.js/Bun (no SharedArrayBuffer),
+  // ONNX runtime hangs indefinitely during pipeline() init when threads > 1.
+  // See https://github.com/xenova/transformers.js/pull/488
+  try {
+    (mod.env as any).backends.onnx.wasm.numThreads = 1;
+  } catch (e) {
+    log("Failed to set wasm.numThreads", { error: String(e) });
+  }
   _transformers = mod;
   return _transformers!;
 }
@@ -58,7 +66,11 @@ export class EmbeddingService {
       const { pipeline } = await ensureTransformersLoaded();
       this.pipe = await pipeline("feature-extraction", CONFIG.embeddingModel, {
         progress_callback: progressCallback,
-      });
+        // Force quantized ONNX. Default is fp32 model.onnx which transformers v4
+        // tries to download from huggingface.co; cache only ships model_quantized.onnx
+        // and HF is unreachable behind GFW, causing init to fail.
+        dtype: "q8",
+      } as any);
       this.isWarmedUp = true;
     } catch (error) {
       this.initPromise = null;

From 20748bec021e9ba744a6105023e1564c5dfab1ff Mon Sep 17 00:00:00 2001
From: Stum Huang <shuang@best2e.com>
Date: Thu, 23 Apr 2026 09:16:18 +0800
Subject: [PATCH 2/3] fix(api): remove embedding warmup from read-only handlers

handleListTags, handleListMemories, and handleStats each awaited
embeddingService.warmup() before serving. These handlers only read
SQLite/sqlite-vec rows and never compute query embeddings, so the
coupling was unnecessary. When warmup() stalled (or simply took a few
seconds on cold start), the entire web UI went blank because every
read endpoint blocked behind the embedding model load.

Removed the warmup() calls from the three read paths. handleSearch
still warms up because it needs the query vector. Net effect:
/api/stats, /api/tags, and /api/memories now respond immediately even
when the embedding model has not been loaded yet, so the dashboard
can render before the first search query.
---
 src/services/api-handlers.ts | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/services/api-handlers.ts b/src/services/api-handlers.ts
index dc70681..e177bd2 100644
--- a/src/services/api-handlers.ts
+++ b/src/services/api-handlers.ts
@@ -100,7 +100,10 @@ function getProjectPathFromTag(tag: string): string | undefined {
 
 export async function handleListTags(): Promise<ApiResponse<{ project: TagInfo[] }>> {
   try {
-    await embeddingService.warmup();
+    // Tags are stored as SQLite metadata; embedding model is not needed.
+    // Calling warmup() here would block on @huggingface/transformers init in
+    // the worker thread and hang every read API. Only handlers that compute
+    // similarity (e.g. handleSearch) should warm up the embedding service.
     const projectShards = shardManager.getAllShards("project", "");
     const tagsMap = new Map<string, TagInfo>();
     for (const shard of projectShards) {
@@ -140,7 +143,8 @@ export async function handleListMemories(
   includePrompts: boolean = true
 ): Promise<ApiResponse<PaginatedResponse<Memory | any>>> {
   try {
-    await embeddingService.warmup();
+    // Listing only reads SQLite rows; no vector ops happen here.
+    // See handleListTags comment - keep embedding init out of read paths.
     let allMemories: any[] = [];
     if (tag) {
       const { scope: tagScope, hash } = extractScopeFromTag(tag);
@@ -652,7 +656,8 @@ export async function handleStats(): Promise<
   }>
 > {
   try {
-    await embeddingService.warmup();
+    // Stats only counts SQLite rows; no embedding needed.
+    // See handleListTags comment - keep embedding init out of read paths.
     const projectShards = shardManager.getAllShards("project", "");
     let userCount = 0,
       projectCount = 0;

From 9e50c2658699f9b6488221f35dcd440e10005265 Mon Sep 17 00:00:00 2001
From: Stum Huang <shuang@best2e.com>
Date: Thu, 23 Apr 2026 09:34:58 +0800
Subject: [PATCH 3/3] fix(embedding): replace 'as any' with
 PretrainedModelOptions type

Per Copilot review on PR #100: the 'as any' cast on pipeline() options
silently dropped compile-time validation of the dtype key, which is the
exact protection that prevents an unwanted fp32 model.onnx download.

Use the official PretrainedModelOptions type so any future typo in dtype
or other option keys fails at tsc time instead of at runtime.
---
 src/services/embedding.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/services/embedding.ts b/src/services/embedding.ts
index f99c906..a63a2d0 100644
--- a/src/services/embedding.ts
+++ b/src/services/embedding.ts
@@ -1,6 +1,7 @@
 import { CONFIG } from "../config.js";
 import { log } from "./logger.js";
 import { join } from "node:path";
+import type { PretrainedModelOptions } from "@huggingface/transformers";
 
 const TIMEOUT_MS = 30000;
 const GLOBAL_EMBEDDING_KEY = Symbol.for("opencode-mem.embedding.instance");
@@ -64,13 +65,14 @@ export class EmbeddingService {
         return;
       }
       const { pipeline } = await ensureTransformersLoaded();
-      this.pipe = await pipeline("feature-extraction", CONFIG.embeddingModel, {
+      const pipelineOptions: PretrainedModelOptions = {
         progress_callback: progressCallback,
         // Force quantized ONNX. Default is fp32 model.onnx which transformers v4
         // tries to download from huggingface.co; cache only ships model_quantized.onnx
         // and HF is unreachable behind GFW, causing init to fail.
         dtype: "q8",
-      } as any);
+      };
+      this.pipe = await pipeline("feature-extraction", CONFIG.embeddingModel, pipelineOptions);
       this.isWarmedUp = true;
     } catch (error) {
       this.initPromise = null;