heygen-com · jrusso1020 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -10,6 +10,7 @@ This repo ships skills that are installed globally via `npx hyperframes skills`
 | ------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | **hyperframes-compose**  | `/hyperframes-compose`  | Creating ANY HTML composition — videos, animations, title cards, overlays. Contains required HTML structure, `class="clip"` rules, GSAP timeline patterns, and rendering constraints. |
 | **hyperframes-captions** | `/hyperframes-captions` | Any task involving text synced to audio: captions, subtitles, lyrics, lyric videos, karaoke. Also covers transcription strategy (whisper model selection, transcript format).         |
+| **hyperframes-tts**      | `/hyperframes-tts`      | Generating speech from text: narration, voiceovers, text-to-speech. Voice selection, speed control, and combining TTS output with compositions.                                       |
 | **marker-highlight**     | `/marker-highlight`     | Animated text highlighting — marker sweeps, hand-drawn circles, burst lines, scribble, sketchout. Use with captions for dynamic emphasis.                                             |
 
 ### GSAP Skills (from [greensock/gsap-skills](https://github.com/greensock/gsap-skills))
@@ -32,6 +33,7 @@ The skills encode HyperFrames-specific patterns (e.g., required `class="clip"` o
 - When creating or modifying HTML compositions → invoke `/hyperframes-compose` BEFORE writing any code
 - When adding captions, subtitles, lyrics, or any text synced to audio → invoke `/hyperframes-captions` BEFORE writing any code
 - When transcribing audio or choosing a whisper model → invoke `/hyperframes-captions` BEFORE running any transcription tool
+- When generating speech from text (narration, voiceover, TTS) → invoke `/hyperframes-tts` BEFORE running any TTS command
 - When creating a video from audio (music video, lyric video, audio visualizer with text) → invoke BOTH `/hyperframes-compose` AND `/hyperframes-captions`
 - When writing GSAP animations → invoke `/gsap-core` and `/gsap-timeline` BEFORE writing any code
 - When optimizing animation performance → invoke `/gsap-performance` BEFORE making changes
@@ -80,6 +82,15 @@ bunx oxfmt --check <files> # Format (check only, used by pre-commit hook)
 
 Always run both on changed files before committing. The lefthook pre-commit hook runs `bunx oxlint` and `bunx oxfmt --check` automatically.
 
+### Adding CLI Commands
+
+When adding a new CLI command:
+
+1. Define the command in `packages/cli/src/commands/<name>.ts` using `defineCommand` from citty
+2. Register it in `packages/cli/src/cli.ts` under `subCommands` (lazy-loaded)
+3. **Add examples to `packages/cli/src/help.ts`** in the `COMMAND_EXAMPLES` record — every command must have `--help` examples
+4. Validate by running `npx tsx packages/cli/src/cli.ts <name> --help` and verifying the examples section appears
+
 ## Key Concepts
 
 - **Compositions** are HTML files with `data-*` attributes defining timeline, tracks, and media
@@ -131,3 +142,46 @@ If captions are inaccurate (wrong words, bad timing):
 3. **Use an external API**: Transcribe via OpenAI or Groq Whisper API, then import the JSON with `hyperframes transcribe response.json`
 
 See the `/hyperframes-captions` skill for full details on model selection and API usage.
+
+## Text-to-Speech
+
+Generate speech audio locally using Kokoro-82M (no API key, runs on CPU). Useful for adding voiceovers to compositions.
+
+### Quick reference
+
+```bash
+# Generate speech from text
+npx hyperframes tts "Welcome to HyperFrames"
+
+# Choose a voice and output path
+npx hyperframes tts "Hello world" --voice am_adam --output narration.wav
+
+# Read text from a file
+npx hyperframes tts script.txt --voice bf_emma
+
+# Adjust speech speed
+npx hyperframes tts "Fast narration" --speed 1.2
+
+# List available voices
+npx hyperframes tts --list
+```
+
+### Voices
+
+Default voice is `af_heart`. The model ships with 54 voices across 8 languages:
+
+| Voice ID     | Name    | Language | Gender |
+| ------------ | ------- | -------- | ------ |
+| `af_heart`   | Heart   | en-US    | Female |
+| `af_nova`    | Nova    | en-US    | Female |
+| `am_adam`    | Adam    | en-US    | Male   |
+| `am_michael` | Michael | en-US    | Male   |
+| `bf_emma`    | Emma    | en-GB    | Female |
+| `bm_george`  | George  | en-GB    | Male   |
+
+Use `npx hyperframes tts --list` for the full set, or pass any valid Kokoro voice ID.
+
+### Requirements
+
+- Python 3.8+ (auto-installs `kokoro-onnx` package on first run)
+- Model downloads automatically on first use (~311 MB model + ~27 MB voices, cached in `~/.cache/hyperframes/tts/`)
diff --git a/packages/cli/package.json b/packages/cli/package.json
@@ -21,7 +21,7 @@
     "build:fonts": "cd ../producer && tsx scripts/generate-font-data.ts",
     "build:studio": "cd ../studio && bun run build",
     "build:runtime": "tsx scripts/build-runtime.ts",
-    "build:copy": "mkdir -p dist/studio dist/docs dist/templates dist/skills && cp -r ../studio/dist/* dist/studio/ && cp -r src/templates/blank src/templates/_shared dist/templates/ && cp -r ../../skills/hyperframes-compose ../../skills/hyperframes-captions dist/skills/ && (cp src/docs/*.md dist/docs/ 2>/dev/null || true)",
+    "build:copy": "mkdir -p dist/studio dist/docs dist/templates dist/skills && cp -r ../studio/dist/* dist/studio/ && cp -r src/templates/blank src/templates/_shared dist/templates/ && cp -r ../../skills/hyperframes-compose ../../skills/hyperframes-captions ../../skills/hyperframes-tts dist/skills/ && (cp src/docs/*.md dist/docs/ 2>/dev/null || true)",
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {

diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
@@ -33,6 +33,7 @@ const subCommands = {
   benchmark: () => import("./commands/benchmark.js").then((m) => m.default),
   browser: () => import("./commands/browser.js").then((m) => m.default),
   transcribe: () => import("./commands/transcribe.js").then((m) => m.default),
+  tts: () => import("./commands/tts.js").then((m) => m.default),
   docs: () => import("./commands/docs.js").then((m) => m.default),
   doctor: () => import("./commands/doctor.js").then((m) => m.default),
   upgrade: () => import("./commands/upgrade.js").then((m) => m.default),

diff --git a/packages/cli/src/commands/tts.ts b/packages/cli/src/commands/tts.ts
@@ -0,0 +1,152 @@
+import { defineCommand } from "citty";
+import { existsSync, readFileSync } from "node:fs";
+import { resolve, extname } from "node:path";
+import * as clack from "@clack/prompts";
+import { c } from "../ui/colors.js";
+import { DEFAULT_VOICE, BUNDLED_VOICES } from "../tts/manager.js";
+
+const voiceList = BUNDLED_VOICES.map((v) => `${v.id} (${v.label})`).join(", ");
+
+export default defineCommand({
+  meta: {
+    name: "tts",
+    description: "Generate speech audio from text using a local AI model (Kokoro-82M)",
+  },
+  args: {
+    input: {
+      type: "positional",
+      description: "Text to speak, or path to a .txt file",
+      required: false,
+    },
+    output: {
+      type: "string",
+      description: "Output file path (default: speech.wav in current directory)",
+      alias: "o",
+    },
+    voice: {
+      type: "string",
+      description: `Voice ID (default: ${DEFAULT_VOICE}). Options: ${voiceList}`,
+      alias: "v",
+    },
+    speed: {
+      type: "string",
+      description: "Speech speed multiplier (default: 1.0)",
+      alias: "s",
+    },
+    list: {
+      type: "boolean",
+      description: "List available voices and exit",
+      default: false,
+    },
+    json: {
+      type: "boolean",
+      description: "Output result as JSON",
+      default: false,
+    },
+  },
+  async run({ args }) {
+    // ── List voices mode ──────────────────────────────────────────────
+    if (args.list) {
+      return listVoices(args.json);
+    }
+
+    // ── Resolve input text ────────────────────────────────────────────
+    if (!args.input) {
+      console.error(c.error("Provide text to speak, or use --list to see available voices."));
+      process.exit(1);
+    }
+
+    let text: string;
+    const maybeFile = resolve(args.input);
+
+    if (existsSync(maybeFile) && extname(maybeFile).toLowerCase() === ".txt") {
+      text = readFileSync(maybeFile, "utf-8").trim();
+      if (!text) {
+        console.error(c.error("File is empty."));
+        process.exit(1);
+      }
+    } else {
+      text = args.input;
+    }
+
+    if (!text.trim()) {
+      console.error(c.error("No text provided."));
+      process.exit(1);
+    }
+
+    // ── Resolve output path ───────────────────────────────────────────
+    const output = resolve(args.output ?? "speech.wav");
+    const voice = args.voice ?? DEFAULT_VOICE;
+    const speed = args.speed ? parseFloat(args.speed) : 1.0;
+
+    if (isNaN(speed) || speed <= 0 || speed > 3) {
+      console.error(c.error("Speed must be a number between 0.1 and 3.0"));
+      process.exit(1);
+    }
+
+    // ── Synthesize ────────────────────────────────────────────────────
+    const { synthesize } = await import("../tts/synthesize.js");
+    const spin = args.json ? null : clack.spinner();
+    spin?.start(`Generating speech with ${c.accent(voice)}...`);
+
+    try {
+      const result = await synthesize(text, output, {
+        voice,
+        speed,
+        onProgress: spin ? (msg) => spin.message(msg) : undefined,
+      });
+
+      if (args.json) {
+        console.log(
+          JSON.stringify({
+            ok: true,
+            voice,
+            speed,
+            durationSeconds: result.durationSeconds,
+            outputPath: result.outputPath,
+          }),
+        );
+      } else {
+        spin?.stop(
+          c.success(
+            `Generated ${c.accent(result.durationSeconds.toFixed(1) + "s")} of speech → ${c.accent(result.outputPath)}`,
+          ),
+        );
+      }
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      if (args.json) {
+        console.log(JSON.stringify({ ok: false, error: message }));
+      } else {
+        spin?.stop(c.error(`Speech synthesis failed: ${message}`));
+      }
+      process.exit(1);
+    }
+  },
+});
+
+// ---------------------------------------------------------------------------
+// List voices
+// ---------------------------------------------------------------------------
+
+function listVoices(json: boolean): void {
+  if (json) {
+    console.log(JSON.stringify(BUNDLED_VOICES));
+    return;
+  }
+
+  console.log(`\n${c.bold("Available voices")} (Kokoro-82M)\n`);
+  console.log(
+    `  ${c.dim("ID")}                ${c.dim("Name")}         ${c.dim("Language")}   ${c.dim("Gender")}`,
+  );
+  console.log(`  ${c.dim("─".repeat(60))}`);
+  for (const v of BUNDLED_VOICES) {
+    const id = v.id.padEnd(18);
+    const label = v.label.padEnd(13);
+    const lang = v.language.padEnd(10);
+    console.log(`  ${c.accent(id)} ${label} ${lang} ${v.gender}`);
+  }
+  console.log(
+    `\n  ${c.dim("Use any Kokoro voice ID — see https://github.com/thewh1teagle/kokoro-onnx for all 54 voices")}\n`,
+  );
+}
diff --git a/packages/cli/src/help.ts b/packages/cli/src/help.ts
@@ -124,6 +124,14 @@ const COMMAND_EXAMPLES: Record<string, Example[]> = {
     ["Install to Cursor (project-level)", "hyperframes skills --cursor"],
     ["Install to specific tools", "hyperframes skills --claude --gemini"],
   ],
+  tts: [
+    ["Generate speech from text", 'hyperframes tts "Welcome to HyperFrames"'],
+    ["Choose a voice", 'hyperframes tts "Hello world" --voice am_adam'],
+    ["Save to a specific file", 'hyperframes tts "Intro" --voice bf_emma --output narration.wav'],
+    ["Adjust speech speed", 'hyperframes tts "Slow and clear" --speed 0.8'],
+    ["Read text from a file", "hyperframes tts script.txt"],
+    ["List available voices", "hyperframes tts --list"],
+  ],
   transcribe: [
     ["Transcribe an audio file", "hyperframes transcribe audio.mp3"],
     ["Transcribe a video file", "hyperframes transcribe video.mp4"],

diff --git a/packages/cli/src/tts/manager.ts b/packages/cli/src/tts/manager.ts
@@ -0,0 +1,99 @@
+import { existsSync, mkdirSync } from "node:fs";
+import { homedir } from "node:os";
+import { join } from "node:path";
+import { downloadFile } from "../utils/download.js";
+
+const CACHE_DIR = join(homedir(), ".cache", "hyperframes", "tts");
+const MODELS_DIR = join(CACHE_DIR, "models");
+const VOICES_DIR = join(CACHE_DIR, "voices");
+
+const DEFAULT_MODEL = "kokoro-v1.0";
+
+const MODEL_URLS: Record<string, string> = {
+  "kokoro-v1.0":
+    "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx",
+};
+
+const VOICES_URL =
+  "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
+
+// ---------------------------------------------------------------------------
+// Voices — Kokoro ships 54 voices across 8 languages. We expose a curated
+// default set and allow users to specify any valid Kokoro voice ID.
+// ---------------------------------------------------------------------------
+
+export interface VoiceInfo {
+  id: string;
+  label: string;
+  language: string;
+  gender: "female" | "male";
+}
+
+export const BUNDLED_VOICES: VoiceInfo[] = [
+  { id: "af_heart", label: "Heart", language: "en-US", gender: "female" },
+  { id: "af_nova", label: "Nova", language: "en-US", gender: "female" },
+  { id: "af_sky", label: "Sky", language: "en-US", gender: "female" },
+  { id: "am_adam", label: "Adam", language: "en-US", gender: "male" },
+  { id: "am_michael", label: "Michael", language: "en-US", gender: "male" },
+  { id: "bf_emma", label: "Emma", language: "en-GB", gender: "female" },
+  { id: "bf_isabella", label: "Isabella", language: "en-GB", gender: "female" },
+  { id: "bm_george", label: "George", language: "en-GB", gender: "male" },
+];
+
+export const DEFAULT_VOICE = "af_heart";
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/**
+ * Ensure the Kokoro ONNX model is downloaded and cached.
+ * Returns the path to the .onnx model file.
+ */
+export async function ensureModel(
+  model: string = DEFAULT_MODEL,
+  options?: { onProgress?: (message: string) => void },
+): Promise<string> {
+  const modelPath = join(MODELS_DIR, `${model}.onnx`);
+  if (existsSync(modelPath)) return modelPath;
+
+  const url = MODEL_URLS[model];
+  if (!url) {
+    throw new Error(
+      `Unknown TTS model: ${model}. Available: ${Object.keys(MODEL_URLS).join(", ")}`,
+    );
+  }
+
+  mkdirSync(MODELS_DIR, { recursive: true });
+  options?.onProgress?.(`Downloading TTS model ${model} (~311 MB)...`);
+  await downloadFile(url, modelPath);
+
+  if (!existsSync(modelPath)) {
+    throw new Error(`Model download failed: ${model}`);
+  }
+
+  return modelPath;
+}
+
+/**
+ * Ensure the Kokoro voices bundle is downloaded and cached.
+ * Returns the path to the voices .bin file.
+ */
+export async function ensureVoices(options?: {
+  onProgress?: (message: string) => void;
+}): Promise<string> {
+  const voicesPath = join(VOICES_DIR, "voices-v1.0.bin");
+  if (existsSync(voicesPath)) return voicesPath;
+
+  mkdirSync(VOICES_DIR, { recursive: true });
+  options?.onProgress?.("Downloading voice data (~27 MB)...");
+  await downloadFile(VOICES_URL, voicesPath);
+
+  if (!existsSync(voicesPath)) {
+    throw new Error("Voice data download failed");
+  }
+
+  return voicesPath;
+}
+
+export { MODELS_DIR, VOICES_DIR, DEFAULT_MODEL };