chore: initialize recovered claude workspace
This commit is contained in:
106
src/services/voiceKeyterms.ts
Normal file
106
src/services/voiceKeyterms.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
// Voice keyterms for improving STT accuracy in the voice_stream endpoint.
|
||||
//
|
||||
// Provides domain-specific vocabulary hints (Deepgram "keywords") so the STT
|
||||
// engine correctly recognises coding terminology, project names, and branch
|
||||
// names that would otherwise be misheard.
|
||||
|
||||
import { basename } from 'path'
|
||||
import { getProjectRoot } from '../bootstrap/state.js'
|
||||
import { getBranch } from '../utils/git.js'
|
||||
|
||||
// ─── Global keyterms ────────────────────────────────────────────────
|
||||
|
||||
const GLOBAL_KEYTERMS: readonly string[] = [
|
||||
// Terms Deepgram consistently mangles without keyword hints.
|
||||
// Note: "Claude" and "Anthropic" are already server-side base keyterms.
|
||||
// Avoid terms nobody speaks aloud as-spelled (stdout → "standard out").
|
||||
'MCP',
|
||||
'symlink',
|
||||
'grep',
|
||||
'regex',
|
||||
'localhost',
|
||||
'codebase',
|
||||
'TypeScript',
|
||||
'JSON',
|
||||
'OAuth',
|
||||
'webhook',
|
||||
'gRPC',
|
||||
'dotfiles',
|
||||
'subagent',
|
||||
'worktree',
|
||||
]
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Split an identifier (camelCase, PascalCase, kebab-case, snake_case, or
|
||||
* path segments) into individual words. Fragments of 2 chars or fewer are
|
||||
* discarded to avoid noise.
|
||||
*/
|
||||
export function splitIdentifier(name: string): string[] {
|
||||
return name
|
||||
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
||||
.split(/[-_./\s]+/)
|
||||
.map(w => w.trim())
|
||||
.filter(w => w.length > 2 && w.length <= 20)
|
||||
}
|
||||
|
||||
function fileNameWords(filePath: string): string[] {
|
||||
const stem = basename(filePath).replace(/\.[^.]+$/, '')
|
||||
return splitIdentifier(stem)
|
||||
}
|
||||
|
||||
// ─── Public API ─────────────────────────────────────────────────────
|
||||
|
||||
const MAX_KEYTERMS = 50
|
||||
|
||||
/**
|
||||
* Build a list of keyterms for the voice_stream STT endpoint.
|
||||
*
|
||||
* Combines hardcoded global coding terms with session context (project name,
|
||||
* git branch, recent files) without any model calls.
|
||||
*/
|
||||
export async function getVoiceKeyterms(
|
||||
recentFiles?: ReadonlySet<string>,
|
||||
): Promise<string[]> {
|
||||
const terms = new Set<string>(GLOBAL_KEYTERMS)
|
||||
|
||||
// Project root basename as a single term — users say "claude CLI internal"
|
||||
// as a phrase, not isolated words. Keeping the whole basename lets the
|
||||
// STT's keyterm boosting match the phrase regardless of separator.
|
||||
try {
|
||||
const projectRoot = getProjectRoot()
|
||||
if (projectRoot) {
|
||||
const name = basename(projectRoot)
|
||||
if (name.length > 2 && name.length <= 50) {
|
||||
terms.add(name)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// getProjectRoot() may throw if not initialised yet — ignore
|
||||
}
|
||||
|
||||
// Git branch words (e.g. "feat/voice-keyterms" → "feat", "voice", "keyterms")
|
||||
try {
|
||||
const branch = await getBranch()
|
||||
if (branch) {
|
||||
for (const word of splitIdentifier(branch)) {
|
||||
terms.add(word)
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// getBranch() may fail if not in a git repo — ignore
|
||||
}
|
||||
|
||||
// Recent file names — only scan enough to fill remaining slots
|
||||
if (recentFiles) {
|
||||
for (const filePath of recentFiles) {
|
||||
if (terms.size >= MAX_KEYTERMS) break
|
||||
for (const word of fileNameWords(filePath)) {
|
||||
terms.add(word)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...terms].slice(0, MAX_KEYTERMS)
|
||||
}
|
||||
Reference in New Issue
Block a user