chore: initialize recovered claude workspace
This commit is contained in:
491
src/utils/telemetry/betaSessionTracing.ts
Normal file
491
src/utils/telemetry/betaSessionTracing.ts
Normal file
@@ -0,0 +1,491 @@
|
||||
/**
|
||||
* Beta Session Tracing for Claude Code
|
||||
*
|
||||
* This module contains beta tracing features enabled when
|
||||
* ENABLE_BETA_TRACING_DETAILED=1 and BETA_TRACING_ENDPOINT are set.
|
||||
*
|
||||
* For external users, tracing is enabled in SDK/headless mode, or in
|
||||
* interactive mode when the org is allowlisted via the
|
||||
* tengu_trace_lantern GrowthBook gate.
|
||||
* For ant users, tracing is enabled in all modes.
|
||||
*
|
||||
* Visibility Rules:
|
||||
* | Content | External | Ant |
|
||||
* |------------------|----------|------|
|
||||
* | System prompts | ✅ | ✅ |
|
||||
* | Model output | ✅ | ✅ |
|
||||
* | Thinking output | ❌ | ✅ |
|
||||
* | Tools | ✅ | ✅ |
|
||||
* | new_context | ✅ | ✅ |
|
||||
*
|
||||
* Features:
|
||||
* - Per-agent message tracking with hash-based deduplication
|
||||
* - System prompt logging (once per unique hash)
|
||||
* - Hook execution spans
|
||||
* - Detailed new_context attributes for LLM requests
|
||||
*/
|
||||
|
||||
import type { Span } from '@opentelemetry/api'
|
||||
import { createHash } from 'crypto'
|
||||
import { getIsNonInteractiveSession } from '../../bootstrap/state.js'
|
||||
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
|
||||
import { sanitizeToolNameForAnalytics } from '../../services/analytics/metadata.js'
|
||||
import type { AssistantMessage, UserMessage } from '../../types/message.js'
|
||||
import { isEnvTruthy } from '../envUtils.js'
|
||||
import { jsonParse, jsonStringify } from '../slowOperations.js'
|
||||
import { logOTelEvent } from './events.js'
|
||||
|
||||
// Message type for API calls (UserMessage or AssistantMessage)
|
||||
type APIMessage = UserMessage | AssistantMessage
|
||||
|
||||
/**
|
||||
* Track hashes we've already logged this session (system prompts, tools, etc).
|
||||
*
|
||||
* WHY: System prompts and tool schemas are large and rarely change within a session.
|
||||
* Sending full content on every request would be wasteful. Instead, we hash and
|
||||
* only log the full content once per unique hash.
|
||||
*/
|
||||
const seenHashes = new Set<string>()
|
||||
|
||||
/**
|
||||
* Track the last reported message hash per querySource (agent) for incremental context.
|
||||
*
|
||||
* WHY: When debugging traces, we want to see what NEW information was added each turn,
|
||||
* not the entire conversation history (which can be huge). By tracking the last message
|
||||
* we reported per agent, we can compute and send only the delta (new messages since
|
||||
* the last request). This is tracked per-agent (querySource) because different agents
|
||||
* (main thread, subagents, warmup requests) have independent conversation contexts.
|
||||
*/
|
||||
const lastReportedMessageHash = new Map<string, string>()
|
||||
|
||||
/**
|
||||
* Clear tracking state after compaction.
|
||||
* Old hashes are irrelevant once messages have been replaced.
|
||||
*/
|
||||
export function clearBetaTracingState(): void {
|
||||
seenHashes.clear()
|
||||
lastReportedMessageHash.clear()
|
||||
}
|
||||
|
||||
const MAX_CONTENT_SIZE = 60 * 1024 // 60KB (Honeycomb limit is 64KB, staying safe)
|
||||
|
||||
/**
|
||||
* Check if beta detailed tracing is enabled.
|
||||
* - Requires ENABLE_BETA_TRACING_DETAILED=1 and BETA_TRACING_ENDPOINT
|
||||
* - For external users, enabled in SDK/headless mode OR when org is
|
||||
* allowlisted via the tengu_trace_lantern GrowthBook gate
|
||||
*/
|
||||
export function isBetaTracingEnabled(): boolean {
|
||||
const baseEnabled =
|
||||
isEnvTruthy(process.env.ENABLE_BETA_TRACING_DETAILED) &&
|
||||
Boolean(process.env.BETA_TRACING_ENDPOINT)
|
||||
|
||||
if (!baseEnabled) {
|
||||
return false
|
||||
}
|
||||
|
||||
// For external users, enable in SDK/headless mode OR when org is allowlisted.
|
||||
// Gate reads from disk cache, so first run after allowlisting returns false;
|
||||
// works from second run onward (same behavior as enhanced_telemetry_beta).
|
||||
if (process.env.USER_TYPE !== 'ant') {
|
||||
return (
|
||||
getIsNonInteractiveSession() ||
|
||||
getFeatureValue_CACHED_MAY_BE_STALE('tengu_trace_lantern', false)
|
||||
)
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate content to fit within Honeycomb limits.
|
||||
*/
|
||||
export function truncateContent(
|
||||
content: string,
|
||||
maxSize: number = MAX_CONTENT_SIZE,
|
||||
): { content: string; truncated: boolean } {
|
||||
if (content.length <= maxSize) {
|
||||
return { content, truncated: false }
|
||||
}
|
||||
|
||||
return {
|
||||
content:
|
||||
content.slice(0, maxSize) +
|
||||
'\n\n[TRUNCATED - Content exceeds 60KB limit]',
|
||||
truncated: true,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a short hash (first 12 hex chars of SHA-256).
|
||||
*/
|
||||
function shortHash(content: string): string {
|
||||
return createHash('sha256').update(content).digest('hex').slice(0, 12)
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a hash for a system prompt.
|
||||
*/
|
||||
function hashSystemPrompt(systemPrompt: string): string {
|
||||
return `sp_${shortHash(systemPrompt)}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a hash for a message based on its content.
|
||||
*/
|
||||
function hashMessage(message: APIMessage): string {
|
||||
const content = jsonStringify(message.message.content)
|
||||
return `msg_${shortHash(content)}`
|
||||
}
|
||||
|
||||
// Regex to detect content wrapped in <system-reminder> tags
|
||||
const SYSTEM_REMINDER_REGEX =
|
||||
/^<system-reminder>\n?([\s\S]*?)\n?<\/system-reminder>$/
|
||||
|
||||
/**
|
||||
* Check if text is entirely a system reminder (wrapped in <system-reminder> tags).
|
||||
* Returns the inner content if it is, null otherwise.
|
||||
*/
|
||||
function extractSystemReminderContent(text: string): string | null {
|
||||
const match = text.trim().match(SYSTEM_REMINDER_REGEX)
|
||||
return match && match[1] ? match[1].trim() : null
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of formatting messages - separates regular content from system reminders.
|
||||
*/
|
||||
interface FormattedMessages {
|
||||
contextParts: string[]
|
||||
systemReminders: string[]
|
||||
}
|
||||
|
||||
/**
|
||||
* Format user messages for new_context display, separating system reminders.
|
||||
* Only handles user messages (assistant messages are filtered out before this is called).
|
||||
*/
|
||||
function formatMessagesForContext(messages: UserMessage[]): FormattedMessages {
|
||||
const contextParts: string[] = []
|
||||
const systemReminders: string[] = []
|
||||
|
||||
for (const message of messages) {
|
||||
const content = message.message.content
|
||||
if (typeof content === 'string') {
|
||||
const reminderContent = extractSystemReminderContent(content)
|
||||
if (reminderContent) {
|
||||
systemReminders.push(reminderContent)
|
||||
} else {
|
||||
contextParts.push(`[USER]\n${content}`)
|
||||
}
|
||||
} else if (Array.isArray(content)) {
|
||||
for (const block of content) {
|
||||
if (block.type === 'text') {
|
||||
const reminderContent = extractSystemReminderContent(block.text)
|
||||
if (reminderContent) {
|
||||
systemReminders.push(reminderContent)
|
||||
} else {
|
||||
contextParts.push(`[USER]\n${block.text}`)
|
||||
}
|
||||
} else if (block.type === 'tool_result') {
|
||||
const resultContent =
|
||||
typeof block.content === 'string'
|
||||
? block.content
|
||||
: jsonStringify(block.content)
|
||||
// Tool results can also contain system reminders (e.g., malware warning)
|
||||
const reminderContent = extractSystemReminderContent(resultContent)
|
||||
if (reminderContent) {
|
||||
systemReminders.push(reminderContent)
|
||||
} else {
|
||||
contextParts.push(
|
||||
`[TOOL RESULT: ${block.tool_use_id}]\n${resultContent}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { contextParts, systemReminders }
|
||||
}
|
||||
|
||||
export interface LLMRequestNewContext {
|
||||
/** System prompt (typically only on first request or if changed) */
|
||||
systemPrompt?: string
|
||||
/** Query source identifying the agent/purpose (e.g., 'repl_main_thread', 'agent:builtin') */
|
||||
querySource?: string
|
||||
/** Tool schemas sent with the request */
|
||||
tools?: string
|
||||
}
|
||||
|
||||
/**
|
||||
* Add beta attributes to an interaction span.
|
||||
* Adds new_context with the user prompt.
|
||||
*/
|
||||
export function addBetaInteractionAttributes(
|
||||
span: Span,
|
||||
userPrompt: string,
|
||||
): void {
|
||||
if (!isBetaTracingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
const { content: truncatedPrompt, truncated } = truncateContent(
|
||||
`[USER PROMPT]\n${userPrompt}`,
|
||||
)
|
||||
span.setAttributes({
|
||||
new_context: truncatedPrompt,
|
||||
...(truncated && {
|
||||
new_context_truncated: true,
|
||||
new_context_original_length: userPrompt.length,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Add beta attributes to an LLM request span.
|
||||
* Handles system prompt logging and new_context computation.
|
||||
*/
|
||||
export function addBetaLLMRequestAttributes(
|
||||
span: Span,
|
||||
newContext?: LLMRequestNewContext,
|
||||
messagesForAPI?: APIMessage[],
|
||||
): void {
|
||||
if (!isBetaTracingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
// Add system prompt info to the span
|
||||
if (newContext?.systemPrompt) {
|
||||
const promptHash = hashSystemPrompt(newContext.systemPrompt)
|
||||
const preview = newContext.systemPrompt.slice(0, 500)
|
||||
|
||||
// Always add hash, preview, and length to the span
|
||||
span.setAttribute('system_prompt_hash', promptHash)
|
||||
span.setAttribute('system_prompt_preview', preview)
|
||||
span.setAttribute('system_prompt_length', newContext.systemPrompt.length)
|
||||
|
||||
// Log the full system prompt only once per unique hash this session
|
||||
if (!seenHashes.has(promptHash)) {
|
||||
seenHashes.add(promptHash)
|
||||
|
||||
// Truncate for the log if needed
|
||||
const { content: truncatedPrompt, truncated } = truncateContent(
|
||||
newContext.systemPrompt,
|
||||
)
|
||||
|
||||
void logOTelEvent('system_prompt', {
|
||||
system_prompt_hash: promptHash,
|
||||
system_prompt: truncatedPrompt,
|
||||
system_prompt_length: String(newContext.systemPrompt.length),
|
||||
...(truncated && { system_prompt_truncated: 'true' }),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Add tools info to the span
|
||||
if (newContext?.tools) {
|
||||
try {
|
||||
const toolsArray = jsonParse(newContext.tools) as Record<
|
||||
string,
|
||||
unknown
|
||||
>[]
|
||||
|
||||
// Build array of {name, hash} for each tool
|
||||
const toolsWithHashes = toolsArray.map(tool => {
|
||||
const toolJson = jsonStringify(tool)
|
||||
const toolHash = shortHash(toolJson)
|
||||
return {
|
||||
name: typeof tool.name === 'string' ? tool.name : 'unknown',
|
||||
hash: toolHash,
|
||||
json: toolJson,
|
||||
}
|
||||
})
|
||||
|
||||
// Set span attribute with array of name/hash pairs
|
||||
span.setAttribute(
|
||||
'tools',
|
||||
jsonStringify(
|
||||
toolsWithHashes.map(({ name, hash }) => ({ name, hash })),
|
||||
),
|
||||
)
|
||||
span.setAttribute('tools_count', toolsWithHashes.length)
|
||||
|
||||
// Log each tool's full description once per unique hash
|
||||
for (const { name, hash, json } of toolsWithHashes) {
|
||||
if (!seenHashes.has(`tool_${hash}`)) {
|
||||
seenHashes.add(`tool_${hash}`)
|
||||
|
||||
const { content: truncatedTool, truncated } = truncateContent(json)
|
||||
|
||||
void logOTelEvent('tool', {
|
||||
tool_name: sanitizeToolNameForAnalytics(name),
|
||||
tool_hash: hash,
|
||||
tool: truncatedTool,
|
||||
...(truncated && { tool_truncated: 'true' }),
|
||||
})
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// If parsing fails, log the raw tools string
|
||||
span.setAttribute('tools_parse_error', true)
|
||||
}
|
||||
}
|
||||
|
||||
// Add new_context using hash-based tracking (visible to all users)
|
||||
if (messagesForAPI && messagesForAPI.length > 0 && newContext?.querySource) {
|
||||
const querySource = newContext.querySource
|
||||
const lastHash = lastReportedMessageHash.get(querySource)
|
||||
|
||||
// Find where the last reported message is in the array
|
||||
let startIndex = 0
|
||||
if (lastHash) {
|
||||
for (let i = 0; i < messagesForAPI.length; i++) {
|
||||
const msg = messagesForAPI[i]
|
||||
if (msg && hashMessage(msg) === lastHash) {
|
||||
startIndex = i + 1 // Start after the last reported message
|
||||
break
|
||||
}
|
||||
}
|
||||
// If lastHash not found, startIndex stays 0 (send everything)
|
||||
}
|
||||
|
||||
// Get new messages (filter out assistant messages - we only want user input/tool results)
|
||||
const newMessages = messagesForAPI
|
||||
.slice(startIndex)
|
||||
.filter((m): m is UserMessage => m.type === 'user')
|
||||
|
||||
if (newMessages.length > 0) {
|
||||
// Format new messages, separating system reminders from regular content
|
||||
const { contextParts, systemReminders } =
|
||||
formatMessagesForContext(newMessages)
|
||||
|
||||
// Set new_context (regular user content and tool results)
|
||||
if (contextParts.length > 0) {
|
||||
const fullContext = contextParts.join('\n\n---\n\n')
|
||||
const { content: truncatedContext, truncated } =
|
||||
truncateContent(fullContext)
|
||||
|
||||
span.setAttributes({
|
||||
new_context: truncatedContext,
|
||||
new_context_message_count: newMessages.length,
|
||||
...(truncated && {
|
||||
new_context_truncated: true,
|
||||
new_context_original_length: fullContext.length,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
// Set system_reminders as a separate attribute
|
||||
if (systemReminders.length > 0) {
|
||||
const fullReminders = systemReminders.join('\n\n---\n\n')
|
||||
const { content: truncatedReminders, truncated: remindersTruncated } =
|
||||
truncateContent(fullReminders)
|
||||
|
||||
span.setAttributes({
|
||||
system_reminders: truncatedReminders,
|
||||
system_reminders_count: systemReminders.length,
|
||||
...(remindersTruncated && {
|
||||
system_reminders_truncated: true,
|
||||
system_reminders_original_length: fullReminders.length,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
// Update last reported hash to the last message in the array
|
||||
const lastMessage = messagesForAPI[messagesForAPI.length - 1]
|
||||
if (lastMessage) {
|
||||
lastReportedMessageHash.set(querySource, hashMessage(lastMessage))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add beta attributes to endLLMRequestSpan.
|
||||
* Handles model_output and thinking_output truncation.
|
||||
*/
|
||||
export function addBetaLLMResponseAttributes(
|
||||
endAttributes: Record<string, string | number | boolean>,
|
||||
metadata?: {
|
||||
modelOutput?: string
|
||||
thinkingOutput?: string
|
||||
},
|
||||
): void {
|
||||
if (!isBetaTracingEnabled() || !metadata) {
|
||||
return
|
||||
}
|
||||
|
||||
// Add model_output (text content) - visible to all users
|
||||
if (metadata.modelOutput !== undefined) {
|
||||
const { content: modelOutput, truncated: outputTruncated } =
|
||||
truncateContent(metadata.modelOutput)
|
||||
endAttributes['response.model_output'] = modelOutput
|
||||
if (outputTruncated) {
|
||||
endAttributes['response.model_output_truncated'] = true
|
||||
endAttributes['response.model_output_original_length'] =
|
||||
metadata.modelOutput.length
|
||||
}
|
||||
}
|
||||
|
||||
// Add thinking_output - ant-only
|
||||
if (
|
||||
process.env.USER_TYPE === 'ant' &&
|
||||
metadata.thinkingOutput !== undefined
|
||||
) {
|
||||
const { content: thinkingOutput, truncated: thinkingTruncated } =
|
||||
truncateContent(metadata.thinkingOutput)
|
||||
endAttributes['response.thinking_output'] = thinkingOutput
|
||||
if (thinkingTruncated) {
|
||||
endAttributes['response.thinking_output_truncated'] = true
|
||||
endAttributes['response.thinking_output_original_length'] =
|
||||
metadata.thinkingOutput.length
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add beta attributes to startToolSpan.
|
||||
* Adds tool_input with the serialized tool input.
|
||||
*/
|
||||
export function addBetaToolInputAttributes(
|
||||
span: Span,
|
||||
toolName: string,
|
||||
toolInput: string,
|
||||
): void {
|
||||
if (!isBetaTracingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
const { content: truncatedInput, truncated } = truncateContent(
|
||||
`[TOOL INPUT: ${toolName}]\n${toolInput}`,
|
||||
)
|
||||
span.setAttributes({
|
||||
tool_input: truncatedInput,
|
||||
...(truncated && {
|
||||
tool_input_truncated: true,
|
||||
tool_input_original_length: toolInput.length,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Add beta attributes to endToolSpan.
|
||||
* Adds new_context with the tool result.
|
||||
*/
|
||||
export function addBetaToolResultAttributes(
|
||||
endAttributes: Record<string, string | number | boolean>,
|
||||
toolName: string | number | boolean,
|
||||
toolResult: string,
|
||||
): void {
|
||||
if (!isBetaTracingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
const { content: truncatedResult, truncated } = truncateContent(
|
||||
`[TOOL RESULT: ${toolName}]\n${toolResult}`,
|
||||
)
|
||||
endAttributes['new_context'] = truncatedResult
|
||||
if (truncated) {
|
||||
endAttributes['new_context_truncated'] = true
|
||||
endAttributes['new_context_original_length'] = toolResult.length
|
||||
}
|
||||
}
|
||||
252
src/utils/telemetry/bigqueryExporter.ts
Normal file
252
src/utils/telemetry/bigqueryExporter.ts
Normal file
@@ -0,0 +1,252 @@
|
||||
import type { Attributes, HrTime } from '@opentelemetry/api'
|
||||
import { type ExportResult, ExportResultCode } from '@opentelemetry/core'
|
||||
import {
|
||||
AggregationTemporality,
|
||||
type MetricData,
|
||||
type DataPoint as OTelDataPoint,
|
||||
type PushMetricExporter,
|
||||
type ResourceMetrics,
|
||||
} from '@opentelemetry/sdk-metrics'
|
||||
import axios from 'axios'
|
||||
import { checkMetricsEnabled } from 'src/services/api/metricsOptOut.js'
|
||||
import { getIsNonInteractiveSession } from '../../bootstrap/state.js'
|
||||
import { getSubscriptionType, isClaudeAISubscriber } from '../auth.js'
|
||||
import { checkHasTrustDialogAccepted } from '../config.js'
|
||||
import { logForDebugging } from '../debug.js'
|
||||
import { errorMessage, toError } from '../errors.js'
|
||||
import { getAuthHeaders } from '../http.js'
|
||||
import { logError } from '../log.js'
|
||||
import { jsonStringify } from '../slowOperations.js'
|
||||
import { getClaudeCodeUserAgent } from '../userAgent.js'
|
||||
|
||||
type DataPoint = {
|
||||
attributes: Record<string, string>
|
||||
value: number
|
||||
timestamp: string
|
||||
}
|
||||
|
||||
type Metric = {
|
||||
name: string
|
||||
description?: string
|
||||
unit?: string
|
||||
data_points: DataPoint[]
|
||||
}
|
||||
|
||||
type InternalMetricsPayload = {
|
||||
resource_attributes: Record<string, string>
|
||||
metrics: Metric[]
|
||||
}
|
||||
|
||||
export class BigQueryMetricsExporter implements PushMetricExporter {
|
||||
private readonly endpoint: string
|
||||
private readonly timeout: number
|
||||
private pendingExports: Promise<void>[] = []
|
||||
private isShutdown = false
|
||||
|
||||
constructor(options: { timeout?: number } = {}) {
|
||||
const defaultEndpoint = 'https://api.anthropic.com/api/claude_code/metrics'
|
||||
|
||||
if (
|
||||
process.env.USER_TYPE === 'ant' &&
|
||||
process.env.ANT_CLAUDE_CODE_METRICS_ENDPOINT
|
||||
) {
|
||||
this.endpoint =
|
||||
process.env.ANT_CLAUDE_CODE_METRICS_ENDPOINT +
|
||||
'/api/claude_code/metrics'
|
||||
} else {
|
||||
this.endpoint = defaultEndpoint
|
||||
}
|
||||
|
||||
this.timeout = options.timeout || 5000
|
||||
}
|
||||
|
||||
async export(
|
||||
metrics: ResourceMetrics,
|
||||
resultCallback: (result: ExportResult) => void,
|
||||
): Promise<void> {
|
||||
if (this.isShutdown) {
|
||||
resultCallback({
|
||||
code: ExportResultCode.FAILED,
|
||||
error: new Error('Exporter has been shutdown'),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
const exportPromise = this.doExport(metrics, resultCallback)
|
||||
this.pendingExports.push(exportPromise)
|
||||
|
||||
// Clean up completed exports
|
||||
void exportPromise.finally(() => {
|
||||
const index = this.pendingExports.indexOf(exportPromise)
|
||||
if (index > -1) {
|
||||
void this.pendingExports.splice(index, 1)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
private async doExport(
|
||||
metrics: ResourceMetrics,
|
||||
resultCallback: (result: ExportResult) => void,
|
||||
): Promise<void> {
|
||||
try {
|
||||
// Skip if trust not established in interactive mode
|
||||
// This prevents triggering apiKeyHelper before trust dialog
|
||||
const hasTrust =
|
||||
checkHasTrustDialogAccepted() || getIsNonInteractiveSession()
|
||||
if (!hasTrust) {
|
||||
logForDebugging(
|
||||
'BigQuery metrics export: trust not established, skipping',
|
||||
)
|
||||
resultCallback({ code: ExportResultCode.SUCCESS })
|
||||
return
|
||||
}
|
||||
|
||||
// Check organization-level metrics opt-out
|
||||
const metricsStatus = await checkMetricsEnabled()
|
||||
if (!metricsStatus.enabled) {
|
||||
logForDebugging('Metrics export disabled by organization setting')
|
||||
resultCallback({ code: ExportResultCode.SUCCESS })
|
||||
return
|
||||
}
|
||||
|
||||
const payload = this.transformMetricsForInternal(metrics)
|
||||
|
||||
const authResult = getAuthHeaders()
|
||||
if (authResult.error) {
|
||||
logForDebugging(`Metrics export failed: ${authResult.error}`)
|
||||
resultCallback({
|
||||
code: ExportResultCode.FAILED,
|
||||
error: new Error(authResult.error),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
'User-Agent': getClaudeCodeUserAgent(),
|
||||
...authResult.headers,
|
||||
}
|
||||
|
||||
const response = await axios.post(this.endpoint, payload, {
|
||||
timeout: this.timeout,
|
||||
headers,
|
||||
})
|
||||
|
||||
logForDebugging('BigQuery metrics exported successfully')
|
||||
logForDebugging(
|
||||
`BigQuery API Response: ${jsonStringify(response.data, null, 2)}`,
|
||||
)
|
||||
resultCallback({ code: ExportResultCode.SUCCESS })
|
||||
} catch (error) {
|
||||
logForDebugging(`BigQuery metrics export failed: ${errorMessage(error)}`)
|
||||
logError(error)
|
||||
resultCallback({
|
||||
code: ExportResultCode.FAILED,
|
||||
error: toError(error),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
private transformMetricsForInternal(
|
||||
metrics: ResourceMetrics,
|
||||
): InternalMetricsPayload {
|
||||
const attrs = metrics.resource.attributes
|
||||
|
||||
const resourceAttributes: Record<string, string> = {
|
||||
'service.name': (attrs['service.name'] as string) || 'claude-code',
|
||||
'service.version': (attrs['service.version'] as string) || 'unknown',
|
||||
'os.type': (attrs['os.type'] as string) || 'unknown',
|
||||
'os.version': (attrs['os.version'] as string) || 'unknown',
|
||||
'host.arch': (attrs['host.arch'] as string) || 'unknown',
|
||||
'aggregation.temporality':
|
||||
this.selectAggregationTemporality() === AggregationTemporality.DELTA
|
||||
? 'delta'
|
||||
: 'cumulative',
|
||||
}
|
||||
|
||||
// Only add wsl.version if it exists (omit instead of default)
|
||||
if (attrs['wsl.version']) {
|
||||
resourceAttributes['wsl.version'] = attrs['wsl.version'] as string
|
||||
}
|
||||
|
||||
// Add customer type and subscription type
|
||||
if (isClaudeAISubscriber()) {
|
||||
resourceAttributes['user.customer_type'] = 'claude_ai'
|
||||
const subscriptionType = getSubscriptionType()
|
||||
if (subscriptionType) {
|
||||
resourceAttributes['user.subscription_type'] = subscriptionType
|
||||
}
|
||||
} else {
|
||||
resourceAttributes['user.customer_type'] = 'api'
|
||||
}
|
||||
|
||||
const transformed = {
|
||||
resource_attributes: resourceAttributes,
|
||||
metrics: metrics.scopeMetrics.flatMap(scopeMetric =>
|
||||
scopeMetric.metrics.map(metric => ({
|
||||
name: metric.descriptor.name,
|
||||
description: metric.descriptor.description,
|
||||
unit: metric.descriptor.unit,
|
||||
data_points: this.extractDataPoints(metric),
|
||||
})),
|
||||
),
|
||||
}
|
||||
|
||||
return transformed
|
||||
}
|
||||
|
||||
private extractDataPoints(metric: MetricData): DataPoint[] {
|
||||
const dataPoints = metric.dataPoints || []
|
||||
|
||||
return dataPoints
|
||||
.filter(
|
||||
(point): point is OTelDataPoint<number> =>
|
||||
typeof point.value === 'number',
|
||||
)
|
||||
.map(point => ({
|
||||
attributes: this.convertAttributes(point.attributes),
|
||||
value: point.value,
|
||||
timestamp: this.hrTimeToISOString(
|
||||
point.endTime || point.startTime || [Date.now() / 1000, 0],
|
||||
),
|
||||
}))
|
||||
}
|
||||
|
||||
async shutdown(): Promise<void> {
|
||||
this.isShutdown = true
|
||||
await this.forceFlush()
|
||||
logForDebugging('BigQuery metrics exporter shutdown complete')
|
||||
}
|
||||
|
||||
async forceFlush(): Promise<void> {
|
||||
await Promise.all(this.pendingExports)
|
||||
logForDebugging('BigQuery metrics exporter flush complete')
|
||||
}
|
||||
|
||||
private convertAttributes(
|
||||
attributes: Attributes | undefined,
|
||||
): Record<string, string> {
|
||||
const result: Record<string, string> = {}
|
||||
if (attributes) {
|
||||
for (const [key, value] of Object.entries(attributes)) {
|
||||
if (value !== undefined && value !== null) {
|
||||
result[key] = String(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
private hrTimeToISOString(hrTime: HrTime): string {
|
||||
const [seconds, nanoseconds] = hrTime
|
||||
const date = new Date(seconds * 1000 + nanoseconds / 1000000)
|
||||
return date.toISOString()
|
||||
}
|
||||
|
||||
selectAggregationTemporality(): AggregationTemporality {
|
||||
// DO NOT CHANGE THIS TO CUMULATIVE
|
||||
// It would mess up the aggregation of metrics
|
||||
// for CC Productivity metrics dashboard
|
||||
return AggregationTemporality.DELTA
|
||||
}
|
||||
}
|
||||
75
src/utils/telemetry/events.ts
Normal file
75
src/utils/telemetry/events.ts
Normal file
@@ -0,0 +1,75 @@
|
||||
import type { Attributes } from '@opentelemetry/api'
|
||||
import { getEventLogger, getPromptId } from 'src/bootstrap/state.js'
|
||||
import { logForDebugging } from '../debug.js'
|
||||
import { isEnvTruthy } from '../envUtils.js'
|
||||
import { getTelemetryAttributes } from '../telemetryAttributes.js'
|
||||
|
||||
// Monotonically increasing counter for ordering events within a session
|
||||
let eventSequence = 0
|
||||
|
||||
// Track whether we've already warned about a null event logger to avoid spamming
|
||||
let hasWarnedNoEventLogger = false
|
||||
|
||||
function isUserPromptLoggingEnabled() {
|
||||
return isEnvTruthy(process.env.OTEL_LOG_USER_PROMPTS)
|
||||
}
|
||||
|
||||
export function redactIfDisabled(content: string): string {
|
||||
return isUserPromptLoggingEnabled() ? content : '<REDACTED>'
|
||||
}
|
||||
|
||||
export async function logOTelEvent(
|
||||
eventName: string,
|
||||
metadata: { [key: string]: string | undefined } = {},
|
||||
): Promise<void> {
|
||||
const eventLogger = getEventLogger()
|
||||
if (!eventLogger) {
|
||||
if (!hasWarnedNoEventLogger) {
|
||||
hasWarnedNoEventLogger = true
|
||||
logForDebugging(
|
||||
`[3P telemetry] Event dropped (no event logger initialized): ${eventName}`,
|
||||
{ level: 'warn' },
|
||||
)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Skip logging in test environment
|
||||
if (process.env.NODE_ENV === 'test') {
|
||||
return
|
||||
}
|
||||
|
||||
const attributes: Attributes = {
|
||||
...getTelemetryAttributes(),
|
||||
'event.name': eventName,
|
||||
'event.timestamp': new Date().toISOString(),
|
||||
'event.sequence': eventSequence++,
|
||||
}
|
||||
|
||||
// Add prompt ID to events (but not metrics, where it would cause unbounded cardinality)
|
||||
const promptId = getPromptId()
|
||||
if (promptId) {
|
||||
attributes['prompt.id'] = promptId
|
||||
}
|
||||
|
||||
// Workspace directory from the desktop app (host path). Events only —
|
||||
// filesystem paths are too high-cardinality for metric dimensions, and
|
||||
// the BQ metrics pipeline must never see them.
|
||||
const workspaceDir = process.env.CLAUDE_CODE_WORKSPACE_HOST_PATHS
|
||||
if (workspaceDir) {
|
||||
attributes['workspace.host_paths'] = workspaceDir.split('|')
|
||||
}
|
||||
|
||||
// Add metadata as attributes - all values are already strings
|
||||
for (const [key, value] of Object.entries(metadata)) {
|
||||
if (value !== undefined) {
|
||||
attributes[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
// Emit log record as an event
|
||||
eventLogger.emit({
|
||||
body: `claude_code.${eventName}`,
|
||||
attributes,
|
||||
})
|
||||
}
|
||||
825
src/utils/telemetry/instrumentation.ts
Normal file
825
src/utils/telemetry/instrumentation.ts
Normal file
@@ -0,0 +1,825 @@
|
||||
import { DiagLogLevel, diag, trace } from '@opentelemetry/api'
|
||||
import { logs } from '@opentelemetry/api-logs'
|
||||
// OTLP/Prometheus exporters are dynamically imported inside the protocol
|
||||
// switch statements below. A process uses at most one protocol variant per
|
||||
// signal, but static imports would load all 6 (~1.2MB) on every startup.
|
||||
import {
|
||||
envDetector,
|
||||
hostDetector,
|
||||
osDetector,
|
||||
resourceFromAttributes,
|
||||
} from '@opentelemetry/resources'
|
||||
import {
|
||||
BatchLogRecordProcessor,
|
||||
ConsoleLogRecordExporter,
|
||||
LoggerProvider,
|
||||
} from '@opentelemetry/sdk-logs'
|
||||
import {
|
||||
ConsoleMetricExporter,
|
||||
MeterProvider,
|
||||
PeriodicExportingMetricReader,
|
||||
} from '@opentelemetry/sdk-metrics'
|
||||
import {
|
||||
BasicTracerProvider,
|
||||
BatchSpanProcessor,
|
||||
ConsoleSpanExporter,
|
||||
} from '@opentelemetry/sdk-trace-base'
|
||||
import {
|
||||
ATTR_SERVICE_NAME,
|
||||
ATTR_SERVICE_VERSION,
|
||||
SEMRESATTRS_HOST_ARCH,
|
||||
} from '@opentelemetry/semantic-conventions'
|
||||
import { HttpsProxyAgent } from 'https-proxy-agent'
|
||||
import {
|
||||
getLoggerProvider,
|
||||
getMeterProvider,
|
||||
getTracerProvider,
|
||||
setEventLogger,
|
||||
setLoggerProvider,
|
||||
setMeterProvider,
|
||||
setTracerProvider,
|
||||
} from 'src/bootstrap/state.js'
|
||||
import {
|
||||
getOtelHeadersFromHelper,
|
||||
getSubscriptionType,
|
||||
is1PApiCustomer,
|
||||
isClaudeAISubscriber,
|
||||
} from 'src/utils/auth.js'
|
||||
import { getPlatform, getWslVersion } from 'src/utils/platform.js'
|
||||
|
||||
import { getCACertificates } from '../caCerts.js'
|
||||
import { registerCleanup } from '../cleanupRegistry.js'
|
||||
import { getHasFormattedOutput, logForDebugging } from '../debug.js'
|
||||
import { isEnvTruthy } from '../envUtils.js'
|
||||
import { errorMessage } from '../errors.js'
|
||||
import { getMTLSConfig } from '../mtls.js'
|
||||
import { getProxyUrl, shouldBypassProxy } from '../proxy.js'
|
||||
import { getSettings_DEPRECATED } from '../settings/settings.js'
|
||||
import { jsonStringify } from '../slowOperations.js'
|
||||
import { profileCheckpoint } from '../startupProfiler.js'
|
||||
import { isBetaTracingEnabled } from './betaSessionTracing.js'
|
||||
import { BigQueryMetricsExporter } from './bigqueryExporter.js'
|
||||
import { ClaudeCodeDiagLogger } from './logger.js'
|
||||
import { initializePerfettoTracing } from './perfettoTracing.js'
|
||||
import {
|
||||
endInteractionSpan,
|
||||
isEnhancedTelemetryEnabled,
|
||||
} from './sessionTracing.js'
|
||||
|
||||
const DEFAULT_METRICS_EXPORT_INTERVAL_MS = 60000
|
||||
const DEFAULT_LOGS_EXPORT_INTERVAL_MS = 5000
|
||||
const DEFAULT_TRACES_EXPORT_INTERVAL_MS = 5000
|
||||
|
||||
class TelemetryTimeoutError extends Error {}
|
||||
|
||||
function telemetryTimeout(ms: number, message: string): Promise<never> {
|
||||
return new Promise((_, reject) => {
|
||||
setTimeout(
|
||||
(rej: (e: Error) => void, msg: string) =>
|
||||
rej(new TelemetryTimeoutError(msg)),
|
||||
ms,
|
||||
reject,
|
||||
message,
|
||||
).unref()
|
||||
})
|
||||
}
|
||||
|
||||
export function bootstrapTelemetry() {
|
||||
if (process.env.USER_TYPE === 'ant') {
|
||||
// Read from ANT_ prefixed variables that are defined at build time
|
||||
if (process.env.ANT_OTEL_METRICS_EXPORTER) {
|
||||
process.env.OTEL_METRICS_EXPORTER = process.env.ANT_OTEL_METRICS_EXPORTER
|
||||
}
|
||||
if (process.env.ANT_OTEL_LOGS_EXPORTER) {
|
||||
process.env.OTEL_LOGS_EXPORTER = process.env.ANT_OTEL_LOGS_EXPORTER
|
||||
}
|
||||
if (process.env.ANT_OTEL_TRACES_EXPORTER) {
|
||||
process.env.OTEL_TRACES_EXPORTER = process.env.ANT_OTEL_TRACES_EXPORTER
|
||||
}
|
||||
if (process.env.ANT_OTEL_EXPORTER_OTLP_PROTOCOL) {
|
||||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL =
|
||||
process.env.ANT_OTEL_EXPORTER_OTLP_PROTOCOL
|
||||
}
|
||||
if (process.env.ANT_OTEL_EXPORTER_OTLP_ENDPOINT) {
|
||||
process.env.OTEL_EXPORTER_OTLP_ENDPOINT =
|
||||
process.env.ANT_OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
}
|
||||
if (process.env.ANT_OTEL_EXPORTER_OTLP_HEADERS) {
|
||||
process.env.OTEL_EXPORTER_OTLP_HEADERS =
|
||||
process.env.ANT_OTEL_EXPORTER_OTLP_HEADERS
|
||||
}
|
||||
}
|
||||
|
||||
// Set default tempoality to 'delta' because it's the more sane default
|
||||
if (!process.env.OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE) {
|
||||
process.env.OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE = 'delta'
|
||||
}
|
||||
}
|
||||
|
||||
// Per OTEL spec, "none" means "no automatically configured exporter for this signal".
|
||||
// https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#exporter-selection
|
||||
export function parseExporterTypes(value: string | undefined): string[] {
|
||||
return (value || '')
|
||||
.trim()
|
||||
.split(',')
|
||||
.filter(Boolean)
|
||||
.map(t => t.trim())
|
||||
.filter(t => t !== 'none')
|
||||
}
|
||||
|
||||
async function getOtlpReaders() {
|
||||
const exporterTypes = parseExporterTypes(process.env.OTEL_METRICS_EXPORTER)
|
||||
const exportInterval = parseInt(
|
||||
process.env.OTEL_METRIC_EXPORT_INTERVAL ||
|
||||
DEFAULT_METRICS_EXPORT_INTERVAL_MS.toString(),
|
||||
)
|
||||
|
||||
const exporters = []
|
||||
for (const exporterType of exporterTypes) {
|
||||
if (exporterType === 'console') {
|
||||
// Custom console exporter that shows resource attributes
|
||||
const consoleExporter = new ConsoleMetricExporter()
|
||||
const originalExport = consoleExporter.export.bind(consoleExporter)
|
||||
|
||||
consoleExporter.export = (metrics, callback) => {
|
||||
// Log resource attributes once at the start
|
||||
if (metrics.resource && metrics.resource.attributes) {
|
||||
// The console exporter is for debugging, so console output is intentional here
|
||||
|
||||
logForDebugging('\n=== Resource Attributes ===')
|
||||
logForDebugging(jsonStringify(metrics.resource.attributes))
|
||||
logForDebugging('===========================\n')
|
||||
}
|
||||
|
||||
return originalExport(metrics, callback)
|
||||
}
|
||||
|
||||
exporters.push(consoleExporter)
|
||||
} else if (exporterType === 'otlp') {
|
||||
const protocol =
|
||||
process.env.OTEL_EXPORTER_OTLP_METRICS_PROTOCOL?.trim() ||
|
||||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL?.trim()
|
||||
|
||||
const httpConfig = getOTLPExporterConfig()
|
||||
|
||||
switch (protocol) {
|
||||
case 'grpc': {
|
||||
// Lazy-import to keep @grpc/grpc-js (~700KB) out of the telemetry chunk
|
||||
// when the protocol is http/protobuf (ant default) or http/json.
|
||||
const { OTLPMetricExporter } = await import(
|
||||
'@opentelemetry/exporter-metrics-otlp-grpc'
|
||||
)
|
||||
exporters.push(new OTLPMetricExporter())
|
||||
break
|
||||
}
|
||||
case 'http/json': {
|
||||
const { OTLPMetricExporter } = await import(
|
||||
'@opentelemetry/exporter-metrics-otlp-http'
|
||||
)
|
||||
exporters.push(new OTLPMetricExporter(httpConfig))
|
||||
break
|
||||
}
|
||||
case 'http/protobuf': {
|
||||
const { OTLPMetricExporter } = await import(
|
||||
'@opentelemetry/exporter-metrics-otlp-proto'
|
||||
)
|
||||
exporters.push(new OTLPMetricExporter(httpConfig))
|
||||
break
|
||||
}
|
||||
default:
|
||||
throw new Error(
|
||||
`Unknown protocol set in OTEL_EXPORTER_OTLP_METRICS_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${protocol}`,
|
||||
)
|
||||
}
|
||||
} else if (exporterType === 'prometheus') {
|
||||
const { PrometheusExporter } = await import(
|
||||
'@opentelemetry/exporter-prometheus'
|
||||
)
|
||||
exporters.push(new PrometheusExporter())
|
||||
} else {
|
||||
throw new Error(
|
||||
`Unknown exporter type set in OTEL_EXPORTER_OTLP_METRICS_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${exporterType}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return exporters.map(exporter => {
|
||||
if ('export' in exporter) {
|
||||
return new PeriodicExportingMetricReader({
|
||||
exporter,
|
||||
exportIntervalMillis: exportInterval,
|
||||
})
|
||||
}
|
||||
return exporter
|
||||
})
|
||||
}
|
||||
|
||||
async function getOtlpLogExporters() {
|
||||
const exporterTypes = parseExporterTypes(process.env.OTEL_LOGS_EXPORTER)
|
||||
|
||||
const protocol =
|
||||
process.env.OTEL_EXPORTER_OTLP_LOGS_PROTOCOL?.trim() ||
|
||||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL?.trim()
|
||||
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
|
||||
logForDebugging(
|
||||
`[3P telemetry] getOtlpLogExporters: types=${jsonStringify(exporterTypes)}, protocol=${protocol}, endpoint=${endpoint}`,
|
||||
)
|
||||
|
||||
const exporters = []
|
||||
for (const exporterType of exporterTypes) {
|
||||
if (exporterType === 'console') {
|
||||
exporters.push(new ConsoleLogRecordExporter())
|
||||
} else if (exporterType === 'otlp') {
|
||||
const httpConfig = getOTLPExporterConfig()
|
||||
|
||||
switch (protocol) {
|
||||
case 'grpc': {
|
||||
const { OTLPLogExporter } = await import(
|
||||
'@opentelemetry/exporter-logs-otlp-grpc'
|
||||
)
|
||||
exporters.push(new OTLPLogExporter())
|
||||
break
|
||||
}
|
||||
case 'http/json': {
|
||||
const { OTLPLogExporter } = await import(
|
||||
'@opentelemetry/exporter-logs-otlp-http'
|
||||
)
|
||||
exporters.push(new OTLPLogExporter(httpConfig))
|
||||
break
|
||||
}
|
||||
case 'http/protobuf': {
|
||||
const { OTLPLogExporter } = await import(
|
||||
'@opentelemetry/exporter-logs-otlp-proto'
|
||||
)
|
||||
exporters.push(new OTLPLogExporter(httpConfig))
|
||||
break
|
||||
}
|
||||
default:
|
||||
throw new Error(
|
||||
`Unknown protocol set in OTEL_EXPORTER_OTLP_LOGS_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${protocol}`,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
throw new Error(
|
||||
`Unknown exporter type set in OTEL_LOGS_EXPORTER env var: ${exporterType}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return exporters
|
||||
}
|
||||
|
||||
async function getOtlpTraceExporters() {
|
||||
const exporterTypes = parseExporterTypes(process.env.OTEL_TRACES_EXPORTER)
|
||||
|
||||
const exporters = []
|
||||
for (const exporterType of exporterTypes) {
|
||||
if (exporterType === 'console') {
|
||||
exporters.push(new ConsoleSpanExporter())
|
||||
} else if (exporterType === 'otlp') {
|
||||
const protocol =
|
||||
process.env.OTEL_EXPORTER_OTLP_TRACES_PROTOCOL?.trim() ||
|
||||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL?.trim()
|
||||
|
||||
const httpConfig = getOTLPExporterConfig()
|
||||
|
||||
switch (protocol) {
|
||||
case 'grpc': {
|
||||
const { OTLPTraceExporter } = await import(
|
||||
'@opentelemetry/exporter-trace-otlp-grpc'
|
||||
)
|
||||
exporters.push(new OTLPTraceExporter())
|
||||
break
|
||||
}
|
||||
case 'http/json': {
|
||||
const { OTLPTraceExporter } = await import(
|
||||
'@opentelemetry/exporter-trace-otlp-http'
|
||||
)
|
||||
exporters.push(new OTLPTraceExporter(httpConfig))
|
||||
break
|
||||
}
|
||||
case 'http/protobuf': {
|
||||
const { OTLPTraceExporter } = await import(
|
||||
'@opentelemetry/exporter-trace-otlp-proto'
|
||||
)
|
||||
exporters.push(new OTLPTraceExporter(httpConfig))
|
||||
break
|
||||
}
|
||||
default:
|
||||
throw new Error(
|
||||
`Unknown protocol set in OTEL_EXPORTER_OTLP_TRACES_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${protocol}`,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
throw new Error(
|
||||
`Unknown exporter type set in OTEL_TRACES_EXPORTER env var: ${exporterType}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return exporters
|
||||
}
|
||||
|
||||
export function isTelemetryEnabled() {
|
||||
return isEnvTruthy(process.env.CLAUDE_CODE_ENABLE_TELEMETRY)
|
||||
}
|
||||
|
||||
function getBigQueryExportingReader() {
|
||||
const bigqueryExporter = new BigQueryMetricsExporter()
|
||||
return new PeriodicExportingMetricReader({
|
||||
exporter: bigqueryExporter,
|
||||
exportIntervalMillis: 5 * 60 * 1000, // 5mins for BigQuery metrics exporter to reduce load
|
||||
})
|
||||
}
|
||||
|
||||
function isBigQueryMetricsEnabled() {
|
||||
// BigQuery metrics are enabled for:
|
||||
// 1. API customers (excluding Claude.ai subscribers and Bedrock/Vertex)
|
||||
// 2. Claude for Enterprise (C4E) users
|
||||
// 3. Claude for Teams users
|
||||
const subscriptionType = getSubscriptionType()
|
||||
const isC4EOrTeamUser =
|
||||
isClaudeAISubscriber() &&
|
||||
(subscriptionType === 'enterprise' || subscriptionType === 'team')
|
||||
|
||||
return is1PApiCustomer() || isC4EOrTeamUser
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize beta tracing - a separate code path for detailed debugging.
|
||||
* Uses BETA_TRACING_ENDPOINT instead of OTEL_EXPORTER_OTLP_ENDPOINT.
|
||||
*/
|
||||
async function initializeBetaTracing(
|
||||
resource: ReturnType<typeof resourceFromAttributes>,
|
||||
): Promise<void> {
|
||||
const endpoint = process.env.BETA_TRACING_ENDPOINT
|
||||
if (!endpoint) {
|
||||
return
|
||||
}
|
||||
|
||||
const [{ OTLPTraceExporter }, { OTLPLogExporter }] = await Promise.all([
|
||||
import('@opentelemetry/exporter-trace-otlp-http'),
|
||||
import('@opentelemetry/exporter-logs-otlp-http'),
|
||||
])
|
||||
|
||||
const httpConfig = {
|
||||
url: `${endpoint}/v1/traces`,
|
||||
}
|
||||
|
||||
const logHttpConfig = {
|
||||
url: `${endpoint}/v1/logs`,
|
||||
}
|
||||
|
||||
// Initialize trace exporter
|
||||
const traceExporter = new OTLPTraceExporter(httpConfig)
|
||||
const spanProcessor = new BatchSpanProcessor(traceExporter, {
|
||||
scheduledDelayMillis: DEFAULT_TRACES_EXPORT_INTERVAL_MS,
|
||||
})
|
||||
|
||||
const tracerProvider = new BasicTracerProvider({
|
||||
resource,
|
||||
spanProcessors: [spanProcessor],
|
||||
})
|
||||
|
||||
trace.setGlobalTracerProvider(tracerProvider)
|
||||
setTracerProvider(tracerProvider)
|
||||
|
||||
// Initialize log exporter
|
||||
const logExporter = new OTLPLogExporter(logHttpConfig)
|
||||
const loggerProvider = new LoggerProvider({
|
||||
resource,
|
||||
processors: [
|
||||
new BatchLogRecordProcessor(logExporter, {
|
||||
scheduledDelayMillis: DEFAULT_LOGS_EXPORT_INTERVAL_MS,
|
||||
}),
|
||||
],
|
||||
})
|
||||
|
||||
logs.setGlobalLoggerProvider(loggerProvider)
|
||||
setLoggerProvider(loggerProvider)
|
||||
|
||||
// Initialize event logger
|
||||
const eventLogger = logs.getLogger(
|
||||
'com.anthropic.claude_code.events',
|
||||
MACRO.VERSION,
|
||||
)
|
||||
setEventLogger(eventLogger)
|
||||
|
||||
// Setup flush handlers - flush both logs AND traces
|
||||
process.on('beforeExit', async () => {
|
||||
await loggerProvider?.forceFlush()
|
||||
await tracerProvider?.forceFlush()
|
||||
})
|
||||
|
||||
process.on('exit', () => {
|
||||
void loggerProvider?.forceFlush()
|
||||
void tracerProvider?.forceFlush()
|
||||
})
|
||||
}
|
||||
|
||||
export async function initializeTelemetry() {
|
||||
profileCheckpoint('telemetry_init_start')
|
||||
bootstrapTelemetry()
|
||||
|
||||
// Console exporters call console.dir on a timer (5s logs/traces, 60s
|
||||
// metrics), writing pretty-printed objects to stdout. In stream-json
|
||||
// mode stdout is the SDK message channel; the first line (`{`) breaks
|
||||
// the SDK's line reader. Stripped here (not main.tsx) because init.ts
|
||||
// re-runs applyConfigEnvironmentVariables() inside initializeTelemetry-
|
||||
// AfterTrust for remote-managed-settings users, and bootstrapTelemetry
|
||||
// above copies ANT_OTEL_* for ant users — both would undo an earlier strip.
|
||||
if (getHasFormattedOutput()) {
|
||||
for (const key of [
|
||||
'OTEL_METRICS_EXPORTER',
|
||||
'OTEL_LOGS_EXPORTER',
|
||||
'OTEL_TRACES_EXPORTER',
|
||||
] as const) {
|
||||
const v = process.env[key]
|
||||
if (v?.includes('console')) {
|
||||
process.env[key] = v
|
||||
.split(',')
|
||||
.map(s => s.trim())
|
||||
.filter(s => s !== 'console')
|
||||
.join(',')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
diag.setLogger(new ClaudeCodeDiagLogger(), DiagLogLevel.ERROR)
|
||||
|
||||
// Initialize Perfetto tracing (independent of OTEL)
|
||||
// Enable via CLAUDE_CODE_PERFETTO_TRACE=1 or CLAUDE_CODE_PERFETTO_TRACE=<path>
|
||||
initializePerfettoTracing()
|
||||
|
||||
const readers = []
|
||||
|
||||
// Add customer exporters (if enabled)
|
||||
const telemetryEnabled = isTelemetryEnabled()
|
||||
logForDebugging(
|
||||
`[3P telemetry] isTelemetryEnabled=${telemetryEnabled} (CLAUDE_CODE_ENABLE_TELEMETRY=${process.env.CLAUDE_CODE_ENABLE_TELEMETRY})`,
|
||||
)
|
||||
if (telemetryEnabled) {
|
||||
readers.push(...(await getOtlpReaders()))
|
||||
}
|
||||
|
||||
// Add BigQuery exporter (for API customers, C4E users, and internal users)
|
||||
if (isBigQueryMetricsEnabled()) {
|
||||
readers.push(getBigQueryExportingReader())
|
||||
}
|
||||
|
||||
// Create base resource with service attributes
|
||||
const platform = getPlatform()
|
||||
const baseAttributes: Record<string, string> = {
|
||||
[ATTR_SERVICE_NAME]: 'claude-code',
|
||||
[ATTR_SERVICE_VERSION]: MACRO.VERSION,
|
||||
}
|
||||
|
||||
// Add WSL-specific attributes if running on WSL
|
||||
if (platform === 'wsl') {
|
||||
const wslVersion = getWslVersion()
|
||||
if (wslVersion) {
|
||||
baseAttributes['wsl.version'] = wslVersion
|
||||
}
|
||||
}
|
||||
|
||||
const baseResource = resourceFromAttributes(baseAttributes)
|
||||
|
||||
// Use OpenTelemetry detectors
|
||||
const osResource = resourceFromAttributes(
|
||||
osDetector.detect().attributes || {},
|
||||
)
|
||||
|
||||
// Extract only host.arch from hostDetector
|
||||
const hostDetected = hostDetector.detect()
|
||||
const hostArchAttributes = hostDetected.attributes?.[SEMRESATTRS_HOST_ARCH]
|
||||
? {
|
||||
[SEMRESATTRS_HOST_ARCH]: hostDetected.attributes[SEMRESATTRS_HOST_ARCH],
|
||||
}
|
||||
: {}
|
||||
const hostArchResource = resourceFromAttributes(hostArchAttributes)
|
||||
|
||||
const envResource = resourceFromAttributes(
|
||||
envDetector.detect().attributes || {},
|
||||
)
|
||||
|
||||
// Merge resources - later resources take precedence
|
||||
const resource = baseResource
|
||||
.merge(osResource)
|
||||
.merge(hostArchResource)
|
||||
.merge(envResource)
|
||||
|
||||
// Check if beta tracing is enabled - this is a separate code path
|
||||
// Available to all users who set ENABLE_BETA_TRACING_DETAILED=1 and BETA_TRACING_ENDPOINT
|
||||
if (isBetaTracingEnabled()) {
|
||||
void initializeBetaTracing(resource).catch(e =>
|
||||
logForDebugging(`Beta tracing init failed: ${e}`, { level: 'error' }),
|
||||
)
|
||||
// Still set up meter provider for metrics (but skip regular logs/traces setup)
|
||||
const meterProvider = new MeterProvider({
|
||||
resource,
|
||||
views: [],
|
||||
readers,
|
||||
})
|
||||
setMeterProvider(meterProvider)
|
||||
|
||||
// Register shutdown for beta tracing
|
||||
const shutdownTelemetry = async () => {
|
||||
const timeoutMs = parseInt(
|
||||
process.env.CLAUDE_CODE_OTEL_SHUTDOWN_TIMEOUT_MS || '2000',
|
||||
)
|
||||
try {
|
||||
endInteractionSpan()
|
||||
|
||||
// Force flush + shutdown together inside the timeout. Previously forceFlush
|
||||
// was awaited unbounded BEFORE the race, blocking exit on slow OTLP endpoints.
|
||||
// Each provider's flush→shutdown is chained independently so a slow logger
|
||||
// flush doesn't delay meterProvider/tracerProvider shutdown (no waterfall).
|
||||
const loggerProvider = getLoggerProvider()
|
||||
const tracerProvider = getTracerProvider()
|
||||
|
||||
const chains: Promise<void>[] = [meterProvider.shutdown()]
|
||||
if (loggerProvider) {
|
||||
chains.push(
|
||||
loggerProvider.forceFlush().then(() => loggerProvider.shutdown()),
|
||||
)
|
||||
}
|
||||
if (tracerProvider) {
|
||||
chains.push(
|
||||
tracerProvider.forceFlush().then(() => tracerProvider.shutdown()),
|
||||
)
|
||||
}
|
||||
|
||||
await Promise.race([
|
||||
Promise.all(chains),
|
||||
telemetryTimeout(timeoutMs, 'OpenTelemetry shutdown timeout'),
|
||||
])
|
||||
} catch {
|
||||
// Ignore shutdown errors
|
||||
}
|
||||
}
|
||||
registerCleanup(shutdownTelemetry)
|
||||
|
||||
return meterProvider.getMeter('com.anthropic.claude_code', MACRO.VERSION)
|
||||
}
|
||||
|
||||
const meterProvider = new MeterProvider({
|
||||
resource,
|
||||
views: [],
|
||||
readers,
|
||||
})
|
||||
|
||||
// Store reference in state for flushing
|
||||
setMeterProvider(meterProvider)
|
||||
|
||||
// Initialize logs if telemetry is enabled
|
||||
if (telemetryEnabled) {
|
||||
const logExporters = await getOtlpLogExporters()
|
||||
logForDebugging(
|
||||
`[3P telemetry] Created ${logExporters.length} log exporter(s)`,
|
||||
)
|
||||
|
||||
if (logExporters.length > 0) {
|
||||
const loggerProvider = new LoggerProvider({
|
||||
resource,
|
||||
// Add batch processors for each exporter
|
||||
processors: logExporters.map(
|
||||
exporter =>
|
||||
new BatchLogRecordProcessor(exporter, {
|
||||
scheduledDelayMillis: parseInt(
|
||||
process.env.OTEL_LOGS_EXPORT_INTERVAL ||
|
||||
DEFAULT_LOGS_EXPORT_INTERVAL_MS.toString(),
|
||||
),
|
||||
}),
|
||||
),
|
||||
})
|
||||
|
||||
// Register the logger provider globally
|
||||
logs.setGlobalLoggerProvider(loggerProvider)
|
||||
setLoggerProvider(loggerProvider)
|
||||
|
||||
// Initialize event logger
|
||||
const eventLogger = logs.getLogger(
|
||||
'com.anthropic.claude_code.events',
|
||||
MACRO.VERSION,
|
||||
)
|
||||
setEventLogger(eventLogger)
|
||||
logForDebugging('[3P telemetry] Event logger set successfully')
|
||||
|
||||
// 'beforeExit' is emitted when Node.js empties its event loop and has no additional work to schedule.
|
||||
// Unlike 'exit', it allows us to perform async operations, so it works well for letting
|
||||
// network requests complete before the process exits naturally.
|
||||
process.on('beforeExit', async () => {
|
||||
await loggerProvider?.forceFlush()
|
||||
// Also flush traces - they use BatchSpanProcessor which needs explicit flush
|
||||
const tracerProvider = getTracerProvider()
|
||||
await tracerProvider?.forceFlush()
|
||||
})
|
||||
|
||||
process.on('exit', () => {
|
||||
// Final attempt to flush logs and traces
|
||||
void loggerProvider?.forceFlush()
|
||||
void getTracerProvider()?.forceFlush()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize tracing if enhanced telemetry is enabled (BETA)
|
||||
if (telemetryEnabled && isEnhancedTelemetryEnabled()) {
|
||||
const traceExporters = await getOtlpTraceExporters()
|
||||
if (traceExporters.length > 0) {
|
||||
// Create span processors for each exporter
|
||||
const spanProcessors = traceExporters.map(
|
||||
exporter =>
|
||||
new BatchSpanProcessor(exporter, {
|
||||
scheduledDelayMillis: parseInt(
|
||||
process.env.OTEL_TRACES_EXPORT_INTERVAL ||
|
||||
DEFAULT_TRACES_EXPORT_INTERVAL_MS.toString(),
|
||||
),
|
||||
}),
|
||||
)
|
||||
|
||||
const tracerProvider = new BasicTracerProvider({
|
||||
resource,
|
||||
spanProcessors,
|
||||
})
|
||||
|
||||
// Register the tracer provider globally
|
||||
trace.setGlobalTracerProvider(tracerProvider)
|
||||
setTracerProvider(tracerProvider)
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown metrics and logs on exit (flushes and closes exporters)
|
||||
const shutdownTelemetry = async () => {
|
||||
const timeoutMs = parseInt(
|
||||
process.env.CLAUDE_CODE_OTEL_SHUTDOWN_TIMEOUT_MS || '2000',
|
||||
)
|
||||
|
||||
try {
|
||||
// End any active interaction span before shutdown
|
||||
endInteractionSpan()
|
||||
|
||||
const shutdownPromises = [meterProvider.shutdown()]
|
||||
const loggerProvider = getLoggerProvider()
|
||||
if (loggerProvider) {
|
||||
shutdownPromises.push(loggerProvider.shutdown())
|
||||
}
|
||||
const tracerProvider = getTracerProvider()
|
||||
if (tracerProvider) {
|
||||
shutdownPromises.push(tracerProvider.shutdown())
|
||||
}
|
||||
|
||||
await Promise.race([
|
||||
Promise.all(shutdownPromises),
|
||||
telemetryTimeout(timeoutMs, 'OpenTelemetry shutdown timeout'),
|
||||
])
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes('timeout')) {
|
||||
logForDebugging(
|
||||
`
|
||||
OpenTelemetry telemetry flush timed out after ${timeoutMs}ms
|
||||
|
||||
To resolve this issue, you can:
|
||||
1. Increase the timeout by setting CLAUDE_CODE_OTEL_SHUTDOWN_TIMEOUT_MS env var (e.g., 5000 for 5 seconds)
|
||||
2. Check if your OpenTelemetry backend is experiencing scalability issues
|
||||
3. Disable OpenTelemetry by unsetting CLAUDE_CODE_ENABLE_TELEMETRY env var
|
||||
|
||||
Current timeout: ${timeoutMs}ms
|
||||
`,
|
||||
{ level: 'error' },
|
||||
)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
// Always register shutdown (internal metrics are always enabled)
|
||||
registerCleanup(shutdownTelemetry)
|
||||
|
||||
return meterProvider.getMeter('com.anthropic.claude_code', MACRO.VERSION)
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush all pending telemetry data immediately.
|
||||
* This should be called before logout or org switching to prevent data leakage.
|
||||
*/
|
||||
export async function flushTelemetry(): Promise<void> {
|
||||
const meterProvider = getMeterProvider()
|
||||
if (!meterProvider) {
|
||||
return
|
||||
}
|
||||
|
||||
const timeoutMs = parseInt(
|
||||
process.env.CLAUDE_CODE_OTEL_FLUSH_TIMEOUT_MS || '5000',
|
||||
)
|
||||
|
||||
try {
|
||||
const flushPromises = [meterProvider.forceFlush()]
|
||||
const loggerProvider = getLoggerProvider()
|
||||
if (loggerProvider) {
|
||||
flushPromises.push(loggerProvider.forceFlush())
|
||||
}
|
||||
const tracerProvider = getTracerProvider()
|
||||
if (tracerProvider) {
|
||||
flushPromises.push(tracerProvider.forceFlush())
|
||||
}
|
||||
|
||||
await Promise.race([
|
||||
Promise.all(flushPromises),
|
||||
telemetryTimeout(timeoutMs, 'OpenTelemetry flush timeout'),
|
||||
])
|
||||
|
||||
logForDebugging('Telemetry flushed successfully')
|
||||
} catch (error) {
|
||||
if (error instanceof TelemetryTimeoutError) {
|
||||
logForDebugging(
|
||||
`Telemetry flush timed out after ${timeoutMs}ms. Some metrics may not be exported.`,
|
||||
{ level: 'warn' },
|
||||
)
|
||||
} else {
|
||||
logForDebugging(`Telemetry flush failed: ${errorMessage(error)}`, {
|
||||
level: 'error',
|
||||
})
|
||||
}
|
||||
// Don't throw - allow logout to continue even if flush fails
|
||||
}
|
||||
}
|
||||
|
||||
function parseOtelHeadersEnvVar(): Record<string, string> {
|
||||
const headers: Record<string, string> = {}
|
||||
const envHeaders = process.env.OTEL_EXPORTER_OTLP_HEADERS
|
||||
if (envHeaders) {
|
||||
for (const pair of envHeaders.split(',')) {
|
||||
const [key, ...valueParts] = pair.split('=')
|
||||
if (key && valueParts.length > 0) {
|
||||
headers[key.trim()] = valueParts.join('=').trim()
|
||||
}
|
||||
}
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
||||
/**
|
||||
* Get configuration for OTLP exporters including:
|
||||
* - HTTP agent options (proxy, mTLS)
|
||||
* - Dynamic headers via otelHeadersHelper or static headers from env var
|
||||
*/
|
||||
function getOTLPExporterConfig() {
|
||||
const proxyUrl = getProxyUrl()
|
||||
const mtlsConfig = getMTLSConfig()
|
||||
const settings = getSettings_DEPRECATED()
|
||||
|
||||
// Build base config
|
||||
const config: Record<string, unknown> = {}
|
||||
|
||||
// Parse static headers from env var once (doesn't change at runtime)
|
||||
const staticHeaders = parseOtelHeadersEnvVar()
|
||||
|
||||
// If otelHeadersHelper is configured, use async headers function for dynamic refresh
|
||||
// Otherwise just return static headers if any exist
|
||||
if (settings?.otelHeadersHelper) {
|
||||
config.headers = async (): Promise<Record<string, string>> => {
|
||||
const dynamicHeaders = getOtelHeadersFromHelper()
|
||||
return { ...staticHeaders, ...dynamicHeaders }
|
||||
}
|
||||
} else if (Object.keys(staticHeaders).length > 0) {
|
||||
config.headers = async (): Promise<Record<string, string>> => staticHeaders
|
||||
}
|
||||
|
||||
// Check if we should bypass proxy for OTEL endpoint
|
||||
const otelEndpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
if (!proxyUrl || (otelEndpoint && shouldBypassProxy(otelEndpoint))) {
|
||||
// No proxy configured or OTEL endpoint should bypass proxy
|
||||
const caCerts = getCACertificates()
|
||||
if (mtlsConfig || caCerts) {
|
||||
config.httpAgentOptions = {
|
||||
...mtlsConfig,
|
||||
...(caCerts && { ca: caCerts }),
|
||||
}
|
||||
}
|
||||
return config
|
||||
}
|
||||
|
||||
// Return an HttpAgentFactory function that creates our proxy agent
|
||||
const caCerts = getCACertificates()
|
||||
const agentFactory = (_protocol: string) => {
|
||||
// Create and return the proxy agent with mTLS and CA cert config
|
||||
const proxyAgent =
|
||||
mtlsConfig || caCerts
|
||||
? new HttpsProxyAgent(proxyUrl, {
|
||||
...(mtlsConfig && {
|
||||
cert: mtlsConfig.cert,
|
||||
key: mtlsConfig.key,
|
||||
passphrase: mtlsConfig.passphrase,
|
||||
}),
|
||||
...(caCerts && { ca: caCerts }),
|
||||
})
|
||||
: new HttpsProxyAgent(proxyUrl)
|
||||
|
||||
return proxyAgent
|
||||
}
|
||||
|
||||
config.httpAgentOptions = agentFactory
|
||||
return config
|
||||
}
|
||||
26
src/utils/telemetry/logger.ts
Normal file
26
src/utils/telemetry/logger.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import type { DiagLogger } from '@opentelemetry/api'
|
||||
import { logForDebugging } from '../debug.js'
|
||||
import { logError } from '../log.js'
|
||||
export class ClaudeCodeDiagLogger implements DiagLogger {
|
||||
error(message: string, ..._: unknown[]) {
|
||||
logError(new Error(message))
|
||||
logForDebugging(`[3P telemetry] OTEL diag error: ${message}`, {
|
||||
level: 'error',
|
||||
})
|
||||
}
|
||||
warn(message: string, ..._: unknown[]) {
|
||||
logError(new Error(message))
|
||||
logForDebugging(`[3P telemetry] OTEL diag warn: ${message}`, {
|
||||
level: 'warn',
|
||||
})
|
||||
}
|
||||
info(_message: string, ..._args: unknown[]) {
|
||||
return
|
||||
}
|
||||
debug(_message: string, ..._args: unknown[]) {
|
||||
return
|
||||
}
|
||||
verbose(_message: string, ..._args: unknown[]) {
|
||||
return
|
||||
}
|
||||
}
|
||||
1120
src/utils/telemetry/perfettoTracing.ts
Normal file
1120
src/utils/telemetry/perfettoTracing.ts
Normal file
File diff suppressed because it is too large
Load Diff
289
src/utils/telemetry/pluginTelemetry.ts
Normal file
289
src/utils/telemetry/pluginTelemetry.ts
Normal file
@@ -0,0 +1,289 @@
|
||||
/**
|
||||
* Plugin telemetry helpers — shared field builders for plugin lifecycle events.
|
||||
*
|
||||
* Implements the twin-column privacy pattern: every user-defined-name field
|
||||
* emits both a raw value (routed to PII-tagged _PROTO_* BQ columns) and a
|
||||
* redacted twin (real name iff marketplace ∈ allowlist, else 'third-party').
|
||||
*
|
||||
* plugin_id_hash provides an opaque per-plugin aggregation key with no privacy
|
||||
* dependency — sha256(name@marketplace + FIXED_SALT) truncated to 16 chars.
|
||||
* This answers distinct-count and per-plugin-trend questions that the
|
||||
* redacted column can't, without exposing user-defined names.
|
||||
*/
|
||||
|
||||
import { createHash } from 'crypto'
|
||||
import { sep } from 'path'
|
||||
import {
|
||||
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
type AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
logEvent,
|
||||
} from '../../services/analytics/index.js'
|
||||
import type {
|
||||
LoadedPlugin,
|
||||
PluginError,
|
||||
PluginManifest,
|
||||
} from '../../types/plugin.js'
|
||||
import {
|
||||
isOfficialMarketplaceName,
|
||||
parsePluginIdentifier,
|
||||
} from '../plugins/pluginIdentifier.js'
|
||||
|
||||
// builtinPlugins.ts:BUILTIN_MARKETPLACE_NAME — inlined to avoid the cycle
|
||||
// through commands.js. Marketplace schemas.ts enforces 'builtin' is reserved.
|
||||
const BUILTIN_MARKETPLACE_NAME = 'builtin'
|
||||
|
||||
// Fixed salt for plugin_id_hash. Same constant across all repos and emission
|
||||
// sites. Not per-org, not rotated — per-org salt would defeat cross-org
|
||||
// distinct-count, rotation would break trend lines. Customers can compute the
|
||||
// same hash on their known plugin names to reverse-match their own telemetry.
|
||||
const PLUGIN_ID_HASH_SALT = 'claude-plugin-telemetry-v1'
|
||||
|
||||
/**
|
||||
* Opaque per-plugin aggregation key. Input is the name@marketplace string as
|
||||
* it appears in enabledPlugins keys, lowercased on the marketplace suffix for
|
||||
* reproducibility. 16-char truncation keeps BQ GROUP BY cardinality manageable
|
||||
* while making collisions negligible at projected 10k-plugin scale. Name case
|
||||
* is preserved in both branches (enabledPlugins keys are case-sensitive).
|
||||
*/
|
||||
export function hashPluginId(name: string, marketplace?: string): string {
|
||||
const key = marketplace ? `${name}@${marketplace.toLowerCase()}` : name
|
||||
return createHash('sha256')
|
||||
.update(key + PLUGIN_ID_HASH_SALT)
|
||||
.digest('hex')
|
||||
.slice(0, 16)
|
||||
}
|
||||
|
||||
/**
|
||||
* 4-value scope enum for plugin origin. Distinct from PluginScope
|
||||
* (managed/user/project/local) which is installation-target — this is
|
||||
* marketplace-origin.
|
||||
*
|
||||
* - official: from an allowlisted Anthropic marketplace
|
||||
* - default-bundle: ships with product (@builtin), auto-enabled
|
||||
* - org: enterprise admin-pushed via managed settings (policySettings)
|
||||
* - user-local: user added marketplace or local plugin
|
||||
*/
|
||||
export type TelemetryPluginScope =
|
||||
| 'official'
|
||||
| 'org'
|
||||
| 'user-local'
|
||||
| 'default-bundle'
|
||||
|
||||
export function getTelemetryPluginScope(
|
||||
name: string,
|
||||
marketplace: string | undefined,
|
||||
managedNames: Set<string> | null,
|
||||
): TelemetryPluginScope {
|
||||
if (marketplace === BUILTIN_MARKETPLACE_NAME) return 'default-bundle'
|
||||
if (isOfficialMarketplaceName(marketplace)) return 'official'
|
||||
if (managedNames?.has(name)) return 'org'
|
||||
return 'user-local'
|
||||
}
|
||||
|
||||
/**
|
||||
* How a plugin arrived in the session. Splits self-selected from org-pushed
|
||||
* — plugin_scope alone doesn't (an official plugin can be user-installed OR
|
||||
* org-pushed; both are scope='official').
|
||||
*/
|
||||
export type EnabledVia =
|
||||
| 'user-install'
|
||||
| 'org-policy'
|
||||
| 'default-enable'
|
||||
| 'seed-mount'
|
||||
|
||||
/** How a skill/command invocation was triggered. */
|
||||
export type InvocationTrigger =
|
||||
| 'user-slash'
|
||||
| 'claude-proactive'
|
||||
| 'nested-skill'
|
||||
|
||||
/** Where a skill invocation executes. */
|
||||
export type SkillExecutionContext = 'fork' | 'inline' | 'remote'
|
||||
|
||||
/** How a plugin install was initiated. */
|
||||
export type InstallSource =
|
||||
| 'cli-explicit'
|
||||
| 'ui-discover'
|
||||
| 'ui-suggestion'
|
||||
| 'deep-link'
|
||||
|
||||
export function getEnabledVia(
|
||||
plugin: LoadedPlugin,
|
||||
managedNames: Set<string> | null,
|
||||
seedDirs: string[],
|
||||
): EnabledVia {
|
||||
if (plugin.isBuiltin) return 'default-enable'
|
||||
if (managedNames?.has(plugin.name)) return 'org-policy'
|
||||
// Trailing sep: /opt/plugins must not match /opt/plugins-extra
|
||||
if (
|
||||
seedDirs.some(dir =>
|
||||
plugin.path.startsWith(dir.endsWith(sep) ? dir : dir + sep),
|
||||
)
|
||||
) {
|
||||
return 'seed-mount'
|
||||
}
|
||||
return 'user-install'
|
||||
}
|
||||
|
||||
/**
|
||||
* Common plugin telemetry fields keyed off name@marketplace. Returns the
|
||||
* hash, scope enum, and the redacted-twin columns. Callers add the raw
|
||||
* _PROTO_* fields separately (those require the PII-tagged marker type).
|
||||
*/
|
||||
export function buildPluginTelemetryFields(
|
||||
name: string,
|
||||
marketplace: string | undefined,
|
||||
managedNames: Set<string> | null = null,
|
||||
): {
|
||||
plugin_id_hash: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
|
||||
plugin_scope: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
|
||||
plugin_name_redacted: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
|
||||
marketplace_name_redacted: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
|
||||
is_official_plugin: boolean
|
||||
} {
|
||||
const scope = getTelemetryPluginScope(name, marketplace, managedNames)
|
||||
// Both official marketplaces and builtin plugins are Anthropic-controlled
|
||||
// — safe to expose real names in the redacted columns.
|
||||
const isAnthropicControlled =
|
||||
scope === 'official' || scope === 'default-bundle'
|
||||
return {
|
||||
plugin_id_hash: hashPluginId(
|
||||
name,
|
||||
marketplace,
|
||||
) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
plugin_scope:
|
||||
scope as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
plugin_name_redacted: (isAnthropicControlled
|
||||
? name
|
||||
: 'third-party') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
marketplace_name_redacted: (isAnthropicControlled && marketplace
|
||||
? marketplace
|
||||
: 'third-party') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
is_official_plugin: isAnthropicControlled,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Per-invocation callers (SkillTool, processSlashCommand) pass
|
||||
* managedNames=null — the session-level tengu_plugin_enabled_for_session
|
||||
* event carries the authoritative plugin_scope, and per-invocation rows can
|
||||
* join on plugin_id_hash to recover it. This keeps hot-path call sites free
|
||||
* of the extra settings read.
|
||||
*/
|
||||
export function buildPluginCommandTelemetryFields(
|
||||
pluginInfo: { pluginManifest: PluginManifest; repository: string },
|
||||
managedNames: Set<string> | null = null,
|
||||
): ReturnType<typeof buildPluginTelemetryFields> {
|
||||
const { marketplace } = parsePluginIdentifier(pluginInfo.repository)
|
||||
return buildPluginTelemetryFields(
|
||||
pluginInfo.pluginManifest.name,
|
||||
marketplace,
|
||||
managedNames,
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit tengu_plugin_enabled_for_session once per enabled plugin at session
|
||||
* start. Supplements tengu_skill_loaded (which still fires per-skill) — use
|
||||
* this for plugin-level aggregates instead of DISTINCT-on-prefix hacks.
|
||||
* A plugin with 5 skills emits 5 skill_loaded rows but 1 of these.
|
||||
*/
|
||||
export function logPluginsEnabledForSession(
|
||||
plugins: LoadedPlugin[],
|
||||
managedNames: Set<string> | null,
|
||||
seedDirs: string[],
|
||||
): void {
|
||||
for (const plugin of plugins) {
|
||||
const { marketplace } = parsePluginIdentifier(plugin.repository)
|
||||
|
||||
logEvent('tengu_plugin_enabled_for_session', {
|
||||
_PROTO_plugin_name:
|
||||
plugin.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
...(marketplace && {
|
||||
_PROTO_marketplace_name:
|
||||
marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
}),
|
||||
...buildPluginTelemetryFields(plugin.name, marketplace, managedNames),
|
||||
enabled_via: getEnabledVia(
|
||||
plugin,
|
||||
managedNames,
|
||||
seedDirs,
|
||||
) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
skill_path_count:
|
||||
(plugin.skillsPath ? 1 : 0) + (plugin.skillsPaths?.length ?? 0),
|
||||
command_path_count:
|
||||
(plugin.commandsPath ? 1 : 0) + (plugin.commandsPaths?.length ?? 0),
|
||||
has_mcp: plugin.manifest.mcpServers !== undefined,
|
||||
has_hooks: plugin.hooksConfig !== undefined,
|
||||
...(plugin.manifest.version && {
|
||||
version: plugin.manifest
|
||||
.version as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
}),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bounded-cardinality error bucket for CLI plugin operation failures.
|
||||
* Maps free-form error messages to 5 stable categories so dashboard
|
||||
* GROUP BY stays tractable.
|
||||
*/
|
||||
export type PluginCommandErrorCategory =
|
||||
| 'network'
|
||||
| 'not-found'
|
||||
| 'permission'
|
||||
| 'validation'
|
||||
| 'unknown'
|
||||
|
||||
export function classifyPluginCommandError(
|
||||
error: unknown,
|
||||
): PluginCommandErrorCategory {
|
||||
const msg = String((error as { message?: unknown })?.message ?? error)
|
||||
if (
|
||||
/ENOTFOUND|ECONNREFUSED|EAI_AGAIN|ETIMEDOUT|ECONNRESET|network|Could not resolve|Connection refused|timed out/i.test(
|
||||
msg,
|
||||
)
|
||||
) {
|
||||
return 'network'
|
||||
}
|
||||
if (/\b404\b|not found|does not exist|no such plugin/i.test(msg)) {
|
||||
return 'not-found'
|
||||
}
|
||||
if (/\b40[13]\b|EACCES|EPERM|permission denied|unauthorized/i.test(msg)) {
|
||||
return 'permission'
|
||||
}
|
||||
if (/invalid|malformed|schema|validation|parse error/i.test(msg)) {
|
||||
return 'validation'
|
||||
}
|
||||
return 'unknown'
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit tengu_plugin_load_failed once per error surfaced by session-start
|
||||
* plugin loading. Pairs with tengu_plugin_enabled_for_session so dashboards
|
||||
* can compute a load-success rate. PluginError.type is already a bounded
|
||||
* enum — use it directly as error_category.
|
||||
*/
|
||||
export function logPluginLoadErrors(
|
||||
errors: PluginError[],
|
||||
managedNames: Set<string> | null,
|
||||
): void {
|
||||
for (const err of errors) {
|
||||
const { name, marketplace } = parsePluginIdentifier(err.source)
|
||||
// Not all PluginError variants carry a plugin name (some have pluginId,
|
||||
// some are marketplace-level). Use the 'plugin' property if present,
|
||||
// fall back to the name parsed from err.source.
|
||||
const pluginName = 'plugin' in err && err.plugin ? err.plugin : name
|
||||
logEvent('tengu_plugin_load_failed', {
|
||||
error_category:
|
||||
err.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
_PROTO_plugin_name:
|
||||
pluginName as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
...(marketplace && {
|
||||
_PROTO_marketplace_name:
|
||||
marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
}),
|
||||
...buildPluginTelemetryFields(pluginName, marketplace, managedNames),
|
||||
})
|
||||
}
|
||||
}
|
||||
927
src/utils/telemetry/sessionTracing.ts
Normal file
927
src/utils/telemetry/sessionTracing.ts
Normal file
@@ -0,0 +1,927 @@
|
||||
/**
|
||||
* Session Tracing for Claude Code using OpenTelemetry (BETA)
|
||||
*
|
||||
* This module provides a high-level API for creating and managing spans
|
||||
* to trace Claude Code workflows. Each user interaction creates a root
|
||||
* interaction span, which contains operation spans (LLM requests, tool calls, etc.).
|
||||
*
|
||||
* Requirements:
|
||||
* - Enhanced telemetry is enabled via feature('ENHANCED_TELEMETRY_BETA')
|
||||
* - Configure OTEL_TRACES_EXPORTER (console, otlp, etc.)
|
||||
*/
|
||||
|
||||
import { feature } from 'bun:bundle'
|
||||
import { context as otelContext, type Span, trace } from '@opentelemetry/api'
|
||||
import { AsyncLocalStorage } from 'async_hooks'
|
||||
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
|
||||
import type { AssistantMessage, UserMessage } from '../../types/message.js'
|
||||
import { isEnvDefinedFalsy, isEnvTruthy } from '../envUtils.js'
|
||||
import { getTelemetryAttributes } from '../telemetryAttributes.js'
|
||||
import {
|
||||
addBetaInteractionAttributes,
|
||||
addBetaLLMRequestAttributes,
|
||||
addBetaLLMResponseAttributes,
|
||||
addBetaToolInputAttributes,
|
||||
addBetaToolResultAttributes,
|
||||
isBetaTracingEnabled,
|
||||
type LLMRequestNewContext,
|
||||
truncateContent,
|
||||
} from './betaSessionTracing.js'
|
||||
import {
|
||||
endInteractionPerfettoSpan,
|
||||
endLLMRequestPerfettoSpan,
|
||||
endToolPerfettoSpan,
|
||||
endUserInputPerfettoSpan,
|
||||
isPerfettoTracingEnabled,
|
||||
startInteractionPerfettoSpan,
|
||||
startLLMRequestPerfettoSpan,
|
||||
startToolPerfettoSpan,
|
||||
startUserInputPerfettoSpan,
|
||||
} from './perfettoTracing.js'
|
||||
|
||||
// Re-export for callers
|
||||
export type { Span }
|
||||
export { isBetaTracingEnabled, type LLMRequestNewContext }
|
||||
|
||||
// Message type for API calls (UserMessage or AssistantMessage)
|
||||
type APIMessage = UserMessage | AssistantMessage
|
||||
|
||||
type SpanType =
|
||||
| 'interaction'
|
||||
| 'llm_request'
|
||||
| 'tool'
|
||||
| 'tool.blocked_on_user'
|
||||
| 'tool.execution'
|
||||
| 'hook'
|
||||
|
||||
interface SpanContext {
|
||||
span: Span
|
||||
startTime: number
|
||||
attributes: Record<string, string | number | boolean>
|
||||
ended?: boolean
|
||||
perfettoSpanId?: string
|
||||
}
|
||||
|
||||
// ALS stores SpanContext directly so it holds a strong reference while a span
|
||||
// is active. With that, activeSpans can use WeakRef — when ALS is cleared
|
||||
// (enterWith(undefined)) and no other code holds the SpanContext, GC can collect
|
||||
// it and the WeakRef goes stale.
|
||||
const interactionContext = new AsyncLocalStorage<SpanContext | undefined>()
|
||||
const toolContext = new AsyncLocalStorage<SpanContext | undefined>()
|
||||
const activeSpans = new Map<string, WeakRef<SpanContext>>()
|
||||
// Spans not stored in ALS (LLM request, blocked-on-user, tool execution, hook)
|
||||
// need a strong reference to prevent GC from collecting the SpanContext before
|
||||
// the corresponding end* function retrieves it.
|
||||
const strongSpans = new Map<string, SpanContext>()
|
||||
let interactionSequence = 0
|
||||
let _cleanupIntervalStarted = false
|
||||
|
||||
const SPAN_TTL_MS = 30 * 60 * 1000 // 30 minutes
|
||||
|
||||
function getSpanId(span: Span): string {
|
||||
return span.spanContext().spanId || ''
|
||||
}
|
||||
|
||||
/**
|
||||
* Lazily start a background interval that evicts orphaned spans from activeSpans.
|
||||
*
|
||||
* Normal teardown calls endInteractionSpan / endToolSpan, which delete spans
|
||||
* immediately. This interval is a safety net for spans that were never ended
|
||||
* (e.g. aborted streams, uncaught exceptions mid-query) — without it they
|
||||
* accumulate in activeSpans indefinitely, holding references to Span objects
|
||||
* and the OpenTelemetry context chain.
|
||||
*
|
||||
* Initialized on the first startInteractionSpan call (not at module load) to
|
||||
* avoid triggering the no-top-level-side-effects lint rule and to keep the
|
||||
* interval from running in processes that never start a span.
|
||||
* unref() prevents the timer from keeping the process alive after all other
|
||||
* work is done.
|
||||
*/
|
||||
function ensureCleanupInterval(): void {
|
||||
if (_cleanupIntervalStarted) return
|
||||
_cleanupIntervalStarted = true
|
||||
const interval = setInterval(() => {
|
||||
const cutoff = Date.now() - SPAN_TTL_MS
|
||||
for (const [spanId, weakRef] of activeSpans) {
|
||||
const ctx = weakRef.deref()
|
||||
if (ctx === undefined) {
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
} else if (ctx.startTime < cutoff) {
|
||||
if (!ctx.ended) ctx.span.end() // flush any recorded attributes to the exporter
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
}
|
||||
}
|
||||
}, 60_000)
|
||||
if (typeof interval.unref === 'function') {
|
||||
interval.unref() // Node.js / Bun: don't block process exit
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if enhanced telemetry is enabled.
|
||||
* Priority: env var override > ant build > GrowthBook gate
|
||||
*/
|
||||
export function isEnhancedTelemetryEnabled(): boolean {
|
||||
if (feature('ENHANCED_TELEMETRY_BETA')) {
|
||||
const env =
|
||||
process.env.CLAUDE_CODE_ENHANCED_TELEMETRY_BETA ??
|
||||
process.env.ENABLE_ENHANCED_TELEMETRY_BETA
|
||||
if (isEnvTruthy(env)) {
|
||||
return true
|
||||
}
|
||||
if (isEnvDefinedFalsy(env)) {
|
||||
return false
|
||||
}
|
||||
return (
|
||||
process.env.USER_TYPE === 'ant' ||
|
||||
getFeatureValue_CACHED_MAY_BE_STALE('enhanced_telemetry_beta', false)
|
||||
)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if any tracing is enabled (either standard enhanced telemetry OR beta tracing)
|
||||
*/
|
||||
function isAnyTracingEnabled(): boolean {
|
||||
return isEnhancedTelemetryEnabled() || isBetaTracingEnabled()
|
||||
}
|
||||
|
||||
function getTracer() {
|
||||
return trace.getTracer('com.anthropic.claude_code.tracing', '1.0.0')
|
||||
}
|
||||
|
||||
function createSpanAttributes(
|
||||
spanType: SpanType,
|
||||
customAttributes: Record<string, string | number | boolean> = {},
|
||||
): Record<string, string | number | boolean> {
|
||||
const baseAttributes = getTelemetryAttributes()
|
||||
|
||||
const attributes: Record<string, string | number | boolean> = {
|
||||
...baseAttributes,
|
||||
'span.type': spanType,
|
||||
...customAttributes,
|
||||
}
|
||||
|
||||
return attributes
|
||||
}
|
||||
|
||||
/**
|
||||
* Start an interaction span. This wraps a user request -> Claude response cycle.
|
||||
* This is now a root span that includes all session-level attributes.
|
||||
* Sets the interaction context for all subsequent operations.
|
||||
*/
|
||||
export function startInteractionSpan(userPrompt: string): Span {
|
||||
ensureCleanupInterval()
|
||||
|
||||
// Start Perfetto span regardless of OTel tracing state
|
||||
const perfettoSpanId = isPerfettoTracingEnabled()
|
||||
? startInteractionPerfettoSpan(userPrompt)
|
||||
: undefined
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
// Still track Perfetto span even if OTel is disabled
|
||||
if (perfettoSpanId) {
|
||||
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
const spanId = getSpanId(dummySpan)
|
||||
const spanContextObj: SpanContext = {
|
||||
span: dummySpan,
|
||||
startTime: Date.now(),
|
||||
attributes: {},
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
interactionContext.enterWith(spanContextObj)
|
||||
return dummySpan
|
||||
}
|
||||
return trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const isUserPromptLoggingEnabled = isEnvTruthy(
|
||||
process.env.OTEL_LOG_USER_PROMPTS,
|
||||
)
|
||||
const promptToLog = isUserPromptLoggingEnabled ? userPrompt : '<REDACTED>'
|
||||
|
||||
interactionSequence++
|
||||
|
||||
const attributes = createSpanAttributes('interaction', {
|
||||
user_prompt: promptToLog,
|
||||
user_prompt_length: userPrompt.length,
|
||||
'interaction.sequence': interactionSequence,
|
||||
})
|
||||
|
||||
const span = tracer.startSpan('claude_code.interaction', {
|
||||
attributes,
|
||||
})
|
||||
|
||||
// Add experimental attributes (new_context)
|
||||
addBetaInteractionAttributes(span, userPrompt)
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes,
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
|
||||
interactionContext.enterWith(spanContextObj)
|
||||
|
||||
return span
|
||||
}
|
||||
|
||||
export function endInteractionSpan(): void {
|
||||
const spanContext = interactionContext.getStore()
|
||||
if (!spanContext) {
|
||||
return
|
||||
}
|
||||
|
||||
if (spanContext.ended) {
|
||||
return
|
||||
}
|
||||
|
||||
// End Perfetto span
|
||||
if (spanContext.perfettoSpanId) {
|
||||
endInteractionPerfettoSpan(spanContext.perfettoSpanId)
|
||||
}
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
spanContext.ended = true
|
||||
activeSpans.delete(getSpanId(spanContext.span))
|
||||
// Clear the store so async continuations created after this point (timers,
|
||||
// promise callbacks, I/O) do not inherit a reference to the ended span.
|
||||
// enterWith(undefined) is intentional: exit(() => {}) is a no-op because it
|
||||
// only suppresses the store inside the callback and returns immediately.
|
||||
interactionContext.enterWith(undefined)
|
||||
return
|
||||
}
|
||||
|
||||
const duration = Date.now() - spanContext.startTime
|
||||
spanContext.span.setAttributes({
|
||||
'interaction.duration_ms': duration,
|
||||
})
|
||||
|
||||
spanContext.span.end()
|
||||
spanContext.ended = true
|
||||
activeSpans.delete(getSpanId(spanContext.span))
|
||||
interactionContext.enterWith(undefined)
|
||||
}
|
||||
|
||||
export function startLLMRequestSpan(
|
||||
model: string,
|
||||
newContext?: LLMRequestNewContext,
|
||||
messagesForAPI?: APIMessage[],
|
||||
fastMode?: boolean,
|
||||
): Span {
|
||||
// Start Perfetto span regardless of OTel tracing state
|
||||
const perfettoSpanId = isPerfettoTracingEnabled()
|
||||
? startLLMRequestPerfettoSpan({
|
||||
model,
|
||||
querySource: newContext?.querySource,
|
||||
messageId: undefined, // Will be set in endLLMRequestSpan
|
||||
})
|
||||
: undefined
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
// Still track Perfetto span even if OTel is disabled
|
||||
if (perfettoSpanId) {
|
||||
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
const spanId = getSpanId(dummySpan)
|
||||
const spanContextObj: SpanContext = {
|
||||
span: dummySpan,
|
||||
startTime: Date.now(),
|
||||
attributes: { model },
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
return dummySpan
|
||||
}
|
||||
return trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const parentSpanCtx = interactionContext.getStore()
|
||||
|
||||
const attributes = createSpanAttributes('llm_request', {
|
||||
model: model,
|
||||
'llm_request.context': parentSpanCtx ? 'interaction' : 'standalone',
|
||||
speed: fastMode ? 'fast' : 'normal',
|
||||
})
|
||||
|
||||
const ctx = parentSpanCtx
|
||||
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
|
||||
: otelContext.active()
|
||||
const span = tracer.startSpan('claude_code.llm_request', { attributes }, ctx)
|
||||
|
||||
// Add query_source (agent name) if provided
|
||||
if (newContext?.querySource) {
|
||||
span.setAttribute('query_source', newContext.querySource)
|
||||
}
|
||||
|
||||
// Add experimental attributes (system prompt, new_context)
|
||||
addBetaLLMRequestAttributes(span, newContext, messagesForAPI)
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes,
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
|
||||
return span
|
||||
}
|
||||
|
||||
/**
|
||||
* End an LLM request span and attach response metadata.
|
||||
*
|
||||
* @param span - Optional. The exact span returned by startLLMRequestSpan().
|
||||
* IMPORTANT: When multiple LLM requests run in parallel (e.g., warmup requests,
|
||||
* topic classifier, file path extractor, main thread), you MUST pass the specific span
|
||||
* to ensure responses are attached to the correct request. Without it, responses may be
|
||||
* incorrectly attached to whichever span happens to be "last" in the activeSpans map.
|
||||
*
|
||||
* If not provided, falls back to finding the most recent llm_request span (legacy behavior).
|
||||
*/
|
||||
export function endLLMRequestSpan(
|
||||
span?: Span,
|
||||
metadata?: {
|
||||
inputTokens?: number
|
||||
outputTokens?: number
|
||||
cacheReadTokens?: number
|
||||
cacheCreationTokens?: number
|
||||
success?: boolean
|
||||
statusCode?: number
|
||||
error?: string
|
||||
attempt?: number
|
||||
modelResponse?: string
|
||||
/** Text output from the model (non-thinking content) */
|
||||
modelOutput?: string
|
||||
/** Thinking/reasoning output from the model */
|
||||
thinkingOutput?: string
|
||||
/** Whether the output included tool calls (look at tool spans for details) */
|
||||
hasToolCall?: boolean
|
||||
/** Time to first token in milliseconds */
|
||||
ttftMs?: number
|
||||
/** Time spent in pre-request setup before the successful attempt */
|
||||
requestSetupMs?: number
|
||||
/** Timestamps (Date.now()) of each attempt start — used to emit retry sub-spans */
|
||||
attemptStartTimes?: number[]
|
||||
},
|
||||
): void {
|
||||
let llmSpanContext: SpanContext | undefined
|
||||
|
||||
if (span) {
|
||||
// Use the provided span directly - this is the correct approach for parallel requests
|
||||
const spanId = getSpanId(span)
|
||||
llmSpanContext = activeSpans.get(spanId)?.deref()
|
||||
} else {
|
||||
// Legacy fallback: find the most recent llm_request span
|
||||
// WARNING: This can cause mismatched responses when multiple requests are in flight
|
||||
llmSpanContext = Array.from(activeSpans.values())
|
||||
.findLast(r => {
|
||||
const ctx = r.deref()
|
||||
return (
|
||||
ctx?.attributes['span.type'] === 'llm_request' ||
|
||||
ctx?.attributes['model']
|
||||
)
|
||||
})
|
||||
?.deref()
|
||||
}
|
||||
|
||||
if (!llmSpanContext) {
|
||||
// Span was already ended or never tracked
|
||||
return
|
||||
}
|
||||
|
||||
const duration = Date.now() - llmSpanContext.startTime
|
||||
|
||||
// End Perfetto span with full metadata
|
||||
if (llmSpanContext.perfettoSpanId) {
|
||||
endLLMRequestPerfettoSpan(llmSpanContext.perfettoSpanId, {
|
||||
ttftMs: metadata?.ttftMs,
|
||||
ttltMs: duration, // Time to last token is the total duration
|
||||
promptTokens: metadata?.inputTokens,
|
||||
outputTokens: metadata?.outputTokens,
|
||||
cacheReadTokens: metadata?.cacheReadTokens,
|
||||
cacheCreationTokens: metadata?.cacheCreationTokens,
|
||||
success: metadata?.success,
|
||||
error: metadata?.error,
|
||||
requestSetupMs: metadata?.requestSetupMs,
|
||||
attemptStartTimes: metadata?.attemptStartTimes,
|
||||
})
|
||||
}
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
const spanId = getSpanId(llmSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
return
|
||||
}
|
||||
|
||||
const endAttributes: Record<string, string | number | boolean> = {
|
||||
duration_ms: duration,
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
if (metadata.inputTokens !== undefined)
|
||||
endAttributes['input_tokens'] = metadata.inputTokens
|
||||
if (metadata.outputTokens !== undefined)
|
||||
endAttributes['output_tokens'] = metadata.outputTokens
|
||||
if (metadata.cacheReadTokens !== undefined)
|
||||
endAttributes['cache_read_tokens'] = metadata.cacheReadTokens
|
||||
if (metadata.cacheCreationTokens !== undefined)
|
||||
endAttributes['cache_creation_tokens'] = metadata.cacheCreationTokens
|
||||
if (metadata.success !== undefined)
|
||||
endAttributes['success'] = metadata.success
|
||||
if (metadata.statusCode !== undefined)
|
||||
endAttributes['status_code'] = metadata.statusCode
|
||||
if (metadata.error !== undefined) endAttributes['error'] = metadata.error
|
||||
if (metadata.attempt !== undefined)
|
||||
endAttributes['attempt'] = metadata.attempt
|
||||
if (metadata.hasToolCall !== undefined)
|
||||
endAttributes['response.has_tool_call'] = metadata.hasToolCall
|
||||
if (metadata.ttftMs !== undefined)
|
||||
endAttributes['ttft_ms'] = metadata.ttftMs
|
||||
|
||||
// Add experimental response attributes (model_output, thinking_output)
|
||||
addBetaLLMResponseAttributes(endAttributes, metadata)
|
||||
}
|
||||
|
||||
llmSpanContext.span.setAttributes(endAttributes)
|
||||
llmSpanContext.span.end()
|
||||
|
||||
const spanId = getSpanId(llmSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
}
|
||||
|
||||
export function startToolSpan(
|
||||
toolName: string,
|
||||
toolAttributes?: Record<string, string | number | boolean>,
|
||||
toolInput?: string,
|
||||
): Span {
|
||||
// Start Perfetto span regardless of OTel tracing state
|
||||
const perfettoSpanId = isPerfettoTracingEnabled()
|
||||
? startToolPerfettoSpan(toolName, toolAttributes)
|
||||
: undefined
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
// Still track Perfetto span even if OTel is disabled
|
||||
if (perfettoSpanId) {
|
||||
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
const spanId = getSpanId(dummySpan)
|
||||
const spanContextObj: SpanContext = {
|
||||
span: dummySpan,
|
||||
startTime: Date.now(),
|
||||
attributes: { 'span.type': 'tool', tool_name: toolName },
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
toolContext.enterWith(spanContextObj)
|
||||
return dummySpan
|
||||
}
|
||||
return trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const parentSpanCtx = interactionContext.getStore()
|
||||
|
||||
const attributes = createSpanAttributes('tool', {
|
||||
tool_name: toolName,
|
||||
...toolAttributes,
|
||||
})
|
||||
|
||||
const ctx = parentSpanCtx
|
||||
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
|
||||
: otelContext.active()
|
||||
const span = tracer.startSpan('claude_code.tool', { attributes }, ctx)
|
||||
|
||||
// Add experimental tool input attributes
|
||||
if (toolInput) {
|
||||
addBetaToolInputAttributes(span, toolName, toolInput)
|
||||
}
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes,
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
|
||||
toolContext.enterWith(spanContextObj)
|
||||
|
||||
return span
|
||||
}
|
||||
|
||||
export function startToolBlockedOnUserSpan(): Span {
|
||||
// Start Perfetto span regardless of OTel tracing state
|
||||
const perfettoSpanId = isPerfettoTracingEnabled()
|
||||
? startUserInputPerfettoSpan('tool_permission')
|
||||
: undefined
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
// Still track Perfetto span even if OTel is disabled
|
||||
if (perfettoSpanId) {
|
||||
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
const spanId = getSpanId(dummySpan)
|
||||
const spanContextObj: SpanContext = {
|
||||
span: dummySpan,
|
||||
startTime: Date.now(),
|
||||
attributes: { 'span.type': 'tool.blocked_on_user' },
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
return dummySpan
|
||||
}
|
||||
return trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const parentSpanCtx = toolContext.getStore()
|
||||
|
||||
const attributes = createSpanAttributes('tool.blocked_on_user')
|
||||
|
||||
const ctx = parentSpanCtx
|
||||
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
|
||||
: otelContext.active()
|
||||
const span = tracer.startSpan(
|
||||
'claude_code.tool.blocked_on_user',
|
||||
{ attributes },
|
||||
ctx,
|
||||
)
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes,
|
||||
perfettoSpanId,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
|
||||
return span
|
||||
}
|
||||
|
||||
export function endToolBlockedOnUserSpan(
|
||||
decision?: string,
|
||||
source?: string,
|
||||
): void {
|
||||
const blockedSpanContext = Array.from(activeSpans.values())
|
||||
.findLast(
|
||||
r => r.deref()?.attributes['span.type'] === 'tool.blocked_on_user',
|
||||
)
|
||||
?.deref()
|
||||
|
||||
if (!blockedSpanContext) {
|
||||
return
|
||||
}
|
||||
|
||||
// End Perfetto span
|
||||
if (blockedSpanContext.perfettoSpanId) {
|
||||
endUserInputPerfettoSpan(blockedSpanContext.perfettoSpanId, {
|
||||
decision,
|
||||
source,
|
||||
})
|
||||
}
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
const spanId = getSpanId(blockedSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
return
|
||||
}
|
||||
|
||||
const duration = Date.now() - blockedSpanContext.startTime
|
||||
const attributes: Record<string, string | number | boolean> = {
|
||||
duration_ms: duration,
|
||||
}
|
||||
|
||||
if (decision) {
|
||||
attributes['decision'] = decision
|
||||
}
|
||||
if (source) {
|
||||
attributes['source'] = source
|
||||
}
|
||||
|
||||
blockedSpanContext.span.setAttributes(attributes)
|
||||
blockedSpanContext.span.end()
|
||||
|
||||
const spanId = getSpanId(blockedSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
}
|
||||
|
||||
export function startToolExecutionSpan(): Span {
|
||||
if (!isAnyTracingEnabled()) {
|
||||
return trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const parentSpanCtx = toolContext.getStore()
|
||||
|
||||
const attributes = createSpanAttributes('tool.execution')
|
||||
|
||||
const ctx = parentSpanCtx
|
||||
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
|
||||
: otelContext.active()
|
||||
const span = tracer.startSpan(
|
||||
'claude_code.tool.execution',
|
||||
{ attributes },
|
||||
ctx,
|
||||
)
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
|
||||
return span
|
||||
}
|
||||
|
||||
export function endToolExecutionSpan(metadata?: {
|
||||
success?: boolean
|
||||
error?: string
|
||||
}): void {
|
||||
if (!isAnyTracingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
const executionSpanContext = Array.from(activeSpans.values())
|
||||
.findLast(r => r.deref()?.attributes['span.type'] === 'tool.execution')
|
||||
?.deref()
|
||||
|
||||
if (!executionSpanContext) {
|
||||
return
|
||||
}
|
||||
|
||||
const duration = Date.now() - executionSpanContext.startTime
|
||||
const attributes: Record<string, string | number | boolean> = {
|
||||
duration_ms: duration,
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
if (metadata.success !== undefined) attributes['success'] = metadata.success
|
||||
if (metadata.error !== undefined) attributes['error'] = metadata.error
|
||||
}
|
||||
|
||||
executionSpanContext.span.setAttributes(attributes)
|
||||
executionSpanContext.span.end()
|
||||
|
||||
const spanId = getSpanId(executionSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
}
|
||||
|
||||
export function endToolSpan(toolResult?: string, resultTokens?: number): void {
|
||||
const toolSpanContext = toolContext.getStore()
|
||||
|
||||
if (!toolSpanContext) {
|
||||
return
|
||||
}
|
||||
|
||||
// End Perfetto span
|
||||
if (toolSpanContext.perfettoSpanId) {
|
||||
endToolPerfettoSpan(toolSpanContext.perfettoSpanId, {
|
||||
success: true,
|
||||
resultTokens,
|
||||
})
|
||||
}
|
||||
|
||||
if (!isAnyTracingEnabled()) {
|
||||
const spanId = getSpanId(toolSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
// Same reasoning as interactionContext above: clear so subsequent async
|
||||
// work doesn't hold a stale reference to the ended tool span.
|
||||
toolContext.enterWith(undefined)
|
||||
return
|
||||
}
|
||||
|
||||
const duration = Date.now() - toolSpanContext.startTime
|
||||
const endAttributes: Record<string, string | number | boolean> = {
|
||||
duration_ms: duration,
|
||||
}
|
||||
|
||||
// Add experimental tool result attributes (new_context)
|
||||
if (toolResult) {
|
||||
const toolName = toolSpanContext.attributes['tool_name'] || 'unknown'
|
||||
addBetaToolResultAttributes(endAttributes, toolName, toolResult)
|
||||
}
|
||||
|
||||
if (resultTokens !== undefined) {
|
||||
endAttributes['result_tokens'] = resultTokens
|
||||
}
|
||||
|
||||
toolSpanContext.span.setAttributes(endAttributes)
|
||||
toolSpanContext.span.end()
|
||||
|
||||
const spanId = getSpanId(toolSpanContext.span)
|
||||
activeSpans.delete(spanId)
|
||||
toolContext.enterWith(undefined)
|
||||
}
|
||||
|
||||
function isToolContentLoggingEnabled(): boolean {
|
||||
return isEnvTruthy(process.env.OTEL_LOG_TOOL_CONTENT)
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a span event with tool content/output data.
|
||||
* Only logs if OTEL_LOG_TOOL_CONTENT=1 is set.
|
||||
* Truncates content if it exceeds MAX_CONTENT_SIZE.
|
||||
*/
|
||||
export function addToolContentEvent(
|
||||
eventName: string,
|
||||
attributes: Record<string, string | number | boolean>,
|
||||
): void {
|
||||
if (!isAnyTracingEnabled() || !isToolContentLoggingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
const currentSpanCtx = toolContext.getStore()
|
||||
if (!currentSpanCtx) {
|
||||
return
|
||||
}
|
||||
|
||||
// Truncate string attributes that might be large
|
||||
const processedAttributes: Record<string, string | number | boolean> = {}
|
||||
for (const [key, value] of Object.entries(attributes)) {
|
||||
if (typeof value === 'string') {
|
||||
const { content, truncated } = truncateContent(value)
|
||||
processedAttributes[key] = content
|
||||
if (truncated) {
|
||||
processedAttributes[`${key}_truncated`] = true
|
||||
processedAttributes[`${key}_original_length`] = value.length
|
||||
}
|
||||
} else {
|
||||
processedAttributes[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
currentSpanCtx.span.addEvent(eventName, processedAttributes)
|
||||
}
|
||||
|
||||
export function getCurrentSpan(): Span | null {
|
||||
if (!isAnyTracingEnabled()) {
|
||||
return null
|
||||
}
|
||||
|
||||
return (
|
||||
toolContext.getStore()?.span ?? interactionContext.getStore()?.span ?? null
|
||||
)
|
||||
}
|
||||
|
||||
export async function executeInSpan<T>(
|
||||
spanName: string,
|
||||
fn: (span: Span) => Promise<T>,
|
||||
attributes?: Record<string, string | number | boolean>,
|
||||
): Promise<T> {
|
||||
if (!isAnyTracingEnabled()) {
|
||||
return fn(trace.getActiveSpan() || getTracer().startSpan('dummy'))
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const parentSpanCtx = toolContext.getStore() ?? interactionContext.getStore()
|
||||
|
||||
const finalAttributes = createSpanAttributes('tool', {
|
||||
...attributes,
|
||||
})
|
||||
|
||||
const ctx = parentSpanCtx
|
||||
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
|
||||
: otelContext.active()
|
||||
const span = tracer.startSpan(spanName, { attributes: finalAttributes }, ctx)
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes: finalAttributes,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
|
||||
try {
|
||||
const result = await fn(span)
|
||||
span.end()
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
return result
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
span.recordException(error)
|
||||
}
|
||||
span.end()
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a hook execution span.
|
||||
* Only creates a span when beta tracing is enabled.
|
||||
* @param hookEvent The hook event type (e.g., 'PreToolUse', 'PostToolUse')
|
||||
* @param hookName The full hook name (e.g., 'PreToolUse:Write')
|
||||
* @param numHooks The number of hooks being executed
|
||||
* @param hookDefinitions JSON string of hook definitions for tracing
|
||||
* @returns The span (or a dummy span if tracing is disabled)
|
||||
*/
|
||||
export function startHookSpan(
|
||||
hookEvent: string,
|
||||
hookName: string,
|
||||
numHooks: number,
|
||||
hookDefinitions: string,
|
||||
): Span {
|
||||
if (!isBetaTracingEnabled()) {
|
||||
return trace.getActiveSpan() || getTracer().startSpan('dummy')
|
||||
}
|
||||
|
||||
const tracer = getTracer()
|
||||
const parentSpanCtx = toolContext.getStore() ?? interactionContext.getStore()
|
||||
|
||||
const attributes = createSpanAttributes('hook', {
|
||||
hook_event: hookEvent,
|
||||
hook_name: hookName,
|
||||
num_hooks: numHooks,
|
||||
hook_definitions: hookDefinitions,
|
||||
})
|
||||
|
||||
const ctx = parentSpanCtx
|
||||
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
|
||||
: otelContext.active()
|
||||
const span = tracer.startSpan('claude_code.hook', { attributes }, ctx)
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContextObj: SpanContext = {
|
||||
span,
|
||||
startTime: Date.now(),
|
||||
attributes,
|
||||
}
|
||||
activeSpans.set(spanId, new WeakRef(spanContextObj))
|
||||
strongSpans.set(spanId, spanContextObj)
|
||||
|
||||
return span
|
||||
}
|
||||
|
||||
/**
|
||||
* End a hook execution span with outcome metadata.
|
||||
* Only does work when beta tracing is enabled.
|
||||
* @param span The span to end (returned from startHookSpan)
|
||||
* @param metadata The outcome metadata for the hook execution
|
||||
*/
|
||||
export function endHookSpan(
|
||||
span: Span,
|
||||
metadata?: {
|
||||
numSuccess?: number
|
||||
numBlocking?: number
|
||||
numNonBlockingError?: number
|
||||
numCancelled?: number
|
||||
},
|
||||
): void {
|
||||
if (!isBetaTracingEnabled()) {
|
||||
return
|
||||
}
|
||||
|
||||
const spanId = getSpanId(span)
|
||||
const spanContext = activeSpans.get(spanId)?.deref()
|
||||
|
||||
if (!spanContext) {
|
||||
return
|
||||
}
|
||||
|
||||
const duration = Date.now() - spanContext.startTime
|
||||
const endAttributes: Record<string, string | number | boolean> = {
|
||||
duration_ms: duration,
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
if (metadata.numSuccess !== undefined)
|
||||
endAttributes['num_success'] = metadata.numSuccess
|
||||
if (metadata.numBlocking !== undefined)
|
||||
endAttributes['num_blocking'] = metadata.numBlocking
|
||||
if (metadata.numNonBlockingError !== undefined)
|
||||
endAttributes['num_non_blocking_error'] = metadata.numNonBlockingError
|
||||
if (metadata.numCancelled !== undefined)
|
||||
endAttributes['num_cancelled'] = metadata.numCancelled
|
||||
}
|
||||
|
||||
spanContext.span.setAttributes(endAttributes)
|
||||
spanContext.span.end()
|
||||
activeSpans.delete(spanId)
|
||||
strongSpans.delete(spanId)
|
||||
}
|
||||
39
src/utils/telemetry/skillLoadedEvent.ts
Normal file
39
src/utils/telemetry/skillLoadedEvent.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
import { getSkillToolCommands } from '../../commands.js'
|
||||
import {
|
||||
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
type AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
logEvent,
|
||||
} from '../../services/analytics/index.js'
|
||||
import { getCharBudget } from '../../tools/SkillTool/prompt.js'
|
||||
|
||||
/**
|
||||
* Logs a tengu_skill_loaded event for each skill available at session startup.
|
||||
* This enables analytics on which skills are available across sessions.
|
||||
*/
|
||||
export async function logSkillsLoaded(
|
||||
cwd: string,
|
||||
contextWindowTokens: number,
|
||||
): Promise<void> {
|
||||
const skills = await getSkillToolCommands(cwd)
|
||||
const skillBudget = getCharBudget(contextWindowTokens)
|
||||
|
||||
for (const skill of skills) {
|
||||
if (skill.type !== 'prompt') continue
|
||||
|
||||
logEvent('tengu_skill_loaded', {
|
||||
// _PROTO_skill_name routes to the privileged skill_name BQ column.
|
||||
// Unredacted names don't go in additional_metadata.
|
||||
_PROTO_skill_name:
|
||||
skill.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
|
||||
skill_source:
|
||||
skill.source as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
skill_loaded_from:
|
||||
skill.loadedFrom as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
skill_budget: skillBudget,
|
||||
...(skill.kind && {
|
||||
skill_kind:
|
||||
skill.kind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
||||
}),
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user