chore: initialize recovered claude workspace

This commit is contained in:
2026-04-02 15:29:01 +08:00
commit a10efa3b4b
1940 changed files with 506426 additions and 0 deletions

View File

@@ -0,0 +1,491 @@
/**
* Beta Session Tracing for Claude Code
*
* This module contains beta tracing features enabled when
* ENABLE_BETA_TRACING_DETAILED=1 and BETA_TRACING_ENDPOINT are set.
*
* For external users, tracing is enabled in SDK/headless mode, or in
* interactive mode when the org is allowlisted via the
* tengu_trace_lantern GrowthBook gate.
* For ant users, tracing is enabled in all modes.
*
* Visibility Rules:
* | Content | External | Ant |
* |------------------|----------|------|
* | System prompts | ✅ | ✅ |
* | Model output | ✅ | ✅ |
* | Thinking output | ❌ | ✅ |
* | Tools | ✅ | ✅ |
* | new_context | ✅ | ✅ |
*
* Features:
* - Per-agent message tracking with hash-based deduplication
* - System prompt logging (once per unique hash)
* - Hook execution spans
* - Detailed new_context attributes for LLM requests
*/
import type { Span } from '@opentelemetry/api'
import { createHash } from 'crypto'
import { getIsNonInteractiveSession } from '../../bootstrap/state.js'
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
import { sanitizeToolNameForAnalytics } from '../../services/analytics/metadata.js'
import type { AssistantMessage, UserMessage } from '../../types/message.js'
import { isEnvTruthy } from '../envUtils.js'
import { jsonParse, jsonStringify } from '../slowOperations.js'
import { logOTelEvent } from './events.js'
// Message type for API calls (UserMessage or AssistantMessage)
type APIMessage = UserMessage | AssistantMessage
/**
* Track hashes we've already logged this session (system prompts, tools, etc).
*
* WHY: System prompts and tool schemas are large and rarely change within a session.
* Sending full content on every request would be wasteful. Instead, we hash and
* only log the full content once per unique hash.
*/
const seenHashes = new Set<string>()
/**
* Track the last reported message hash per querySource (agent) for incremental context.
*
* WHY: When debugging traces, we want to see what NEW information was added each turn,
* not the entire conversation history (which can be huge). By tracking the last message
* we reported per agent, we can compute and send only the delta (new messages since
* the last request). This is tracked per-agent (querySource) because different agents
* (main thread, subagents, warmup requests) have independent conversation contexts.
*/
const lastReportedMessageHash = new Map<string, string>()
/**
* Clear tracking state after compaction.
* Old hashes are irrelevant once messages have been replaced.
*/
export function clearBetaTracingState(): void {
seenHashes.clear()
lastReportedMessageHash.clear()
}
const MAX_CONTENT_SIZE = 60 * 1024 // 60KB (Honeycomb limit is 64KB, staying safe)
/**
* Check if beta detailed tracing is enabled.
* - Requires ENABLE_BETA_TRACING_DETAILED=1 and BETA_TRACING_ENDPOINT
* - For external users, enabled in SDK/headless mode OR when org is
* allowlisted via the tengu_trace_lantern GrowthBook gate
*/
export function isBetaTracingEnabled(): boolean {
const baseEnabled =
isEnvTruthy(process.env.ENABLE_BETA_TRACING_DETAILED) &&
Boolean(process.env.BETA_TRACING_ENDPOINT)
if (!baseEnabled) {
return false
}
// For external users, enable in SDK/headless mode OR when org is allowlisted.
// Gate reads from disk cache, so first run after allowlisting returns false;
// works from second run onward (same behavior as enhanced_telemetry_beta).
if (process.env.USER_TYPE !== 'ant') {
return (
getIsNonInteractiveSession() ||
getFeatureValue_CACHED_MAY_BE_STALE('tengu_trace_lantern', false)
)
}
return true
}
/**
* Truncate content to fit within Honeycomb limits.
*/
export function truncateContent(
content: string,
maxSize: number = MAX_CONTENT_SIZE,
): { content: string; truncated: boolean } {
if (content.length <= maxSize) {
return { content, truncated: false }
}
return {
content:
content.slice(0, maxSize) +
'\n\n[TRUNCATED - Content exceeds 60KB limit]',
truncated: true,
}
}
/**
* Generate a short hash (first 12 hex chars of SHA-256).
*/
function shortHash(content: string): string {
return createHash('sha256').update(content).digest('hex').slice(0, 12)
}
/**
* Generate a hash for a system prompt.
*/
function hashSystemPrompt(systemPrompt: string): string {
return `sp_${shortHash(systemPrompt)}`
}
/**
* Generate a hash for a message based on its content.
*/
function hashMessage(message: APIMessage): string {
const content = jsonStringify(message.message.content)
return `msg_${shortHash(content)}`
}
// Regex to detect content wrapped in <system-reminder> tags
const SYSTEM_REMINDER_REGEX =
/^<system-reminder>\n?([\s\S]*?)\n?<\/system-reminder>$/
/**
* Check if text is entirely a system reminder (wrapped in <system-reminder> tags).
* Returns the inner content if it is, null otherwise.
*/
function extractSystemReminderContent(text: string): string | null {
const match = text.trim().match(SYSTEM_REMINDER_REGEX)
return match && match[1] ? match[1].trim() : null
}
/**
* Result of formatting messages - separates regular content from system reminders.
*/
interface FormattedMessages {
contextParts: string[]
systemReminders: string[]
}
/**
* Format user messages for new_context display, separating system reminders.
* Only handles user messages (assistant messages are filtered out before this is called).
*/
function formatMessagesForContext(messages: UserMessage[]): FormattedMessages {
const contextParts: string[] = []
const systemReminders: string[] = []
for (const message of messages) {
const content = message.message.content
if (typeof content === 'string') {
const reminderContent = extractSystemReminderContent(content)
if (reminderContent) {
systemReminders.push(reminderContent)
} else {
contextParts.push(`[USER]\n${content}`)
}
} else if (Array.isArray(content)) {
for (const block of content) {
if (block.type === 'text') {
const reminderContent = extractSystemReminderContent(block.text)
if (reminderContent) {
systemReminders.push(reminderContent)
} else {
contextParts.push(`[USER]\n${block.text}`)
}
} else if (block.type === 'tool_result') {
const resultContent =
typeof block.content === 'string'
? block.content
: jsonStringify(block.content)
// Tool results can also contain system reminders (e.g., malware warning)
const reminderContent = extractSystemReminderContent(resultContent)
if (reminderContent) {
systemReminders.push(reminderContent)
} else {
contextParts.push(
`[TOOL RESULT: ${block.tool_use_id}]\n${resultContent}`,
)
}
}
}
}
}
return { contextParts, systemReminders }
}
export interface LLMRequestNewContext {
/** System prompt (typically only on first request or if changed) */
systemPrompt?: string
/** Query source identifying the agent/purpose (e.g., 'repl_main_thread', 'agent:builtin') */
querySource?: string
/** Tool schemas sent with the request */
tools?: string
}
/**
* Add beta attributes to an interaction span.
* Adds new_context with the user prompt.
*/
export function addBetaInteractionAttributes(
span: Span,
userPrompt: string,
): void {
if (!isBetaTracingEnabled()) {
return
}
const { content: truncatedPrompt, truncated } = truncateContent(
`[USER PROMPT]\n${userPrompt}`,
)
span.setAttributes({
new_context: truncatedPrompt,
...(truncated && {
new_context_truncated: true,
new_context_original_length: userPrompt.length,
}),
})
}
/**
* Add beta attributes to an LLM request span.
* Handles system prompt logging and new_context computation.
*/
export function addBetaLLMRequestAttributes(
span: Span,
newContext?: LLMRequestNewContext,
messagesForAPI?: APIMessage[],
): void {
if (!isBetaTracingEnabled()) {
return
}
// Add system prompt info to the span
if (newContext?.systemPrompt) {
const promptHash = hashSystemPrompt(newContext.systemPrompt)
const preview = newContext.systemPrompt.slice(0, 500)
// Always add hash, preview, and length to the span
span.setAttribute('system_prompt_hash', promptHash)
span.setAttribute('system_prompt_preview', preview)
span.setAttribute('system_prompt_length', newContext.systemPrompt.length)
// Log the full system prompt only once per unique hash this session
if (!seenHashes.has(promptHash)) {
seenHashes.add(promptHash)
// Truncate for the log if needed
const { content: truncatedPrompt, truncated } = truncateContent(
newContext.systemPrompt,
)
void logOTelEvent('system_prompt', {
system_prompt_hash: promptHash,
system_prompt: truncatedPrompt,
system_prompt_length: String(newContext.systemPrompt.length),
...(truncated && { system_prompt_truncated: 'true' }),
})
}
}
// Add tools info to the span
if (newContext?.tools) {
try {
const toolsArray = jsonParse(newContext.tools) as Record<
string,
unknown
>[]
// Build array of {name, hash} for each tool
const toolsWithHashes = toolsArray.map(tool => {
const toolJson = jsonStringify(tool)
const toolHash = shortHash(toolJson)
return {
name: typeof tool.name === 'string' ? tool.name : 'unknown',
hash: toolHash,
json: toolJson,
}
})
// Set span attribute with array of name/hash pairs
span.setAttribute(
'tools',
jsonStringify(
toolsWithHashes.map(({ name, hash }) => ({ name, hash })),
),
)
span.setAttribute('tools_count', toolsWithHashes.length)
// Log each tool's full description once per unique hash
for (const { name, hash, json } of toolsWithHashes) {
if (!seenHashes.has(`tool_${hash}`)) {
seenHashes.add(`tool_${hash}`)
const { content: truncatedTool, truncated } = truncateContent(json)
void logOTelEvent('tool', {
tool_name: sanitizeToolNameForAnalytics(name),
tool_hash: hash,
tool: truncatedTool,
...(truncated && { tool_truncated: 'true' }),
})
}
}
} catch {
// If parsing fails, log the raw tools string
span.setAttribute('tools_parse_error', true)
}
}
// Add new_context using hash-based tracking (visible to all users)
if (messagesForAPI && messagesForAPI.length > 0 && newContext?.querySource) {
const querySource = newContext.querySource
const lastHash = lastReportedMessageHash.get(querySource)
// Find where the last reported message is in the array
let startIndex = 0
if (lastHash) {
for (let i = 0; i < messagesForAPI.length; i++) {
const msg = messagesForAPI[i]
if (msg && hashMessage(msg) === lastHash) {
startIndex = i + 1 // Start after the last reported message
break
}
}
// If lastHash not found, startIndex stays 0 (send everything)
}
// Get new messages (filter out assistant messages - we only want user input/tool results)
const newMessages = messagesForAPI
.slice(startIndex)
.filter((m): m is UserMessage => m.type === 'user')
if (newMessages.length > 0) {
// Format new messages, separating system reminders from regular content
const { contextParts, systemReminders } =
formatMessagesForContext(newMessages)
// Set new_context (regular user content and tool results)
if (contextParts.length > 0) {
const fullContext = contextParts.join('\n\n---\n\n')
const { content: truncatedContext, truncated } =
truncateContent(fullContext)
span.setAttributes({
new_context: truncatedContext,
new_context_message_count: newMessages.length,
...(truncated && {
new_context_truncated: true,
new_context_original_length: fullContext.length,
}),
})
}
// Set system_reminders as a separate attribute
if (systemReminders.length > 0) {
const fullReminders = systemReminders.join('\n\n---\n\n')
const { content: truncatedReminders, truncated: remindersTruncated } =
truncateContent(fullReminders)
span.setAttributes({
system_reminders: truncatedReminders,
system_reminders_count: systemReminders.length,
...(remindersTruncated && {
system_reminders_truncated: true,
system_reminders_original_length: fullReminders.length,
}),
})
}
// Update last reported hash to the last message in the array
const lastMessage = messagesForAPI[messagesForAPI.length - 1]
if (lastMessage) {
lastReportedMessageHash.set(querySource, hashMessage(lastMessage))
}
}
}
}
/**
* Add beta attributes to endLLMRequestSpan.
* Handles model_output and thinking_output truncation.
*/
export function addBetaLLMResponseAttributes(
endAttributes: Record<string, string | number | boolean>,
metadata?: {
modelOutput?: string
thinkingOutput?: string
},
): void {
if (!isBetaTracingEnabled() || !metadata) {
return
}
// Add model_output (text content) - visible to all users
if (metadata.modelOutput !== undefined) {
const { content: modelOutput, truncated: outputTruncated } =
truncateContent(metadata.modelOutput)
endAttributes['response.model_output'] = modelOutput
if (outputTruncated) {
endAttributes['response.model_output_truncated'] = true
endAttributes['response.model_output_original_length'] =
metadata.modelOutput.length
}
}
// Add thinking_output - ant-only
if (
process.env.USER_TYPE === 'ant' &&
metadata.thinkingOutput !== undefined
) {
const { content: thinkingOutput, truncated: thinkingTruncated } =
truncateContent(metadata.thinkingOutput)
endAttributes['response.thinking_output'] = thinkingOutput
if (thinkingTruncated) {
endAttributes['response.thinking_output_truncated'] = true
endAttributes['response.thinking_output_original_length'] =
metadata.thinkingOutput.length
}
}
}
/**
* Add beta attributes to startToolSpan.
* Adds tool_input with the serialized tool input.
*/
export function addBetaToolInputAttributes(
span: Span,
toolName: string,
toolInput: string,
): void {
if (!isBetaTracingEnabled()) {
return
}
const { content: truncatedInput, truncated } = truncateContent(
`[TOOL INPUT: ${toolName}]\n${toolInput}`,
)
span.setAttributes({
tool_input: truncatedInput,
...(truncated && {
tool_input_truncated: true,
tool_input_original_length: toolInput.length,
}),
})
}
/**
* Add beta attributes to endToolSpan.
* Adds new_context with the tool result.
*/
export function addBetaToolResultAttributes(
endAttributes: Record<string, string | number | boolean>,
toolName: string | number | boolean,
toolResult: string,
): void {
if (!isBetaTracingEnabled()) {
return
}
const { content: truncatedResult, truncated } = truncateContent(
`[TOOL RESULT: ${toolName}]\n${toolResult}`,
)
endAttributes['new_context'] = truncatedResult
if (truncated) {
endAttributes['new_context_truncated'] = true
endAttributes['new_context_original_length'] = toolResult.length
}
}

View File

@@ -0,0 +1,252 @@
import type { Attributes, HrTime } from '@opentelemetry/api'
import { type ExportResult, ExportResultCode } from '@opentelemetry/core'
import {
AggregationTemporality,
type MetricData,
type DataPoint as OTelDataPoint,
type PushMetricExporter,
type ResourceMetrics,
} from '@opentelemetry/sdk-metrics'
import axios from 'axios'
import { checkMetricsEnabled } from 'src/services/api/metricsOptOut.js'
import { getIsNonInteractiveSession } from '../../bootstrap/state.js'
import { getSubscriptionType, isClaudeAISubscriber } from '../auth.js'
import { checkHasTrustDialogAccepted } from '../config.js'
import { logForDebugging } from '../debug.js'
import { errorMessage, toError } from '../errors.js'
import { getAuthHeaders } from '../http.js'
import { logError } from '../log.js'
import { jsonStringify } from '../slowOperations.js'
import { getClaudeCodeUserAgent } from '../userAgent.js'
type DataPoint = {
attributes: Record<string, string>
value: number
timestamp: string
}
type Metric = {
name: string
description?: string
unit?: string
data_points: DataPoint[]
}
type InternalMetricsPayload = {
resource_attributes: Record<string, string>
metrics: Metric[]
}
export class BigQueryMetricsExporter implements PushMetricExporter {
private readonly endpoint: string
private readonly timeout: number
private pendingExports: Promise<void>[] = []
private isShutdown = false
constructor(options: { timeout?: number } = {}) {
const defaultEndpoint = 'https://api.anthropic.com/api/claude_code/metrics'
if (
process.env.USER_TYPE === 'ant' &&
process.env.ANT_CLAUDE_CODE_METRICS_ENDPOINT
) {
this.endpoint =
process.env.ANT_CLAUDE_CODE_METRICS_ENDPOINT +
'/api/claude_code/metrics'
} else {
this.endpoint = defaultEndpoint
}
this.timeout = options.timeout || 5000
}
async export(
metrics: ResourceMetrics,
resultCallback: (result: ExportResult) => void,
): Promise<void> {
if (this.isShutdown) {
resultCallback({
code: ExportResultCode.FAILED,
error: new Error('Exporter has been shutdown'),
})
return
}
const exportPromise = this.doExport(metrics, resultCallback)
this.pendingExports.push(exportPromise)
// Clean up completed exports
void exportPromise.finally(() => {
const index = this.pendingExports.indexOf(exportPromise)
if (index > -1) {
void this.pendingExports.splice(index, 1)
}
})
}
private async doExport(
metrics: ResourceMetrics,
resultCallback: (result: ExportResult) => void,
): Promise<void> {
try {
// Skip if trust not established in interactive mode
// This prevents triggering apiKeyHelper before trust dialog
const hasTrust =
checkHasTrustDialogAccepted() || getIsNonInteractiveSession()
if (!hasTrust) {
logForDebugging(
'BigQuery metrics export: trust not established, skipping',
)
resultCallback({ code: ExportResultCode.SUCCESS })
return
}
// Check organization-level metrics opt-out
const metricsStatus = await checkMetricsEnabled()
if (!metricsStatus.enabled) {
logForDebugging('Metrics export disabled by organization setting')
resultCallback({ code: ExportResultCode.SUCCESS })
return
}
const payload = this.transformMetricsForInternal(metrics)
const authResult = getAuthHeaders()
if (authResult.error) {
logForDebugging(`Metrics export failed: ${authResult.error}`)
resultCallback({
code: ExportResultCode.FAILED,
error: new Error(authResult.error),
})
return
}
const headers: Record<string, string> = {
'Content-Type': 'application/json',
'User-Agent': getClaudeCodeUserAgent(),
...authResult.headers,
}
const response = await axios.post(this.endpoint, payload, {
timeout: this.timeout,
headers,
})
logForDebugging('BigQuery metrics exported successfully')
logForDebugging(
`BigQuery API Response: ${jsonStringify(response.data, null, 2)}`,
)
resultCallback({ code: ExportResultCode.SUCCESS })
} catch (error) {
logForDebugging(`BigQuery metrics export failed: ${errorMessage(error)}`)
logError(error)
resultCallback({
code: ExportResultCode.FAILED,
error: toError(error),
})
}
}
private transformMetricsForInternal(
metrics: ResourceMetrics,
): InternalMetricsPayload {
const attrs = metrics.resource.attributes
const resourceAttributes: Record<string, string> = {
'service.name': (attrs['service.name'] as string) || 'claude-code',
'service.version': (attrs['service.version'] as string) || 'unknown',
'os.type': (attrs['os.type'] as string) || 'unknown',
'os.version': (attrs['os.version'] as string) || 'unknown',
'host.arch': (attrs['host.arch'] as string) || 'unknown',
'aggregation.temporality':
this.selectAggregationTemporality() === AggregationTemporality.DELTA
? 'delta'
: 'cumulative',
}
// Only add wsl.version if it exists (omit instead of default)
if (attrs['wsl.version']) {
resourceAttributes['wsl.version'] = attrs['wsl.version'] as string
}
// Add customer type and subscription type
if (isClaudeAISubscriber()) {
resourceAttributes['user.customer_type'] = 'claude_ai'
const subscriptionType = getSubscriptionType()
if (subscriptionType) {
resourceAttributes['user.subscription_type'] = subscriptionType
}
} else {
resourceAttributes['user.customer_type'] = 'api'
}
const transformed = {
resource_attributes: resourceAttributes,
metrics: metrics.scopeMetrics.flatMap(scopeMetric =>
scopeMetric.metrics.map(metric => ({
name: metric.descriptor.name,
description: metric.descriptor.description,
unit: metric.descriptor.unit,
data_points: this.extractDataPoints(metric),
})),
),
}
return transformed
}
private extractDataPoints(metric: MetricData): DataPoint[] {
const dataPoints = metric.dataPoints || []
return dataPoints
.filter(
(point): point is OTelDataPoint<number> =>
typeof point.value === 'number',
)
.map(point => ({
attributes: this.convertAttributes(point.attributes),
value: point.value,
timestamp: this.hrTimeToISOString(
point.endTime || point.startTime || [Date.now() / 1000, 0],
),
}))
}
async shutdown(): Promise<void> {
this.isShutdown = true
await this.forceFlush()
logForDebugging('BigQuery metrics exporter shutdown complete')
}
async forceFlush(): Promise<void> {
await Promise.all(this.pendingExports)
logForDebugging('BigQuery metrics exporter flush complete')
}
private convertAttributes(
attributes: Attributes | undefined,
): Record<string, string> {
const result: Record<string, string> = {}
if (attributes) {
for (const [key, value] of Object.entries(attributes)) {
if (value !== undefined && value !== null) {
result[key] = String(value)
}
}
}
return result
}
private hrTimeToISOString(hrTime: HrTime): string {
const [seconds, nanoseconds] = hrTime
const date = new Date(seconds * 1000 + nanoseconds / 1000000)
return date.toISOString()
}
selectAggregationTemporality(): AggregationTemporality {
// DO NOT CHANGE THIS TO CUMULATIVE
// It would mess up the aggregation of metrics
// for CC Productivity metrics dashboard
return AggregationTemporality.DELTA
}
}

View File

@@ -0,0 +1,75 @@
import type { Attributes } from '@opentelemetry/api'
import { getEventLogger, getPromptId } from 'src/bootstrap/state.js'
import { logForDebugging } from '../debug.js'
import { isEnvTruthy } from '../envUtils.js'
import { getTelemetryAttributes } from '../telemetryAttributes.js'
// Monotonically increasing counter for ordering events within a session
let eventSequence = 0
// Track whether we've already warned about a null event logger to avoid spamming
let hasWarnedNoEventLogger = false
function isUserPromptLoggingEnabled() {
return isEnvTruthy(process.env.OTEL_LOG_USER_PROMPTS)
}
export function redactIfDisabled(content: string): string {
return isUserPromptLoggingEnabled() ? content : '<REDACTED>'
}
export async function logOTelEvent(
eventName: string,
metadata: { [key: string]: string | undefined } = {},
): Promise<void> {
const eventLogger = getEventLogger()
if (!eventLogger) {
if (!hasWarnedNoEventLogger) {
hasWarnedNoEventLogger = true
logForDebugging(
`[3P telemetry] Event dropped (no event logger initialized): ${eventName}`,
{ level: 'warn' },
)
}
return
}
// Skip logging in test environment
if (process.env.NODE_ENV === 'test') {
return
}
const attributes: Attributes = {
...getTelemetryAttributes(),
'event.name': eventName,
'event.timestamp': new Date().toISOString(),
'event.sequence': eventSequence++,
}
// Add prompt ID to events (but not metrics, where it would cause unbounded cardinality)
const promptId = getPromptId()
if (promptId) {
attributes['prompt.id'] = promptId
}
// Workspace directory from the desktop app (host path). Events only —
// filesystem paths are too high-cardinality for metric dimensions, and
// the BQ metrics pipeline must never see them.
const workspaceDir = process.env.CLAUDE_CODE_WORKSPACE_HOST_PATHS
if (workspaceDir) {
attributes['workspace.host_paths'] = workspaceDir.split('|')
}
// Add metadata as attributes - all values are already strings
for (const [key, value] of Object.entries(metadata)) {
if (value !== undefined) {
attributes[key] = value
}
}
// Emit log record as an event
eventLogger.emit({
body: `claude_code.${eventName}`,
attributes,
})
}

View File

@@ -0,0 +1,825 @@
import { DiagLogLevel, diag, trace } from '@opentelemetry/api'
import { logs } from '@opentelemetry/api-logs'
// OTLP/Prometheus exporters are dynamically imported inside the protocol
// switch statements below. A process uses at most one protocol variant per
// signal, but static imports would load all 6 (~1.2MB) on every startup.
import {
envDetector,
hostDetector,
osDetector,
resourceFromAttributes,
} from '@opentelemetry/resources'
import {
BatchLogRecordProcessor,
ConsoleLogRecordExporter,
LoggerProvider,
} from '@opentelemetry/sdk-logs'
import {
ConsoleMetricExporter,
MeterProvider,
PeriodicExportingMetricReader,
} from '@opentelemetry/sdk-metrics'
import {
BasicTracerProvider,
BatchSpanProcessor,
ConsoleSpanExporter,
} from '@opentelemetry/sdk-trace-base'
import {
ATTR_SERVICE_NAME,
ATTR_SERVICE_VERSION,
SEMRESATTRS_HOST_ARCH,
} from '@opentelemetry/semantic-conventions'
import { HttpsProxyAgent } from 'https-proxy-agent'
import {
getLoggerProvider,
getMeterProvider,
getTracerProvider,
setEventLogger,
setLoggerProvider,
setMeterProvider,
setTracerProvider,
} from 'src/bootstrap/state.js'
import {
getOtelHeadersFromHelper,
getSubscriptionType,
is1PApiCustomer,
isClaudeAISubscriber,
} from 'src/utils/auth.js'
import { getPlatform, getWslVersion } from 'src/utils/platform.js'
import { getCACertificates } from '../caCerts.js'
import { registerCleanup } from '../cleanupRegistry.js'
import { getHasFormattedOutput, logForDebugging } from '../debug.js'
import { isEnvTruthy } from '../envUtils.js'
import { errorMessage } from '../errors.js'
import { getMTLSConfig } from '../mtls.js'
import { getProxyUrl, shouldBypassProxy } from '../proxy.js'
import { getSettings_DEPRECATED } from '../settings/settings.js'
import { jsonStringify } from '../slowOperations.js'
import { profileCheckpoint } from '../startupProfiler.js'
import { isBetaTracingEnabled } from './betaSessionTracing.js'
import { BigQueryMetricsExporter } from './bigqueryExporter.js'
import { ClaudeCodeDiagLogger } from './logger.js'
import { initializePerfettoTracing } from './perfettoTracing.js'
import {
endInteractionSpan,
isEnhancedTelemetryEnabled,
} from './sessionTracing.js'
const DEFAULT_METRICS_EXPORT_INTERVAL_MS = 60000
const DEFAULT_LOGS_EXPORT_INTERVAL_MS = 5000
const DEFAULT_TRACES_EXPORT_INTERVAL_MS = 5000
class TelemetryTimeoutError extends Error {}
function telemetryTimeout(ms: number, message: string): Promise<never> {
return new Promise((_, reject) => {
setTimeout(
(rej: (e: Error) => void, msg: string) =>
rej(new TelemetryTimeoutError(msg)),
ms,
reject,
message,
).unref()
})
}
export function bootstrapTelemetry() {
if (process.env.USER_TYPE === 'ant') {
// Read from ANT_ prefixed variables that are defined at build time
if (process.env.ANT_OTEL_METRICS_EXPORTER) {
process.env.OTEL_METRICS_EXPORTER = process.env.ANT_OTEL_METRICS_EXPORTER
}
if (process.env.ANT_OTEL_LOGS_EXPORTER) {
process.env.OTEL_LOGS_EXPORTER = process.env.ANT_OTEL_LOGS_EXPORTER
}
if (process.env.ANT_OTEL_TRACES_EXPORTER) {
process.env.OTEL_TRACES_EXPORTER = process.env.ANT_OTEL_TRACES_EXPORTER
}
if (process.env.ANT_OTEL_EXPORTER_OTLP_PROTOCOL) {
process.env.OTEL_EXPORTER_OTLP_PROTOCOL =
process.env.ANT_OTEL_EXPORTER_OTLP_PROTOCOL
}
if (process.env.ANT_OTEL_EXPORTER_OTLP_ENDPOINT) {
process.env.OTEL_EXPORTER_OTLP_ENDPOINT =
process.env.ANT_OTEL_EXPORTER_OTLP_ENDPOINT
}
if (process.env.ANT_OTEL_EXPORTER_OTLP_HEADERS) {
process.env.OTEL_EXPORTER_OTLP_HEADERS =
process.env.ANT_OTEL_EXPORTER_OTLP_HEADERS
}
}
// Set default tempoality to 'delta' because it's the more sane default
if (!process.env.OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE) {
process.env.OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE = 'delta'
}
}
// Per OTEL spec, "none" means "no automatically configured exporter for this signal".
// https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#exporter-selection
export function parseExporterTypes(value: string | undefined): string[] {
return (value || '')
.trim()
.split(',')
.filter(Boolean)
.map(t => t.trim())
.filter(t => t !== 'none')
}
async function getOtlpReaders() {
const exporterTypes = parseExporterTypes(process.env.OTEL_METRICS_EXPORTER)
const exportInterval = parseInt(
process.env.OTEL_METRIC_EXPORT_INTERVAL ||
DEFAULT_METRICS_EXPORT_INTERVAL_MS.toString(),
)
const exporters = []
for (const exporterType of exporterTypes) {
if (exporterType === 'console') {
// Custom console exporter that shows resource attributes
const consoleExporter = new ConsoleMetricExporter()
const originalExport = consoleExporter.export.bind(consoleExporter)
consoleExporter.export = (metrics, callback) => {
// Log resource attributes once at the start
if (metrics.resource && metrics.resource.attributes) {
// The console exporter is for debugging, so console output is intentional here
logForDebugging('\n=== Resource Attributes ===')
logForDebugging(jsonStringify(metrics.resource.attributes))
logForDebugging('===========================\n')
}
return originalExport(metrics, callback)
}
exporters.push(consoleExporter)
} else if (exporterType === 'otlp') {
const protocol =
process.env.OTEL_EXPORTER_OTLP_METRICS_PROTOCOL?.trim() ||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL?.trim()
const httpConfig = getOTLPExporterConfig()
switch (protocol) {
case 'grpc': {
// Lazy-import to keep @grpc/grpc-js (~700KB) out of the telemetry chunk
// when the protocol is http/protobuf (ant default) or http/json.
const { OTLPMetricExporter } = await import(
'@opentelemetry/exporter-metrics-otlp-grpc'
)
exporters.push(new OTLPMetricExporter())
break
}
case 'http/json': {
const { OTLPMetricExporter } = await import(
'@opentelemetry/exporter-metrics-otlp-http'
)
exporters.push(new OTLPMetricExporter(httpConfig))
break
}
case 'http/protobuf': {
const { OTLPMetricExporter } = await import(
'@opentelemetry/exporter-metrics-otlp-proto'
)
exporters.push(new OTLPMetricExporter(httpConfig))
break
}
default:
throw new Error(
`Unknown protocol set in OTEL_EXPORTER_OTLP_METRICS_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${protocol}`,
)
}
} else if (exporterType === 'prometheus') {
const { PrometheusExporter } = await import(
'@opentelemetry/exporter-prometheus'
)
exporters.push(new PrometheusExporter())
} else {
throw new Error(
`Unknown exporter type set in OTEL_EXPORTER_OTLP_METRICS_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${exporterType}`,
)
}
}
return exporters.map(exporter => {
if ('export' in exporter) {
return new PeriodicExportingMetricReader({
exporter,
exportIntervalMillis: exportInterval,
})
}
return exporter
})
}
async function getOtlpLogExporters() {
const exporterTypes = parseExporterTypes(process.env.OTEL_LOGS_EXPORTER)
const protocol =
process.env.OTEL_EXPORTER_OTLP_LOGS_PROTOCOL?.trim() ||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL?.trim()
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT
logForDebugging(
`[3P telemetry] getOtlpLogExporters: types=${jsonStringify(exporterTypes)}, protocol=${protocol}, endpoint=${endpoint}`,
)
const exporters = []
for (const exporterType of exporterTypes) {
if (exporterType === 'console') {
exporters.push(new ConsoleLogRecordExporter())
} else if (exporterType === 'otlp') {
const httpConfig = getOTLPExporterConfig()
switch (protocol) {
case 'grpc': {
const { OTLPLogExporter } = await import(
'@opentelemetry/exporter-logs-otlp-grpc'
)
exporters.push(new OTLPLogExporter())
break
}
case 'http/json': {
const { OTLPLogExporter } = await import(
'@opentelemetry/exporter-logs-otlp-http'
)
exporters.push(new OTLPLogExporter(httpConfig))
break
}
case 'http/protobuf': {
const { OTLPLogExporter } = await import(
'@opentelemetry/exporter-logs-otlp-proto'
)
exporters.push(new OTLPLogExporter(httpConfig))
break
}
default:
throw new Error(
`Unknown protocol set in OTEL_EXPORTER_OTLP_LOGS_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${protocol}`,
)
}
} else {
throw new Error(
`Unknown exporter type set in OTEL_LOGS_EXPORTER env var: ${exporterType}`,
)
}
}
return exporters
}
async function getOtlpTraceExporters() {
const exporterTypes = parseExporterTypes(process.env.OTEL_TRACES_EXPORTER)
const exporters = []
for (const exporterType of exporterTypes) {
if (exporterType === 'console') {
exporters.push(new ConsoleSpanExporter())
} else if (exporterType === 'otlp') {
const protocol =
process.env.OTEL_EXPORTER_OTLP_TRACES_PROTOCOL?.trim() ||
process.env.OTEL_EXPORTER_OTLP_PROTOCOL?.trim()
const httpConfig = getOTLPExporterConfig()
switch (protocol) {
case 'grpc': {
const { OTLPTraceExporter } = await import(
'@opentelemetry/exporter-trace-otlp-grpc'
)
exporters.push(new OTLPTraceExporter())
break
}
case 'http/json': {
const { OTLPTraceExporter } = await import(
'@opentelemetry/exporter-trace-otlp-http'
)
exporters.push(new OTLPTraceExporter(httpConfig))
break
}
case 'http/protobuf': {
const { OTLPTraceExporter } = await import(
'@opentelemetry/exporter-trace-otlp-proto'
)
exporters.push(new OTLPTraceExporter(httpConfig))
break
}
default:
throw new Error(
`Unknown protocol set in OTEL_EXPORTER_OTLP_TRACES_PROTOCOL or OTEL_EXPORTER_OTLP_PROTOCOL env var: ${protocol}`,
)
}
} else {
throw new Error(
`Unknown exporter type set in OTEL_TRACES_EXPORTER env var: ${exporterType}`,
)
}
}
return exporters
}
export function isTelemetryEnabled() {
return isEnvTruthy(process.env.CLAUDE_CODE_ENABLE_TELEMETRY)
}
function getBigQueryExportingReader() {
const bigqueryExporter = new BigQueryMetricsExporter()
return new PeriodicExportingMetricReader({
exporter: bigqueryExporter,
exportIntervalMillis: 5 * 60 * 1000, // 5mins for BigQuery metrics exporter to reduce load
})
}
function isBigQueryMetricsEnabled() {
// BigQuery metrics are enabled for:
// 1. API customers (excluding Claude.ai subscribers and Bedrock/Vertex)
// 2. Claude for Enterprise (C4E) users
// 3. Claude for Teams users
const subscriptionType = getSubscriptionType()
const isC4EOrTeamUser =
isClaudeAISubscriber() &&
(subscriptionType === 'enterprise' || subscriptionType === 'team')
return is1PApiCustomer() || isC4EOrTeamUser
}
/**
* Initialize beta tracing - a separate code path for detailed debugging.
* Uses BETA_TRACING_ENDPOINT instead of OTEL_EXPORTER_OTLP_ENDPOINT.
*/
async function initializeBetaTracing(
resource: ReturnType<typeof resourceFromAttributes>,
): Promise<void> {
const endpoint = process.env.BETA_TRACING_ENDPOINT
if (!endpoint) {
return
}
const [{ OTLPTraceExporter }, { OTLPLogExporter }] = await Promise.all([
import('@opentelemetry/exporter-trace-otlp-http'),
import('@opentelemetry/exporter-logs-otlp-http'),
])
const httpConfig = {
url: `${endpoint}/v1/traces`,
}
const logHttpConfig = {
url: `${endpoint}/v1/logs`,
}
// Initialize trace exporter
const traceExporter = new OTLPTraceExporter(httpConfig)
const spanProcessor = new BatchSpanProcessor(traceExporter, {
scheduledDelayMillis: DEFAULT_TRACES_EXPORT_INTERVAL_MS,
})
const tracerProvider = new BasicTracerProvider({
resource,
spanProcessors: [spanProcessor],
})
trace.setGlobalTracerProvider(tracerProvider)
setTracerProvider(tracerProvider)
// Initialize log exporter
const logExporter = new OTLPLogExporter(logHttpConfig)
const loggerProvider = new LoggerProvider({
resource,
processors: [
new BatchLogRecordProcessor(logExporter, {
scheduledDelayMillis: DEFAULT_LOGS_EXPORT_INTERVAL_MS,
}),
],
})
logs.setGlobalLoggerProvider(loggerProvider)
setLoggerProvider(loggerProvider)
// Initialize event logger
const eventLogger = logs.getLogger(
'com.anthropic.claude_code.events',
MACRO.VERSION,
)
setEventLogger(eventLogger)
// Setup flush handlers - flush both logs AND traces
process.on('beforeExit', async () => {
await loggerProvider?.forceFlush()
await tracerProvider?.forceFlush()
})
process.on('exit', () => {
void loggerProvider?.forceFlush()
void tracerProvider?.forceFlush()
})
}
export async function initializeTelemetry() {
profileCheckpoint('telemetry_init_start')
bootstrapTelemetry()
// Console exporters call console.dir on a timer (5s logs/traces, 60s
// metrics), writing pretty-printed objects to stdout. In stream-json
// mode stdout is the SDK message channel; the first line (`{`) breaks
// the SDK's line reader. Stripped here (not main.tsx) because init.ts
// re-runs applyConfigEnvironmentVariables() inside initializeTelemetry-
// AfterTrust for remote-managed-settings users, and bootstrapTelemetry
// above copies ANT_OTEL_* for ant users — both would undo an earlier strip.
if (getHasFormattedOutput()) {
for (const key of [
'OTEL_METRICS_EXPORTER',
'OTEL_LOGS_EXPORTER',
'OTEL_TRACES_EXPORTER',
] as const) {
const v = process.env[key]
if (v?.includes('console')) {
process.env[key] = v
.split(',')
.map(s => s.trim())
.filter(s => s !== 'console')
.join(',')
}
}
}
diag.setLogger(new ClaudeCodeDiagLogger(), DiagLogLevel.ERROR)
// Initialize Perfetto tracing (independent of OTEL)
// Enable via CLAUDE_CODE_PERFETTO_TRACE=1 or CLAUDE_CODE_PERFETTO_TRACE=<path>
initializePerfettoTracing()
const readers = []
// Add customer exporters (if enabled)
const telemetryEnabled = isTelemetryEnabled()
logForDebugging(
`[3P telemetry] isTelemetryEnabled=${telemetryEnabled} (CLAUDE_CODE_ENABLE_TELEMETRY=${process.env.CLAUDE_CODE_ENABLE_TELEMETRY})`,
)
if (telemetryEnabled) {
readers.push(...(await getOtlpReaders()))
}
// Add BigQuery exporter (for API customers, C4E users, and internal users)
if (isBigQueryMetricsEnabled()) {
readers.push(getBigQueryExportingReader())
}
// Create base resource with service attributes
const platform = getPlatform()
const baseAttributes: Record<string, string> = {
[ATTR_SERVICE_NAME]: 'claude-code',
[ATTR_SERVICE_VERSION]: MACRO.VERSION,
}
// Add WSL-specific attributes if running on WSL
if (platform === 'wsl') {
const wslVersion = getWslVersion()
if (wslVersion) {
baseAttributes['wsl.version'] = wslVersion
}
}
const baseResource = resourceFromAttributes(baseAttributes)
// Use OpenTelemetry detectors
const osResource = resourceFromAttributes(
osDetector.detect().attributes || {},
)
// Extract only host.arch from hostDetector
const hostDetected = hostDetector.detect()
const hostArchAttributes = hostDetected.attributes?.[SEMRESATTRS_HOST_ARCH]
? {
[SEMRESATTRS_HOST_ARCH]: hostDetected.attributes[SEMRESATTRS_HOST_ARCH],
}
: {}
const hostArchResource = resourceFromAttributes(hostArchAttributes)
const envResource = resourceFromAttributes(
envDetector.detect().attributes || {},
)
// Merge resources - later resources take precedence
const resource = baseResource
.merge(osResource)
.merge(hostArchResource)
.merge(envResource)
// Check if beta tracing is enabled - this is a separate code path
// Available to all users who set ENABLE_BETA_TRACING_DETAILED=1 and BETA_TRACING_ENDPOINT
if (isBetaTracingEnabled()) {
void initializeBetaTracing(resource).catch(e =>
logForDebugging(`Beta tracing init failed: ${e}`, { level: 'error' }),
)
// Still set up meter provider for metrics (but skip regular logs/traces setup)
const meterProvider = new MeterProvider({
resource,
views: [],
readers,
})
setMeterProvider(meterProvider)
// Register shutdown for beta tracing
const shutdownTelemetry = async () => {
const timeoutMs = parseInt(
process.env.CLAUDE_CODE_OTEL_SHUTDOWN_TIMEOUT_MS || '2000',
)
try {
endInteractionSpan()
// Force flush + shutdown together inside the timeout. Previously forceFlush
// was awaited unbounded BEFORE the race, blocking exit on slow OTLP endpoints.
// Each provider's flush→shutdown is chained independently so a slow logger
// flush doesn't delay meterProvider/tracerProvider shutdown (no waterfall).
const loggerProvider = getLoggerProvider()
const tracerProvider = getTracerProvider()
const chains: Promise<void>[] = [meterProvider.shutdown()]
if (loggerProvider) {
chains.push(
loggerProvider.forceFlush().then(() => loggerProvider.shutdown()),
)
}
if (tracerProvider) {
chains.push(
tracerProvider.forceFlush().then(() => tracerProvider.shutdown()),
)
}
await Promise.race([
Promise.all(chains),
telemetryTimeout(timeoutMs, 'OpenTelemetry shutdown timeout'),
])
} catch {
// Ignore shutdown errors
}
}
registerCleanup(shutdownTelemetry)
return meterProvider.getMeter('com.anthropic.claude_code', MACRO.VERSION)
}
const meterProvider = new MeterProvider({
resource,
views: [],
readers,
})
// Store reference in state for flushing
setMeterProvider(meterProvider)
// Initialize logs if telemetry is enabled
if (telemetryEnabled) {
const logExporters = await getOtlpLogExporters()
logForDebugging(
`[3P telemetry] Created ${logExporters.length} log exporter(s)`,
)
if (logExporters.length > 0) {
const loggerProvider = new LoggerProvider({
resource,
// Add batch processors for each exporter
processors: logExporters.map(
exporter =>
new BatchLogRecordProcessor(exporter, {
scheduledDelayMillis: parseInt(
process.env.OTEL_LOGS_EXPORT_INTERVAL ||
DEFAULT_LOGS_EXPORT_INTERVAL_MS.toString(),
),
}),
),
})
// Register the logger provider globally
logs.setGlobalLoggerProvider(loggerProvider)
setLoggerProvider(loggerProvider)
// Initialize event logger
const eventLogger = logs.getLogger(
'com.anthropic.claude_code.events',
MACRO.VERSION,
)
setEventLogger(eventLogger)
logForDebugging('[3P telemetry] Event logger set successfully')
// 'beforeExit' is emitted when Node.js empties its event loop and has no additional work to schedule.
// Unlike 'exit', it allows us to perform async operations, so it works well for letting
// network requests complete before the process exits naturally.
process.on('beforeExit', async () => {
await loggerProvider?.forceFlush()
// Also flush traces - they use BatchSpanProcessor which needs explicit flush
const tracerProvider = getTracerProvider()
await tracerProvider?.forceFlush()
})
process.on('exit', () => {
// Final attempt to flush logs and traces
void loggerProvider?.forceFlush()
void getTracerProvider()?.forceFlush()
})
}
}
// Initialize tracing if enhanced telemetry is enabled (BETA)
if (telemetryEnabled && isEnhancedTelemetryEnabled()) {
const traceExporters = await getOtlpTraceExporters()
if (traceExporters.length > 0) {
// Create span processors for each exporter
const spanProcessors = traceExporters.map(
exporter =>
new BatchSpanProcessor(exporter, {
scheduledDelayMillis: parseInt(
process.env.OTEL_TRACES_EXPORT_INTERVAL ||
DEFAULT_TRACES_EXPORT_INTERVAL_MS.toString(),
),
}),
)
const tracerProvider = new BasicTracerProvider({
resource,
spanProcessors,
})
// Register the tracer provider globally
trace.setGlobalTracerProvider(tracerProvider)
setTracerProvider(tracerProvider)
}
}
// Shutdown metrics and logs on exit (flushes and closes exporters)
const shutdownTelemetry = async () => {
const timeoutMs = parseInt(
process.env.CLAUDE_CODE_OTEL_SHUTDOWN_TIMEOUT_MS || '2000',
)
try {
// End any active interaction span before shutdown
endInteractionSpan()
const shutdownPromises = [meterProvider.shutdown()]
const loggerProvider = getLoggerProvider()
if (loggerProvider) {
shutdownPromises.push(loggerProvider.shutdown())
}
const tracerProvider = getTracerProvider()
if (tracerProvider) {
shutdownPromises.push(tracerProvider.shutdown())
}
await Promise.race([
Promise.all(shutdownPromises),
telemetryTimeout(timeoutMs, 'OpenTelemetry shutdown timeout'),
])
} catch (error) {
if (error instanceof Error && error.message.includes('timeout')) {
logForDebugging(
`
OpenTelemetry telemetry flush timed out after ${timeoutMs}ms
To resolve this issue, you can:
1. Increase the timeout by setting CLAUDE_CODE_OTEL_SHUTDOWN_TIMEOUT_MS env var (e.g., 5000 for 5 seconds)
2. Check if your OpenTelemetry backend is experiencing scalability issues
3. Disable OpenTelemetry by unsetting CLAUDE_CODE_ENABLE_TELEMETRY env var
Current timeout: ${timeoutMs}ms
`,
{ level: 'error' },
)
}
throw error
}
}
// Always register shutdown (internal metrics are always enabled)
registerCleanup(shutdownTelemetry)
return meterProvider.getMeter('com.anthropic.claude_code', MACRO.VERSION)
}
/**
* Flush all pending telemetry data immediately.
* This should be called before logout or org switching to prevent data leakage.
*/
export async function flushTelemetry(): Promise<void> {
const meterProvider = getMeterProvider()
if (!meterProvider) {
return
}
const timeoutMs = parseInt(
process.env.CLAUDE_CODE_OTEL_FLUSH_TIMEOUT_MS || '5000',
)
try {
const flushPromises = [meterProvider.forceFlush()]
const loggerProvider = getLoggerProvider()
if (loggerProvider) {
flushPromises.push(loggerProvider.forceFlush())
}
const tracerProvider = getTracerProvider()
if (tracerProvider) {
flushPromises.push(tracerProvider.forceFlush())
}
await Promise.race([
Promise.all(flushPromises),
telemetryTimeout(timeoutMs, 'OpenTelemetry flush timeout'),
])
logForDebugging('Telemetry flushed successfully')
} catch (error) {
if (error instanceof TelemetryTimeoutError) {
logForDebugging(
`Telemetry flush timed out after ${timeoutMs}ms. Some metrics may not be exported.`,
{ level: 'warn' },
)
} else {
logForDebugging(`Telemetry flush failed: ${errorMessage(error)}`, {
level: 'error',
})
}
// Don't throw - allow logout to continue even if flush fails
}
}
function parseOtelHeadersEnvVar(): Record<string, string> {
const headers: Record<string, string> = {}
const envHeaders = process.env.OTEL_EXPORTER_OTLP_HEADERS
if (envHeaders) {
for (const pair of envHeaders.split(',')) {
const [key, ...valueParts] = pair.split('=')
if (key && valueParts.length > 0) {
headers[key.trim()] = valueParts.join('=').trim()
}
}
}
return headers
}
/**
* Get configuration for OTLP exporters including:
* - HTTP agent options (proxy, mTLS)
* - Dynamic headers via otelHeadersHelper or static headers from env var
*/
function getOTLPExporterConfig() {
const proxyUrl = getProxyUrl()
const mtlsConfig = getMTLSConfig()
const settings = getSettings_DEPRECATED()
// Build base config
const config: Record<string, unknown> = {}
// Parse static headers from env var once (doesn't change at runtime)
const staticHeaders = parseOtelHeadersEnvVar()
// If otelHeadersHelper is configured, use async headers function for dynamic refresh
// Otherwise just return static headers if any exist
if (settings?.otelHeadersHelper) {
config.headers = async (): Promise<Record<string, string>> => {
const dynamicHeaders = getOtelHeadersFromHelper()
return { ...staticHeaders, ...dynamicHeaders }
}
} else if (Object.keys(staticHeaders).length > 0) {
config.headers = async (): Promise<Record<string, string>> => staticHeaders
}
// Check if we should bypass proxy for OTEL endpoint
const otelEndpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT
if (!proxyUrl || (otelEndpoint && shouldBypassProxy(otelEndpoint))) {
// No proxy configured or OTEL endpoint should bypass proxy
const caCerts = getCACertificates()
if (mtlsConfig || caCerts) {
config.httpAgentOptions = {
...mtlsConfig,
...(caCerts && { ca: caCerts }),
}
}
return config
}
// Return an HttpAgentFactory function that creates our proxy agent
const caCerts = getCACertificates()
const agentFactory = (_protocol: string) => {
// Create and return the proxy agent with mTLS and CA cert config
const proxyAgent =
mtlsConfig || caCerts
? new HttpsProxyAgent(proxyUrl, {
...(mtlsConfig && {
cert: mtlsConfig.cert,
key: mtlsConfig.key,
passphrase: mtlsConfig.passphrase,
}),
...(caCerts && { ca: caCerts }),
})
: new HttpsProxyAgent(proxyUrl)
return proxyAgent
}
config.httpAgentOptions = agentFactory
return config
}

View File

@@ -0,0 +1,26 @@
import type { DiagLogger } from '@opentelemetry/api'
import { logForDebugging } from '../debug.js'
import { logError } from '../log.js'
export class ClaudeCodeDiagLogger implements DiagLogger {
error(message: string, ..._: unknown[]) {
logError(new Error(message))
logForDebugging(`[3P telemetry] OTEL diag error: ${message}`, {
level: 'error',
})
}
warn(message: string, ..._: unknown[]) {
logError(new Error(message))
logForDebugging(`[3P telemetry] OTEL diag warn: ${message}`, {
level: 'warn',
})
}
info(_message: string, ..._args: unknown[]) {
return
}
debug(_message: string, ..._args: unknown[]) {
return
}
verbose(_message: string, ..._args: unknown[]) {
return
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,289 @@
/**
* Plugin telemetry helpers — shared field builders for plugin lifecycle events.
*
* Implements the twin-column privacy pattern: every user-defined-name field
* emits both a raw value (routed to PII-tagged _PROTO_* BQ columns) and a
* redacted twin (real name iff marketplace ∈ allowlist, else 'third-party').
*
* plugin_id_hash provides an opaque per-plugin aggregation key with no privacy
* dependency — sha256(name@marketplace + FIXED_SALT) truncated to 16 chars.
* This answers distinct-count and per-plugin-trend questions that the
* redacted column can't, without exposing user-defined names.
*/
import { createHash } from 'crypto'
import { sep } from 'path'
import {
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
type AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
logEvent,
} from '../../services/analytics/index.js'
import type {
LoadedPlugin,
PluginError,
PluginManifest,
} from '../../types/plugin.js'
import {
isOfficialMarketplaceName,
parsePluginIdentifier,
} from '../plugins/pluginIdentifier.js'
// builtinPlugins.ts:BUILTIN_MARKETPLACE_NAME — inlined to avoid the cycle
// through commands.js. Marketplace schemas.ts enforces 'builtin' is reserved.
const BUILTIN_MARKETPLACE_NAME = 'builtin'
// Fixed salt for plugin_id_hash. Same constant across all repos and emission
// sites. Not per-org, not rotated — per-org salt would defeat cross-org
// distinct-count, rotation would break trend lines. Customers can compute the
// same hash on their known plugin names to reverse-match their own telemetry.
const PLUGIN_ID_HASH_SALT = 'claude-plugin-telemetry-v1'
/**
* Opaque per-plugin aggregation key. Input is the name@marketplace string as
* it appears in enabledPlugins keys, lowercased on the marketplace suffix for
* reproducibility. 16-char truncation keeps BQ GROUP BY cardinality manageable
* while making collisions negligible at projected 10k-plugin scale. Name case
* is preserved in both branches (enabledPlugins keys are case-sensitive).
*/
export function hashPluginId(name: string, marketplace?: string): string {
const key = marketplace ? `${name}@${marketplace.toLowerCase()}` : name
return createHash('sha256')
.update(key + PLUGIN_ID_HASH_SALT)
.digest('hex')
.slice(0, 16)
}
/**
* 4-value scope enum for plugin origin. Distinct from PluginScope
* (managed/user/project/local) which is installation-target — this is
* marketplace-origin.
*
* - official: from an allowlisted Anthropic marketplace
* - default-bundle: ships with product (@builtin), auto-enabled
* - org: enterprise admin-pushed via managed settings (policySettings)
* - user-local: user added marketplace or local plugin
*/
export type TelemetryPluginScope =
| 'official'
| 'org'
| 'user-local'
| 'default-bundle'
export function getTelemetryPluginScope(
name: string,
marketplace: string | undefined,
managedNames: Set<string> | null,
): TelemetryPluginScope {
if (marketplace === BUILTIN_MARKETPLACE_NAME) return 'default-bundle'
if (isOfficialMarketplaceName(marketplace)) return 'official'
if (managedNames?.has(name)) return 'org'
return 'user-local'
}
/**
* How a plugin arrived in the session. Splits self-selected from org-pushed
* — plugin_scope alone doesn't (an official plugin can be user-installed OR
* org-pushed; both are scope='official').
*/
export type EnabledVia =
| 'user-install'
| 'org-policy'
| 'default-enable'
| 'seed-mount'
/** How a skill/command invocation was triggered. */
export type InvocationTrigger =
| 'user-slash'
| 'claude-proactive'
| 'nested-skill'
/** Where a skill invocation executes. */
export type SkillExecutionContext = 'fork' | 'inline' | 'remote'
/** How a plugin install was initiated. */
export type InstallSource =
| 'cli-explicit'
| 'ui-discover'
| 'ui-suggestion'
| 'deep-link'
export function getEnabledVia(
plugin: LoadedPlugin,
managedNames: Set<string> | null,
seedDirs: string[],
): EnabledVia {
if (plugin.isBuiltin) return 'default-enable'
if (managedNames?.has(plugin.name)) return 'org-policy'
// Trailing sep: /opt/plugins must not match /opt/plugins-extra
if (
seedDirs.some(dir =>
plugin.path.startsWith(dir.endsWith(sep) ? dir : dir + sep),
)
) {
return 'seed-mount'
}
return 'user-install'
}
/**
* Common plugin telemetry fields keyed off name@marketplace. Returns the
* hash, scope enum, and the redacted-twin columns. Callers add the raw
* _PROTO_* fields separately (those require the PII-tagged marker type).
*/
export function buildPluginTelemetryFields(
name: string,
marketplace: string | undefined,
managedNames: Set<string> | null = null,
): {
plugin_id_hash: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
plugin_scope: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
plugin_name_redacted: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
marketplace_name_redacted: AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
is_official_plugin: boolean
} {
const scope = getTelemetryPluginScope(name, marketplace, managedNames)
// Both official marketplaces and builtin plugins are Anthropic-controlled
// — safe to expose real names in the redacted columns.
const isAnthropicControlled =
scope === 'official' || scope === 'default-bundle'
return {
plugin_id_hash: hashPluginId(
name,
marketplace,
) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
plugin_scope:
scope as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
plugin_name_redacted: (isAnthropicControlled
? name
: 'third-party') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
marketplace_name_redacted: (isAnthropicControlled && marketplace
? marketplace
: 'third-party') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
is_official_plugin: isAnthropicControlled,
}
}
/**
* Per-invocation callers (SkillTool, processSlashCommand) pass
* managedNames=null — the session-level tengu_plugin_enabled_for_session
* event carries the authoritative plugin_scope, and per-invocation rows can
* join on plugin_id_hash to recover it. This keeps hot-path call sites free
* of the extra settings read.
*/
export function buildPluginCommandTelemetryFields(
pluginInfo: { pluginManifest: PluginManifest; repository: string },
managedNames: Set<string> | null = null,
): ReturnType<typeof buildPluginTelemetryFields> {
const { marketplace } = parsePluginIdentifier(pluginInfo.repository)
return buildPluginTelemetryFields(
pluginInfo.pluginManifest.name,
marketplace,
managedNames,
)
}
/**
* Emit tengu_plugin_enabled_for_session once per enabled plugin at session
* start. Supplements tengu_skill_loaded (which still fires per-skill) — use
* this for plugin-level aggregates instead of DISTINCT-on-prefix hacks.
* A plugin with 5 skills emits 5 skill_loaded rows but 1 of these.
*/
export function logPluginsEnabledForSession(
plugins: LoadedPlugin[],
managedNames: Set<string> | null,
seedDirs: string[],
): void {
for (const plugin of plugins) {
const { marketplace } = parsePluginIdentifier(plugin.repository)
logEvent('tengu_plugin_enabled_for_session', {
_PROTO_plugin_name:
plugin.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
...(marketplace && {
_PROTO_marketplace_name:
marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
}),
...buildPluginTelemetryFields(plugin.name, marketplace, managedNames),
enabled_via: getEnabledVia(
plugin,
managedNames,
seedDirs,
) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
skill_path_count:
(plugin.skillsPath ? 1 : 0) + (plugin.skillsPaths?.length ?? 0),
command_path_count:
(plugin.commandsPath ? 1 : 0) + (plugin.commandsPaths?.length ?? 0),
has_mcp: plugin.manifest.mcpServers !== undefined,
has_hooks: plugin.hooksConfig !== undefined,
...(plugin.manifest.version && {
version: plugin.manifest
.version as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
}),
})
}
}
/**
* Bounded-cardinality error bucket for CLI plugin operation failures.
* Maps free-form error messages to 5 stable categories so dashboard
* GROUP BY stays tractable.
*/
export type PluginCommandErrorCategory =
| 'network'
| 'not-found'
| 'permission'
| 'validation'
| 'unknown'
export function classifyPluginCommandError(
error: unknown,
): PluginCommandErrorCategory {
const msg = String((error as { message?: unknown })?.message ?? error)
if (
/ENOTFOUND|ECONNREFUSED|EAI_AGAIN|ETIMEDOUT|ECONNRESET|network|Could not resolve|Connection refused|timed out/i.test(
msg,
)
) {
return 'network'
}
if (/\b404\b|not found|does not exist|no such plugin/i.test(msg)) {
return 'not-found'
}
if (/\b40[13]\b|EACCES|EPERM|permission denied|unauthorized/i.test(msg)) {
return 'permission'
}
if (/invalid|malformed|schema|validation|parse error/i.test(msg)) {
return 'validation'
}
return 'unknown'
}
/**
* Emit tengu_plugin_load_failed once per error surfaced by session-start
* plugin loading. Pairs with tengu_plugin_enabled_for_session so dashboards
* can compute a load-success rate. PluginError.type is already a bounded
* enum — use it directly as error_category.
*/
export function logPluginLoadErrors(
errors: PluginError[],
managedNames: Set<string> | null,
): void {
for (const err of errors) {
const { name, marketplace } = parsePluginIdentifier(err.source)
// Not all PluginError variants carry a plugin name (some have pluginId,
// some are marketplace-level). Use the 'plugin' property if present,
// fall back to the name parsed from err.source.
const pluginName = 'plugin' in err && err.plugin ? err.plugin : name
logEvent('tengu_plugin_load_failed', {
error_category:
err.type as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
_PROTO_plugin_name:
pluginName as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
...(marketplace && {
_PROTO_marketplace_name:
marketplace as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
}),
...buildPluginTelemetryFields(pluginName, marketplace, managedNames),
})
}
}

View File

@@ -0,0 +1,927 @@
/**
* Session Tracing for Claude Code using OpenTelemetry (BETA)
*
* This module provides a high-level API for creating and managing spans
* to trace Claude Code workflows. Each user interaction creates a root
* interaction span, which contains operation spans (LLM requests, tool calls, etc.).
*
* Requirements:
* - Enhanced telemetry is enabled via feature('ENHANCED_TELEMETRY_BETA')
* - Configure OTEL_TRACES_EXPORTER (console, otlp, etc.)
*/
import { feature } from 'bun:bundle'
import { context as otelContext, type Span, trace } from '@opentelemetry/api'
import { AsyncLocalStorage } from 'async_hooks'
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
import type { AssistantMessage, UserMessage } from '../../types/message.js'
import { isEnvDefinedFalsy, isEnvTruthy } from '../envUtils.js'
import { getTelemetryAttributes } from '../telemetryAttributes.js'
import {
addBetaInteractionAttributes,
addBetaLLMRequestAttributes,
addBetaLLMResponseAttributes,
addBetaToolInputAttributes,
addBetaToolResultAttributes,
isBetaTracingEnabled,
type LLMRequestNewContext,
truncateContent,
} from './betaSessionTracing.js'
import {
endInteractionPerfettoSpan,
endLLMRequestPerfettoSpan,
endToolPerfettoSpan,
endUserInputPerfettoSpan,
isPerfettoTracingEnabled,
startInteractionPerfettoSpan,
startLLMRequestPerfettoSpan,
startToolPerfettoSpan,
startUserInputPerfettoSpan,
} from './perfettoTracing.js'
// Re-export for callers
export type { Span }
export { isBetaTracingEnabled, type LLMRequestNewContext }
// Message type for API calls (UserMessage or AssistantMessage)
type APIMessage = UserMessage | AssistantMessage
type SpanType =
| 'interaction'
| 'llm_request'
| 'tool'
| 'tool.blocked_on_user'
| 'tool.execution'
| 'hook'
interface SpanContext {
span: Span
startTime: number
attributes: Record<string, string | number | boolean>
ended?: boolean
perfettoSpanId?: string
}
// ALS stores SpanContext directly so it holds a strong reference while a span
// is active. With that, activeSpans can use WeakRef — when ALS is cleared
// (enterWith(undefined)) and no other code holds the SpanContext, GC can collect
// it and the WeakRef goes stale.
const interactionContext = new AsyncLocalStorage<SpanContext | undefined>()
const toolContext = new AsyncLocalStorage<SpanContext | undefined>()
const activeSpans = new Map<string, WeakRef<SpanContext>>()
// Spans not stored in ALS (LLM request, blocked-on-user, tool execution, hook)
// need a strong reference to prevent GC from collecting the SpanContext before
// the corresponding end* function retrieves it.
const strongSpans = new Map<string, SpanContext>()
let interactionSequence = 0
let _cleanupIntervalStarted = false
const SPAN_TTL_MS = 30 * 60 * 1000 // 30 minutes
function getSpanId(span: Span): string {
return span.spanContext().spanId || ''
}
/**
* Lazily start a background interval that evicts orphaned spans from activeSpans.
*
* Normal teardown calls endInteractionSpan / endToolSpan, which delete spans
* immediately. This interval is a safety net for spans that were never ended
* (e.g. aborted streams, uncaught exceptions mid-query) — without it they
* accumulate in activeSpans indefinitely, holding references to Span objects
* and the OpenTelemetry context chain.
*
* Initialized on the first startInteractionSpan call (not at module load) to
* avoid triggering the no-top-level-side-effects lint rule and to keep the
* interval from running in processes that never start a span.
* unref() prevents the timer from keeping the process alive after all other
* work is done.
*/
function ensureCleanupInterval(): void {
if (_cleanupIntervalStarted) return
_cleanupIntervalStarted = true
const interval = setInterval(() => {
const cutoff = Date.now() - SPAN_TTL_MS
for (const [spanId, weakRef] of activeSpans) {
const ctx = weakRef.deref()
if (ctx === undefined) {
activeSpans.delete(spanId)
strongSpans.delete(spanId)
} else if (ctx.startTime < cutoff) {
if (!ctx.ended) ctx.span.end() // flush any recorded attributes to the exporter
activeSpans.delete(spanId)
strongSpans.delete(spanId)
}
}
}, 60_000)
if (typeof interval.unref === 'function') {
interval.unref() // Node.js / Bun: don't block process exit
}
}
/**
* Check if enhanced telemetry is enabled.
* Priority: env var override > ant build > GrowthBook gate
*/
export function isEnhancedTelemetryEnabled(): boolean {
if (feature('ENHANCED_TELEMETRY_BETA')) {
const env =
process.env.CLAUDE_CODE_ENHANCED_TELEMETRY_BETA ??
process.env.ENABLE_ENHANCED_TELEMETRY_BETA
if (isEnvTruthy(env)) {
return true
}
if (isEnvDefinedFalsy(env)) {
return false
}
return (
process.env.USER_TYPE === 'ant' ||
getFeatureValue_CACHED_MAY_BE_STALE('enhanced_telemetry_beta', false)
)
}
return false
}
/**
* Check if any tracing is enabled (either standard enhanced telemetry OR beta tracing)
*/
function isAnyTracingEnabled(): boolean {
return isEnhancedTelemetryEnabled() || isBetaTracingEnabled()
}
function getTracer() {
return trace.getTracer('com.anthropic.claude_code.tracing', '1.0.0')
}
function createSpanAttributes(
spanType: SpanType,
customAttributes: Record<string, string | number | boolean> = {},
): Record<string, string | number | boolean> {
const baseAttributes = getTelemetryAttributes()
const attributes: Record<string, string | number | boolean> = {
...baseAttributes,
'span.type': spanType,
...customAttributes,
}
return attributes
}
/**
* Start an interaction span. This wraps a user request -> Claude response cycle.
* This is now a root span that includes all session-level attributes.
* Sets the interaction context for all subsequent operations.
*/
export function startInteractionSpan(userPrompt: string): Span {
ensureCleanupInterval()
// Start Perfetto span regardless of OTel tracing state
const perfettoSpanId = isPerfettoTracingEnabled()
? startInteractionPerfettoSpan(userPrompt)
: undefined
if (!isAnyTracingEnabled()) {
// Still track Perfetto span even if OTel is disabled
if (perfettoSpanId) {
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
const spanId = getSpanId(dummySpan)
const spanContextObj: SpanContext = {
span: dummySpan,
startTime: Date.now(),
attributes: {},
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
interactionContext.enterWith(spanContextObj)
return dummySpan
}
return trace.getActiveSpan() || getTracer().startSpan('dummy')
}
const tracer = getTracer()
const isUserPromptLoggingEnabled = isEnvTruthy(
process.env.OTEL_LOG_USER_PROMPTS,
)
const promptToLog = isUserPromptLoggingEnabled ? userPrompt : '<REDACTED>'
interactionSequence++
const attributes = createSpanAttributes('interaction', {
user_prompt: promptToLog,
user_prompt_length: userPrompt.length,
'interaction.sequence': interactionSequence,
})
const span = tracer.startSpan('claude_code.interaction', {
attributes,
})
// Add experimental attributes (new_context)
addBetaInteractionAttributes(span, userPrompt)
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes,
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
interactionContext.enterWith(spanContextObj)
return span
}
export function endInteractionSpan(): void {
const spanContext = interactionContext.getStore()
if (!spanContext) {
return
}
if (spanContext.ended) {
return
}
// End Perfetto span
if (spanContext.perfettoSpanId) {
endInteractionPerfettoSpan(spanContext.perfettoSpanId)
}
if (!isAnyTracingEnabled()) {
spanContext.ended = true
activeSpans.delete(getSpanId(spanContext.span))
// Clear the store so async continuations created after this point (timers,
// promise callbacks, I/O) do not inherit a reference to the ended span.
// enterWith(undefined) is intentional: exit(() => {}) is a no-op because it
// only suppresses the store inside the callback and returns immediately.
interactionContext.enterWith(undefined)
return
}
const duration = Date.now() - spanContext.startTime
spanContext.span.setAttributes({
'interaction.duration_ms': duration,
})
spanContext.span.end()
spanContext.ended = true
activeSpans.delete(getSpanId(spanContext.span))
interactionContext.enterWith(undefined)
}
export function startLLMRequestSpan(
model: string,
newContext?: LLMRequestNewContext,
messagesForAPI?: APIMessage[],
fastMode?: boolean,
): Span {
// Start Perfetto span regardless of OTel tracing state
const perfettoSpanId = isPerfettoTracingEnabled()
? startLLMRequestPerfettoSpan({
model,
querySource: newContext?.querySource,
messageId: undefined, // Will be set in endLLMRequestSpan
})
: undefined
if (!isAnyTracingEnabled()) {
// Still track Perfetto span even if OTel is disabled
if (perfettoSpanId) {
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
const spanId = getSpanId(dummySpan)
const spanContextObj: SpanContext = {
span: dummySpan,
startTime: Date.now(),
attributes: { model },
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
return dummySpan
}
return trace.getActiveSpan() || getTracer().startSpan('dummy')
}
const tracer = getTracer()
const parentSpanCtx = interactionContext.getStore()
const attributes = createSpanAttributes('llm_request', {
model: model,
'llm_request.context': parentSpanCtx ? 'interaction' : 'standalone',
speed: fastMode ? 'fast' : 'normal',
})
const ctx = parentSpanCtx
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
: otelContext.active()
const span = tracer.startSpan('claude_code.llm_request', { attributes }, ctx)
// Add query_source (agent name) if provided
if (newContext?.querySource) {
span.setAttribute('query_source', newContext.querySource)
}
// Add experimental attributes (system prompt, new_context)
addBetaLLMRequestAttributes(span, newContext, messagesForAPI)
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes,
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
return span
}
/**
* End an LLM request span and attach response metadata.
*
* @param span - Optional. The exact span returned by startLLMRequestSpan().
* IMPORTANT: When multiple LLM requests run in parallel (e.g., warmup requests,
* topic classifier, file path extractor, main thread), you MUST pass the specific span
* to ensure responses are attached to the correct request. Without it, responses may be
* incorrectly attached to whichever span happens to be "last" in the activeSpans map.
*
* If not provided, falls back to finding the most recent llm_request span (legacy behavior).
*/
export function endLLMRequestSpan(
span?: Span,
metadata?: {
inputTokens?: number
outputTokens?: number
cacheReadTokens?: number
cacheCreationTokens?: number
success?: boolean
statusCode?: number
error?: string
attempt?: number
modelResponse?: string
/** Text output from the model (non-thinking content) */
modelOutput?: string
/** Thinking/reasoning output from the model */
thinkingOutput?: string
/** Whether the output included tool calls (look at tool spans for details) */
hasToolCall?: boolean
/** Time to first token in milliseconds */
ttftMs?: number
/** Time spent in pre-request setup before the successful attempt */
requestSetupMs?: number
/** Timestamps (Date.now()) of each attempt start — used to emit retry sub-spans */
attemptStartTimes?: number[]
},
): void {
let llmSpanContext: SpanContext | undefined
if (span) {
// Use the provided span directly - this is the correct approach for parallel requests
const spanId = getSpanId(span)
llmSpanContext = activeSpans.get(spanId)?.deref()
} else {
// Legacy fallback: find the most recent llm_request span
// WARNING: This can cause mismatched responses when multiple requests are in flight
llmSpanContext = Array.from(activeSpans.values())
.findLast(r => {
const ctx = r.deref()
return (
ctx?.attributes['span.type'] === 'llm_request' ||
ctx?.attributes['model']
)
})
?.deref()
}
if (!llmSpanContext) {
// Span was already ended or never tracked
return
}
const duration = Date.now() - llmSpanContext.startTime
// End Perfetto span with full metadata
if (llmSpanContext.perfettoSpanId) {
endLLMRequestPerfettoSpan(llmSpanContext.perfettoSpanId, {
ttftMs: metadata?.ttftMs,
ttltMs: duration, // Time to last token is the total duration
promptTokens: metadata?.inputTokens,
outputTokens: metadata?.outputTokens,
cacheReadTokens: metadata?.cacheReadTokens,
cacheCreationTokens: metadata?.cacheCreationTokens,
success: metadata?.success,
error: metadata?.error,
requestSetupMs: metadata?.requestSetupMs,
attemptStartTimes: metadata?.attemptStartTimes,
})
}
if (!isAnyTracingEnabled()) {
const spanId = getSpanId(llmSpanContext.span)
activeSpans.delete(spanId)
strongSpans.delete(spanId)
return
}
const endAttributes: Record<string, string | number | boolean> = {
duration_ms: duration,
}
if (metadata) {
if (metadata.inputTokens !== undefined)
endAttributes['input_tokens'] = metadata.inputTokens
if (metadata.outputTokens !== undefined)
endAttributes['output_tokens'] = metadata.outputTokens
if (metadata.cacheReadTokens !== undefined)
endAttributes['cache_read_tokens'] = metadata.cacheReadTokens
if (metadata.cacheCreationTokens !== undefined)
endAttributes['cache_creation_tokens'] = metadata.cacheCreationTokens
if (metadata.success !== undefined)
endAttributes['success'] = metadata.success
if (metadata.statusCode !== undefined)
endAttributes['status_code'] = metadata.statusCode
if (metadata.error !== undefined) endAttributes['error'] = metadata.error
if (metadata.attempt !== undefined)
endAttributes['attempt'] = metadata.attempt
if (metadata.hasToolCall !== undefined)
endAttributes['response.has_tool_call'] = metadata.hasToolCall
if (metadata.ttftMs !== undefined)
endAttributes['ttft_ms'] = metadata.ttftMs
// Add experimental response attributes (model_output, thinking_output)
addBetaLLMResponseAttributes(endAttributes, metadata)
}
llmSpanContext.span.setAttributes(endAttributes)
llmSpanContext.span.end()
const spanId = getSpanId(llmSpanContext.span)
activeSpans.delete(spanId)
strongSpans.delete(spanId)
}
export function startToolSpan(
toolName: string,
toolAttributes?: Record<string, string | number | boolean>,
toolInput?: string,
): Span {
// Start Perfetto span regardless of OTel tracing state
const perfettoSpanId = isPerfettoTracingEnabled()
? startToolPerfettoSpan(toolName, toolAttributes)
: undefined
if (!isAnyTracingEnabled()) {
// Still track Perfetto span even if OTel is disabled
if (perfettoSpanId) {
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
const spanId = getSpanId(dummySpan)
const spanContextObj: SpanContext = {
span: dummySpan,
startTime: Date.now(),
attributes: { 'span.type': 'tool', tool_name: toolName },
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
toolContext.enterWith(spanContextObj)
return dummySpan
}
return trace.getActiveSpan() || getTracer().startSpan('dummy')
}
const tracer = getTracer()
const parentSpanCtx = interactionContext.getStore()
const attributes = createSpanAttributes('tool', {
tool_name: toolName,
...toolAttributes,
})
const ctx = parentSpanCtx
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
: otelContext.active()
const span = tracer.startSpan('claude_code.tool', { attributes }, ctx)
// Add experimental tool input attributes
if (toolInput) {
addBetaToolInputAttributes(span, toolName, toolInput)
}
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes,
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
toolContext.enterWith(spanContextObj)
return span
}
export function startToolBlockedOnUserSpan(): Span {
// Start Perfetto span regardless of OTel tracing state
const perfettoSpanId = isPerfettoTracingEnabled()
? startUserInputPerfettoSpan('tool_permission')
: undefined
if (!isAnyTracingEnabled()) {
// Still track Perfetto span even if OTel is disabled
if (perfettoSpanId) {
const dummySpan = trace.getActiveSpan() || getTracer().startSpan('dummy')
const spanId = getSpanId(dummySpan)
const spanContextObj: SpanContext = {
span: dummySpan,
startTime: Date.now(),
attributes: { 'span.type': 'tool.blocked_on_user' },
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
return dummySpan
}
return trace.getActiveSpan() || getTracer().startSpan('dummy')
}
const tracer = getTracer()
const parentSpanCtx = toolContext.getStore()
const attributes = createSpanAttributes('tool.blocked_on_user')
const ctx = parentSpanCtx
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
: otelContext.active()
const span = tracer.startSpan(
'claude_code.tool.blocked_on_user',
{ attributes },
ctx,
)
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes,
perfettoSpanId,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
return span
}
export function endToolBlockedOnUserSpan(
decision?: string,
source?: string,
): void {
const blockedSpanContext = Array.from(activeSpans.values())
.findLast(
r => r.deref()?.attributes['span.type'] === 'tool.blocked_on_user',
)
?.deref()
if (!blockedSpanContext) {
return
}
// End Perfetto span
if (blockedSpanContext.perfettoSpanId) {
endUserInputPerfettoSpan(blockedSpanContext.perfettoSpanId, {
decision,
source,
})
}
if (!isAnyTracingEnabled()) {
const spanId = getSpanId(blockedSpanContext.span)
activeSpans.delete(spanId)
strongSpans.delete(spanId)
return
}
const duration = Date.now() - blockedSpanContext.startTime
const attributes: Record<string, string | number | boolean> = {
duration_ms: duration,
}
if (decision) {
attributes['decision'] = decision
}
if (source) {
attributes['source'] = source
}
blockedSpanContext.span.setAttributes(attributes)
blockedSpanContext.span.end()
const spanId = getSpanId(blockedSpanContext.span)
activeSpans.delete(spanId)
strongSpans.delete(spanId)
}
export function startToolExecutionSpan(): Span {
if (!isAnyTracingEnabled()) {
return trace.getActiveSpan() || getTracer().startSpan('dummy')
}
const tracer = getTracer()
const parentSpanCtx = toolContext.getStore()
const attributes = createSpanAttributes('tool.execution')
const ctx = parentSpanCtx
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
: otelContext.active()
const span = tracer.startSpan(
'claude_code.tool.execution',
{ attributes },
ctx,
)
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
return span
}
export function endToolExecutionSpan(metadata?: {
success?: boolean
error?: string
}): void {
if (!isAnyTracingEnabled()) {
return
}
const executionSpanContext = Array.from(activeSpans.values())
.findLast(r => r.deref()?.attributes['span.type'] === 'tool.execution')
?.deref()
if (!executionSpanContext) {
return
}
const duration = Date.now() - executionSpanContext.startTime
const attributes: Record<string, string | number | boolean> = {
duration_ms: duration,
}
if (metadata) {
if (metadata.success !== undefined) attributes['success'] = metadata.success
if (metadata.error !== undefined) attributes['error'] = metadata.error
}
executionSpanContext.span.setAttributes(attributes)
executionSpanContext.span.end()
const spanId = getSpanId(executionSpanContext.span)
activeSpans.delete(spanId)
strongSpans.delete(spanId)
}
export function endToolSpan(toolResult?: string, resultTokens?: number): void {
const toolSpanContext = toolContext.getStore()
if (!toolSpanContext) {
return
}
// End Perfetto span
if (toolSpanContext.perfettoSpanId) {
endToolPerfettoSpan(toolSpanContext.perfettoSpanId, {
success: true,
resultTokens,
})
}
if (!isAnyTracingEnabled()) {
const spanId = getSpanId(toolSpanContext.span)
activeSpans.delete(spanId)
// Same reasoning as interactionContext above: clear so subsequent async
// work doesn't hold a stale reference to the ended tool span.
toolContext.enterWith(undefined)
return
}
const duration = Date.now() - toolSpanContext.startTime
const endAttributes: Record<string, string | number | boolean> = {
duration_ms: duration,
}
// Add experimental tool result attributes (new_context)
if (toolResult) {
const toolName = toolSpanContext.attributes['tool_name'] || 'unknown'
addBetaToolResultAttributes(endAttributes, toolName, toolResult)
}
if (resultTokens !== undefined) {
endAttributes['result_tokens'] = resultTokens
}
toolSpanContext.span.setAttributes(endAttributes)
toolSpanContext.span.end()
const spanId = getSpanId(toolSpanContext.span)
activeSpans.delete(spanId)
toolContext.enterWith(undefined)
}
function isToolContentLoggingEnabled(): boolean {
return isEnvTruthy(process.env.OTEL_LOG_TOOL_CONTENT)
}
/**
* Add a span event with tool content/output data.
* Only logs if OTEL_LOG_TOOL_CONTENT=1 is set.
* Truncates content if it exceeds MAX_CONTENT_SIZE.
*/
export function addToolContentEvent(
eventName: string,
attributes: Record<string, string | number | boolean>,
): void {
if (!isAnyTracingEnabled() || !isToolContentLoggingEnabled()) {
return
}
const currentSpanCtx = toolContext.getStore()
if (!currentSpanCtx) {
return
}
// Truncate string attributes that might be large
const processedAttributes: Record<string, string | number | boolean> = {}
for (const [key, value] of Object.entries(attributes)) {
if (typeof value === 'string') {
const { content, truncated } = truncateContent(value)
processedAttributes[key] = content
if (truncated) {
processedAttributes[`${key}_truncated`] = true
processedAttributes[`${key}_original_length`] = value.length
}
} else {
processedAttributes[key] = value
}
}
currentSpanCtx.span.addEvent(eventName, processedAttributes)
}
export function getCurrentSpan(): Span | null {
if (!isAnyTracingEnabled()) {
return null
}
return (
toolContext.getStore()?.span ?? interactionContext.getStore()?.span ?? null
)
}
export async function executeInSpan<T>(
spanName: string,
fn: (span: Span) => Promise<T>,
attributes?: Record<string, string | number | boolean>,
): Promise<T> {
if (!isAnyTracingEnabled()) {
return fn(trace.getActiveSpan() || getTracer().startSpan('dummy'))
}
const tracer = getTracer()
const parentSpanCtx = toolContext.getStore() ?? interactionContext.getStore()
const finalAttributes = createSpanAttributes('tool', {
...attributes,
})
const ctx = parentSpanCtx
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
: otelContext.active()
const span = tracer.startSpan(spanName, { attributes: finalAttributes }, ctx)
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes: finalAttributes,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
try {
const result = await fn(span)
span.end()
activeSpans.delete(spanId)
strongSpans.delete(spanId)
return result
} catch (error) {
if (error instanceof Error) {
span.recordException(error)
}
span.end()
activeSpans.delete(spanId)
strongSpans.delete(spanId)
throw error
}
}
/**
* Start a hook execution span.
* Only creates a span when beta tracing is enabled.
* @param hookEvent The hook event type (e.g., 'PreToolUse', 'PostToolUse')
* @param hookName The full hook name (e.g., 'PreToolUse:Write')
* @param numHooks The number of hooks being executed
* @param hookDefinitions JSON string of hook definitions for tracing
* @returns The span (or a dummy span if tracing is disabled)
*/
export function startHookSpan(
hookEvent: string,
hookName: string,
numHooks: number,
hookDefinitions: string,
): Span {
if (!isBetaTracingEnabled()) {
return trace.getActiveSpan() || getTracer().startSpan('dummy')
}
const tracer = getTracer()
const parentSpanCtx = toolContext.getStore() ?? interactionContext.getStore()
const attributes = createSpanAttributes('hook', {
hook_event: hookEvent,
hook_name: hookName,
num_hooks: numHooks,
hook_definitions: hookDefinitions,
})
const ctx = parentSpanCtx
? trace.setSpan(otelContext.active(), parentSpanCtx.span)
: otelContext.active()
const span = tracer.startSpan('claude_code.hook', { attributes }, ctx)
const spanId = getSpanId(span)
const spanContextObj: SpanContext = {
span,
startTime: Date.now(),
attributes,
}
activeSpans.set(spanId, new WeakRef(spanContextObj))
strongSpans.set(spanId, spanContextObj)
return span
}
/**
* End a hook execution span with outcome metadata.
* Only does work when beta tracing is enabled.
* @param span The span to end (returned from startHookSpan)
* @param metadata The outcome metadata for the hook execution
*/
export function endHookSpan(
span: Span,
metadata?: {
numSuccess?: number
numBlocking?: number
numNonBlockingError?: number
numCancelled?: number
},
): void {
if (!isBetaTracingEnabled()) {
return
}
const spanId = getSpanId(span)
const spanContext = activeSpans.get(spanId)?.deref()
if (!spanContext) {
return
}
const duration = Date.now() - spanContext.startTime
const endAttributes: Record<string, string | number | boolean> = {
duration_ms: duration,
}
if (metadata) {
if (metadata.numSuccess !== undefined)
endAttributes['num_success'] = metadata.numSuccess
if (metadata.numBlocking !== undefined)
endAttributes['num_blocking'] = metadata.numBlocking
if (metadata.numNonBlockingError !== undefined)
endAttributes['num_non_blocking_error'] = metadata.numNonBlockingError
if (metadata.numCancelled !== undefined)
endAttributes['num_cancelled'] = metadata.numCancelled
}
spanContext.span.setAttributes(endAttributes)
spanContext.span.end()
activeSpans.delete(spanId)
strongSpans.delete(spanId)
}

View File

@@ -0,0 +1,39 @@
import { getSkillToolCommands } from '../../commands.js'
import {
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
type AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
logEvent,
} from '../../services/analytics/index.js'
import { getCharBudget } from '../../tools/SkillTool/prompt.js'
/**
* Logs a tengu_skill_loaded event for each skill available at session startup.
* This enables analytics on which skills are available across sessions.
*/
export async function logSkillsLoaded(
cwd: string,
contextWindowTokens: number,
): Promise<void> {
const skills = await getSkillToolCommands(cwd)
const skillBudget = getCharBudget(contextWindowTokens)
for (const skill of skills) {
if (skill.type !== 'prompt') continue
logEvent('tengu_skill_loaded', {
// _PROTO_skill_name routes to the privileged skill_name BQ column.
// Unredacted names don't go in additional_metadata.
_PROTO_skill_name:
skill.name as AnalyticsMetadata_I_VERIFIED_THIS_IS_PII_TAGGED,
skill_source:
skill.source as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
skill_loaded_from:
skill.loadedFrom as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
skill_budget: skillBudget,
...(skill.kind && {
skill_kind:
skill.kind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
}),
})
}
}