forked from CodebuffAI/codebuff
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark-providers.ts
More file actions
644 lines (567 loc) Β· 32.6 KB
/
Copy pathbenchmark-providers.ts
File metadata and controls
644 lines (567 loc) Β· 32.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
#!/usr/bin/env bun
/**
* Combined benchmark: runs Fireworks, SiliconFlow, and CanopyWave
* 10-turn conversation caching tests in parallel, then prints a
* unified comparison table.
*
* Usage:
* bun scripts/benchmark-providers.ts
*/
export {}
// ββ Pricing (same model across all providers) ββ
const INPUT_COST_PER_TOKEN = 0.30 / 1_000_000
const CACHED_INPUT_COST_PER_TOKEN = 0.03 / 1_000_000
const OUTPUT_COST_PER_TOKEN = 1.20 / 1_000_000
const MAX_TOKENS = 100
const NUM_TURNS = 10
// ββ Provider configs ββ
interface ProviderConfig {
name: string
baseUrl: string
model: string
apiKeyEnvVar: string
}
const PROVIDERS: ProviderConfig[] = [
{
name: 'Fireworks',
baseUrl: 'https://api.fireworks.ai/inference/v1',
model: 'accounts/fireworks/models/minimax-m2p5',
apiKeyEnvVar: 'FIREWORKS_API_KEY',
},
{
name: 'SiliconFlow',
baseUrl: 'https://api.siliconflow.com/v1',
model: 'MiniMaxAI/MiniMax-M2.5',
apiKeyEnvVar: 'SILICON_FLOW_API_KEY',
},
{
name: 'CanopyWave',
baseUrl: 'https://inference.canopywave.io/v1',
model: 'minimax/minimax-m2.5',
apiKeyEnvVar: 'CANOPYWAVE_API_KEY',
},
]
// ββ Shared system prompt (single seed so all providers get identical input) ββ
const SEED_STRING = `Seed: ${Math.random().toString(36).slice(2, 10)}`
const SYSTEM_PROMPT = `You are an expert software architect, technical writer, and senior engineering consultant.
${SEED_STRING}
You always respond with brief, concise answers β one or two sentences at most.
You provide practical advice grounded in real-world engineering experience.
Your areas of expertise include:
- Distributed systems design and architecture patterns (microservices, event-driven, CQRS, saga patterns, choreography vs orchestration, bulkhead pattern, circuit breaker, retry with exponential backoff, sidecar pattern, ambassador pattern, strangler fig pattern, anti-corruption layer)
- Database design and optimization (relational databases including PostgreSQL, MySQL, SQL Server; document databases including MongoDB, CouchDB, DynamoDB; graph databases including Neo4j, ArangoDB, JanusGraph; time-series databases including InfluxDB, TimescaleDB, QuestDB; wide-column stores including Cassandra, ScyllaDB, HBase; sharding strategies including hash-based, range-based, geographic; replication topologies including primary-replica, multi-primary, chain replication; connection pooling with PgBouncer, ProxySQL; query optimization techniques including index selection, query plan analysis, materialized views, covering indexes, partial indexes, expression indexes)
- Cloud infrastructure and deployment (AWS services including EC2, ECS, EKS, Lambda, S3, DynamoDB, RDS, Aurora, ElastiCache, CloudFront, Route53, IAM, VPC, SQS, SNS, Kinesis, Step Functions; GCP services including GKE, Cloud Run, Cloud Functions, BigQuery, Spanner, Pub/Sub, Cloud Storage; Azure services including AKS, Azure Functions, Cosmos DB, Azure SQL; container orchestration with Kubernetes including deployments, stateful sets, daemon sets, jobs, CronJobs, custom resource definitions, operators, Helm charts, Kustomize; infrastructure as code with Terraform, Pulumi, CloudFormation, CDK; service mesh with Istio, Linkerd, Consul Connect; load balancers including ALB, NLB, HAProxy, Nginx, Envoy; auto-scaling including HPA, VPA, KEDA, cluster autoscaler)
- Programming languages and their ecosystems (TypeScript/JavaScript with Node.js, Deno, Bun; Python with FastAPI, Django, Flask, SQLAlchemy, Pydantic; Rust with Tokio, Actix, Axum, Serde; Go with Gin, Echo, GORM; Java with Spring Boot, Quarkus, Micronaut, Hibernate; C++ with Boost, gRPC, Abseil; Kotlin with Ktor, Spring; Scala with Akka, ZIO, Cats Effect; Elixir with Phoenix, Ecto, LiveView; Haskell with Servant, Yesod, Persistent)
- API design principles (REST architectural constraints, Richardson Maturity Model, HATEOAS, content negotiation; GraphQL including schema design, resolvers, DataLoader, subscriptions, federation; gRPC including protobuf schema design, streaming patterns, interceptors, deadline propagation; WebSocket patterns for real-time communication; Server-Sent Events for unidirectional streaming; OpenAPI/Swagger specification; API versioning strategies including URL path, header, query parameter; pagination patterns including cursor-based, offset, keyset; rate limiting algorithms including token bucket, leaky bucket, sliding window; API gateway patterns)
- Security best practices (authentication protocols including OAuth 2.0, OIDC, SAML, WebAuthn, FIDO2; authorization models including RBAC, ABAC, ReBAC, PBAC; encryption at rest with AES-256, at transit with TLS 1.3; OWASP Top 10 including injection, broken authentication, sensitive data exposure, XXE, broken access control, security misconfiguration, XSS, insecure deserialization, known vulnerabilities, insufficient logging; Content Security Policy headers; CORS configuration; DDoS mitigation with WAF, rate limiting, geo-blocking; secret management with HashiCorp Vault, AWS Secrets Manager, GCP Secret Manager; certificate management including Let's Encrypt, cert-manager, mTLS; supply chain security with SBOM, Sigstore, dependency scanning)
- Performance optimization and profiling (caching strategies including write-through, write-behind, read-through, cache-aside, refresh-ahead; cache invalidation patterns; CDN configuration with CloudFront, Fastly, Cloudflare; connection pooling for HTTP, database, Redis; async patterns including event loops, worker threads, thread pools, coroutines; WebAssembly for compute-intensive operations; JIT compilation optimization; memory profiling with heap snapshots, allocation tracking; CPU profiling with flame graphs, perf, async-profiler; load testing with k6, Locust, Artillery, Gatling; performance budgets and real user monitoring)
- Testing methodologies (unit testing with Jest, Vitest, pytest, Go testing; integration testing with Testcontainers, Docker Compose; end-to-end testing with Playwright, Cypress, Selenium; property-based testing with fast-check, Hypothesis, QuickCheck; mutation testing with Stryker, PITest; snapshot testing; contract testing with Pact, Spring Cloud Contract; chaos engineering with Chaos Monkey, Litmus, Gremlin; load testing; fuzz testing with AFL, LibFuzzer; visual regression testing; accessibility testing)
- CI/CD pipelines and DevOps practices (GitHub Actions workflows, Jenkins pipelines, GitLab CI, CircleCI; ArgoCD for GitOps; deployment strategies including blue-green, canary, rolling update, recreate; feature flag systems with LaunchDarkly, Flagsmith, Unleash; trunk-based development; semantic versioning and conventional commits; artifact management with Artifactory, Nexus, ECR, GCR; infrastructure pipeline including Terraform plan/apply, drift detection; security scanning in CI including SAST, DAST, SCA, secret scanning; release management including changelogs, release notes, semantic-release)
- Monitoring and observability (metrics collection with Prometheus, StatsD, Datadog; visualization with Grafana, Kibana; distributed tracing with Jaeger, Zipkin, Tempo, OpenTelemetry; log aggregation with Elasticsearch, Loki, CloudWatch; alerting with PagerDuty, OpsGenie, VictorOps; SLO/SLI definition and error budgets; synthetic monitoring; real user monitoring; custom business metrics; incident management processes; postmortem culture; runbook automation)
- Data engineering and analytics (stream processing with Apache Kafka, Flink, Spark Streaming, Kinesis; batch processing with Spark, Hadoop, dbt; data warehousing with Snowflake, BigQuery, Redshift, ClickHouse; data lake architecture with Delta Lake, Apache Iceberg, Apache Hudi; ETL/ELT patterns; data quality frameworks with Great Expectations, dbt tests; schema evolution and backward compatibility; data governance and lineage tracking; real-time analytics with materialized views, OLAP cubes)
- Machine learning operations (model serving with TensorFlow Serving, TorchServe, Triton; MLOps pipelines with MLflow, Kubeflow, Metaflow; feature stores with Feast, Tecton; model monitoring for drift detection; A/B testing for ML models; experiment tracking; model versioning and registry; GPU cluster management; inference optimization with quantization, pruning, distillation)
When providing responses, you follow these conventions:
- Keep answers extremely brief β one or two sentences maximum
- Be direct and actionable
- Use concrete examples over abstract advice
- Reference specific tools, libraries, or patterns by name
Additional context for this conversation:
- We are working on a high-traffic web application that serves 50 million requests per day across 3 regions
- The system needs to handle bursty traffic patterns with 10x spikes during peak hours and flash sales
- Data consistency is important but eventual consistency is acceptable for most read paths with a 5-second staleness budget
- The team is experienced with TypeScript and Node.js but open to other technologies for specific use cases
- We use PostgreSQL 16 as our primary database with logical replication to read replicas and Redis 7 Cluster for caching
- The application is deployed on Kubernetes 1.29 in a multi-region setup across US-East-1, US-West-2, and EU-West-1
- We need to maintain 99.95% uptime SLA with a target p99 latency of 150ms for API endpoints and 50ms for cached reads
- Cost optimization is a secondary concern after reliability and developer experience, but we spend $2.5M/year on infrastructure
- The codebase is approximately 750k lines of TypeScript across 80+ microservices with an additional 200k lines of Python for ML services
- We use an event-driven architecture with Kafka (3 clusters, 500+ topics) for inter-service communication with exactly-once semantics
- All services expose both REST (OpenAPI 3.1) and gRPC (protobuf v3) endpoints with automatic code generation
- We have a comprehensive monitoring stack with Prometheus (50M time series), Grafana (200+ dashboards), Jaeger, and PagerDuty
- Database migrations are managed with Drizzle ORM with automated rollback capabilities and zero-downtime schema changes
- The frontend is a Next.js 15 application with React Server Components, streaming SSR, and partial prerendering
- We use feature flags extensively via LaunchDarkly with 500+ active flags and automated cleanup for stale flags
- The CI/CD pipeline runs 5000+ tests (unit, integration, e2e) with a target of under 8 minutes using distributed execution on BuildKite
- We practice trunk-based development with short-lived feature branches, PR previews, and automated merge queues
- The team consists of 60 engineers across 10 squads, each owning 5-12 services with clear domain boundaries
- We use a mono-repo structure managed with Turborepo and Bun workspaces with remote caching
- All inter-service communication uses Protocol Buffers for serialization with a shared schema registry and backward compatibility enforcement
- We have a custom API gateway built on Envoy that handles authentication, rate limiting, request routing, and observability injection
- The system processes approximately 100TB of data per day through our analytics pipeline (Kafka β Flink β ClickHouse + BigQuery)
- Mobile clients communicate via a BFF (Backend for Frontend) layer with GraphQL federation across 12 subgraphs
- We have a custom feature flag evaluation engine that supports complex targeting rules including percentage rollouts, user segments, and geographic targeting
- The deployment pipeline supports multi-region blue-green deployments with automated rollback on SLO violation detection
- We use HashiCorp Vault for secret management with automatic rotation policies for database credentials, API keys, and certificates
- Our observability stack includes custom instrumentation for business metrics including revenue, conversion, engagement, and error rates
- The team follows an RFC process for architectural decisions with ADRs stored in the repo and reviewed by the architecture guild
- We have a dedicated platform team of 8 engineers that maintains shared infrastructure, developer tooling, and internal SDKs
- All services implement health checks (liveness + readiness), graceful shutdown handlers, and circuit breakers via a shared middleware library
- We use PgBouncer in transaction mode for PostgreSQL connection pooling (max 500 connections per region) and Redis Cluster with 6 shards per region
- The system supports multi-tenancy with tenant isolation at the database level using row-level security and per-tenant connection pools
- We have a custom schema registry for Kafka topic schemas with backward/forward compatibility validation and automated consumer migration
- Our error handling follows a structured error taxonomy with 200+ error codes, retry policies, and dead-letter queues for unprocessable messages
- We use structured logging with JSON format, correlation IDs, and trace context propagation across all services via OpenTelemetry
- The frontend uses a design system with 300+ components maintained by a dedicated UI platform team with visual regression testing via Chromatic
- We have automated performance regression testing that runs nightly against production-like data with 10% traffic replay
- Our incident response process includes automated runbook execution, escalation policies, and post-incident review within 48 hours
- We maintain a service catalog with dependency graphs, SLO definitions, on-call schedules, and cost attribution per service
- The platform supports A/B testing with Bayesian statistical significance calculations, multi-armed bandit allocation, and segment analysis
- We use GitOps for all infrastructure management with Terraform modules in a dedicated repo and Atlantis for plan/apply workflows
- Our security posture includes weekly penetration testing, continuous dependency scanning with Snyk, SAST with Semgrep, and DAST with OWASP ZAP
- We have a data mesh architecture for analytics with 15 domain-owned data products, each with defined SLAs and data contracts
- The system supports webhook delivery with at-least-once semantics, configurable retry policies (exponential backoff up to 24h), and delivery status tracking
- We use OpenTelemetry Collector for telemetry pipeline with custom processors for PII redaction, sampling, and cost-based routing
- Our caching strategy uses L1 (in-process LRU, 100MB per pod), L2 (Redis Cluster, 500GB), and L3 (CloudFront, 30+ edge locations) with coordinated invalidation
- We maintain backward compatibility for 3 API versions simultaneously with automated deprecation notices, usage tracking, and migration guides
- The platform includes a developer portal with API documentation, SDK generation, sandbox environments, and usage analytics
- We use Temporal for workflow orchestration across 20+ long-running business processes including order fulfillment, payment processing, and user onboarding
- Our ML platform serves 50+ models in production with A/B testing, shadow mode deployment, and automated retraining pipelines
- The search infrastructure uses Elasticsearch clusters with 500M+ documents, custom analyzers, and learning-to-rank models
- We have a notification system that delivers 10M+ messages daily across email, push, SMS, and in-app channels with template management and delivery optimization
- The billing system processes $50M+ in monthly transactions with Stripe integration, usage-based billing, and revenue recognition
- We use Crossplane for provisioning cloud resources as Kubernetes custom resources with drift detection and reconciliation
- Our edge computing layer uses Cloudflare Workers for geo-routing, A/B test assignment, and personalization at the edge
- The platform includes a custom query builder for internal dashboards that generates optimized SQL for ClickHouse and PostgreSQL
- We maintain a shared protobuf definition repository with 500+ message types, automated code generation for 6 languages, and breaking change detection`
const TURN_PROMPTS = [
'Give a brief one-sentence answer: What is the single most important principle when designing distributed systems?',
'Give a brief one-sentence answer: What is the biggest mistake teams make when adopting microservices?',
'Give a brief one-sentence answer: When should you choose eventual consistency over strong consistency?',
'Give a brief one-sentence answer: What is the most underrated database optimization technique?',
'Give a brief one-sentence answer: What is the best approach to handle cascading failures in a microservice architecture?',
'Give a brief one-sentence answer: When is it better to use gRPC over REST?',
'Give a brief one-sentence answer: What is the most effective caching strategy for a read-heavy workload?',
'Give a brief one-sentence answer: What is the key to successful trunk-based development at scale?',
'Give a brief one-sentence answer: What metric best predicts production reliability?',
'Give a brief one-sentence answer: What is the most important thing to get right in an observability stack?',
]
// ββ Types ββ
interface ConversationMessage {
role: string
content: string
}
interface TurnResult {
turn: number
elapsedMs: number
ttftMs?: number
inputTokens: number
cachedTokens: number
outputTokens: number
outputTokensPerSec: number
cost: number
responseContent: string
error?: string
}
interface ProviderResult {
provider: ProviderConfig
turns: TurnResult[]
totalElapsedMs: number
wallClockMs: number
}
// ββ Helpers ββ
function computeCost(usage: Record<string, unknown>): number {
const inputTokens = typeof usage.prompt_tokens === 'number' ? usage.prompt_tokens : 0
const outputTokens = typeof usage.completion_tokens === 'number' ? usage.completion_tokens : 0
const promptDetails = usage.prompt_tokens_details as Record<string, unknown> | undefined
const cachedTokens = typeof promptDetails?.cached_tokens === 'number' ? promptDetails.cached_tokens : 0
const nonCachedInput = Math.max(0, inputTokens - cachedTokens)
return nonCachedInput * INPUT_COST_PER_TOKEN +
cachedTokens * CACHED_INPUT_COST_PER_TOKEN +
outputTokens * OUTPUT_COST_PER_TOKEN
}
function extractUsageFields(usage: Record<string, unknown>): { inputTokens: number; cachedTokens: number; outputTokens: number } {
const inputTokens = typeof usage.prompt_tokens === 'number' ? usage.prompt_tokens : 0
const outputTokens = typeof usage.completion_tokens === 'number' ? usage.completion_tokens : 0
const promptDetails = usage.prompt_tokens_details as Record<string, unknown> | undefined
const cachedTokens = typeof promptDetails?.cached_tokens === 'number' ? promptDetails.cached_tokens : 0
return { inputTokens, cachedTokens, outputTokens }
}
async function runTurn(
config: ProviderConfig,
apiKey: string,
messages: ConversationMessage[],
turnIndex: number,
): Promise<TurnResult> {
const startTime = Date.now()
let ttftMs: number | undefined
const response = await fetch(`${config.baseUrl}/chat/completions`, {
method: 'POST',
headers: {
Authorization: `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: config.model,
messages,
max_tokens: MAX_TOKENS,
stream: true,
stream_options: { include_usage: true },
}),
})
if (!response.ok) {
const errorText = await response.text()
return {
turn: turnIndex + 1,
elapsedMs: Date.now() - startTime,
inputTokens: 0,
cachedTokens: 0,
outputTokens: 0,
outputTokensPerSec: 0,
cost: 0,
responseContent: '',
error: `${response.status}: ${errorText.slice(0, 200)}`,
}
}
const reader = response.body?.getReader()
if (!reader) {
return {
turn: turnIndex + 1,
elapsedMs: Date.now() - startTime,
inputTokens: 0,
cachedTokens: 0,
outputTokens: 0,
outputTokensPerSec: 0,
cost: 0,
responseContent: '',
error: 'No response body reader',
}
}
const decoder = new TextDecoder()
let streamContent = ''
let streamUsage: Record<string, unknown> | null = null
let firstContentChunkTime: number | undefined
let done = false
while (!done) {
const result = await reader.read()
done = result.done
if (done) break
const text = decoder.decode(result.value, { stream: true })
const lines = text.split('\n').filter((l) => l.startsWith('data: '))
for (const line of lines) {
const raw = line.slice('data: '.length)
if (raw === '[DONE]') continue
try {
const chunk = JSON.parse(raw)
const delta = chunk.choices?.[0]?.delta
if (delta?.content) {
if (firstContentChunkTime === undefined) {
firstContentChunkTime = Date.now()
ttftMs = firstContentChunkTime - startTime
}
streamContent += delta.content
}
if (chunk.usage) streamUsage = chunk.usage
} catch {
// skip non-JSON lines
}
}
}
const elapsedMs = Date.now() - startTime
const { inputTokens, cachedTokens, outputTokens } = streamUsage
? extractUsageFields(streamUsage)
: { inputTokens: 0, cachedTokens: 0, outputTokens: 0 }
const generationTimeMs = firstContentChunkTime !== undefined
? Date.now() - firstContentChunkTime
: elapsedMs
const outputTokensPerSec = generationTimeMs > 0
? (outputTokens / (generationTimeMs / 1000))
: 0
const cost = streamUsage ? computeCost(streamUsage) : 0
return {
turn: turnIndex + 1,
elapsedMs,
ttftMs,
inputTokens,
cachedTokens,
outputTokens,
outputTokensPerSec,
cost,
responseContent: streamContent,
}
}
async function runProviderBenchmark(config: ProviderConfig, apiKey: string): Promise<ProviderResult> {
const conversationHistory: ConversationMessage[] = [
{ role: 'system', content: SYSTEM_PROMPT },
]
const turns: TurnResult[] = []
const wallStart = Date.now()
let totalElapsedMs = 0
for (let i = 0; i < NUM_TURNS; i++) {
conversationHistory.push({ role: 'user', content: TURN_PROMPTS[i] })
const result = await runTurn(config, apiKey, [...conversationHistory], i)
turns.push(result)
totalElapsedMs += result.elapsedMs
if (result.responseContent) {
conversationHistory.push({ role: 'assistant', content: result.responseContent })
}
}
return {
provider: config,
turns,
totalElapsedMs,
wallClockMs: Date.now() - wallStart,
}
}
// ββ Formatting helpers ββ
function pad(s: string, n: number): string { return s.padStart(n) }
function pct(n: number, d: number): string { return d > 0 ? `${((n / d) * 100).toFixed(1)}%` : '0.0%' }
function printProviderSummary(r: ProviderResult) {
const p = r.provider
console.log()
console.log(`${'β'.repeat(100)}`)
console.log(` ${p.name} | Model: ${p.model} | Base URL: ${p.baseUrl}`)
console.log(`${'β'.repeat(100)}`)
console.log()
console.log(` ${'Turn'.padEnd(25)} | ${pad('Time', 8)} | ${pad('TTFT', 7)} | ${pad('Input', 6)} | ${pad('Cached', 6)} | ${pad('Cache%', 7)} | ${pad('Output', 6)} | ${pad('tok/s', 6)} | ${pad('e2e t/s', 7)} | Cost`)
console.log(' ' + 'β'.repeat(105))
let totalCost = 0
let totalInput = 0
let totalCached = 0
let totalOutput = 0
for (const t of r.turns) {
const label = `Turn ${t.turn}/${NUM_TURNS}${t.turn === 1 ? ' (cold)' : ''}`
const time = `${(t.elapsedMs / 1000).toFixed(2)}s`
const ttft = t.ttftMs !== undefined ? `${(t.ttftMs / 1000).toFixed(2)}s` : 'n/a'
const cacheRate = pct(t.cachedTokens, t.inputTokens)
const tokSec = t.outputTokensPerSec.toFixed(1)
const e2eTokSec = t.elapsedMs > 0 ? (t.outputTokens / (t.elapsedMs / 1000)).toFixed(1) : 'n/a'
const costStr = t.error ? 'err' : `$${t.cost.toFixed(6)}`
totalCost += t.cost
totalInput += t.inputTokens
totalCached += t.cachedTokens
totalOutput += t.outputTokens
if (t.error) {
console.log(` ${label.padEnd(25)} | ${pad(time, 8)} | ${pad(ttft, 7)} | β ${t.error.slice(0, 60)}`)
} else {
console.log(` ${label.padEnd(25)} | ${pad(time, 8)} | ${pad(ttft, 7)} | ${pad(String(t.inputTokens), 6)} | ${pad(String(t.cachedTokens), 6)} | ${pad(cacheRate, 7)} | ${pad(String(t.outputTokens), 6)} | ${pad(tokSec, 6)} | ${pad(e2eTokSec, 7)} | ${costStr}`)
}
}
console.log(' ' + 'β'.repeat(105))
const totalTimeStr = `${(r.totalElapsedMs / 1000).toFixed(2)}s`
const overallCacheRate = pct(totalCached, totalInput)
const overallTokSec = r.totalElapsedMs > 0 ? (totalOutput / (r.totalElapsedMs / 1000)).toFixed(1) : 'n/a'
console.log(` ${'TOTAL'.padEnd(25)} | ${pad(totalTimeStr, 8)} | | ${pad(String(totalInput), 6)} | ${pad(String(totalCached), 6)} | ${pad(overallCacheRate, 7)} | ${pad(String(totalOutput), 6)} | | ${pad(overallTokSec, 7)} | $${totalCost.toFixed(6)}`)
console.log()
const costWithoutCaching = totalInput * INPUT_COST_PER_TOKEN + totalOutput * OUTPUT_COST_PER_TOKEN
const savings = costWithoutCaching - totalCost
const savingsPct = costWithoutCaching > 0 ? ((savings / costWithoutCaching) * 100).toFixed(1) : '0.0'
console.log(` Cost savings from caching: $${savings.toFixed(6)} (${savingsPct}%)`)
const ttfts = r.turns.filter((t) => t.ttftMs !== undefined).map((t) => t.ttftMs!)
if (ttfts.length > 0) {
const avgTtft = ttfts.reduce((a, b) => a + b, 0) / ttfts.length
console.log(` TTFT β avg: ${(avgTtft / 1000).toFixed(2)}s, min: ${(Math.min(...ttfts) / 1000).toFixed(2)}s, max: ${(Math.max(...ttfts) / 1000).toFixed(2)}s`)
}
}
interface ProviderSummary {
name: string
totalTime: number
wallClock: number
cacheHitRate: number
costSavings: number
totalCost: number
costWithoutCaching: number
avgTtft: number | null
avgWarmTtft: number | null
e2eTokSec: number
totalInput: number
totalCached: number
totalOutput: number
cacheMissTurns: number
errorTurns: number
}
function summarize(r: ProviderResult): ProviderSummary {
let totalInput = 0
let totalCached = 0
let totalOutput = 0
let totalCost = 0
let cacheMissTurns = 0
let errorTurns = 0
for (const t of r.turns) {
totalInput += t.inputTokens
totalCached += t.cachedTokens
totalOutput += t.outputTokens
totalCost += t.cost
if (t.error) errorTurns++
else if (t.cachedTokens === 0) cacheMissTurns++
}
const cacheHitRate = totalInput > 0 ? (totalCached / totalInput) * 100 : 0
const costWithoutCaching = totalInput * INPUT_COST_PER_TOKEN + totalOutput * OUTPUT_COST_PER_TOKEN
const savings = costWithoutCaching > 0 ? ((costWithoutCaching - totalCost) / costWithoutCaching) * 100 : 0
const e2eTokSec = r.totalElapsedMs > 0 ? totalOutput / (r.totalElapsedMs / 1000) : 0
const ttfts = r.turns.filter((t) => t.ttftMs !== undefined).map((t) => t.ttftMs!)
const avgTtft = ttfts.length > 0 ? ttfts.reduce((a, b) => a + b, 0) / ttfts.length : null
const warmTtfts = r.turns.slice(1).filter((t) => t.ttftMs !== undefined).map((t) => t.ttftMs!)
const avgWarmTtft = warmTtfts.length > 0 ? warmTtfts.reduce((a, b) => a + b, 0) / warmTtfts.length : null
return {
name: r.provider.name,
totalTime: r.totalElapsedMs,
wallClock: r.wallClockMs,
cacheHitRate,
costSavings: savings,
totalCost,
costWithoutCaching,
avgTtft,
avgWarmTtft,
e2eTokSec,
totalInput,
totalCached,
totalOutput,
cacheMissTurns,
errorTurns,
}
}
function pickWinner(summaries: ProviderSummary[], key: keyof ProviderSummary, higherIsBetter: boolean): string {
let best: ProviderSummary | null = null
for (const s of summaries) {
const val = s[key]
if (val === null || val === undefined) continue
if (!best) { best = s; continue }
const bestVal = best[key] as number
if (higherIsBetter ? (val as number) > bestVal : (val as number) < bestVal) best = s
}
return best ? `π ${best.name}` : 'n/a'
}
function printComparisonTable(summaries: ProviderSummary[]) {
console.log()
console.log('β'.repeat(100))
console.log(' HEAD-TO-HEAD COMPARISON')
console.log('β'.repeat(100))
console.log()
const nameWidth = 14
const colWidth = 16
const header = ` ${'Metric'.padEnd(24)} | ${summaries.map((s) => s.name.padStart(colWidth)).join(' | ')} | Winner`
console.log(header)
console.log(' ' + 'β'.repeat(header.length - 3))
const rows: Array<{ label: string; values: string[]; winner: string }> = [
{
label: 'Total time',
values: summaries.map((s) => `${(s.totalTime / 1000).toFixed(2)}s`),
winner: pickWinner(summaries, 'totalTime', false),
},
{
label: 'Wall clock',
values: summaries.map((s) => `${(s.wallClock / 1000).toFixed(2)}s`),
winner: pickWinner(summaries, 'wallClock', false),
},
{
label: 'Cache hit rate',
values: summaries.map((s) => `${s.cacheHitRate.toFixed(1)}%`),
winner: pickWinner(summaries, 'cacheHitRate', true),
},
{
label: 'Cost savings',
values: summaries.map((s) => `${s.costSavings.toFixed(1)}%`),
winner: pickWinner(summaries, 'costSavings', true),
},
{
label: 'Total cost',
values: summaries.map((s) => `$${s.totalCost.toFixed(6)}`),
winner: pickWinner(summaries, 'totalCost', false),
},
{
label: 'Avg TTFT',
values: summaries.map((s) => s.avgTtft !== null ? `${(s.avgTtft / 1000).toFixed(2)}s` : 'n/a'),
winner: (() => {
const withTtft = summaries.filter((s) => s.avgTtft !== null)
if (withTtft.length === 0) return 'n/a'
return `π ${withTtft.reduce((a, b) => a.avgTtft! < b.avgTtft! ? a : b).name}`
})(),
},
{
label: 'Avg warm TTFT',
values: summaries.map((s) => s.avgWarmTtft !== null ? `${(s.avgWarmTtft / 1000).toFixed(2)}s` : 'n/a'),
winner: (() => {
const withTtft = summaries.filter((s) => s.avgWarmTtft !== null)
if (withTtft.length === 0) return 'n/a'
return `π ${withTtft.reduce((a, b) => a.avgWarmTtft! < b.avgWarmTtft! ? a : b).name}`
})(),
},
{
label: 'e2e tok/s',
values: summaries.map((s) => s.e2eTokSec.toFixed(1)),
winner: pickWinner(summaries, 'e2eTokSec', true),
},
{
label: 'Cache miss turns',
values: summaries.map((s) => `${s.cacheMissTurns}/${NUM_TURNS}`),
winner: pickWinner(summaries, 'cacheMissTurns', false),
},
{
label: 'Error turns',
values: summaries.map((s) => `${s.errorTurns}/${NUM_TURNS}`),
winner: pickWinner(summaries, 'errorTurns', false),
},
{
label: 'Total input tokens',
values: summaries.map((s) => String(s.totalInput)),
winner: '',
},
{
label: 'Total output tokens',
values: summaries.map((s) => String(s.totalOutput)),
winner: '',
},
]
for (const row of rows) {
const vals = row.values.map((v) => v.padStart(colWidth)).join(' | ')
console.log(` ${row.label.padEnd(24)} | ${vals} | ${row.winner}`)
}
console.log()
}
// ββ Main ββ
async function main() {
console.log('π Combined Provider Benchmark β 10-Turn Conversation Caching Test')
console.log('='.repeat(100))
console.log(`Turns: ${NUM_TURNS}`)
console.log(`Max tokens: ${MAX_TOKENS} per turn`)
console.log(`Pricing: $0.30/M input, $0.03/M cached, $1.20/M output`)
console.log(`Seed: ${SEED_STRING}`)
console.log(`Providers: ${PROVIDERS.map((p) => p.name).join(', ')}`)
console.log('='.repeat(100))
console.log()
// Validate API keys
const validProviders: Array<{ config: ProviderConfig; apiKey: string }> = []
const skippedProviders: string[] = []
for (const config of PROVIDERS) {
const apiKey = process.env[config.apiKeyEnvVar]
if (!apiKey) {
console.log(`β οΈ Skipping ${config.name}: ${config.apiKeyEnvVar} not set`)
skippedProviders.push(config.name)
} else {
validProviders.push({ config, apiKey })
console.log(`β
${config.name}: API key found`)
}
}
if (validProviders.length === 0) {
console.error('\nβ No API keys found. Set at least one of: FIREWORKS_API_KEY, SILICON_FLOW_API_KEY, CANOPYWAVE_API_KEY')
process.exit(1)
}
console.log()
console.log(`π Running ${validProviders.length} provider(s) in parallel...`)
console.log()
const benchmarkStart = Date.now()
// Run all providers in parallel
const results = await Promise.all(
validProviders.map(({ config, apiKey }) => runProviderBenchmark(config, apiKey)),
)
const benchmarkElapsed = Date.now() - benchmarkStart
// Print individual provider summaries
for (const result of results) {
printProviderSummary(result)
}
// Print comparison table
if (results.length > 1) {
const summaries = results.map(summarize)
printComparisonTable(summaries)
}
// Final summary
console.log('β'.repeat(100))
console.log(` Benchmark complete in ${(benchmarkElapsed / 1000).toFixed(1)}s wall clock (all providers ran in parallel)`)
if (skippedProviders.length > 0) {
console.log(` Skipped: ${skippedProviders.join(', ')}`)
}
console.log('β'.repeat(100))
console.log()
console.log('Done!')
}
main()