langfuse

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation, RegressionError, RunnerContext
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from ._version import __version__
32from .span_filter import (
33    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
34    is_default_export_span,
35    is_genai_span,
36    is_known_llm_instrumentor,
37    is_langfuse_span,
38)
39from .types import (
40    MaskOtelSpansFunction,
41    MaskOtelSpansParams,
42    MaskOtelSpansResult,
43    OtelSpanData,
44    OtelSpanIdentifier,
45    OtelSpanPatch,
46)
47
48Langfuse = _client_module.Langfuse
49
50__all__ = [
51    "Langfuse",
52    "get_client",
53    "observe",
54    "propagate_attributes",
55    "ObservationTypeLiteral",
56    "LangfuseSpan",
57    "LangfuseGeneration",
58    "LangfuseEvent",
59    "LangfuseOtelSpanAttributes",
60    "LangfuseAgent",
61    "LangfuseTool",
62    "LangfuseChain",
63    "LangfuseEmbedding",
64    "LangfuseEvaluator",
65    "LangfuseRetriever",
66    "LangfuseGuardrail",
67    "Evaluation",
68    "EvaluatorInputs",
69    "MapperFunction",
70    "CompositeEvaluatorFunction",
71    "EvaluatorStats",
72    "BatchEvaluationResumeToken",
73    "BatchEvaluationResult",
74    "RunnerContext",
75    "RegressionError",
76    "__version__",
77    "is_default_export_span",
78    "is_langfuse_span",
79    "is_genai_span",
80    "is_known_llm_instrumentor",
81    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
82    "MaskOtelSpansFunction",
83    "MaskOtelSpansParams",
84    "MaskOtelSpansResult",
85    "OtelSpanData",
86    "OtelSpanIdentifier",
87    "OtelSpanPatch",
88    "experiment",
89    "api",
90]
class Langfuse:
 147class Langfuse:
 148    """Main client for Langfuse tracing and platform features.
 149
 150    This class provides an interface for creating and managing traces, spans,
 151    and generations in Langfuse as well as interacting with the Langfuse API.
 152
 153    The client features a thread-safe singleton pattern for each unique public API key,
 154    ensuring consistent trace context propagation across your application. It implements
 155    efficient batching of spans with configurable flush settings and includes background
 156    thread management for media uploads and score ingestion.
 157
 158    Configuration is flexible through either direct parameters or environment variables,
 159    with graceful fallbacks and runtime configuration updates.
 160
 161    Attributes:
 162        api: Synchronous API client for Langfuse backend communication
 163        async_api: Asynchronous API client for Langfuse backend communication
 164        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 165
 166    Parameters:
 167        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 168        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 169        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 170        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 171        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 172        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 173        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 174        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 175        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 176        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 177        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 178        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 179        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 180        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 181        mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as `start_observation()`, `update()`, and `set_trace_io()`.
 182        mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.
 183
 184            The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during `flush()` and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.
 185
 186            Return `None` to leave the batch unchanged. Return `MaskOtelSpansResult` with `OtelSpanPatch` values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.
 187
 188            Example:
 189                ```python
 190                from typing import Optional
 191
 192                from langfuse import Langfuse
 193                from langfuse.types import (
 194                    MaskOtelSpansParams,
 195                    MaskOtelSpansResult,
 196                    OtelSpanPatch,
 197                )
 198
 199                def mask_otel_spans(
 200                    *, params: MaskOtelSpansParams
 201                ) -> Optional[MaskOtelSpansResult]:
 202                    patches = {}
 203
 204                    for identifier, span in params.spans.items():
 205                        if "gen_ai.prompt.0.content" in span.attributes:
 206                            patches[identifier] = OtelSpanPatch(
 207                                delete_attributes=("gen_ai.prompt.0.content",),
 208                                set_attributes={"masking.applied": True},
 209                            )
 210
 211                    return MaskOtelSpansResult(span_patches=patches)
 212
 213                langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
 214                ```
 215        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 216            ```python
 217            from langfuse.span_filter import is_default_export_span
 218            blocked = {"sqlite", "requests"}
 219
 220            should_export_span = lambda span: (
 221                is_default_export_span(span)
 222                and (
 223                    span.instrumentation_scope is None
 224                    or span.instrumentation_scope.name not in blocked
 225                )
 226            )
 227            ```
 228        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 229        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
 230        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 231        span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans.
 232
 233    Example:
 234        ```python
 235        from langfuse.otel import Langfuse
 236
 237        # Initialize the client (reads from env vars if not provided)
 238        langfuse = Langfuse(
 239            public_key="your-public-key",
 240            secret_key="your-secret-key",
 241            host="https://cloud.langfuse.com",  # Optional, default shown
 242        )
 243
 244        # Create a trace span
 245        with langfuse.start_as_current_observation(name="process-query") as span:
 246            # Your application code here
 247
 248            # Create a nested generation span for an LLM call
 249            with span.start_as_current_generation(
 250                name="generate-response",
 251                model="gpt-4",
 252                input={"query": "Tell me about AI"},
 253                model_parameters={"temperature": 0.7, "max_tokens": 500}
 254            ) as generation:
 255                # Generate response here
 256                response = "AI is a field of computer science..."
 257
 258                generation.update(
 259                    output=response,
 260                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 261                    cost_details={"total_cost": 0.0023}
 262                )
 263
 264                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 265                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 266        ```
 267    """
 268
 269    _resources: Optional[LangfuseResourceManager] = None
 270    _mask: Optional[MaskFunction] = None
 271    _otel_tracer: otel_trace_api.Tracer
 272
 273    def __init__(
 274        self,
 275        *,
 276        public_key: Optional[str] = None,
 277        secret_key: Optional[str] = None,
 278        base_url: Optional[str] = None,
 279        host: Optional[str] = None,
 280        timeout: Optional[int] = None,
 281        httpx_client: Optional[httpx.Client] = None,
 282        debug: bool = False,
 283        tracing_enabled: Optional[bool] = True,
 284        flush_at: Optional[int] = None,
 285        flush_interval: Optional[float] = None,
 286        environment: Optional[str] = None,
 287        release: Optional[str] = None,
 288        media_upload_thread_count: Optional[int] = None,
 289        sample_rate: Optional[float] = None,
 290        mask: Optional[MaskFunction] = None,
 291        mask_otel_spans: Optional[MaskOtelSpansFunction] = None,
 292        blocked_instrumentation_scopes: Optional[List[str]] = None,
 293        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 294        additional_headers: Optional[Dict[str, str]] = None,
 295        tracer_provider: Optional[TracerProvider] = None,
 296        span_exporter: Optional[SpanExporter] = None,
 297    ):
 298        self._base_url = (
 299            base_url
 300            or os.environ.get(LANGFUSE_BASE_URL)
 301            or host
 302            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 303        )
 304        self._environment = environment or cast(
 305            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 306        )
 307        self._release = (
 308            release
 309            or os.environ.get(LANGFUSE_RELEASE, None)
 310            or get_common_release_envs()
 311        )
 312        self._project_id: Optional[str] = None
 313        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 314        if not 0.0 <= sample_rate <= 1.0:
 315            raise ValueError(
 316                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 317            )
 318
 319        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 320
 321        self._tracing_enabled = (
 322            tracing_enabled
 323            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 324        )
 325        if not self._tracing_enabled:
 326            langfuse_logger.info(
 327                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 328            )
 329
 330        debug = (
 331            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 332        )
 333        if debug:
 334            logging.basicConfig(
 335                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 336            )
 337            langfuse_logger.setLevel(logging.DEBUG)
 338
 339        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 340        if public_key is None:
 341            langfuse_logger.warning(
 342                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 343                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 344            )
 345            self._otel_tracer = otel_trace_api.NoOpTracer()
 346            return
 347
 348        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 349        if secret_key is None:
 350            langfuse_logger.warning(
 351                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 352                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 353            )
 354            self._otel_tracer = otel_trace_api.NoOpTracer()
 355            return
 356
 357        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 358            langfuse_logger.warning(
 359                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 360            )
 361
 362        if blocked_instrumentation_scopes is not None:
 363            warnings.warn(
 364                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 365                "Use `should_export_span` instead. Example: "
 366                "from langfuse.span_filter import is_default_export_span; "
 367                'blocked={"scope"}; should_export_span=lambda span: '
 368                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 369                "span.instrumentation_scope.name not in blocked).",
 370                DeprecationWarning,
 371                stacklevel=2,
 372            )
 373
 374        # Initialize api and tracer if requirements are met
 375        self._resources = LangfuseResourceManager(
 376            public_key=public_key,
 377            secret_key=secret_key,
 378            base_url=self._base_url,
 379            timeout=timeout,
 380            environment=self._environment,
 381            release=release,
 382            flush_at=flush_at,
 383            flush_interval=flush_interval,
 384            httpx_client=httpx_client,
 385            media_upload_thread_count=media_upload_thread_count,
 386            sample_rate=sample_rate,
 387            mask=mask,
 388            mask_otel_spans=mask_otel_spans,
 389            tracing_enabled=self._tracing_enabled,
 390            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 391            should_export_span=should_export_span,
 392            additional_headers=additional_headers,
 393            tracer_provider=tracer_provider,
 394            span_exporter=span_exporter,
 395        )
 396        self._mask = self._resources.mask
 397
 398        self._otel_tracer = (
 399            self._resources.tracer
 400            if self._tracing_enabled and self._resources.tracer is not None
 401            else otel_trace_api.NoOpTracer()
 402        )
 403        self.api = self._resources.api
 404        self.async_api = self._resources.async_api
 405
 406    @overload
 407    def start_observation(
 408        self,
 409        *,
 410        trace_context: Optional[TraceContext] = None,
 411        name: str,
 412        as_type: Literal["generation"],
 413        input: Optional[Any] = None,
 414        output: Optional[Any] = None,
 415        metadata: Optional[Any] = None,
 416        version: Optional[str] = None,
 417        level: Optional[SpanLevel] = None,
 418        status_message: Optional[str] = None,
 419        completion_start_time: Optional[datetime] = None,
 420        model: Optional[str] = None,
 421        model_parameters: Optional[Dict[str, MapValue]] = None,
 422        usage_details: Optional[Dict[str, int]] = None,
 423        cost_details: Optional[Dict[str, float]] = None,
 424        prompt: Optional[PromptClient] = None,
 425    ) -> LangfuseGeneration: ...
 426
 427    @overload
 428    def start_observation(
 429        self,
 430        *,
 431        trace_context: Optional[TraceContext] = None,
 432        name: str,
 433        as_type: Literal["span"] = "span",
 434        input: Optional[Any] = None,
 435        output: Optional[Any] = None,
 436        metadata: Optional[Any] = None,
 437        version: Optional[str] = None,
 438        level: Optional[SpanLevel] = None,
 439        status_message: Optional[str] = None,
 440    ) -> LangfuseSpan: ...
 441
 442    @overload
 443    def start_observation(
 444        self,
 445        *,
 446        trace_context: Optional[TraceContext] = None,
 447        name: str,
 448        as_type: Literal["agent"],
 449        input: Optional[Any] = None,
 450        output: Optional[Any] = None,
 451        metadata: Optional[Any] = None,
 452        version: Optional[str] = None,
 453        level: Optional[SpanLevel] = None,
 454        status_message: Optional[str] = None,
 455    ) -> LangfuseAgent: ...
 456
 457    @overload
 458    def start_observation(
 459        self,
 460        *,
 461        trace_context: Optional[TraceContext] = None,
 462        name: str,
 463        as_type: Literal["tool"],
 464        input: Optional[Any] = None,
 465        output: Optional[Any] = None,
 466        metadata: Optional[Any] = None,
 467        version: Optional[str] = None,
 468        level: Optional[SpanLevel] = None,
 469        status_message: Optional[str] = None,
 470    ) -> LangfuseTool: ...
 471
 472    @overload
 473    def start_observation(
 474        self,
 475        *,
 476        trace_context: Optional[TraceContext] = None,
 477        name: str,
 478        as_type: Literal["chain"],
 479        input: Optional[Any] = None,
 480        output: Optional[Any] = None,
 481        metadata: Optional[Any] = None,
 482        version: Optional[str] = None,
 483        level: Optional[SpanLevel] = None,
 484        status_message: Optional[str] = None,
 485    ) -> LangfuseChain: ...
 486
 487    @overload
 488    def start_observation(
 489        self,
 490        *,
 491        trace_context: Optional[TraceContext] = None,
 492        name: str,
 493        as_type: Literal["retriever"],
 494        input: Optional[Any] = None,
 495        output: Optional[Any] = None,
 496        metadata: Optional[Any] = None,
 497        version: Optional[str] = None,
 498        level: Optional[SpanLevel] = None,
 499        status_message: Optional[str] = None,
 500    ) -> LangfuseRetriever: ...
 501
 502    @overload
 503    def start_observation(
 504        self,
 505        *,
 506        trace_context: Optional[TraceContext] = None,
 507        name: str,
 508        as_type: Literal["evaluator"],
 509        input: Optional[Any] = None,
 510        output: Optional[Any] = None,
 511        metadata: Optional[Any] = None,
 512        version: Optional[str] = None,
 513        level: Optional[SpanLevel] = None,
 514        status_message: Optional[str] = None,
 515    ) -> LangfuseEvaluator: ...
 516
 517    @overload
 518    def start_observation(
 519        self,
 520        *,
 521        trace_context: Optional[TraceContext] = None,
 522        name: str,
 523        as_type: Literal["embedding"],
 524        input: Optional[Any] = None,
 525        output: Optional[Any] = None,
 526        metadata: Optional[Any] = None,
 527        version: Optional[str] = None,
 528        level: Optional[SpanLevel] = None,
 529        status_message: Optional[str] = None,
 530        completion_start_time: Optional[datetime] = None,
 531        model: Optional[str] = None,
 532        model_parameters: Optional[Dict[str, MapValue]] = None,
 533        usage_details: Optional[Dict[str, int]] = None,
 534        cost_details: Optional[Dict[str, float]] = None,
 535        prompt: Optional[PromptClient] = None,
 536    ) -> LangfuseEmbedding: ...
 537
 538    @overload
 539    def start_observation(
 540        self,
 541        *,
 542        trace_context: Optional[TraceContext] = None,
 543        name: str,
 544        as_type: Literal["guardrail"],
 545        input: Optional[Any] = None,
 546        output: Optional[Any] = None,
 547        metadata: Optional[Any] = None,
 548        version: Optional[str] = None,
 549        level: Optional[SpanLevel] = None,
 550        status_message: Optional[str] = None,
 551    ) -> LangfuseGuardrail: ...
 552
 553    def start_observation(
 554        self,
 555        *,
 556        trace_context: Optional[TraceContext] = None,
 557        name: str,
 558        as_type: ObservationTypeLiteralNoEvent = "span",
 559        input: Optional[Any] = None,
 560        output: Optional[Any] = None,
 561        metadata: Optional[Any] = None,
 562        version: Optional[str] = None,
 563        level: Optional[SpanLevel] = None,
 564        status_message: Optional[str] = None,
 565        completion_start_time: Optional[datetime] = None,
 566        model: Optional[str] = None,
 567        model_parameters: Optional[Dict[str, MapValue]] = None,
 568        usage_details: Optional[Dict[str, int]] = None,
 569        cost_details: Optional[Dict[str, float]] = None,
 570        prompt: Optional[PromptClient] = None,
 571    ) -> Union[
 572        LangfuseSpan,
 573        LangfuseGeneration,
 574        LangfuseAgent,
 575        LangfuseTool,
 576        LangfuseChain,
 577        LangfuseRetriever,
 578        LangfuseEvaluator,
 579        LangfuseEmbedding,
 580        LangfuseGuardrail,
 581    ]:
 582        """Create a new observation of the specified type.
 583
 584        This method creates a new observation but does not set it as the current span in the
 585        context. To create and use an observation within a context, use start_as_current_observation().
 586
 587        Args:
 588            trace_context: Optional context for connecting to an existing trace
 589            name: Name of the observation
 590            as_type: Type of observation to create (defaults to "span")
 591            input: Input data for the operation
 592            output: Output data from the operation
 593            metadata: Additional metadata to associate with the observation
 594            version: Version identifier for the code or component
 595            level: Importance level of the observation
 596            status_message: Optional status message for the observation
 597            completion_start_time: When the model started generating (for generation types)
 598            model: Name/identifier of the AI model used (for generation types)
 599            model_parameters: Parameters used for the model (for generation types)
 600            usage_details: Token usage information (for generation types)
 601            cost_details: Cost information (for generation types)
 602            prompt: Associated prompt template (for generation types)
 603
 604        Returns:
 605            An observation object of the appropriate type that must be ended with .end()
 606        """
 607        if trace_context:
 608            trace_id = trace_context.get("trace_id", None)
 609            parent_span_id = trace_context.get("parent_span_id", None)
 610
 611            if trace_id:
 612                remote_parent_span = self._create_remote_parent_span(
 613                    trace_id=trace_id, parent_span_id=parent_span_id
 614                )
 615
 616                with otel_trace_api.use_span(
 617                    cast(otel_trace_api.Span, remote_parent_span)
 618                ):
 619                    otel_span = self._otel_tracer.start_span(name=name)
 620                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 621
 622                    return self._create_observation_from_otel_span(
 623                        otel_span=otel_span,
 624                        as_type=as_type,
 625                        input=input,
 626                        output=output,
 627                        metadata=metadata,
 628                        version=version,
 629                        level=level,
 630                        status_message=status_message,
 631                        completion_start_time=completion_start_time,
 632                        model=model,
 633                        model_parameters=model_parameters,
 634                        usage_details=usage_details,
 635                        cost_details=cost_details,
 636                        prompt=prompt,
 637                    )
 638
 639        otel_span = self._otel_tracer.start_span(name=name)
 640
 641        return self._create_observation_from_otel_span(
 642            otel_span=otel_span,
 643            as_type=as_type,
 644            input=input,
 645            output=output,
 646            metadata=metadata,
 647            version=version,
 648            level=level,
 649            status_message=status_message,
 650            completion_start_time=completion_start_time,
 651            model=model,
 652            model_parameters=model_parameters,
 653            usage_details=usage_details,
 654            cost_details=cost_details,
 655            prompt=prompt,
 656        )
 657
 658    def _create_observation_from_otel_span(
 659        self,
 660        *,
 661        otel_span: otel_trace_api.Span,
 662        as_type: ObservationTypeLiteralNoEvent,
 663        input: Optional[Any] = None,
 664        output: Optional[Any] = None,
 665        metadata: Optional[Any] = None,
 666        version: Optional[str] = None,
 667        level: Optional[SpanLevel] = None,
 668        status_message: Optional[str] = None,
 669        completion_start_time: Optional[datetime] = None,
 670        model: Optional[str] = None,
 671        model_parameters: Optional[Dict[str, MapValue]] = None,
 672        usage_details: Optional[Dict[str, int]] = None,
 673        cost_details: Optional[Dict[str, float]] = None,
 674        prompt: Optional[PromptClient] = None,
 675    ) -> Union[
 676        LangfuseSpan,
 677        LangfuseGeneration,
 678        LangfuseAgent,
 679        LangfuseTool,
 680        LangfuseChain,
 681        LangfuseRetriever,
 682        LangfuseEvaluator,
 683        LangfuseEmbedding,
 684        LangfuseGuardrail,
 685    ]:
 686        """Create the appropriate observation type from an OTEL span."""
 687        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 688            observation_class = self._get_span_class(as_type)
 689            # Type ignore to prevent overloads of internal _get_span_class function,
 690            # issue is that LangfuseEvent could be returned and that classes have diff. args
 691            return observation_class(  # type: ignore[return-value,call-arg]
 692                otel_span=otel_span,
 693                langfuse_client=self,
 694                environment=self._environment,
 695                release=self._release,
 696                input=input,
 697                output=output,
 698                metadata=metadata,
 699                version=version,
 700                level=level,
 701                status_message=status_message,
 702                completion_start_time=completion_start_time,
 703                model=model,
 704                model_parameters=model_parameters,
 705                usage_details=usage_details,
 706                cost_details=cost_details,
 707                prompt=prompt,
 708            )
 709        else:
 710            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 711            observation_class = self._get_span_class(as_type)
 712            # Type ignore to prevent overloads of internal _get_span_class function,
 713            # issue is that LangfuseEvent could be returned and that classes have diff. args
 714            return observation_class(  # type: ignore[return-value,call-arg]
 715                otel_span=otel_span,
 716                langfuse_client=self,
 717                environment=self._environment,
 718                release=self._release,
 719                input=input,
 720                output=output,
 721                metadata=metadata,
 722                version=version,
 723                level=level,
 724                status_message=status_message,
 725            )
 726            # span._observation_type = as_type
 727            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 728            # return span
 729
 730    @overload
 731    def start_as_current_observation(
 732        self,
 733        *,
 734        trace_context: Optional[TraceContext] = None,
 735        name: str,
 736        as_type: Literal["generation"],
 737        input: Optional[Any] = None,
 738        output: Optional[Any] = None,
 739        metadata: Optional[Any] = None,
 740        version: Optional[str] = None,
 741        level: Optional[SpanLevel] = None,
 742        status_message: Optional[str] = None,
 743        completion_start_time: Optional[datetime] = None,
 744        model: Optional[str] = None,
 745        model_parameters: Optional[Dict[str, MapValue]] = None,
 746        usage_details: Optional[Dict[str, int]] = None,
 747        cost_details: Optional[Dict[str, float]] = None,
 748        prompt: Optional[PromptClient] = None,
 749        end_on_exit: Optional[bool] = None,
 750    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 751
 752    @overload
 753    def start_as_current_observation(
 754        self,
 755        *,
 756        trace_context: Optional[TraceContext] = None,
 757        name: str,
 758        as_type: Literal["span"] = "span",
 759        input: Optional[Any] = None,
 760        output: Optional[Any] = None,
 761        metadata: Optional[Any] = None,
 762        version: Optional[str] = None,
 763        level: Optional[SpanLevel] = None,
 764        status_message: Optional[str] = None,
 765        end_on_exit: Optional[bool] = None,
 766    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 767
 768    @overload
 769    def start_as_current_observation(
 770        self,
 771        *,
 772        trace_context: Optional[TraceContext] = None,
 773        name: str,
 774        as_type: Literal["agent"],
 775        input: Optional[Any] = None,
 776        output: Optional[Any] = None,
 777        metadata: Optional[Any] = None,
 778        version: Optional[str] = None,
 779        level: Optional[SpanLevel] = None,
 780        status_message: Optional[str] = None,
 781        end_on_exit: Optional[bool] = None,
 782    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 783
 784    @overload
 785    def start_as_current_observation(
 786        self,
 787        *,
 788        trace_context: Optional[TraceContext] = None,
 789        name: str,
 790        as_type: Literal["tool"],
 791        input: Optional[Any] = None,
 792        output: Optional[Any] = None,
 793        metadata: Optional[Any] = None,
 794        version: Optional[str] = None,
 795        level: Optional[SpanLevel] = None,
 796        status_message: Optional[str] = None,
 797        end_on_exit: Optional[bool] = None,
 798    ) -> _AgnosticContextManager[LangfuseTool]: ...
 799
 800    @overload
 801    def start_as_current_observation(
 802        self,
 803        *,
 804        trace_context: Optional[TraceContext] = None,
 805        name: str,
 806        as_type: Literal["chain"],
 807        input: Optional[Any] = None,
 808        output: Optional[Any] = None,
 809        metadata: Optional[Any] = None,
 810        version: Optional[str] = None,
 811        level: Optional[SpanLevel] = None,
 812        status_message: Optional[str] = None,
 813        end_on_exit: Optional[bool] = None,
 814    ) -> _AgnosticContextManager[LangfuseChain]: ...
 815
 816    @overload
 817    def start_as_current_observation(
 818        self,
 819        *,
 820        trace_context: Optional[TraceContext] = None,
 821        name: str,
 822        as_type: Literal["retriever"],
 823        input: Optional[Any] = None,
 824        output: Optional[Any] = None,
 825        metadata: Optional[Any] = None,
 826        version: Optional[str] = None,
 827        level: Optional[SpanLevel] = None,
 828        status_message: Optional[str] = None,
 829        end_on_exit: Optional[bool] = None,
 830    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 831
 832    @overload
 833    def start_as_current_observation(
 834        self,
 835        *,
 836        trace_context: Optional[TraceContext] = None,
 837        name: str,
 838        as_type: Literal["evaluator"],
 839        input: Optional[Any] = None,
 840        output: Optional[Any] = None,
 841        metadata: Optional[Any] = None,
 842        version: Optional[str] = None,
 843        level: Optional[SpanLevel] = None,
 844        status_message: Optional[str] = None,
 845        end_on_exit: Optional[bool] = None,
 846    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 847
 848    @overload
 849    def start_as_current_observation(
 850        self,
 851        *,
 852        trace_context: Optional[TraceContext] = None,
 853        name: str,
 854        as_type: Literal["embedding"],
 855        input: Optional[Any] = None,
 856        output: Optional[Any] = None,
 857        metadata: Optional[Any] = None,
 858        version: Optional[str] = None,
 859        level: Optional[SpanLevel] = None,
 860        status_message: Optional[str] = None,
 861        completion_start_time: Optional[datetime] = None,
 862        model: Optional[str] = None,
 863        model_parameters: Optional[Dict[str, MapValue]] = None,
 864        usage_details: Optional[Dict[str, int]] = None,
 865        cost_details: Optional[Dict[str, float]] = None,
 866        prompt: Optional[PromptClient] = None,
 867        end_on_exit: Optional[bool] = None,
 868    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 869
 870    @overload
 871    def start_as_current_observation(
 872        self,
 873        *,
 874        trace_context: Optional[TraceContext] = None,
 875        name: str,
 876        as_type: Literal["guardrail"],
 877        input: Optional[Any] = None,
 878        output: Optional[Any] = None,
 879        metadata: Optional[Any] = None,
 880        version: Optional[str] = None,
 881        level: Optional[SpanLevel] = None,
 882        status_message: Optional[str] = None,
 883        end_on_exit: Optional[bool] = None,
 884    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 885
 886    def start_as_current_observation(
 887        self,
 888        *,
 889        trace_context: Optional[TraceContext] = None,
 890        name: str,
 891        as_type: ObservationTypeLiteralNoEvent = "span",
 892        input: Optional[Any] = None,
 893        output: Optional[Any] = None,
 894        metadata: Optional[Any] = None,
 895        version: Optional[str] = None,
 896        level: Optional[SpanLevel] = None,
 897        status_message: Optional[str] = None,
 898        completion_start_time: Optional[datetime] = None,
 899        model: Optional[str] = None,
 900        model_parameters: Optional[Dict[str, MapValue]] = None,
 901        usage_details: Optional[Dict[str, int]] = None,
 902        cost_details: Optional[Dict[str, float]] = None,
 903        prompt: Optional[PromptClient] = None,
 904        end_on_exit: Optional[bool] = None,
 905    ) -> Union[
 906        _AgnosticContextManager[LangfuseGeneration],
 907        _AgnosticContextManager[LangfuseSpan],
 908        _AgnosticContextManager[LangfuseAgent],
 909        _AgnosticContextManager[LangfuseTool],
 910        _AgnosticContextManager[LangfuseChain],
 911        _AgnosticContextManager[LangfuseRetriever],
 912        _AgnosticContextManager[LangfuseEvaluator],
 913        _AgnosticContextManager[LangfuseEmbedding],
 914        _AgnosticContextManager[LangfuseGuardrail],
 915    ]:
 916        """Create a new observation and set it as the current span in a context manager.
 917
 918        This method creates a new observation of the specified type and sets it as the
 919        current span within a context manager. Use this method with a 'with' statement to
 920        automatically handle the observation lifecycle within a code block.
 921
 922        The created observation will be the child of the current span in the context.
 923
 924        Args:
 925            trace_context: Optional context for connecting to an existing trace
 926            name: Name of the observation (e.g., function or operation name)
 927            as_type: Type of observation to create (defaults to "span")
 928            input: Input data for the operation (can be any JSON-serializable object)
 929            output: Output data from the operation (can be any JSON-serializable object)
 930            metadata: Additional metadata to associate with the observation
 931            version: Version identifier for the code or component
 932            level: Importance level of the observation (info, warning, error)
 933            status_message: Optional status message for the observation
 934            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 935
 936            The following parameters are available when as_type is: "generation" or "embedding".
 937            completion_start_time: When the model started generating the response
 938            model: Name/identifier of the AI model used (e.g., "gpt-4")
 939            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 940            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 941            cost_details: Cost information for the model call
 942            prompt: Associated prompt template from Langfuse prompt management
 943
 944        Returns:
 945            A context manager that yields the appropriate observation type based on as_type
 946
 947        Example:
 948            ```python
 949            # Create a span
 950            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 951                # Do work
 952                result = process_data()
 953                span.update(output=result)
 954
 955                # Create a child span automatically
 956                with span.start_as_current_observation(name="sub-operation") as child_span:
 957                    # Do sub-operation work
 958                    child_span.update(output="sub-result")
 959
 960            # Create a tool observation
 961            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 962                # Do tool work
 963                results = search_web(query)
 964                tool.update(output=results)
 965
 966            # Create a generation observation
 967            with langfuse.start_as_current_observation(
 968                name="answer-generation",
 969                as_type="generation",
 970                model="gpt-4"
 971            ) as generation:
 972                # Generate answer
 973                response = llm.generate(...)
 974                generation.update(output=response)
 975            ```
 976        """
 977        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 978            if trace_context:
 979                trace_id = trace_context.get("trace_id", None)
 980                parent_span_id = trace_context.get("parent_span_id", None)
 981
 982                if trace_id:
 983                    remote_parent_span = self._create_remote_parent_span(
 984                        trace_id=trace_id, parent_span_id=parent_span_id
 985                    )
 986
 987                    return cast(
 988                        Union[
 989                            _AgnosticContextManager[LangfuseGeneration],
 990                            _AgnosticContextManager[LangfuseEmbedding],
 991                        ],
 992                        self._create_span_with_parent_context(
 993                            as_type=as_type,
 994                            name=name,
 995                            remote_parent_span=remote_parent_span,
 996                            parent=None,
 997                            end_on_exit=end_on_exit,
 998                            input=input,
 999                            output=output,
1000                            metadata=metadata,
1001                            version=version,
1002                            level=level,
1003                            status_message=status_message,
1004                            completion_start_time=completion_start_time,
1005                            model=model,
1006                            model_parameters=model_parameters,
1007                            usage_details=usage_details,
1008                            cost_details=cost_details,
1009                            prompt=prompt,
1010                        ),
1011                    )
1012
1013            return cast(
1014                Union[
1015                    _AgnosticContextManager[LangfuseGeneration],
1016                    _AgnosticContextManager[LangfuseEmbedding],
1017                ],
1018                self._start_as_current_otel_span_with_processed_media(
1019                    as_type=as_type,
1020                    name=name,
1021                    end_on_exit=end_on_exit,
1022                    input=input,
1023                    output=output,
1024                    metadata=metadata,
1025                    version=version,
1026                    level=level,
1027                    status_message=status_message,
1028                    completion_start_time=completion_start_time,
1029                    model=model,
1030                    model_parameters=model_parameters,
1031                    usage_details=usage_details,
1032                    cost_details=cost_details,
1033                    prompt=prompt,
1034                ),
1035            )
1036
1037        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1038            if trace_context:
1039                trace_id = trace_context.get("trace_id", None)
1040                parent_span_id = trace_context.get("parent_span_id", None)
1041
1042                if trace_id:
1043                    remote_parent_span = self._create_remote_parent_span(
1044                        trace_id=trace_id, parent_span_id=parent_span_id
1045                    )
1046
1047                    return cast(
1048                        Union[
1049                            _AgnosticContextManager[LangfuseSpan],
1050                            _AgnosticContextManager[LangfuseAgent],
1051                            _AgnosticContextManager[LangfuseTool],
1052                            _AgnosticContextManager[LangfuseChain],
1053                            _AgnosticContextManager[LangfuseRetriever],
1054                            _AgnosticContextManager[LangfuseEvaluator],
1055                            _AgnosticContextManager[LangfuseGuardrail],
1056                        ],
1057                        self._create_span_with_parent_context(
1058                            as_type=as_type,
1059                            name=name,
1060                            remote_parent_span=remote_parent_span,
1061                            parent=None,
1062                            end_on_exit=end_on_exit,
1063                            input=input,
1064                            output=output,
1065                            metadata=metadata,
1066                            version=version,
1067                            level=level,
1068                            status_message=status_message,
1069                        ),
1070                    )
1071
1072            return cast(
1073                Union[
1074                    _AgnosticContextManager[LangfuseSpan],
1075                    _AgnosticContextManager[LangfuseAgent],
1076                    _AgnosticContextManager[LangfuseTool],
1077                    _AgnosticContextManager[LangfuseChain],
1078                    _AgnosticContextManager[LangfuseRetriever],
1079                    _AgnosticContextManager[LangfuseEvaluator],
1080                    _AgnosticContextManager[LangfuseGuardrail],
1081                ],
1082                self._start_as_current_otel_span_with_processed_media(
1083                    as_type=as_type,
1084                    name=name,
1085                    end_on_exit=end_on_exit,
1086                    input=input,
1087                    output=output,
1088                    metadata=metadata,
1089                    version=version,
1090                    level=level,
1091                    status_message=status_message,
1092                ),
1093            )
1094
1095        # This should never be reached since all valid types are handled above
1096        langfuse_logger.warning(
1097            f"Unknown observation type: {as_type}, falling back to span"
1098        )
1099        return self._start_as_current_otel_span_with_processed_media(
1100            as_type="span",
1101            name=name,
1102            end_on_exit=end_on_exit,
1103            input=input,
1104            output=output,
1105            metadata=metadata,
1106            version=version,
1107            level=level,
1108            status_message=status_message,
1109        )
1110
1111    def _get_span_class(
1112        self,
1113        as_type: str,
1114    ) -> Union[
1115        Type[LangfuseAgent],
1116        Type[LangfuseTool],
1117        Type[LangfuseChain],
1118        Type[LangfuseRetriever],
1119        Type[LangfuseEvaluator],
1120        Type[LangfuseEmbedding],
1121        Type[LangfuseGuardrail],
1122        Type[LangfuseGeneration],
1123        Type[LangfuseEvent],
1124        Type[LangfuseSpan],
1125    ]:
1126        """Get the appropriate span class based on as_type."""
1127        normalized_type = as_type.lower()
1128
1129        if normalized_type == "agent":
1130            return LangfuseAgent
1131        elif normalized_type == "tool":
1132            return LangfuseTool
1133        elif normalized_type == "chain":
1134            return LangfuseChain
1135        elif normalized_type == "retriever":
1136            return LangfuseRetriever
1137        elif normalized_type == "evaluator":
1138            return LangfuseEvaluator
1139        elif normalized_type == "embedding":
1140            return LangfuseEmbedding
1141        elif normalized_type == "guardrail":
1142            return LangfuseGuardrail
1143        elif normalized_type == "generation":
1144            return LangfuseGeneration
1145        elif normalized_type == "event":
1146            return LangfuseEvent
1147        elif normalized_type == "span":
1148            return LangfuseSpan
1149        else:
1150            return LangfuseSpan
1151
1152    @staticmethod
1153    def _get_observation_type_from_otel_span(otel_span: otel_trace_api.Span) -> str:
1154        if not otel_span.is_recording():
1155            return "span"
1156
1157        attributes = getattr(otel_span, "attributes", None)
1158        if attributes is None or not hasattr(attributes, "get"):
1159            return "span"
1160
1161        observation_type = attributes.get(
1162            LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1163        )
1164
1165        return observation_type if isinstance(observation_type, str) else "span"
1166
1167    @_agnosticcontextmanager
1168    def _create_span_with_parent_context(
1169        self,
1170        *,
1171        name: str,
1172        parent: Optional[otel_trace_api.Span] = None,
1173        remote_parent_span: Optional[otel_trace_api.Span] = None,
1174        as_type: ObservationTypeLiteralNoEvent,
1175        end_on_exit: Optional[bool] = None,
1176        input: Optional[Any] = None,
1177        output: Optional[Any] = None,
1178        metadata: Optional[Any] = None,
1179        version: Optional[str] = None,
1180        level: Optional[SpanLevel] = None,
1181        status_message: Optional[str] = None,
1182        completion_start_time: Optional[datetime] = None,
1183        model: Optional[str] = None,
1184        model_parameters: Optional[Dict[str, MapValue]] = None,
1185        usage_details: Optional[Dict[str, int]] = None,
1186        cost_details: Optional[Dict[str, float]] = None,
1187        prompt: Optional[PromptClient] = None,
1188    ) -> Any:
1189        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1190
1191        with otel_trace_api.use_span(parent_span):
1192            with self._start_as_current_otel_span_with_processed_media(
1193                name=name,
1194                as_type=as_type,
1195                end_on_exit=end_on_exit,
1196                input=input,
1197                output=output,
1198                metadata=metadata,
1199                version=version,
1200                level=level,
1201                status_message=status_message,
1202                completion_start_time=completion_start_time,
1203                model=model,
1204                model_parameters=model_parameters,
1205                usage_details=usage_details,
1206                cost_details=cost_details,
1207                prompt=prompt,
1208            ) as langfuse_span:
1209                if remote_parent_span is not None:
1210                    langfuse_span._otel_span.set_attribute(
1211                        LangfuseOtelSpanAttributes.AS_ROOT, True
1212                    )
1213
1214                yield langfuse_span
1215
1216    @_agnosticcontextmanager
1217    def _start_as_current_otel_span_with_processed_media(
1218        self,
1219        *,
1220        name: str,
1221        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1222        end_on_exit: Optional[bool] = None,
1223        input: Optional[Any] = None,
1224        output: Optional[Any] = None,
1225        metadata: Optional[Any] = None,
1226        version: Optional[str] = None,
1227        level: Optional[SpanLevel] = None,
1228        status_message: Optional[str] = None,
1229        completion_start_time: Optional[datetime] = None,
1230        model: Optional[str] = None,
1231        model_parameters: Optional[Dict[str, MapValue]] = None,
1232        usage_details: Optional[Dict[str, int]] = None,
1233        cost_details: Optional[Dict[str, float]] = None,
1234        prompt: Optional[PromptClient] = None,
1235    ) -> Any:
1236        with self._otel_tracer.start_as_current_span(
1237            name=name,
1238            end_on_exit=end_on_exit if end_on_exit is not None else True,
1239        ) as otel_span:
1240            baggage_token = None
1241
1242            if otel_span.is_recording():
1243                context_with_app_root_claim = _set_langfuse_trace_id_in_baggage(
1244                    trace_id=self._get_otel_trace_id(otel_span),
1245                    context=otel_context_api.get_current(),
1246                )
1247                baggage_token = otel_context_api.attach(context_with_app_root_claim)
1248
1249            span_class = self._get_span_class(
1250                as_type or "generation"
1251            )  # default was "generation"
1252
1253            try:
1254                common_args = {
1255                    "otel_span": otel_span,
1256                    "langfuse_client": self,
1257                    "environment": self._environment,
1258                    "release": self._release,
1259                    "input": input,
1260                    "output": output,
1261                    "metadata": metadata,
1262                    "version": version,
1263                    "level": level,
1264                    "status_message": status_message,
1265                }
1266
1267                if span_class in [
1268                    LangfuseGeneration,
1269                    LangfuseEmbedding,
1270                ]:
1271                    common_args.update(
1272                        {
1273                            "completion_start_time": completion_start_time,
1274                            "model": model,
1275                            "model_parameters": model_parameters,
1276                            "usage_details": usage_details,
1277                            "cost_details": cost_details,
1278                            "prompt": prompt,
1279                        }
1280                    )
1281                # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1282
1283                yield span_class(**common_args)  # type: ignore[arg-type]
1284
1285            finally:
1286                if baggage_token is not None:
1287                    _detach_context_token_safely(baggage_token)
1288
1289    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1290        current_span = otel_trace_api.get_current_span()
1291
1292        if current_span is otel_trace_api.INVALID_SPAN:
1293            langfuse_logger.warning(
1294                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1295                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1296            )
1297            return None
1298
1299        return current_span
1300
1301    def update_current_generation(
1302        self,
1303        *,
1304        name: Optional[str] = None,
1305        input: Optional[Any] = None,
1306        output: Optional[Any] = None,
1307        metadata: Optional[Any] = None,
1308        version: Optional[str] = None,
1309        level: Optional[SpanLevel] = None,
1310        status_message: Optional[str] = None,
1311        completion_start_time: Optional[datetime] = None,
1312        model: Optional[str] = None,
1313        model_parameters: Optional[Dict[str, MapValue]] = None,
1314        usage_details: Optional[Dict[str, int]] = None,
1315        cost_details: Optional[Dict[str, float]] = None,
1316        prompt: Optional[PromptClient] = None,
1317    ) -> None:
1318        """Update the current active generation span with new information.
1319
1320        This method updates the current generation span in the active context with
1321        additional information. It's useful for adding output, usage stats, or other
1322        details that become available during or after model generation.
1323
1324        Args:
1325            name: The generation name
1326            input: Updated input data for the model
1327            output: Output from the model (e.g., completions)
1328            metadata: Additional metadata to associate with the generation
1329            version: Version identifier for the model or component
1330            level: Importance level of the generation (info, warning, error)
1331            status_message: Optional status message for the generation
1332            completion_start_time: When the model started generating the response
1333            model: Name/identifier of the AI model used (e.g., "gpt-4")
1334            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1335            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1336            cost_details: Cost information for the model call
1337            prompt: Associated prompt template from Langfuse prompt management
1338
1339        Example:
1340            ```python
1341            with langfuse.start_as_current_generation(name="answer-query") as generation:
1342                # Initial setup and API call
1343                response = llm.generate(...)
1344
1345                # Update with results that weren't available at creation time
1346                langfuse.update_current_generation(
1347                    output=response.text,
1348                    usage_details={
1349                        "prompt_tokens": response.usage.prompt_tokens,
1350                        "completion_tokens": response.usage.completion_tokens
1351                    }
1352                )
1353            ```
1354        """
1355        if not self._tracing_enabled:
1356            langfuse_logger.debug(
1357                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1358            )
1359            return
1360
1361        current_otel_span = self._get_current_otel_span()
1362
1363        if current_otel_span is not None:
1364            generation = LangfuseGeneration(
1365                otel_span=current_otel_span, langfuse_client=self
1366            )
1367
1368            if name:
1369                current_otel_span.update_name(name)
1370
1371            generation.update(
1372                input=input,
1373                output=output,
1374                metadata=metadata,
1375                version=version,
1376                level=level,
1377                status_message=status_message,
1378                completion_start_time=completion_start_time,
1379                model=model,
1380                model_parameters=model_parameters,
1381                usage_details=usage_details,
1382                cost_details=cost_details,
1383                prompt=prompt,
1384            )
1385
1386    def update_current_span(
1387        self,
1388        *,
1389        name: Optional[str] = None,
1390        input: Optional[Any] = None,
1391        output: Optional[Any] = None,
1392        metadata: Optional[Any] = None,
1393        version: Optional[str] = None,
1394        level: Optional[SpanLevel] = None,
1395        status_message: Optional[str] = None,
1396    ) -> None:
1397        """Update the current active span with new information.
1398
1399        This method updates the current span in the active context with
1400        additional information. It's useful for adding outputs or metadata
1401        that become available during execution.
1402
1403        Args:
1404            name: The span name
1405            input: Updated input data for the operation
1406            output: Output data from the operation
1407            metadata: Additional metadata to associate with the span
1408            version: Version identifier for the code or component
1409            level: Importance level of the span (info, warning, error)
1410            status_message: Optional status message for the span
1411
1412        Example:
1413            ```python
1414            with langfuse.start_as_current_observation(name="process-data") as span:
1415                # Initial processing
1416                result = process_first_part()
1417
1418                # Update with intermediate results
1419                langfuse.update_current_span(metadata={"intermediate_result": result})
1420
1421                # Continue processing
1422                final_result = process_second_part(result)
1423
1424                # Final update
1425                langfuse.update_current_span(output=final_result)
1426            ```
1427        """
1428        if not self._tracing_enabled:
1429            langfuse_logger.debug(
1430                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1431            )
1432            return
1433
1434        current_otel_span = self._get_current_otel_span()
1435
1436        if current_otel_span is not None:
1437            span_class = self._get_span_class(
1438                self._get_observation_type_from_otel_span(current_otel_span)
1439            )
1440            span = span_class(
1441                otel_span=current_otel_span,
1442                langfuse_client=self,
1443                environment=self._environment,
1444                release=self._release,
1445            )
1446
1447            if name:
1448                current_otel_span.update_name(name)
1449
1450            span.update(
1451                input=input,
1452                output=output,
1453                metadata=metadata,
1454                version=version,
1455                level=level,
1456                status_message=status_message,
1457            )
1458
1459    @deprecated(
1460        "Trace-level input/output is deprecated. "
1461        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1462        "This method will be removed in a future major version."
1463    )
1464    def set_current_trace_io(
1465        self,
1466        *,
1467        input: Optional[Any] = None,
1468        output: Optional[Any] = None,
1469    ) -> None:
1470        """Set trace-level input and output for the current span's trace.
1471
1472        .. deprecated::
1473            This is a legacy method for backward compatibility with Langfuse platform
1474            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1475            evaluators). It will be removed in a future major version.
1476
1477            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1478            use :meth:`propagate_attributes` instead.
1479
1480        Args:
1481            input: Input data to associate with the trace.
1482            output: Output data to associate with the trace.
1483        """
1484        if not self._tracing_enabled:
1485            langfuse_logger.debug(
1486                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1487            )
1488            return
1489
1490        current_otel_span = self._get_current_otel_span()
1491
1492        if current_otel_span is not None and current_otel_span.is_recording():
1493            span_class = self._get_span_class(
1494                self._get_observation_type_from_otel_span(current_otel_span)
1495            )
1496            span = span_class(
1497                otel_span=current_otel_span,
1498                langfuse_client=self,
1499                environment=self._environment,
1500                release=self._release,
1501            )
1502
1503            span.set_trace_io(
1504                input=input,
1505                output=output,
1506            )
1507
1508    def set_current_trace_as_public(self) -> None:
1509        """Make the current trace publicly accessible via its URL.
1510
1511        When a trace is published, anyone with the trace link can view the full trace
1512        without needing to be logged in to Langfuse. This action cannot be undone
1513        programmatically - once published, the entire trace becomes public.
1514
1515        This is a convenience method that publishes the trace from the currently
1516        active span context. Use this when you want to make a trace public from
1517        within a traced function without needing direct access to the span object.
1518        """
1519        if not self._tracing_enabled:
1520            langfuse_logger.debug(
1521                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1522            )
1523            return
1524
1525        current_otel_span = self._get_current_otel_span()
1526
1527        if current_otel_span is not None and current_otel_span.is_recording():
1528            span_class = self._get_span_class(
1529                self._get_observation_type_from_otel_span(current_otel_span)
1530            )
1531            span = span_class(
1532                otel_span=current_otel_span,
1533                langfuse_client=self,
1534                environment=self._environment,
1535            )
1536
1537            span.set_trace_as_public()
1538
1539    def create_event(
1540        self,
1541        *,
1542        trace_context: Optional[TraceContext] = None,
1543        name: str,
1544        input: Optional[Any] = None,
1545        output: Optional[Any] = None,
1546        metadata: Optional[Any] = None,
1547        version: Optional[str] = None,
1548        level: Optional[SpanLevel] = None,
1549        status_message: Optional[str] = None,
1550    ) -> LangfuseEvent:
1551        """Create a new Langfuse observation of type 'EVENT'.
1552
1553        The created Langfuse Event observation will be the child of the current span in the context.
1554
1555        Args:
1556            trace_context: Optional context for connecting to an existing trace
1557            name: Name of the span (e.g., function or operation name)
1558            input: Input data for the operation (can be any JSON-serializable object)
1559            output: Output data from the operation (can be any JSON-serializable object)
1560            metadata: Additional metadata to associate with the span
1561            version: Version identifier for the code or component
1562            level: Importance level of the span (info, warning, error)
1563            status_message: Optional status message for the span
1564
1565        Returns:
1566            The Langfuse Event object
1567
1568        Example:
1569            ```python
1570            event = langfuse.create_event(name="process-event")
1571            ```
1572        """
1573        timestamp = time_ns()
1574
1575        if trace_context:
1576            trace_id = trace_context.get("trace_id", None)
1577            parent_span_id = trace_context.get("parent_span_id", None)
1578
1579            if trace_id:
1580                remote_parent_span = self._create_remote_parent_span(
1581                    trace_id=trace_id, parent_span_id=parent_span_id
1582                )
1583
1584                with otel_trace_api.use_span(
1585                    cast(otel_trace_api.Span, remote_parent_span)
1586                ):
1587                    otel_span = self._otel_tracer.start_span(
1588                        name=name, start_time=timestamp
1589                    )
1590                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1591
1592                    return cast(
1593                        LangfuseEvent,
1594                        LangfuseEvent(
1595                            otel_span=otel_span,
1596                            langfuse_client=self,
1597                            environment=self._environment,
1598                            release=self._release,
1599                            input=input,
1600                            output=output,
1601                            metadata=metadata,
1602                            version=version,
1603                            level=level,
1604                            status_message=status_message,
1605                        ).end(end_time=timestamp),
1606                    )
1607
1608        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1609
1610        return cast(
1611            LangfuseEvent,
1612            LangfuseEvent(
1613                otel_span=otel_span,
1614                langfuse_client=self,
1615                environment=self._environment,
1616                release=self._release,
1617                input=input,
1618                output=output,
1619                metadata=metadata,
1620                version=version,
1621                level=level,
1622                status_message=status_message,
1623            ).end(end_time=timestamp),
1624        )
1625
1626    def _create_remote_parent_span(
1627        self, *, trace_id: str, parent_span_id: Optional[str]
1628    ) -> Any:
1629        if not self._is_valid_trace_id(trace_id):
1630            langfuse_logger.warning(
1631                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1632            )
1633
1634        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1635            langfuse_logger.warning(
1636                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1637            )
1638
1639        int_trace_id = int(trace_id, 16)
1640        int_parent_span_id = (
1641            int(parent_span_id, 16)
1642            if parent_span_id
1643            else RandomIdGenerator().generate_span_id()
1644        )
1645
1646        span_context = otel_trace_api.SpanContext(
1647            trace_id=int_trace_id,
1648            span_id=int_parent_span_id,
1649            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1650            is_remote=False,
1651        )
1652
1653        return otel_trace_api.NonRecordingSpan(span_context)
1654
1655    def _is_valid_trace_id(self, trace_id: str) -> bool:
1656        pattern = r"^[0-9a-f]{32}$"
1657
1658        return bool(re.match(pattern, trace_id))
1659
1660    def _is_valid_span_id(self, span_id: str) -> bool:
1661        pattern = r"^[0-9a-f]{16}$"
1662
1663        return bool(re.match(pattern, span_id))
1664
1665    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1666        """Create a unique observation ID for use with Langfuse.
1667
1668        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1669        for use with various Langfuse APIs. It can either generate a random ID or
1670        create a deterministic ID based on a seed string.
1671
1672        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1673        This method ensures the generated ID meets this requirement. If you need to
1674        correlate an external ID with a Langfuse observation ID, use the external ID as
1675        the seed to get a valid, deterministic observation ID.
1676
1677        Args:
1678            seed: Optional string to use as a seed for deterministic ID generation.
1679                 If provided, the same seed will always produce the same ID.
1680                 If not provided, a random ID will be generated.
1681
1682        Returns:
1683            A 16-character lowercase hexadecimal string representing the observation ID.
1684
1685        Example:
1686            ```python
1687            # Generate a random observation ID
1688            obs_id = langfuse.create_observation_id()
1689
1690            # Generate a deterministic ID based on a seed
1691            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1692
1693            # Correlate an external item ID with a Langfuse observation ID
1694            item_id = "item-789012"
1695            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1696
1697            # Use the ID with Langfuse APIs
1698            langfuse.create_score(
1699                name="relevance",
1700                value=0.95,
1701                trace_id=trace_id,
1702                observation_id=obs_id
1703            )
1704            ```
1705        """
1706        if not seed:
1707            span_id_int = RandomIdGenerator().generate_span_id()
1708
1709            return self._format_otel_span_id(span_id_int)
1710
1711        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1712
1713    @staticmethod
1714    def create_trace_id(*, seed: Optional[str] = None) -> str:
1715        """Create a unique trace ID for use with Langfuse.
1716
1717        This method generates a unique trace ID for use with various Langfuse APIs.
1718        It can either generate a random ID or create a deterministic ID based on
1719        a seed string.
1720
1721        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1722        This method ensures the generated ID meets this requirement. If you need to
1723        correlate an external ID with a Langfuse trace ID, use the external ID as the
1724        seed to get a valid, deterministic Langfuse trace ID.
1725
1726        Args:
1727            seed: Optional string to use as a seed for deterministic ID generation.
1728                 If provided, the same seed will always produce the same ID.
1729                 If not provided, a random ID will be generated.
1730
1731        Returns:
1732            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1733
1734        Example:
1735            ```python
1736            # Generate a random trace ID
1737            trace_id = langfuse.create_trace_id()
1738
1739            # Generate a deterministic ID based on a seed
1740            session_trace_id = langfuse.create_trace_id(seed="session-456")
1741
1742            # Correlate an external ID with a Langfuse trace ID
1743            external_id = "external-system-123456"
1744            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1745
1746            # Use the ID with trace context
1747            with langfuse.start_as_current_observation(
1748                name="process-request",
1749                trace_context={"trace_id": trace_id}
1750            ) as span:
1751                # Operation will be part of the specific trace
1752                pass
1753            ```
1754        """
1755        if not seed:
1756            trace_id_int = RandomIdGenerator().generate_trace_id()
1757
1758            return Langfuse._format_otel_trace_id(trace_id_int)
1759
1760        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1761
1762    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1763        span_context = otel_span.get_span_context()
1764
1765        return self._format_otel_trace_id(span_context.trace_id)
1766
1767    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1768        span_context = otel_span.get_span_context()
1769
1770        return self._format_otel_span_id(span_context.span_id)
1771
1772    @staticmethod
1773    def _format_otel_span_id(span_id_int: int) -> str:
1774        """Format an integer span ID to a 16-character lowercase hex string.
1775
1776        Internal method to convert an OpenTelemetry integer span ID to the standard
1777        W3C Trace Context format (16-character lowercase hex string).
1778
1779        Args:
1780            span_id_int: 64-bit integer representing a span ID
1781
1782        Returns:
1783            A 16-character lowercase hexadecimal string
1784        """
1785        return format(span_id_int, "016x")
1786
1787    @staticmethod
1788    def _format_otel_trace_id(trace_id_int: int) -> str:
1789        """Format an integer trace ID to a 32-character lowercase hex string.
1790
1791        Internal method to convert an OpenTelemetry integer trace ID to the standard
1792        W3C Trace Context format (32-character lowercase hex string).
1793
1794        Args:
1795            trace_id_int: 128-bit integer representing a trace ID
1796
1797        Returns:
1798            A 32-character lowercase hexadecimal string
1799        """
1800        return format(trace_id_int, "032x")
1801
1802    @overload
1803    def create_score(
1804        self,
1805        *,
1806        name: str,
1807        value: float,
1808        session_id: Optional[str] = None,
1809        dataset_run_id: Optional[str] = None,
1810        trace_id: Optional[str] = None,
1811        observation_id: Optional[str] = None,
1812        score_id: Optional[str] = None,
1813        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1814        comment: Optional[str] = None,
1815        config_id: Optional[str] = None,
1816        metadata: Optional[Any] = None,
1817        timestamp: Optional[datetime] = None,
1818    ) -> None: ...
1819
1820    @overload
1821    def create_score(
1822        self,
1823        *,
1824        name: str,
1825        value: str,
1826        session_id: Optional[str] = None,
1827        dataset_run_id: Optional[str] = None,
1828        trace_id: Optional[str] = None,
1829        score_id: Optional[str] = None,
1830        observation_id: Optional[str] = None,
1831        data_type: Optional[
1832            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
1833        ] = "CATEGORICAL",
1834        comment: Optional[str] = None,
1835        config_id: Optional[str] = None,
1836        metadata: Optional[Any] = None,
1837        timestamp: Optional[datetime] = None,
1838    ) -> None: ...
1839
1840    def create_score(
1841        self,
1842        *,
1843        name: str,
1844        value: Union[float, str],
1845        session_id: Optional[str] = None,
1846        dataset_run_id: Optional[str] = None,
1847        trace_id: Optional[str] = None,
1848        observation_id: Optional[str] = None,
1849        score_id: Optional[str] = None,
1850        data_type: Optional[ScoreDataType] = None,
1851        comment: Optional[str] = None,
1852        config_id: Optional[str] = None,
1853        metadata: Optional[Any] = None,
1854        timestamp: Optional[datetime] = None,
1855    ) -> None:
1856        """Create a score for a specific trace or observation.
1857
1858        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1859        used to track quality metrics, user feedback, or automated evaluations.
1860
1861        Args:
1862            name: Name of the score (e.g., "relevance", "accuracy")
1863            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
1864            session_id: ID of the Langfuse session to associate the score with
1865            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1866            trace_id: ID of the Langfuse trace to associate the score with
1867            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1868            score_id: Optional custom ID for the score (auto-generated if not provided)
1869            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
1870            comment: Optional comment or explanation for the score
1871            config_id: Optional ID of a score config defined in Langfuse
1872            metadata: Optional metadata to be attached to the score
1873            timestamp: Optional timestamp for the score (defaults to current UTC time)
1874
1875        Example:
1876            ```python
1877            # Create a numeric score for accuracy
1878            langfuse.create_score(
1879                name="accuracy",
1880                value=0.92,
1881                trace_id="abcdef1234567890abcdef1234567890",
1882                data_type="NUMERIC",
1883                comment="High accuracy with minor irrelevant details"
1884            )
1885
1886            # Create a categorical score for sentiment
1887            langfuse.create_score(
1888                name="sentiment",
1889                value="positive",
1890                trace_id="abcdef1234567890abcdef1234567890",
1891                observation_id="abcdef1234567890",
1892                data_type="CATEGORICAL"
1893            )
1894            ```
1895        """
1896        if not self._tracing_enabled:
1897            return
1898
1899        score_id = score_id or self._create_observation_id()
1900
1901        try:
1902            new_body = ScoreBody(
1903                id=score_id,
1904                sessionId=session_id,
1905                datasetRunId=dataset_run_id,
1906                traceId=trace_id,
1907                observationId=observation_id,
1908                name=name,
1909                value=value,
1910                dataType=data_type,  # type: ignore
1911                comment=comment,
1912                configId=config_id,
1913                environment=self._environment,
1914                metadata=metadata,
1915            )
1916
1917            event = {
1918                "id": self.create_trace_id(),
1919                "type": "score-create",
1920                "timestamp": timestamp or _get_timestamp(),
1921                "body": new_body,
1922            }
1923
1924            if self._resources is not None:
1925                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1926                force_sample = (
1927                    not self._is_valid_trace_id(trace_id) if trace_id else True
1928                )
1929
1930                self._resources.add_score_task(
1931                    event,
1932                    force_sample=force_sample,
1933                )
1934
1935        except Exception as e:
1936            langfuse_logger.exception(
1937                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1938            )
1939
1940    def _create_trace_tags_via_ingestion(
1941        self,
1942        *,
1943        trace_id: str,
1944        tags: List[str],
1945    ) -> None:
1946        """Private helper to enqueue trace tag updates via ingestion API events."""
1947        if not self._tracing_enabled:
1948            return
1949
1950        if len(tags) == 0:
1951            return
1952
1953        try:
1954            new_body = TraceBody(
1955                id=trace_id,
1956                tags=tags,
1957            )
1958
1959            event = {
1960                "id": self.create_trace_id(),
1961                "type": "trace-create",
1962                "timestamp": _get_timestamp(),
1963                "body": new_body,
1964            }
1965
1966            if self._resources is not None:
1967                self._resources.add_trace_task(event)
1968        except Exception as e:
1969            langfuse_logger.exception(
1970                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
1971            )
1972
1973    @overload
1974    def score_current_span(
1975        self,
1976        *,
1977        name: str,
1978        value: float,
1979        score_id: Optional[str] = None,
1980        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1981        comment: Optional[str] = None,
1982        config_id: Optional[str] = None,
1983        metadata: Optional[Any] = None,
1984    ) -> None: ...
1985
1986    @overload
1987    def score_current_span(
1988        self,
1989        *,
1990        name: str,
1991        value: str,
1992        score_id: Optional[str] = None,
1993        data_type: Optional[
1994            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
1995        ] = "CATEGORICAL",
1996        comment: Optional[str] = None,
1997        config_id: Optional[str] = None,
1998        metadata: Optional[Any] = None,
1999    ) -> None: ...
2000
2001    def score_current_span(
2002        self,
2003        *,
2004        name: str,
2005        value: Union[float, str],
2006        score_id: Optional[str] = None,
2007        data_type: Optional[ScoreDataType] = None,
2008        comment: Optional[str] = None,
2009        config_id: Optional[str] = None,
2010        metadata: Optional[Any] = None,
2011    ) -> None:
2012        """Create a score for the current active span.
2013
2014        This method scores the currently active span in the context. It's a convenient
2015        way to score the current operation without needing to know its trace and span IDs.
2016
2017        Args:
2018            name: Name of the score (e.g., "relevance", "accuracy")
2019            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2020            score_id: Optional custom ID for the score (auto-generated if not provided)
2021            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2022            comment: Optional comment or explanation for the score
2023            config_id: Optional ID of a score config defined in Langfuse
2024            metadata: Optional metadata to be attached to the score
2025
2026        Example:
2027            ```python
2028            with langfuse.start_as_current_generation(name="answer-query") as generation:
2029                # Generate answer
2030                response = generate_answer(...)
2031                generation.update(output=response)
2032
2033                # Score the generation
2034                langfuse.score_current_span(
2035                    name="relevance",
2036                    value=0.85,
2037                    data_type="NUMERIC",
2038                    comment="Mostly relevant but contains some tangential information",
2039                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2040                )
2041            ```
2042        """
2043        current_span = self._get_current_otel_span()
2044
2045        if current_span is not None:
2046            trace_id = self._get_otel_trace_id(current_span)
2047            observation_id = self._get_otel_span_id(current_span)
2048
2049            langfuse_logger.info(
2050                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2051            )
2052
2053            self.create_score(
2054                trace_id=trace_id,
2055                observation_id=observation_id,
2056                name=name,
2057                value=cast(str, value),
2058                score_id=score_id,
2059                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2060                comment=comment,
2061                config_id=config_id,
2062                metadata=metadata,
2063            )
2064
2065    @overload
2066    def score_current_trace(
2067        self,
2068        *,
2069        name: str,
2070        value: float,
2071        score_id: Optional[str] = None,
2072        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2073        comment: Optional[str] = None,
2074        config_id: Optional[str] = None,
2075        metadata: Optional[Any] = None,
2076    ) -> None: ...
2077
2078    @overload
2079    def score_current_trace(
2080        self,
2081        *,
2082        name: str,
2083        value: str,
2084        score_id: Optional[str] = None,
2085        data_type: Optional[
2086            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
2087        ] = "CATEGORICAL",
2088        comment: Optional[str] = None,
2089        config_id: Optional[str] = None,
2090        metadata: Optional[Any] = None,
2091    ) -> None: ...
2092
2093    def score_current_trace(
2094        self,
2095        *,
2096        name: str,
2097        value: Union[float, str],
2098        score_id: Optional[str] = None,
2099        data_type: Optional[ScoreDataType] = None,
2100        comment: Optional[str] = None,
2101        config_id: Optional[str] = None,
2102        metadata: Optional[Any] = None,
2103    ) -> None:
2104        """Create a score for the current trace.
2105
2106        This method scores the trace of the currently active span. Unlike score_current_span,
2107        this method associates the score with the entire trace rather than a specific span.
2108        It's useful for scoring overall performance or quality of the entire operation.
2109
2110        Args:
2111            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2112            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2113            score_id: Optional custom ID for the score (auto-generated if not provided)
2114            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2115            comment: Optional comment or explanation for the score
2116            config_id: Optional ID of a score config defined in Langfuse
2117            metadata: Optional metadata to be attached to the score
2118
2119        Example:
2120            ```python
2121            with langfuse.start_as_current_observation(name="process-user-request") as span:
2122                # Process request
2123                result = process_complete_request()
2124                span.update(output=result)
2125
2126                # Score the overall trace
2127                langfuse.score_current_trace(
2128                    name="overall_quality",
2129                    value=0.95,
2130                    data_type="NUMERIC",
2131                    comment="High quality end-to-end response",
2132                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2133                )
2134            ```
2135        """
2136        current_span = self._get_current_otel_span()
2137
2138        if current_span is not None:
2139            trace_id = self._get_otel_trace_id(current_span)
2140
2141            langfuse_logger.info(
2142                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2143            )
2144
2145            self.create_score(
2146                trace_id=trace_id,
2147                name=name,
2148                value=cast(str, value),
2149                score_id=score_id,
2150                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2151                comment=comment,
2152                config_id=config_id,
2153                metadata=metadata,
2154            )
2155
2156    def flush(self) -> None:
2157        """Force flush all pending spans and events to the Langfuse API.
2158
2159        This method manually flushes any pending spans, scores, and other events to the
2160        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2161        before proceeding, without waiting for the automatic flush interval.
2162
2163        Example:
2164            ```python
2165            # Record some spans and scores
2166            with langfuse.start_as_current_observation(name="operation") as span:
2167                # Do work...
2168                pass
2169
2170            # Ensure all data is sent to Langfuse before proceeding
2171            langfuse.flush()
2172
2173            # Continue with other work
2174            ```
2175        """
2176        if self._resources is not None:
2177            self._resources.flush()
2178
2179    def shutdown(self) -> None:
2180        """Shut down the Langfuse client and flush all pending data.
2181
2182        This method cleanly shuts down the Langfuse client, ensuring all pending data
2183        is flushed to the API and all background threads are properly terminated.
2184
2185        It's important to call this method when your application is shutting down to
2186        prevent data loss and resource leaks. For most applications, using the client
2187        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2188
2189        Example:
2190            ```python
2191            # Initialize Langfuse
2192            langfuse = Langfuse(public_key="...", secret_key="...")
2193
2194            # Use Langfuse throughout your application
2195            # ...
2196
2197            # When application is shutting down
2198            langfuse.shutdown()
2199            ```
2200        """
2201        if self._resources is not None:
2202            self._resources.shutdown()
2203
2204    def get_current_trace_id(self) -> Optional[str]:
2205        """Get the trace ID of the current active span.
2206
2207        This method retrieves the trace ID from the currently active span in the context.
2208        It can be used to get the trace ID for referencing in logs, external systems,
2209        or for creating related operations.
2210
2211        Returns:
2212            The current trace ID as a 32-character lowercase hexadecimal string,
2213            or None if there is no active span.
2214
2215        Example:
2216            ```python
2217            with langfuse.start_as_current_observation(name="process-request") as span:
2218                # Get the current trace ID for reference
2219                trace_id = langfuse.get_current_trace_id()
2220
2221                # Use it for external correlation
2222                log.info(f"Processing request with trace_id: {trace_id}")
2223
2224                # Or pass to another system
2225                external_system.process(data, trace_id=trace_id)
2226            ```
2227        """
2228        if not self._tracing_enabled:
2229            langfuse_logger.debug(
2230                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2231            )
2232            return None
2233
2234        current_otel_span = self._get_current_otel_span()
2235
2236        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2237
2238    def get_current_observation_id(self) -> Optional[str]:
2239        """Get the observation ID (span ID) of the current active span.
2240
2241        This method retrieves the observation ID from the currently active span in the context.
2242        It can be used to get the observation ID for referencing in logs, external systems,
2243        or for creating scores or other related operations.
2244
2245        Returns:
2246            The current observation ID as a 16-character lowercase hexadecimal string,
2247            or None if there is no active span.
2248
2249        Example:
2250            ```python
2251            with langfuse.start_as_current_observation(name="process-user-query") as span:
2252                # Get the current observation ID
2253                observation_id = langfuse.get_current_observation_id()
2254
2255                # Store it for later reference
2256                cache.set(f"query_{query_id}_observation", observation_id)
2257
2258                # Process the query...
2259            ```
2260        """
2261        if not self._tracing_enabled:
2262            langfuse_logger.debug(
2263                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2264            )
2265            return None
2266
2267        current_otel_span = self._get_current_otel_span()
2268
2269        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2270
2271    def _get_project_id(self) -> Optional[str]:
2272        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2273        if not self._project_id:
2274            proj = self.api.projects.get()
2275            if not proj.data or not proj.data[0].id:
2276                return None
2277
2278            self._project_id = proj.data[0].id
2279
2280        return self._project_id
2281
2282    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2283        """Get the URL to view a trace in the Langfuse UI.
2284
2285        This method generates a URL that links directly to a trace in the Langfuse UI.
2286        It's useful for providing links in logs, notifications, or debugging tools.
2287
2288        Args:
2289            trace_id: Optional trace ID to generate a URL for. If not provided,
2290                     the trace ID of the current active span will be used.
2291
2292        Returns:
2293            A URL string pointing to the trace in the Langfuse UI,
2294            or None if the project ID couldn't be retrieved or no trace ID is available.
2295
2296        Example:
2297            ```python
2298            # Get URL for the current trace
2299            with langfuse.start_as_current_observation(name="process-request") as span:
2300                trace_url = langfuse.get_trace_url()
2301                log.info(f"Processing trace: {trace_url}")
2302
2303            # Get URL for a specific trace
2304            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2305            send_notification(f"Review needed for trace: {specific_trace_url}")
2306            ```
2307        """
2308        final_trace_id = trace_id or self.get_current_trace_id()
2309        if not final_trace_id:
2310            return None
2311
2312        project_id = self._get_project_id()
2313
2314        return (
2315            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2316            if project_id and final_trace_id
2317            else None
2318        )
2319
2320    def get_dataset(
2321        self,
2322        name: str,
2323        *,
2324        fetch_items_page_size: Optional[int] = 50,
2325        version: Optional[datetime] = None,
2326    ) -> "DatasetClient":
2327        """Fetch a dataset by its name.
2328
2329        Args:
2330            name (str): The name of the dataset to fetch.
2331            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2332            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2333                If provided, returns the state of items at the specified UTC timestamp.
2334                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2335
2336        Returns:
2337            DatasetClient: The dataset with the given name.
2338        """
2339        try:
2340            langfuse_logger.debug(f"Getting datasets {name}")
2341            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2342
2343            dataset_items = []
2344            page = 1
2345
2346            while True:
2347                new_items = self.api.dataset_items.list(
2348                    dataset_name=self._url_encode(name, is_url_param=True),
2349                    page=page,
2350                    limit=fetch_items_page_size,
2351                    version=version,
2352                )
2353                dataset_items.extend(new_items.data)
2354
2355                if new_items.meta.total_pages <= page:
2356                    break
2357
2358                page += 1
2359
2360            return DatasetClient(
2361                dataset=dataset,
2362                items=dataset_items,
2363                version=version,
2364                langfuse_client=self,
2365            )
2366
2367        except Error as e:
2368            handle_fern_exception(e)
2369            raise e
2370
2371    def get_dataset_run(
2372        self, *, dataset_name: str, run_name: str
2373    ) -> DatasetRunWithItems:
2374        """Fetch a dataset run by dataset name and run name.
2375
2376        Args:
2377            dataset_name (str): The name of the dataset.
2378            run_name (str): The name of the run.
2379
2380        Returns:
2381            DatasetRunWithItems: The dataset run with its items.
2382        """
2383        try:
2384            return cast(
2385                DatasetRunWithItems,
2386                self.api.datasets.get_run(
2387                    dataset_name=self._url_encode(dataset_name),
2388                    run_name=self._url_encode(run_name),
2389                    request_options=None,
2390                ),
2391            )
2392        except Error as e:
2393            handle_fern_exception(e)
2394            raise e
2395
2396    def get_dataset_runs(
2397        self,
2398        *,
2399        dataset_name: str,
2400        page: Optional[int] = None,
2401        limit: Optional[int] = None,
2402    ) -> PaginatedDatasetRuns:
2403        """Fetch all runs for a dataset.
2404
2405        Args:
2406            dataset_name (str): The name of the dataset.
2407            page (Optional[int]): Page number, starts at 1.
2408            limit (Optional[int]): Limit of items per page.
2409
2410        Returns:
2411            PaginatedDatasetRuns: Paginated list of dataset runs.
2412        """
2413        try:
2414            return cast(
2415                PaginatedDatasetRuns,
2416                self.api.datasets.get_runs(
2417                    dataset_name=self._url_encode(dataset_name),
2418                    page=page,
2419                    limit=limit,
2420                    request_options=None,
2421                ),
2422            )
2423        except Error as e:
2424            handle_fern_exception(e)
2425            raise e
2426
2427    def delete_dataset_run(
2428        self, *, dataset_name: str, run_name: str
2429    ) -> DeleteDatasetRunResponse:
2430        """Delete a dataset run and all its run items. This action is irreversible.
2431
2432        Args:
2433            dataset_name (str): The name of the dataset.
2434            run_name (str): The name of the run.
2435
2436        Returns:
2437            DeleteDatasetRunResponse: Confirmation of deletion.
2438        """
2439        try:
2440            return cast(
2441                DeleteDatasetRunResponse,
2442                self.api.datasets.delete_run(
2443                    dataset_name=self._url_encode(dataset_name),
2444                    run_name=self._url_encode(run_name),
2445                    request_options=None,
2446                ),
2447            )
2448        except Error as e:
2449            handle_fern_exception(e)
2450            raise e
2451
2452    def run_experiment(
2453        self,
2454        *,
2455        name: str,
2456        run_name: Optional[str] = None,
2457        description: Optional[str] = None,
2458        data: ExperimentData,
2459        task: TaskFunction,
2460        evaluators: List[EvaluatorFunction] = [],
2461        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2462        run_evaluators: List[RunEvaluatorFunction] = [],
2463        max_concurrency: int = 50,
2464        metadata: Optional[Dict[str, str]] = None,
2465        _dataset_version: Optional[datetime] = None,
2466    ) -> ExperimentResult:
2467        """Run an experiment on a dataset with automatic tracing and evaluation.
2468
2469        This method executes a task function on each item in the provided dataset,
2470        automatically traces all executions with Langfuse for observability, runs
2471        item-level and run-level evaluators on the outputs, and returns comprehensive
2472        results with evaluation metrics.
2473
2474        The experiment system provides:
2475        - Automatic tracing of all task executions
2476        - Concurrent processing with configurable limits
2477        - Comprehensive error handling that isolates failures
2478        - Integration with Langfuse datasets for experiment tracking
2479        - Flexible evaluation framework supporting both sync and async evaluators
2480
2481        Args:
2482            name: Human-readable name for the experiment. Used for identification
2483                in the Langfuse UI.
2484            run_name: Optional exact name for the experiment run. If provided, this will be
2485                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2486                If not provided, this will default to the experiment name appended with an ISO timestamp.
2487            description: Optional description explaining the experiment's purpose,
2488                methodology, or expected outcomes.
2489            data: Array of data items to process. Can be either:
2490                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2491                - List of Langfuse DatasetItem objects from dataset.items
2492            task: Function that processes each data item and returns output.
2493                Must accept 'item' as keyword argument and can return sync or async results.
2494                The task function signature should be: task(*, item, **kwargs) -> Any
2495            evaluators: List of functions to evaluate each item's output individually.
2496                Each evaluator receives input, output, expected_output, and metadata.
2497                Can return single Evaluation dict or list of Evaluation dicts.
2498            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2499                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2500                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2501                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2502            run_evaluators: List of functions to evaluate the entire experiment run.
2503                Each run evaluator receives all item_results and can compute aggregate metrics.
2504                Useful for calculating averages, distributions, or cross-item comparisons.
2505            max_concurrency: Maximum number of concurrent task executions (default: 50).
2506                Controls the number of items processed simultaneously. Adjust based on
2507                API rate limits and system resources.
2508            metadata: Optional metadata dictionary to attach to all experiment traces.
2509                This metadata will be included in every trace created during the experiment.
2510                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2511
2512        Returns:
2513            ExperimentResult containing:
2514            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2515            - item_results: List of results for each processed item with outputs and evaluations
2516            - run_evaluations: List of aggregate evaluation results for the entire run
2517            - experiment_id: Stable identifier for the experiment run across all items
2518            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2519            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2520
2521        Raises:
2522            ValueError: If required parameters are missing or invalid
2523            Exception: If experiment setup fails (individual item failures are handled gracefully)
2524
2525        Examples:
2526            Basic experiment with local data:
2527            ```python
2528            def summarize_text(*, item, **kwargs):
2529                return f"Summary: {item['input'][:50]}..."
2530
2531            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2532                return {
2533                    "name": "output_length",
2534                    "value": len(output),
2535                    "comment": f"Output contains {len(output)} characters"
2536                }
2537
2538            result = langfuse.run_experiment(
2539                name="Text Summarization Test",
2540                description="Evaluate summarization quality and length",
2541                data=[
2542                    {"input": "Long article text...", "expected_output": "Expected summary"},
2543                    {"input": "Another article...", "expected_output": "Another summary"}
2544                ],
2545                task=summarize_text,
2546                evaluators=[length_evaluator]
2547            )
2548
2549            print(f"Processed {len(result.item_results)} items")
2550            for item_result in result.item_results:
2551                print(f"Input: {item_result.item['input']}")
2552                print(f"Output: {item_result.output}")
2553                print(f"Evaluations: {item_result.evaluations}")
2554            ```
2555
2556            Advanced experiment with async task and multiple evaluators:
2557            ```python
2558            async def llm_task(*, item, **kwargs):
2559                # Simulate async LLM call
2560                response = await openai_client.chat.completions.create(
2561                    model="gpt-4",
2562                    messages=[{"role": "user", "content": item["input"]}]
2563                )
2564                return response.choices[0].message.content
2565
2566            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2567                if expected_output and expected_output.lower() in output.lower():
2568                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2569                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2570
2571            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2572                # Simulate toxicity check
2573                toxicity_score = check_toxicity(output)  # Your toxicity checker
2574                return {
2575                    "name": "toxicity",
2576                    "value": toxicity_score,
2577                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2578                }
2579
2580            def average_accuracy(*, item_results, **kwargs):
2581                accuracies = [
2582                    eval.value for result in item_results
2583                    for eval in result.evaluations
2584                    if eval.name == "accuracy"
2585                ]
2586                return {
2587                    "name": "average_accuracy",
2588                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2589                    "comment": f"Average accuracy across {len(accuracies)} items"
2590                }
2591
2592            result = langfuse.run_experiment(
2593                name="LLM Safety and Accuracy Test",
2594                description="Evaluate model accuracy and safety across diverse prompts",
2595                data=test_dataset,  # Your dataset items
2596                task=llm_task,
2597                evaluators=[accuracy_evaluator, toxicity_evaluator],
2598                run_evaluators=[average_accuracy],
2599                max_concurrency=5,  # Limit concurrent API calls
2600                metadata={"model": "gpt-4", "temperature": 0.7}
2601            )
2602            ```
2603
2604            Using with Langfuse datasets:
2605            ```python
2606            # Get dataset from Langfuse
2607            dataset = langfuse.get_dataset("my-eval-dataset")
2608
2609            result = dataset.run_experiment(
2610                name="Production Model Evaluation",
2611                description="Monthly evaluation of production model performance",
2612                task=my_production_task,
2613                evaluators=[accuracy_evaluator, latency_evaluator]
2614            )
2615
2616            # Results automatically linked to dataset in Langfuse UI
2617            print(f"View results: {result['dataset_run_url']}")
2618            ```
2619
2620        Note:
2621            - Task and evaluator functions can be either synchronous or asynchronous
2622            - Individual item failures are logged but don't stop the experiment
2623            - All executions are automatically traced and visible in Langfuse UI
2624            - When using Langfuse datasets, results are automatically linked for easy comparison
2625            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2626            - Async execution is handled automatically with smart event loop detection
2627        """
2628        return cast(
2629            ExperimentResult,
2630            run_async_safely(
2631                self._run_experiment_async(
2632                    name=name,
2633                    run_name=self._create_experiment_run_name(
2634                        name=name, run_name=run_name
2635                    ),
2636                    description=description,
2637                    data=data,
2638                    task=task,
2639                    evaluators=evaluators or [],
2640                    composite_evaluator=composite_evaluator,
2641                    run_evaluators=run_evaluators or [],
2642                    max_concurrency=max_concurrency,
2643                    metadata=metadata,
2644                    dataset_version=_dataset_version,
2645                ),
2646            ),
2647        )
2648
2649    async def _run_experiment_async(
2650        self,
2651        *,
2652        name: str,
2653        run_name: str,
2654        description: Optional[str],
2655        data: ExperimentData,
2656        task: TaskFunction,
2657        evaluators: List[EvaluatorFunction],
2658        composite_evaluator: Optional[CompositeEvaluatorFunction],
2659        run_evaluators: List[RunEvaluatorFunction],
2660        max_concurrency: int,
2661        metadata: Optional[Dict[str, Any]] = None,
2662        dataset_version: Optional[datetime] = None,
2663    ) -> ExperimentResult:
2664        langfuse_logger.debug(
2665            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2666        )
2667
2668        shared_fallback_experiment_id = self._create_observation_id()
2669
2670        # Set up concurrency control
2671        semaphore = asyncio.Semaphore(max_concurrency)
2672
2673        # Process all items
2674        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2675            async with semaphore:
2676                return await self._process_experiment_item(
2677                    item,
2678                    task,
2679                    evaluators,
2680                    composite_evaluator,
2681                    shared_fallback_experiment_id,
2682                    name,
2683                    run_name,
2684                    description,
2685                    metadata,
2686                    dataset_version,
2687                )
2688
2689        # Run all items concurrently
2690        tasks = [process_item(item) for item in data]
2691        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2692
2693        # Filter out any exceptions and log errors
2694        valid_results: List[ExperimentItemResult] = []
2695        for i, result in enumerate(item_results):
2696            if isinstance(result, Exception):
2697                langfuse_logger.error(f"Item {i} failed: {result}")
2698            elif isinstance(result, ExperimentItemResult):
2699                valid_results.append(result)  # type: ignore
2700
2701        # Run experiment-level evaluators
2702        run_evaluations: List[Evaluation] = []
2703        for run_evaluator in run_evaluators:
2704            try:
2705                evaluations = await _run_evaluator(
2706                    run_evaluator, item_results=valid_results
2707                )
2708                run_evaluations.extend(evaluations)
2709            except Exception as e:
2710                langfuse_logger.error(f"Run evaluator failed: {e}")
2711
2712        # Generate dataset run URL if applicable
2713        dataset_run_id = next(
2714            (
2715                result.dataset_run_id
2716                for result in valid_results
2717                if result.dataset_run_id
2718            ),
2719            None,
2720        )
2721        dataset_run_url = None
2722        if dataset_run_id and data:
2723            try:
2724                # Check if the first item has dataset_id (for DatasetItem objects)
2725                first_item = data[0]
2726                dataset_id = None
2727
2728                if hasattr(first_item, "dataset_id"):
2729                    dataset_id = getattr(first_item, "dataset_id", None)
2730
2731                if dataset_id:
2732                    project_id = self._get_project_id()
2733
2734                    if project_id:
2735                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2736
2737            except Exception:
2738                pass  # URL generation is optional
2739
2740        # Store run-level evaluations as scores
2741        for evaluation in run_evaluations:
2742            try:
2743                if dataset_run_id:
2744                    self.create_score(
2745                        dataset_run_id=dataset_run_id,
2746                        name=evaluation.name or "<unknown>",
2747                        value=evaluation.value,  # type: ignore
2748                        comment=evaluation.comment,
2749                        metadata=evaluation.metadata,
2750                        data_type=evaluation.data_type,  # type: ignore
2751                        config_id=evaluation.config_id,
2752                    )
2753
2754            except Exception as e:
2755                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2756
2757        # Flush scores and traces
2758        self.flush()
2759
2760        return ExperimentResult(
2761            name=name,
2762            run_name=run_name,
2763            description=description,
2764            item_results=valid_results,
2765            run_evaluations=run_evaluations,
2766            experiment_id=dataset_run_id or shared_fallback_experiment_id,
2767            dataset_run_id=dataset_run_id,
2768            dataset_run_url=dataset_run_url,
2769        )
2770
2771    async def _process_experiment_item(
2772        self,
2773        item: ExperimentItem,
2774        task: Callable,
2775        evaluators: List[Callable],
2776        composite_evaluator: Optional[CompositeEvaluatorFunction],
2777        fallback_experiment_id: str,
2778        experiment_name: str,
2779        experiment_run_name: str,
2780        experiment_description: Optional[str],
2781        experiment_metadata: Optional[Dict[str, Any]] = None,
2782        dataset_version: Optional[datetime] = None,
2783    ) -> ExperimentItemResult:
2784        span_name = "experiment-item-run"
2785
2786        with self.start_as_current_observation(name=span_name) as span:
2787            try:
2788                input_data = (
2789                    item.get("input")
2790                    if isinstance(item, dict)
2791                    else getattr(item, "input", None)
2792                )
2793
2794                if input_data is None:
2795                    raise ValueError("Experiment Item is missing input. Skipping item.")
2796
2797                expected_output = (
2798                    item.get("expected_output")
2799                    if isinstance(item, dict)
2800                    else getattr(item, "expected_output", None)
2801                )
2802
2803                item_metadata = (
2804                    item.get("metadata")
2805                    if isinstance(item, dict)
2806                    else getattr(item, "metadata", None)
2807                )
2808
2809                final_observation_metadata = {
2810                    "experiment_name": experiment_name,
2811                    "experiment_run_name": experiment_run_name,
2812                    **(experiment_metadata or {}),
2813                }
2814
2815                trace_id = span.trace_id
2816                dataset_id = None
2817                dataset_item_id = None
2818                dataset_run_id = None
2819
2820                # Link to dataset run if this is a dataset item
2821                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2822                    try:
2823                        # Use sync API to avoid event loop issues when run_async_safely
2824                        # creates multiple event loops across different threads
2825                        dataset_run_item = await asyncio.to_thread(
2826                            self.api.dataset_run_items.create,
2827                            run_name=experiment_run_name,
2828                            run_description=experiment_description,
2829                            metadata=experiment_metadata,
2830                            dataset_item_id=item.id,  # type: ignore
2831                            trace_id=trace_id,
2832                            observation_id=span.id,
2833                            dataset_version=dataset_version,
2834                        )
2835
2836                        dataset_run_id = dataset_run_item.dataset_run_id
2837
2838                    except Exception as e:
2839                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2840
2841                if (
2842                    not isinstance(item, dict)
2843                    and hasattr(item, "dataset_id")
2844                    and hasattr(item, "id")
2845                ):
2846                    dataset_id = item.dataset_id
2847                    dataset_item_id = item.id
2848
2849                    final_observation_metadata.update(
2850                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2851                    )
2852
2853                if isinstance(item_metadata, dict):
2854                    final_observation_metadata.update(item_metadata)
2855
2856                experiment_id = dataset_run_id or fallback_experiment_id
2857                experiment_item_id = (
2858                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2859                )
2860                span._otel_span.set_attributes(
2861                    {
2862                        k: v
2863                        for k, v in {
2864                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2865                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2866                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2867                                expected_output
2868                            ),
2869                        }.items()
2870                        if v is not None
2871                    }
2872                )
2873
2874                propagated_experiment_attributes = PropagatedExperimentAttributes(
2875                    experiment_id=experiment_id,
2876                    experiment_name=experiment_run_name,
2877                    experiment_metadata=_flatten_and_serialize_metadata_values(
2878                        experiment_metadata
2879                    ),
2880                    experiment_dataset_id=dataset_id,
2881                    experiment_item_id=experiment_item_id,
2882                    experiment_item_metadata=_flatten_and_serialize_metadata_values(
2883                        item_metadata if isinstance(item_metadata, dict) else None
2884                    ),
2885                    experiment_item_root_observation_id=span.id,
2886                )
2887
2888                with _propagate_attributes(experiment=propagated_experiment_attributes):
2889                    output = await _run_task(task, item)
2890
2891                span.update(
2892                    input=input_data,
2893                    output=output,
2894                    metadata=final_observation_metadata,
2895                )
2896
2897            except Exception as e:
2898                span.update(
2899                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2900                )
2901                raise e
2902
2903            # Run evaluators
2904            evaluations = []
2905
2906            for evaluator in evaluators:
2907                try:
2908                    eval_metadata: Optional[Dict[str, Any]] = None
2909
2910                    if isinstance(item, dict):
2911                        eval_metadata = item.get("metadata")
2912                    elif hasattr(item, "metadata"):
2913                        eval_metadata = item.metadata
2914
2915                    with _propagate_attributes(
2916                        experiment=propagated_experiment_attributes
2917                    ):
2918                        eval_results = await _run_evaluator(
2919                            evaluator,
2920                            input=input_data,
2921                            output=output,
2922                            expected_output=expected_output,
2923                            metadata=eval_metadata,
2924                        )
2925                        evaluations.extend(eval_results)
2926
2927                        # Store evaluations as scores
2928                        for evaluation in eval_results:
2929                            self.create_score(
2930                                trace_id=trace_id,
2931                                observation_id=span.id,
2932                                name=evaluation.name,
2933                                value=evaluation.value,  # type: ignore
2934                                comment=evaluation.comment,
2935                                metadata=evaluation.metadata,
2936                                config_id=evaluation.config_id,
2937                                data_type=evaluation.data_type,  # type: ignore
2938                            )
2939
2940                except Exception as e:
2941                    langfuse_logger.error(f"Evaluator failed: {e}")
2942
2943            # Run composite evaluator if provided and we have evaluations
2944            if composite_evaluator and evaluations:
2945                try:
2946                    composite_eval_metadata: Optional[Dict[str, Any]] = None
2947                    if isinstance(item, dict):
2948                        composite_eval_metadata = item.get("metadata")
2949                    elif hasattr(item, "metadata"):
2950                        composite_eval_metadata = item.metadata
2951
2952                    with _propagate_attributes(
2953                        experiment=propagated_experiment_attributes
2954                    ):
2955                        result = composite_evaluator(
2956                            input=input_data,
2957                            output=output,
2958                            expected_output=expected_output,
2959                            metadata=composite_eval_metadata,
2960                            evaluations=evaluations,
2961                        )
2962
2963                        # Handle async composite evaluators
2964                        if asyncio.iscoroutine(result):
2965                            result = await result
2966
2967                        # Normalize to list
2968                        composite_evals: List[Evaluation] = []
2969                        if isinstance(result, (dict, Evaluation)):
2970                            composite_evals = [result]  # type: ignore
2971                        elif isinstance(result, list):
2972                            composite_evals = result  # type: ignore
2973
2974                        # Store composite evaluations as scores and add to evaluations list
2975                        for composite_evaluation in composite_evals:
2976                            self.create_score(
2977                                trace_id=trace_id,
2978                                observation_id=span.id,
2979                                name=composite_evaluation.name,
2980                                value=composite_evaluation.value,  # type: ignore
2981                                comment=composite_evaluation.comment,
2982                                metadata=composite_evaluation.metadata,
2983                                config_id=composite_evaluation.config_id,
2984                                data_type=composite_evaluation.data_type,  # type: ignore
2985                            )
2986                            evaluations.append(composite_evaluation)
2987
2988                except Exception as e:
2989                    langfuse_logger.error(f"Composite evaluator failed: {e}")
2990
2991            return ExperimentItemResult(
2992                item=item,
2993                output=output,
2994                evaluations=evaluations,
2995                trace_id=trace_id,
2996                dataset_run_id=dataset_run_id,
2997            )
2998
2999    def _create_experiment_run_name(
3000        self, *, name: Optional[str] = None, run_name: Optional[str] = None
3001    ) -> str:
3002        if run_name:
3003            return run_name
3004
3005        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
3006
3007        return f"{name} - {iso_timestamp}"
3008
3009    def run_batched_evaluation(
3010        self,
3011        *,
3012        scope: Literal["traces", "observations"],
3013        mapper: MapperFunction,
3014        filter: Optional[str] = None,
3015        fetch_batch_size: int = 50,
3016        fetch_trace_fields: Optional[str] = None,
3017        max_items: Optional[int] = None,
3018        max_retries: int = 3,
3019        evaluators: List[EvaluatorFunction],
3020        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3021        max_concurrency: int = 5,
3022        metadata: Optional[Dict[str, Any]] = None,
3023        _add_observation_scores_to_trace: bool = False,
3024        _additional_trace_tags: Optional[List[str]] = None,
3025        resume_from: Optional[BatchEvaluationResumeToken] = None,
3026        verbose: bool = False,
3027    ) -> BatchEvaluationResult:
3028        """Fetch traces or observations and run evaluations on each item.
3029
3030        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3031        It fetches items based on filters, transforms them using a mapper function, runs
3032        evaluators on each item, and creates scores that are linked back to the original
3033        entities. This is ideal for:
3034
3035        - Running evaluations on production traces after deployment
3036        - Backtesting new evaluation metrics on historical data
3037        - Batch scoring of observations for quality monitoring
3038        - Periodic evaluation runs on recent data
3039
3040        The method uses a streaming/pipeline approach to process items in batches, making
3041        it memory-efficient for large datasets. It includes comprehensive error handling,
3042        retry logic, and resume capability for long-running evaluations.
3043
3044        Args:
3045            scope: The type of items to evaluate. Must be one of:
3046                - "traces": Evaluate complete traces with all their observations
3047                - "observations": Evaluate individual observations (spans, generations, events)
3048            mapper: Function that transforms API response objects into evaluator inputs.
3049                Receives a trace/observation object and returns an EvaluatorInputs
3050                instance with input, output, expected_output, and metadata fields.
3051                Can be sync or async.
3052            evaluators: List of evaluation functions to run on each item. Each evaluator
3053                receives the mapped inputs and returns Evaluation object(s). Evaluator
3054                failures are logged but don't stop the batch evaluation.
3055            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3056                - '{"tags": ["production"]}'
3057                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3058                Default: None (fetches all items).
3059            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3060                Larger values may be faster but use more memory. Default: 50.
3061            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3062            max_items: Maximum total number of items to process. If None, processes all
3063                items matching the filter. Useful for testing or limiting evaluation runs.
3064                Default: None (process all).
3065            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3066                parallelism and resource usage. Default: 5.
3067            composite_evaluator: Optional function that creates a composite score from
3068                item-level evaluations. Receives the original item and its evaluations,
3069                returns a single Evaluation. Useful for weighted averages or combined metrics.
3070                Default: None.
3071            metadata: Optional metadata dict to add to all created scores. Useful for
3072                tracking evaluation runs, versions, or other context. Default: None.
3073            max_retries: Maximum number of retry attempts for failed batch fetches.
3074                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3075            verbose: If True, logs progress information to console. Useful for monitoring
3076                long-running evaluations. Default: False.
3077            resume_from: Optional resume token from a previous incomplete run. Allows
3078                continuing evaluation after interruption or failure. Default: None.
3079
3080
3081        Returns:
3082            BatchEvaluationResult containing:
3083                - total_items_fetched: Number of items fetched from API
3084                - total_items_processed: Number of items successfully evaluated
3085                - total_items_failed: Number of items that failed evaluation
3086                - total_scores_created: Scores created by item-level evaluators
3087                - total_composite_scores_created: Scores created by composite evaluator
3088                - total_evaluations_failed: Individual evaluator failures
3089                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3090                - resume_token: Token for resuming if incomplete (None if completed)
3091                - completed: True if all items processed
3092                - duration_seconds: Total execution time
3093                - failed_item_ids: IDs of items that failed
3094                - error_summary: Error types and counts
3095                - has_more_items: True if max_items reached but more exist
3096
3097        Raises:
3098            ValueError: If invalid scope is provided.
3099
3100        Examples:
3101            Basic trace evaluation:
3102            ```python
3103            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3104
3105            client = Langfuse()
3106
3107            # Define mapper to extract fields from traces
3108            def trace_mapper(trace):
3109                return EvaluatorInputs(
3110                    input=trace.input,
3111                    output=trace.output,
3112                    expected_output=None,
3113                    metadata={"trace_id": trace.id}
3114                )
3115
3116            # Define evaluator
3117            def length_evaluator(*, input, output, expected_output, metadata):
3118                return Evaluation(
3119                    name="output_length",
3120                    value=len(output) if output else 0
3121                )
3122
3123            # Run batch evaluation
3124            result = client.run_batched_evaluation(
3125                scope="traces",
3126                mapper=trace_mapper,
3127                evaluators=[length_evaluator],
3128                filter='{"tags": ["production"]}',
3129                max_items=1000,
3130                verbose=True
3131            )
3132
3133            print(f"Processed {result.total_items_processed} traces")
3134            print(f"Created {result.total_scores_created} scores")
3135            ```
3136
3137            Evaluation with composite scorer:
3138            ```python
3139            def accuracy_evaluator(*, input, output, expected_output, metadata):
3140                # ... evaluation logic
3141                return Evaluation(name="accuracy", value=0.85)
3142
3143            def relevance_evaluator(*, input, output, expected_output, metadata):
3144                # ... evaluation logic
3145                return Evaluation(name="relevance", value=0.92)
3146
3147            def composite_evaluator(*, item, evaluations):
3148                # Weighted average of evaluations
3149                weights = {"accuracy": 0.6, "relevance": 0.4}
3150                total = sum(
3151                    e.value * weights.get(e.name, 0)
3152                    for e in evaluations
3153                    if isinstance(e.value, (int, float))
3154                )
3155                return Evaluation(
3156                    name="composite_score",
3157                    value=total,
3158                    comment=f"Weighted average of {len(evaluations)} metrics"
3159                )
3160
3161            result = client.run_batched_evaluation(
3162                scope="traces",
3163                mapper=trace_mapper,
3164                evaluators=[accuracy_evaluator, relevance_evaluator],
3165                composite_evaluator=composite_evaluator,
3166                filter='{"user_id": "important_user"}',
3167                verbose=True
3168            )
3169            ```
3170
3171            Handling incomplete runs with resume:
3172            ```python
3173            # Initial run that may fail or timeout
3174            result = client.run_batched_evaluation(
3175                scope="observations",
3176                mapper=obs_mapper,
3177                evaluators=[my_evaluator],
3178                max_items=10000,
3179                verbose=True
3180            )
3181
3182            # Check if incomplete
3183            if not result.completed and result.resume_token:
3184                print(f"Processed {result.resume_token.items_processed} items before interruption")
3185
3186                # Resume from where it left off
3187                result = client.run_batched_evaluation(
3188                    scope="observations",
3189                    mapper=obs_mapper,
3190                    evaluators=[my_evaluator],
3191                    resume_from=result.resume_token,
3192                    verbose=True
3193                )
3194
3195            print(f"Total items processed: {result.total_items_processed}")
3196            ```
3197
3198            Monitoring evaluator performance:
3199            ```python
3200            result = client.run_batched_evaluation(...)
3201
3202            for stats in result.evaluator_stats:
3203                success_rate = stats.successful_runs / stats.total_runs
3204                print(f"{stats.name}:")
3205                print(f"  Success rate: {success_rate:.1%}")
3206                print(f"  Scores created: {stats.total_scores_created}")
3207
3208                if stats.failed_runs > 0:
3209                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3210            ```
3211
3212        Note:
3213            - Evaluator failures are logged but don't stop the batch evaluation
3214            - Individual item failures are tracked but don't stop processing
3215            - Fetch failures are retried with exponential backoff
3216            - All scores are automatically flushed to Langfuse at the end
3217            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3218        """
3219        runner = BatchEvaluationRunner(self)
3220
3221        return cast(
3222            BatchEvaluationResult,
3223            run_async_safely(
3224                runner.run_async(
3225                    scope=scope,
3226                    mapper=mapper,
3227                    evaluators=evaluators,
3228                    filter=filter,
3229                    fetch_batch_size=fetch_batch_size,
3230                    fetch_trace_fields=fetch_trace_fields,
3231                    max_items=max_items,
3232                    max_concurrency=max_concurrency,
3233                    composite_evaluator=composite_evaluator,
3234                    metadata=metadata,
3235                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3236                    _additional_trace_tags=_additional_trace_tags,
3237                    max_retries=max_retries,
3238                    verbose=verbose,
3239                    resume_from=resume_from,
3240                )
3241            ),
3242        )
3243
3244    def auth_check(self) -> bool:
3245        """Check if the provided credentials (public and secret key) are valid.
3246
3247        Raises:
3248            Exception: If no projects were found for the provided credentials.
3249
3250        Note:
3251            This method is blocking. It is discouraged to use it in production code.
3252        """
3253        try:
3254            projects = self.api.projects.get()
3255            langfuse_logger.debug(
3256                f"Auth check successful, found {len(projects.data)} projects"
3257            )
3258            if len(projects.data) == 0:
3259                raise Exception(
3260                    "Auth check failed, no project found for the keys provided."
3261                )
3262            return True
3263
3264        except AttributeError as e:
3265            langfuse_logger.warning(
3266                f"Auth check failed: Client not properly initialized. Error: {e}"
3267            )
3268            return False
3269
3270        except Error as e:
3271            handle_fern_exception(e)
3272            raise e
3273
3274    def create_dataset(
3275        self,
3276        *,
3277        name: str,
3278        description: Optional[str] = None,
3279        metadata: Optional[Any] = None,
3280        input_schema: Optional[Any] = None,
3281        expected_output_schema: Optional[Any] = None,
3282    ) -> Dataset:
3283        """Create a dataset with the given name on Langfuse.
3284
3285        Args:
3286            name: Name of the dataset to create.
3287            description: Description of the dataset. Defaults to None.
3288            metadata: Additional metadata. Defaults to None.
3289            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3290            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3291
3292        Returns:
3293            Dataset: The created dataset as returned by the Langfuse API.
3294        """
3295        try:
3296            langfuse_logger.debug(f"Creating datasets {name}")
3297
3298            result = self.api.datasets.create(
3299                name=name,
3300                description=description,
3301                metadata=metadata,
3302                input_schema=input_schema,
3303                expected_output_schema=expected_output_schema,
3304            )
3305
3306            return cast(Dataset, result)
3307
3308        except Error as e:
3309            handle_fern_exception(e)
3310            raise e
3311
3312    def create_dataset_item(
3313        self,
3314        *,
3315        dataset_name: str,
3316        input: Optional[Any] = None,
3317        expected_output: Optional[Any] = None,
3318        metadata: Optional[Any] = None,
3319        source_trace_id: Optional[str] = None,
3320        source_observation_id: Optional[str] = None,
3321        status: Optional[DatasetStatus] = None,
3322        id: Optional[str] = None,
3323    ) -> DatasetItem:
3324        """Create a dataset item.
3325
3326        Upserts if an item with id already exists.
3327
3328        Args:
3329            dataset_name: Name of the dataset in which the dataset item should be created.
3330            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3331            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3332            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3333            source_trace_id: Id of the source trace. Defaults to None.
3334            source_observation_id: Id of the source observation. Defaults to None.
3335            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3336            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3337
3338        Returns:
3339            DatasetItem: The created dataset item as returned by the Langfuse API.
3340
3341        Example:
3342            ```python
3343            from langfuse import Langfuse
3344
3345            langfuse = Langfuse()
3346
3347            # Uploading items to the Langfuse dataset named "capital_cities"
3348            langfuse.create_dataset_item(
3349                dataset_name="capital_cities",
3350                input={"input": {"country": "Italy"}},
3351                expected_output={"expected_output": "Rome"},
3352                metadata={"foo": "bar"}
3353            )
3354            ```
3355        """
3356        try:
3357            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3358
3359            result = self.api.dataset_items.create(
3360                dataset_name=dataset_name,
3361                input=input,
3362                expected_output=expected_output,
3363                metadata=metadata,
3364                source_trace_id=source_trace_id,
3365                source_observation_id=source_observation_id,
3366                status=status,
3367                id=id,
3368            )
3369
3370            return cast(DatasetItem, result)
3371        except Error as e:
3372            handle_fern_exception(e)
3373            raise e
3374
3375    def resolve_media_references(
3376        self,
3377        *,
3378        obj: Any,
3379        resolve_with: Literal["base64_data_uri"],
3380        max_depth: int = 10,
3381        content_fetch_timeout_seconds: int = 5,
3382    ) -> Any:
3383        """Replace media reference strings in an object with base64 data URIs.
3384
3385        This method recursively traverses an object (up to max_depth) looking for media reference strings
3386        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3387        the provided Langfuse client and replaces the reference string with a base64 data URI.
3388
3389        If fetching media content fails for a reference string, a warning is logged and the reference
3390        string is left unchanged.
3391
3392        Args:
3393            obj: The object to process. Can be a primitive value, array, or nested object.
3394                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3395            resolve_with: The representation of the media content to replace the media reference string with.
3396                Currently only "base64_data_uri" is supported.
3397            max_depth: int: The maximum depth to traverse the object. Default is 10.
3398            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3399
3400        Returns:
3401            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3402            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3403
3404        Example:
3405            obj = {
3406                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3407                "nested": {
3408                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3409                }
3410            }
3411
3412            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3413
3414            # Result:
3415            # {
3416            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3417            #     "nested": {
3418            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3419            #     }
3420            # }
3421        """
3422        return LangfuseMedia.resolve_media_references(
3423            langfuse_client=self,
3424            obj=obj,
3425            resolve_with=resolve_with,
3426            max_depth=max_depth,
3427            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3428        )
3429
3430    @overload
3431    def get_prompt(
3432        self,
3433        name: str,
3434        *,
3435        version: Optional[int] = None,
3436        label: Optional[str] = None,
3437        type: Literal["chat"],
3438        cache_ttl_seconds: Optional[int] = None,
3439        fallback: Optional[List[ChatMessageDict]] = None,
3440        max_retries: Optional[int] = None,
3441        fetch_timeout_seconds: Optional[int] = None,
3442    ) -> ChatPromptClient: ...
3443
3444    @overload
3445    def get_prompt(
3446        self,
3447        name: str,
3448        *,
3449        version: Optional[int] = None,
3450        label: Optional[str] = None,
3451        type: Literal["text"] = "text",
3452        cache_ttl_seconds: Optional[int] = None,
3453        fallback: Optional[str] = None,
3454        max_retries: Optional[int] = None,
3455        fetch_timeout_seconds: Optional[int] = None,
3456    ) -> TextPromptClient: ...
3457
3458    def get_prompt(
3459        self,
3460        name: str,
3461        *,
3462        version: Optional[int] = None,
3463        label: Optional[str] = None,
3464        type: Literal["chat", "text"] = "text",
3465        cache_ttl_seconds: Optional[int] = None,
3466        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3467        max_retries: Optional[int] = None,
3468        fetch_timeout_seconds: Optional[int] = None,
3469    ) -> PromptClient:
3470        """Get a prompt.
3471
3472        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3473        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3474        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3475        return the expired prompt as a fallback.
3476
3477        Args:
3478            name (str): The name of the prompt to retrieve.
3479
3480        Keyword Args:
3481            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3482            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3483            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3484            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3485            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3486            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3487            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3488            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3489
3490        Returns:
3491            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3492            - TextPromptClient, if type argument is 'text'.
3493            - ChatPromptClient, if type argument is 'chat'.
3494
3495        Raises:
3496            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3497            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3498        """
3499        if self._resources is None:
3500            raise Error(
3501                "SDK is not correctly initialized. Check the init logs for more details."
3502            )
3503        if version is not None and label is not None:
3504            raise ValueError("Cannot specify both version and label at the same time.")
3505
3506        if not name:
3507            raise ValueError("Prompt name cannot be empty.")
3508
3509        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3510        bounded_max_retries = self._get_bounded_max_retries(
3511            max_retries, default_max_retries=2, max_retries_upper_bound=4
3512        )
3513
3514        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3515        cached_prompt = self._resources.prompt_cache.get(cache_key)
3516
3517        if cached_prompt is None or cache_ttl_seconds == 0:
3518            langfuse_logger.debug(
3519                f"Prompt '{cache_key}' not found in cache or caching disabled."
3520            )
3521            try:
3522                return self._fetch_prompt_and_update_cache(
3523                    name,
3524                    version=version,
3525                    label=label,
3526                    ttl_seconds=cache_ttl_seconds,
3527                    max_retries=bounded_max_retries,
3528                    fetch_timeout_seconds=fetch_timeout_seconds,
3529                )
3530            except Exception as e:
3531                if fallback:
3532                    langfuse_logger.warning(
3533                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3534                    )
3535
3536                    fallback_client_args: Dict[str, Any] = {
3537                        "name": name,
3538                        "prompt": fallback,
3539                        "type": type,
3540                        "version": version or 0,
3541                        "config": {},
3542                        "labels": [label] if label else [],
3543                        "tags": [],
3544                    }
3545
3546                    if type == "text":
3547                        return TextPromptClient(
3548                            prompt=Prompt_Text(**fallback_client_args),
3549                            is_fallback=True,
3550                        )
3551
3552                    if type == "chat":
3553                        return ChatPromptClient(
3554                            prompt=Prompt_Chat(**fallback_client_args),
3555                            is_fallback=True,
3556                        )
3557
3558                raise e
3559
3560        if cached_prompt.is_expired():
3561            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3562            try:
3563                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3564                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3565
3566                def refresh_task() -> None:
3567                    self._fetch_prompt_and_update_cache(
3568                        name,
3569                        version=version,
3570                        label=label,
3571                        ttl_seconds=cache_ttl_seconds,
3572                        max_retries=bounded_max_retries,
3573                        fetch_timeout_seconds=fetch_timeout_seconds,
3574                    )
3575
3576                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3577                    cache_key,
3578                    cached_prompt,
3579                    refresh_task,
3580                )
3581                langfuse_logger.debug(
3582                    f"Returning stale prompt '{cache_key}' from cache."
3583                )
3584                # return stale prompt
3585                return cached_prompt.value
3586
3587            except Exception as e:
3588                langfuse_logger.warning(
3589                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3590                )
3591                # creation of refresh prompt task failed, return stale prompt
3592                return cached_prompt.value
3593
3594        return cached_prompt.value
3595
3596    def _fetch_prompt_and_update_cache(
3597        self,
3598        name: str,
3599        *,
3600        version: Optional[int] = None,
3601        label: Optional[str] = None,
3602        ttl_seconds: Optional[int] = None,
3603        max_retries: int,
3604        fetch_timeout_seconds: Optional[int],
3605    ) -> PromptClient:
3606        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3607        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3608
3609        try:
3610
3611            @backoff.on_exception(
3612                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3613            )
3614            def fetch_prompts() -> Any:
3615                return self.api.prompts.get(
3616                    self._url_encode(name),
3617                    version=version,
3618                    label=label,
3619                    request_options={
3620                        "timeout_in_seconds": fetch_timeout_seconds,
3621                    }
3622                    if fetch_timeout_seconds is not None
3623                    else None,
3624                )
3625
3626            prompt_response = fetch_prompts()
3627
3628            prompt: PromptClient
3629            if prompt_response.type == "chat":
3630                prompt = ChatPromptClient(prompt_response)
3631            else:
3632                prompt = TextPromptClient(prompt_response)
3633
3634            if self._resources is not None:
3635                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3636
3637            return prompt
3638
3639        except NotFoundError as not_found_error:
3640            langfuse_logger.warning(
3641                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3642            )
3643            if self._resources is not None:
3644                self._resources.prompt_cache.delete(cache_key)
3645            raise not_found_error
3646
3647        except Exception as e:
3648            langfuse_logger.error(
3649                f"Error while fetching prompt '{cache_key}': {str(e)}"
3650            )
3651            raise e
3652
3653    def _get_bounded_max_retries(
3654        self,
3655        max_retries: Optional[int],
3656        *,
3657        default_max_retries: int = 2,
3658        max_retries_upper_bound: int = 4,
3659    ) -> int:
3660        if max_retries is None:
3661            return default_max_retries
3662
3663        bounded_max_retries = min(
3664            max(max_retries, 0),
3665            max_retries_upper_bound,
3666        )
3667
3668        return bounded_max_retries
3669
3670    @overload
3671    def create_prompt(
3672        self,
3673        *,
3674        name: str,
3675        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3676        labels: List[str] = [],
3677        tags: Optional[List[str]] = None,
3678        type: Optional[Literal["chat"]],
3679        config: Optional[Any] = None,
3680        commit_message: Optional[str] = None,
3681    ) -> ChatPromptClient: ...
3682
3683    @overload
3684    def create_prompt(
3685        self,
3686        *,
3687        name: str,
3688        prompt: str,
3689        labels: List[str] = [],
3690        tags: Optional[List[str]] = None,
3691        type: Optional[Literal["text"]] = "text",
3692        config: Optional[Any] = None,
3693        commit_message: Optional[str] = None,
3694    ) -> TextPromptClient: ...
3695
3696    def create_prompt(
3697        self,
3698        *,
3699        name: str,
3700        prompt: Union[
3701            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3702        ],
3703        labels: List[str] = [],
3704        tags: Optional[List[str]] = None,
3705        type: Optional[Literal["chat", "text"]] = "text",
3706        config: Optional[Any] = None,
3707        commit_message: Optional[str] = None,
3708    ) -> PromptClient:
3709        """Create a new prompt in Langfuse.
3710
3711        Keyword Args:
3712            name : The name of the prompt to be created.
3713            prompt : The content of the prompt to be created.
3714            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3715            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3716            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3717            config: Additional structured data to be saved with the prompt. Defaults to None.
3718            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3719            commit_message: Optional string describing the change.
3720
3721        Returns:
3722            TextPromptClient: The prompt if type argument is 'text'.
3723            ChatPromptClient: The prompt if type argument is 'chat'.
3724        """
3725        try:
3726            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3727
3728            if type == "chat":
3729                if not isinstance(prompt, list):
3730                    raise ValueError(
3731                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3732                    )
3733                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3734                    CreateChatPromptRequest(
3735                        name=name,
3736                        prompt=cast(Any, prompt),
3737                        labels=labels,
3738                        tags=tags,
3739                        config=config or {},
3740                        commit_message=commit_message,
3741                        type=CreateChatPromptType.CHAT,
3742                    )
3743                )
3744                server_prompt = self.api.prompts.create(request=request)
3745
3746                if self._resources is not None:
3747                    self._resources.prompt_cache.invalidate(name)
3748
3749                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3750
3751            if not isinstance(prompt, str):
3752                raise ValueError("For 'text' type, 'prompt' must be a string.")
3753
3754            request = CreateTextPromptRequest(
3755                name=name,
3756                prompt=prompt,
3757                labels=labels,
3758                tags=tags,
3759                config=config or {},
3760                commit_message=commit_message,
3761            )
3762
3763            server_prompt = self.api.prompts.create(request=request)
3764
3765            if self._resources is not None:
3766                self._resources.prompt_cache.invalidate(name)
3767
3768            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3769
3770        except Error as e:
3771            handle_fern_exception(e)
3772            raise e
3773
3774    def update_prompt(
3775        self,
3776        *,
3777        name: str,
3778        version: int,
3779        new_labels: List[str] = [],
3780    ) -> Any:
3781        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3782
3783        Args:
3784            name (str): The name of the prompt to update.
3785            version (int): The version number of the prompt to update.
3786            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3787
3788        Returns:
3789            Prompt: The updated prompt from the Langfuse API.
3790
3791        """
3792        updated_prompt = self.api.prompt_version.update(
3793            name=self._url_encode(name),
3794            version=version,
3795            new_labels=new_labels,
3796        )
3797
3798        if self._resources is not None:
3799            self._resources.prompt_cache.invalidate(name)
3800
3801        return updated_prompt
3802
3803    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3804        # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3805        # “%”, “?”, “#”, “|”, … in query/path parts).  Re-quoting here would
3806        # double-encode, so we skip when the value is about to be sent straight
3807        # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28.
3808        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3809            return url
3810
3811        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3812        # we need add safe="" to force escaping of slashes
3813        # This is necessary for prompts in prompt folders
3814        return urllib.parse.quote(url, safe="")
3815
3816    def clear_prompt_cache(self) -> None:
3817        """Clear the entire prompt cache, removing all cached prompts.
3818
3819        This method is useful when you want to force a complete refresh of all
3820        cached prompts, for example after major updates or when you need to
3821        ensure the latest versions are fetched from the server.
3822        """
3823        if self._resources is not None:
3824            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as start_observation(), update(), and set_trace_io().
  • mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.

    The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during flush() and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.

    Return None to leave the batch unchanged. Return MaskOtelSpansResult with OtelSpanPatch values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.

    Example:

    from typing import Optional
    
    from langfuse import Langfuse
    from langfuse.types import (
        MaskOtelSpansParams,
        MaskOtelSpansResult,
        OtelSpanPatch,
    )
    
    def mask_otel_spans(
        *, params: MaskOtelSpansParams
    ) -> Optional[MaskOtelSpansResult]:
        patches = {}
    
        for identifier, span in params.spans.items():
            if "gen_ai.prompt.0.content" in span.attributes:
                patches[identifier] = OtelSpanPatch(
                    delete_attributes=("gen_ai.prompt.0.content",),
                    set_attributes={"masking.applied": True},
                )
    
        return MaskOtelSpansResult(span_patches=patches)
    
    langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
    
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If span_exporter is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
  • span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include x-langfuse-ingestion-version=4 on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, mask_otel_spans: Optional[MaskOtelSpansFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None, span_exporter: Optional[opentelemetry.sdk.trace.export.SpanExporter] = None)
273    def __init__(
274        self,
275        *,
276        public_key: Optional[str] = None,
277        secret_key: Optional[str] = None,
278        base_url: Optional[str] = None,
279        host: Optional[str] = None,
280        timeout: Optional[int] = None,
281        httpx_client: Optional[httpx.Client] = None,
282        debug: bool = False,
283        tracing_enabled: Optional[bool] = True,
284        flush_at: Optional[int] = None,
285        flush_interval: Optional[float] = None,
286        environment: Optional[str] = None,
287        release: Optional[str] = None,
288        media_upload_thread_count: Optional[int] = None,
289        sample_rate: Optional[float] = None,
290        mask: Optional[MaskFunction] = None,
291        mask_otel_spans: Optional[MaskOtelSpansFunction] = None,
292        blocked_instrumentation_scopes: Optional[List[str]] = None,
293        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
294        additional_headers: Optional[Dict[str, str]] = None,
295        tracer_provider: Optional[TracerProvider] = None,
296        span_exporter: Optional[SpanExporter] = None,
297    ):
298        self._base_url = (
299            base_url
300            or os.environ.get(LANGFUSE_BASE_URL)
301            or host
302            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
303        )
304        self._environment = environment or cast(
305            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
306        )
307        self._release = (
308            release
309            or os.environ.get(LANGFUSE_RELEASE, None)
310            or get_common_release_envs()
311        )
312        self._project_id: Optional[str] = None
313        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
314        if not 0.0 <= sample_rate <= 1.0:
315            raise ValueError(
316                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
317            )
318
319        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
320
321        self._tracing_enabled = (
322            tracing_enabled
323            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
324        )
325        if not self._tracing_enabled:
326            langfuse_logger.info(
327                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
328            )
329
330        debug = (
331            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
332        )
333        if debug:
334            logging.basicConfig(
335                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
336            )
337            langfuse_logger.setLevel(logging.DEBUG)
338
339        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
340        if public_key is None:
341            langfuse_logger.warning(
342                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
343                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
344            )
345            self._otel_tracer = otel_trace_api.NoOpTracer()
346            return
347
348        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
349        if secret_key is None:
350            langfuse_logger.warning(
351                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
352                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
353            )
354            self._otel_tracer = otel_trace_api.NoOpTracer()
355            return
356
357        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
358            langfuse_logger.warning(
359                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
360            )
361
362        if blocked_instrumentation_scopes is not None:
363            warnings.warn(
364                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
365                "Use `should_export_span` instead. Example: "
366                "from langfuse.span_filter import is_default_export_span; "
367                'blocked={"scope"}; should_export_span=lambda span: '
368                "is_default_export_span(span) and (span.instrumentation_scope is None or "
369                "span.instrumentation_scope.name not in blocked).",
370                DeprecationWarning,
371                stacklevel=2,
372            )
373
374        # Initialize api and tracer if requirements are met
375        self._resources = LangfuseResourceManager(
376            public_key=public_key,
377            secret_key=secret_key,
378            base_url=self._base_url,
379            timeout=timeout,
380            environment=self._environment,
381            release=release,
382            flush_at=flush_at,
383            flush_interval=flush_interval,
384            httpx_client=httpx_client,
385            media_upload_thread_count=media_upload_thread_count,
386            sample_rate=sample_rate,
387            mask=mask,
388            mask_otel_spans=mask_otel_spans,
389            tracing_enabled=self._tracing_enabled,
390            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
391            should_export_span=should_export_span,
392            additional_headers=additional_headers,
393            tracer_provider=tracer_provider,
394            span_exporter=span_exporter,
395        )
396        self._mask = self._resources.mask
397
398        self._otel_tracer = (
399            self._resources.tracer
400            if self._tracing_enabled and self._resources.tracer is not None
401            else otel_trace_api.NoOpTracer()
402        )
403        self.api = self._resources.api
404        self.async_api = self._resources.async_api
api
async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
553    def start_observation(
554        self,
555        *,
556        trace_context: Optional[TraceContext] = None,
557        name: str,
558        as_type: ObservationTypeLiteralNoEvent = "span",
559        input: Optional[Any] = None,
560        output: Optional[Any] = None,
561        metadata: Optional[Any] = None,
562        version: Optional[str] = None,
563        level: Optional[SpanLevel] = None,
564        status_message: Optional[str] = None,
565        completion_start_time: Optional[datetime] = None,
566        model: Optional[str] = None,
567        model_parameters: Optional[Dict[str, MapValue]] = None,
568        usage_details: Optional[Dict[str, int]] = None,
569        cost_details: Optional[Dict[str, float]] = None,
570        prompt: Optional[PromptClient] = None,
571    ) -> Union[
572        LangfuseSpan,
573        LangfuseGeneration,
574        LangfuseAgent,
575        LangfuseTool,
576        LangfuseChain,
577        LangfuseRetriever,
578        LangfuseEvaluator,
579        LangfuseEmbedding,
580        LangfuseGuardrail,
581    ]:
582        """Create a new observation of the specified type.
583
584        This method creates a new observation but does not set it as the current span in the
585        context. To create and use an observation within a context, use start_as_current_observation().
586
587        Args:
588            trace_context: Optional context for connecting to an existing trace
589            name: Name of the observation
590            as_type: Type of observation to create (defaults to "span")
591            input: Input data for the operation
592            output: Output data from the operation
593            metadata: Additional metadata to associate with the observation
594            version: Version identifier for the code or component
595            level: Importance level of the observation
596            status_message: Optional status message for the observation
597            completion_start_time: When the model started generating (for generation types)
598            model: Name/identifier of the AI model used (for generation types)
599            model_parameters: Parameters used for the model (for generation types)
600            usage_details: Token usage information (for generation types)
601            cost_details: Cost information (for generation types)
602            prompt: Associated prompt template (for generation types)
603
604        Returns:
605            An observation object of the appropriate type that must be ended with .end()
606        """
607        if trace_context:
608            trace_id = trace_context.get("trace_id", None)
609            parent_span_id = trace_context.get("parent_span_id", None)
610
611            if trace_id:
612                remote_parent_span = self._create_remote_parent_span(
613                    trace_id=trace_id, parent_span_id=parent_span_id
614                )
615
616                with otel_trace_api.use_span(
617                    cast(otel_trace_api.Span, remote_parent_span)
618                ):
619                    otel_span = self._otel_tracer.start_span(name=name)
620                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
621
622                    return self._create_observation_from_otel_span(
623                        otel_span=otel_span,
624                        as_type=as_type,
625                        input=input,
626                        output=output,
627                        metadata=metadata,
628                        version=version,
629                        level=level,
630                        status_message=status_message,
631                        completion_start_time=completion_start_time,
632                        model=model,
633                        model_parameters=model_parameters,
634                        usage_details=usage_details,
635                        cost_details=cost_details,
636                        prompt=prompt,
637                    )
638
639        otel_span = self._otel_tracer.start_span(name=name)
640
641        return self._create_observation_from_otel_span(
642            otel_span=otel_span,
643            as_type=as_type,
644            input=input,
645            output=output,
646            metadata=metadata,
647            version=version,
648            level=level,
649            status_message=status_message,
650            completion_start_time=completion_start_time,
651            model=model,
652            model_parameters=model_parameters,
653            usage_details=usage_details,
654            cost_details=cost_details,
655            prompt=prompt,
656        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 886    def start_as_current_observation(
 887        self,
 888        *,
 889        trace_context: Optional[TraceContext] = None,
 890        name: str,
 891        as_type: ObservationTypeLiteralNoEvent = "span",
 892        input: Optional[Any] = None,
 893        output: Optional[Any] = None,
 894        metadata: Optional[Any] = None,
 895        version: Optional[str] = None,
 896        level: Optional[SpanLevel] = None,
 897        status_message: Optional[str] = None,
 898        completion_start_time: Optional[datetime] = None,
 899        model: Optional[str] = None,
 900        model_parameters: Optional[Dict[str, MapValue]] = None,
 901        usage_details: Optional[Dict[str, int]] = None,
 902        cost_details: Optional[Dict[str, float]] = None,
 903        prompt: Optional[PromptClient] = None,
 904        end_on_exit: Optional[bool] = None,
 905    ) -> Union[
 906        _AgnosticContextManager[LangfuseGeneration],
 907        _AgnosticContextManager[LangfuseSpan],
 908        _AgnosticContextManager[LangfuseAgent],
 909        _AgnosticContextManager[LangfuseTool],
 910        _AgnosticContextManager[LangfuseChain],
 911        _AgnosticContextManager[LangfuseRetriever],
 912        _AgnosticContextManager[LangfuseEvaluator],
 913        _AgnosticContextManager[LangfuseEmbedding],
 914        _AgnosticContextManager[LangfuseGuardrail],
 915    ]:
 916        """Create a new observation and set it as the current span in a context manager.
 917
 918        This method creates a new observation of the specified type and sets it as the
 919        current span within a context manager. Use this method with a 'with' statement to
 920        automatically handle the observation lifecycle within a code block.
 921
 922        The created observation will be the child of the current span in the context.
 923
 924        Args:
 925            trace_context: Optional context for connecting to an existing trace
 926            name: Name of the observation (e.g., function or operation name)
 927            as_type: Type of observation to create (defaults to "span")
 928            input: Input data for the operation (can be any JSON-serializable object)
 929            output: Output data from the operation (can be any JSON-serializable object)
 930            metadata: Additional metadata to associate with the observation
 931            version: Version identifier for the code or component
 932            level: Importance level of the observation (info, warning, error)
 933            status_message: Optional status message for the observation
 934            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 935
 936            The following parameters are available when as_type is: "generation" or "embedding".
 937            completion_start_time: When the model started generating the response
 938            model: Name/identifier of the AI model used (e.g., "gpt-4")
 939            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 940            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 941            cost_details: Cost information for the model call
 942            prompt: Associated prompt template from Langfuse prompt management
 943
 944        Returns:
 945            A context manager that yields the appropriate observation type based on as_type
 946
 947        Example:
 948            ```python
 949            # Create a span
 950            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 951                # Do work
 952                result = process_data()
 953                span.update(output=result)
 954
 955                # Create a child span automatically
 956                with span.start_as_current_observation(name="sub-operation") as child_span:
 957                    # Do sub-operation work
 958                    child_span.update(output="sub-result")
 959
 960            # Create a tool observation
 961            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 962                # Do tool work
 963                results = search_web(query)
 964                tool.update(output=results)
 965
 966            # Create a generation observation
 967            with langfuse.start_as_current_observation(
 968                name="answer-generation",
 969                as_type="generation",
 970                model="gpt-4"
 971            ) as generation:
 972                # Generate answer
 973                response = llm.generate(...)
 974                generation.update(output=response)
 975            ```
 976        """
 977        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 978            if trace_context:
 979                trace_id = trace_context.get("trace_id", None)
 980                parent_span_id = trace_context.get("parent_span_id", None)
 981
 982                if trace_id:
 983                    remote_parent_span = self._create_remote_parent_span(
 984                        trace_id=trace_id, parent_span_id=parent_span_id
 985                    )
 986
 987                    return cast(
 988                        Union[
 989                            _AgnosticContextManager[LangfuseGeneration],
 990                            _AgnosticContextManager[LangfuseEmbedding],
 991                        ],
 992                        self._create_span_with_parent_context(
 993                            as_type=as_type,
 994                            name=name,
 995                            remote_parent_span=remote_parent_span,
 996                            parent=None,
 997                            end_on_exit=end_on_exit,
 998                            input=input,
 999                            output=output,
1000                            metadata=metadata,
1001                            version=version,
1002                            level=level,
1003                            status_message=status_message,
1004                            completion_start_time=completion_start_time,
1005                            model=model,
1006                            model_parameters=model_parameters,
1007                            usage_details=usage_details,
1008                            cost_details=cost_details,
1009                            prompt=prompt,
1010                        ),
1011                    )
1012
1013            return cast(
1014                Union[
1015                    _AgnosticContextManager[LangfuseGeneration],
1016                    _AgnosticContextManager[LangfuseEmbedding],
1017                ],
1018                self._start_as_current_otel_span_with_processed_media(
1019                    as_type=as_type,
1020                    name=name,
1021                    end_on_exit=end_on_exit,
1022                    input=input,
1023                    output=output,
1024                    metadata=metadata,
1025                    version=version,
1026                    level=level,
1027                    status_message=status_message,
1028                    completion_start_time=completion_start_time,
1029                    model=model,
1030                    model_parameters=model_parameters,
1031                    usage_details=usage_details,
1032                    cost_details=cost_details,
1033                    prompt=prompt,
1034                ),
1035            )
1036
1037        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1038            if trace_context:
1039                trace_id = trace_context.get("trace_id", None)
1040                parent_span_id = trace_context.get("parent_span_id", None)
1041
1042                if trace_id:
1043                    remote_parent_span = self._create_remote_parent_span(
1044                        trace_id=trace_id, parent_span_id=parent_span_id
1045                    )
1046
1047                    return cast(
1048                        Union[
1049                            _AgnosticContextManager[LangfuseSpan],
1050                            _AgnosticContextManager[LangfuseAgent],
1051                            _AgnosticContextManager[LangfuseTool],
1052                            _AgnosticContextManager[LangfuseChain],
1053                            _AgnosticContextManager[LangfuseRetriever],
1054                            _AgnosticContextManager[LangfuseEvaluator],
1055                            _AgnosticContextManager[LangfuseGuardrail],
1056                        ],
1057                        self._create_span_with_parent_context(
1058                            as_type=as_type,
1059                            name=name,
1060                            remote_parent_span=remote_parent_span,
1061                            parent=None,
1062                            end_on_exit=end_on_exit,
1063                            input=input,
1064                            output=output,
1065                            metadata=metadata,
1066                            version=version,
1067                            level=level,
1068                            status_message=status_message,
1069                        ),
1070                    )
1071
1072            return cast(
1073                Union[
1074                    _AgnosticContextManager[LangfuseSpan],
1075                    _AgnosticContextManager[LangfuseAgent],
1076                    _AgnosticContextManager[LangfuseTool],
1077                    _AgnosticContextManager[LangfuseChain],
1078                    _AgnosticContextManager[LangfuseRetriever],
1079                    _AgnosticContextManager[LangfuseEvaluator],
1080                    _AgnosticContextManager[LangfuseGuardrail],
1081                ],
1082                self._start_as_current_otel_span_with_processed_media(
1083                    as_type=as_type,
1084                    name=name,
1085                    end_on_exit=end_on_exit,
1086                    input=input,
1087                    output=output,
1088                    metadata=metadata,
1089                    version=version,
1090                    level=level,
1091                    status_message=status_message,
1092                ),
1093            )
1094
1095        # This should never be reached since all valid types are handled above
1096        langfuse_logger.warning(
1097            f"Unknown observation type: {as_type}, falling back to span"
1098        )
1099        return self._start_as_current_otel_span_with_processed_media(
1100            as_type="span",
1101            name=name,
1102            end_on_exit=end_on_exit,
1103            input=input,
1104            output=output,
1105            metadata=metadata,
1106            version=version,
1107            level=level,
1108            status_message=status_message,
1109        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1301    def update_current_generation(
1302        self,
1303        *,
1304        name: Optional[str] = None,
1305        input: Optional[Any] = None,
1306        output: Optional[Any] = None,
1307        metadata: Optional[Any] = None,
1308        version: Optional[str] = None,
1309        level: Optional[SpanLevel] = None,
1310        status_message: Optional[str] = None,
1311        completion_start_time: Optional[datetime] = None,
1312        model: Optional[str] = None,
1313        model_parameters: Optional[Dict[str, MapValue]] = None,
1314        usage_details: Optional[Dict[str, int]] = None,
1315        cost_details: Optional[Dict[str, float]] = None,
1316        prompt: Optional[PromptClient] = None,
1317    ) -> None:
1318        """Update the current active generation span with new information.
1319
1320        This method updates the current generation span in the active context with
1321        additional information. It's useful for adding output, usage stats, or other
1322        details that become available during or after model generation.
1323
1324        Args:
1325            name: The generation name
1326            input: Updated input data for the model
1327            output: Output from the model (e.g., completions)
1328            metadata: Additional metadata to associate with the generation
1329            version: Version identifier for the model or component
1330            level: Importance level of the generation (info, warning, error)
1331            status_message: Optional status message for the generation
1332            completion_start_time: When the model started generating the response
1333            model: Name/identifier of the AI model used (e.g., "gpt-4")
1334            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1335            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1336            cost_details: Cost information for the model call
1337            prompt: Associated prompt template from Langfuse prompt management
1338
1339        Example:
1340            ```python
1341            with langfuse.start_as_current_generation(name="answer-query") as generation:
1342                # Initial setup and API call
1343                response = llm.generate(...)
1344
1345                # Update with results that weren't available at creation time
1346                langfuse.update_current_generation(
1347                    output=response.text,
1348                    usage_details={
1349                        "prompt_tokens": response.usage.prompt_tokens,
1350                        "completion_tokens": response.usage.completion_tokens
1351                    }
1352                )
1353            ```
1354        """
1355        if not self._tracing_enabled:
1356            langfuse_logger.debug(
1357                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1358            )
1359            return
1360
1361        current_otel_span = self._get_current_otel_span()
1362
1363        if current_otel_span is not None:
1364            generation = LangfuseGeneration(
1365                otel_span=current_otel_span, langfuse_client=self
1366            )
1367
1368            if name:
1369                current_otel_span.update_name(name)
1370
1371            generation.update(
1372                input=input,
1373                output=output,
1374                metadata=metadata,
1375                version=version,
1376                level=level,
1377                status_message=status_message,
1378                completion_start_time=completion_start_time,
1379                model=model,
1380                model_parameters=model_parameters,
1381                usage_details=usage_details,
1382                cost_details=cost_details,
1383                prompt=prompt,
1384            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1386    def update_current_span(
1387        self,
1388        *,
1389        name: Optional[str] = None,
1390        input: Optional[Any] = None,
1391        output: Optional[Any] = None,
1392        metadata: Optional[Any] = None,
1393        version: Optional[str] = None,
1394        level: Optional[SpanLevel] = None,
1395        status_message: Optional[str] = None,
1396    ) -> None:
1397        """Update the current active span with new information.
1398
1399        This method updates the current span in the active context with
1400        additional information. It's useful for adding outputs or metadata
1401        that become available during execution.
1402
1403        Args:
1404            name: The span name
1405            input: Updated input data for the operation
1406            output: Output data from the operation
1407            metadata: Additional metadata to associate with the span
1408            version: Version identifier for the code or component
1409            level: Importance level of the span (info, warning, error)
1410            status_message: Optional status message for the span
1411
1412        Example:
1413            ```python
1414            with langfuse.start_as_current_observation(name="process-data") as span:
1415                # Initial processing
1416                result = process_first_part()
1417
1418                # Update with intermediate results
1419                langfuse.update_current_span(metadata={"intermediate_result": result})
1420
1421                # Continue processing
1422                final_result = process_second_part(result)
1423
1424                # Final update
1425                langfuse.update_current_span(output=final_result)
1426            ```
1427        """
1428        if not self._tracing_enabled:
1429            langfuse_logger.debug(
1430                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1431            )
1432            return
1433
1434        current_otel_span = self._get_current_otel_span()
1435
1436        if current_otel_span is not None:
1437            span_class = self._get_span_class(
1438                self._get_observation_type_from_otel_span(current_otel_span)
1439            )
1440            span = span_class(
1441                otel_span=current_otel_span,
1442                langfuse_client=self,
1443                environment=self._environment,
1444                release=self._release,
1445            )
1446
1447            if name:
1448                current_otel_span.update_name(name)
1449
1450            span.update(
1451                input=input,
1452                output=output,
1453                metadata=metadata,
1454                version=version,
1455                level=level,
1456                status_message=status_message,
1457            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1459    @deprecated(
1460        "Trace-level input/output is deprecated. "
1461        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1462        "This method will be removed in a future major version."
1463    )
1464    def set_current_trace_io(
1465        self,
1466        *,
1467        input: Optional[Any] = None,
1468        output: Optional[Any] = None,
1469    ) -> None:
1470        """Set trace-level input and output for the current span's trace.
1471
1472        .. deprecated::
1473            This is a legacy method for backward compatibility with Langfuse platform
1474            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1475            evaluators). It will be removed in a future major version.
1476
1477            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1478            use :meth:`propagate_attributes` instead.
1479
1480        Args:
1481            input: Input data to associate with the trace.
1482            output: Output data to associate with the trace.
1483        """
1484        if not self._tracing_enabled:
1485            langfuse_logger.debug(
1486                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1487            )
1488            return
1489
1490        current_otel_span = self._get_current_otel_span()
1491
1492        if current_otel_span is not None and current_otel_span.is_recording():
1493            span_class = self._get_span_class(
1494                self._get_observation_type_from_otel_span(current_otel_span)
1495            )
1496            span = span_class(
1497                otel_span=current_otel_span,
1498                langfuse_client=self,
1499                environment=self._environment,
1500                release=self._release,
1501            )
1502
1503            span.set_trace_io(
1504                input=input,
1505                output=output,
1506            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1508    def set_current_trace_as_public(self) -> None:
1509        """Make the current trace publicly accessible via its URL.
1510
1511        When a trace is published, anyone with the trace link can view the full trace
1512        without needing to be logged in to Langfuse. This action cannot be undone
1513        programmatically - once published, the entire trace becomes public.
1514
1515        This is a convenience method that publishes the trace from the currently
1516        active span context. Use this when you want to make a trace public from
1517        within a traced function without needing direct access to the span object.
1518        """
1519        if not self._tracing_enabled:
1520            langfuse_logger.debug(
1521                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1522            )
1523            return
1524
1525        current_otel_span = self._get_current_otel_span()
1526
1527        if current_otel_span is not None and current_otel_span.is_recording():
1528            span_class = self._get_span_class(
1529                self._get_observation_type_from_otel_span(current_otel_span)
1530            )
1531            span = span_class(
1532                otel_span=current_otel_span,
1533                langfuse_client=self,
1534                environment=self._environment,
1535            )
1536
1537            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1539    def create_event(
1540        self,
1541        *,
1542        trace_context: Optional[TraceContext] = None,
1543        name: str,
1544        input: Optional[Any] = None,
1545        output: Optional[Any] = None,
1546        metadata: Optional[Any] = None,
1547        version: Optional[str] = None,
1548        level: Optional[SpanLevel] = None,
1549        status_message: Optional[str] = None,
1550    ) -> LangfuseEvent:
1551        """Create a new Langfuse observation of type 'EVENT'.
1552
1553        The created Langfuse Event observation will be the child of the current span in the context.
1554
1555        Args:
1556            trace_context: Optional context for connecting to an existing trace
1557            name: Name of the span (e.g., function or operation name)
1558            input: Input data for the operation (can be any JSON-serializable object)
1559            output: Output data from the operation (can be any JSON-serializable object)
1560            metadata: Additional metadata to associate with the span
1561            version: Version identifier for the code or component
1562            level: Importance level of the span (info, warning, error)
1563            status_message: Optional status message for the span
1564
1565        Returns:
1566            The Langfuse Event object
1567
1568        Example:
1569            ```python
1570            event = langfuse.create_event(name="process-event")
1571            ```
1572        """
1573        timestamp = time_ns()
1574
1575        if trace_context:
1576            trace_id = trace_context.get("trace_id", None)
1577            parent_span_id = trace_context.get("parent_span_id", None)
1578
1579            if trace_id:
1580                remote_parent_span = self._create_remote_parent_span(
1581                    trace_id=trace_id, parent_span_id=parent_span_id
1582                )
1583
1584                with otel_trace_api.use_span(
1585                    cast(otel_trace_api.Span, remote_parent_span)
1586                ):
1587                    otel_span = self._otel_tracer.start_span(
1588                        name=name, start_time=timestamp
1589                    )
1590                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1591
1592                    return cast(
1593                        LangfuseEvent,
1594                        LangfuseEvent(
1595                            otel_span=otel_span,
1596                            langfuse_client=self,
1597                            environment=self._environment,
1598                            release=self._release,
1599                            input=input,
1600                            output=output,
1601                            metadata=metadata,
1602                            version=version,
1603                            level=level,
1604                            status_message=status_message,
1605                        ).end(end_time=timestamp),
1606                    )
1607
1608        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1609
1610        return cast(
1611            LangfuseEvent,
1612            LangfuseEvent(
1613                otel_span=otel_span,
1614                langfuse_client=self,
1615                environment=self._environment,
1616                release=self._release,
1617                input=input,
1618                output=output,
1619                metadata=metadata,
1620                version=version,
1621                level=level,
1622                status_message=status_message,
1623            ).end(end_time=timestamp),
1624        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1713    @staticmethod
1714    def create_trace_id(*, seed: Optional[str] = None) -> str:
1715        """Create a unique trace ID for use with Langfuse.
1716
1717        This method generates a unique trace ID for use with various Langfuse APIs.
1718        It can either generate a random ID or create a deterministic ID based on
1719        a seed string.
1720
1721        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1722        This method ensures the generated ID meets this requirement. If you need to
1723        correlate an external ID with a Langfuse trace ID, use the external ID as the
1724        seed to get a valid, deterministic Langfuse trace ID.
1725
1726        Args:
1727            seed: Optional string to use as a seed for deterministic ID generation.
1728                 If provided, the same seed will always produce the same ID.
1729                 If not provided, a random ID will be generated.
1730
1731        Returns:
1732            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1733
1734        Example:
1735            ```python
1736            # Generate a random trace ID
1737            trace_id = langfuse.create_trace_id()
1738
1739            # Generate a deterministic ID based on a seed
1740            session_trace_id = langfuse.create_trace_id(seed="session-456")
1741
1742            # Correlate an external ID with a Langfuse trace ID
1743            external_id = "external-system-123456"
1744            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1745
1746            # Use the ID with trace context
1747            with langfuse.start_as_current_observation(
1748                name="process-request",
1749                trace_context={"trace_id": trace_id}
1750            ) as span:
1751                # Operation will be part of the specific trace
1752                pass
1753            ```
1754        """
1755        if not seed:
1756            trace_id_int = RandomIdGenerator().generate_trace_id()
1757
1758            return Langfuse._format_otel_trace_id(trace_id_int)
1759
1760        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
1840    def create_score(
1841        self,
1842        *,
1843        name: str,
1844        value: Union[float, str],
1845        session_id: Optional[str] = None,
1846        dataset_run_id: Optional[str] = None,
1847        trace_id: Optional[str] = None,
1848        observation_id: Optional[str] = None,
1849        score_id: Optional[str] = None,
1850        data_type: Optional[ScoreDataType] = None,
1851        comment: Optional[str] = None,
1852        config_id: Optional[str] = None,
1853        metadata: Optional[Any] = None,
1854        timestamp: Optional[datetime] = None,
1855    ) -> None:
1856        """Create a score for a specific trace or observation.
1857
1858        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1859        used to track quality metrics, user feedback, or automated evaluations.
1860
1861        Args:
1862            name: Name of the score (e.g., "relevance", "accuracy")
1863            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
1864            session_id: ID of the Langfuse session to associate the score with
1865            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1866            trace_id: ID of the Langfuse trace to associate the score with
1867            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1868            score_id: Optional custom ID for the score (auto-generated if not provided)
1869            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
1870            comment: Optional comment or explanation for the score
1871            config_id: Optional ID of a score config defined in Langfuse
1872            metadata: Optional metadata to be attached to the score
1873            timestamp: Optional timestamp for the score (defaults to current UTC time)
1874
1875        Example:
1876            ```python
1877            # Create a numeric score for accuracy
1878            langfuse.create_score(
1879                name="accuracy",
1880                value=0.92,
1881                trace_id="abcdef1234567890abcdef1234567890",
1882                data_type="NUMERIC",
1883                comment="High accuracy with minor irrelevant details"
1884            )
1885
1886            # Create a categorical score for sentiment
1887            langfuse.create_score(
1888                name="sentiment",
1889                value="positive",
1890                trace_id="abcdef1234567890abcdef1234567890",
1891                observation_id="abcdef1234567890",
1892                data_type="CATEGORICAL"
1893            )
1894            ```
1895        """
1896        if not self._tracing_enabled:
1897            return
1898
1899        score_id = score_id or self._create_observation_id()
1900
1901        try:
1902            new_body = ScoreBody(
1903                id=score_id,
1904                sessionId=session_id,
1905                datasetRunId=dataset_run_id,
1906                traceId=trace_id,
1907                observationId=observation_id,
1908                name=name,
1909                value=value,
1910                dataType=data_type,  # type: ignore
1911                comment=comment,
1912                configId=config_id,
1913                environment=self._environment,
1914                metadata=metadata,
1915            )
1916
1917            event = {
1918                "id": self.create_trace_id(),
1919                "type": "score-create",
1920                "timestamp": timestamp or _get_timestamp(),
1921                "body": new_body,
1922            }
1923
1924            if self._resources is not None:
1925                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1926                force_sample = (
1927                    not self._is_valid_trace_id(trace_id) if trace_id else True
1928                )
1929
1930                self._resources.add_score_task(
1931                    event,
1932                    force_sample=force_sample,
1933                )
1934
1935        except Exception as e:
1936            langfuse_logger.exception(
1937                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1938            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2001    def score_current_span(
2002        self,
2003        *,
2004        name: str,
2005        value: Union[float, str],
2006        score_id: Optional[str] = None,
2007        data_type: Optional[ScoreDataType] = None,
2008        comment: Optional[str] = None,
2009        config_id: Optional[str] = None,
2010        metadata: Optional[Any] = None,
2011    ) -> None:
2012        """Create a score for the current active span.
2013
2014        This method scores the currently active span in the context. It's a convenient
2015        way to score the current operation without needing to know its trace and span IDs.
2016
2017        Args:
2018            name: Name of the score (e.g., "relevance", "accuracy")
2019            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2020            score_id: Optional custom ID for the score (auto-generated if not provided)
2021            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2022            comment: Optional comment or explanation for the score
2023            config_id: Optional ID of a score config defined in Langfuse
2024            metadata: Optional metadata to be attached to the score
2025
2026        Example:
2027            ```python
2028            with langfuse.start_as_current_generation(name="answer-query") as generation:
2029                # Generate answer
2030                response = generate_answer(...)
2031                generation.update(output=response)
2032
2033                # Score the generation
2034                langfuse.score_current_span(
2035                    name="relevance",
2036                    value=0.85,
2037                    data_type="NUMERIC",
2038                    comment="Mostly relevant but contains some tangential information",
2039                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2040                )
2041            ```
2042        """
2043        current_span = self._get_current_otel_span()
2044
2045        if current_span is not None:
2046            trace_id = self._get_otel_trace_id(current_span)
2047            observation_id = self._get_otel_span_id(current_span)
2048
2049            langfuse_logger.info(
2050                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2051            )
2052
2053            self.create_score(
2054                trace_id=trace_id,
2055                observation_id=observation_id,
2056                name=name,
2057                value=cast(str, value),
2058                score_id=score_id,
2059                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2060                comment=comment,
2061                config_id=config_id,
2062                metadata=metadata,
2063            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2093    def score_current_trace(
2094        self,
2095        *,
2096        name: str,
2097        value: Union[float, str],
2098        score_id: Optional[str] = None,
2099        data_type: Optional[ScoreDataType] = None,
2100        comment: Optional[str] = None,
2101        config_id: Optional[str] = None,
2102        metadata: Optional[Any] = None,
2103    ) -> None:
2104        """Create a score for the current trace.
2105
2106        This method scores the trace of the currently active span. Unlike score_current_span,
2107        this method associates the score with the entire trace rather than a specific span.
2108        It's useful for scoring overall performance or quality of the entire operation.
2109
2110        Args:
2111            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2112            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2113            score_id: Optional custom ID for the score (auto-generated if not provided)
2114            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2115            comment: Optional comment or explanation for the score
2116            config_id: Optional ID of a score config defined in Langfuse
2117            metadata: Optional metadata to be attached to the score
2118
2119        Example:
2120            ```python
2121            with langfuse.start_as_current_observation(name="process-user-request") as span:
2122                # Process request
2123                result = process_complete_request()
2124                span.update(output=result)
2125
2126                # Score the overall trace
2127                langfuse.score_current_trace(
2128                    name="overall_quality",
2129                    value=0.95,
2130                    data_type="NUMERIC",
2131                    comment="High quality end-to-end response",
2132                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2133                )
2134            ```
2135        """
2136        current_span = self._get_current_otel_span()
2137
2138        if current_span is not None:
2139            trace_id = self._get_otel_trace_id(current_span)
2140
2141            langfuse_logger.info(
2142                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2143            )
2144
2145            self.create_score(
2146                trace_id=trace_id,
2147                name=name,
2148                value=cast(str, value),
2149                score_id=score_id,
2150                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2151                comment=comment,
2152                config_id=config_id,
2153                metadata=metadata,
2154            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2156    def flush(self) -> None:
2157        """Force flush all pending spans and events to the Langfuse API.
2158
2159        This method manually flushes any pending spans, scores, and other events to the
2160        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2161        before proceeding, without waiting for the automatic flush interval.
2162
2163        Example:
2164            ```python
2165            # Record some spans and scores
2166            with langfuse.start_as_current_observation(name="operation") as span:
2167                # Do work...
2168                pass
2169
2170            # Ensure all data is sent to Langfuse before proceeding
2171            langfuse.flush()
2172
2173            # Continue with other work
2174            ```
2175        """
2176        if self._resources is not None:
2177            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2179    def shutdown(self) -> None:
2180        """Shut down the Langfuse client and flush all pending data.
2181
2182        This method cleanly shuts down the Langfuse client, ensuring all pending data
2183        is flushed to the API and all background threads are properly terminated.
2184
2185        It's important to call this method when your application is shutting down to
2186        prevent data loss and resource leaks. For most applications, using the client
2187        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2188
2189        Example:
2190            ```python
2191            # Initialize Langfuse
2192            langfuse = Langfuse(public_key="...", secret_key="...")
2193
2194            # Use Langfuse throughout your application
2195            # ...
2196
2197            # When application is shutting down
2198            langfuse.shutdown()
2199            ```
2200        """
2201        if self._resources is not None:
2202            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2204    def get_current_trace_id(self) -> Optional[str]:
2205        """Get the trace ID of the current active span.
2206
2207        This method retrieves the trace ID from the currently active span in the context.
2208        It can be used to get the trace ID for referencing in logs, external systems,
2209        or for creating related operations.
2210
2211        Returns:
2212            The current trace ID as a 32-character lowercase hexadecimal string,
2213            or None if there is no active span.
2214
2215        Example:
2216            ```python
2217            with langfuse.start_as_current_observation(name="process-request") as span:
2218                # Get the current trace ID for reference
2219                trace_id = langfuse.get_current_trace_id()
2220
2221                # Use it for external correlation
2222                log.info(f"Processing request with trace_id: {trace_id}")
2223
2224                # Or pass to another system
2225                external_system.process(data, trace_id=trace_id)
2226            ```
2227        """
2228        if not self._tracing_enabled:
2229            langfuse_logger.debug(
2230                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2231            )
2232            return None
2233
2234        current_otel_span = self._get_current_otel_span()
2235
2236        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2238    def get_current_observation_id(self) -> Optional[str]:
2239        """Get the observation ID (span ID) of the current active span.
2240
2241        This method retrieves the observation ID from the currently active span in the context.
2242        It can be used to get the observation ID for referencing in logs, external systems,
2243        or for creating scores or other related operations.
2244
2245        Returns:
2246            The current observation ID as a 16-character lowercase hexadecimal string,
2247            or None if there is no active span.
2248
2249        Example:
2250            ```python
2251            with langfuse.start_as_current_observation(name="process-user-query") as span:
2252                # Get the current observation ID
2253                observation_id = langfuse.get_current_observation_id()
2254
2255                # Store it for later reference
2256                cache.set(f"query_{query_id}_observation", observation_id)
2257
2258                # Process the query...
2259            ```
2260        """
2261        if not self._tracing_enabled:
2262            langfuse_logger.debug(
2263                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2264            )
2265            return None
2266
2267        current_otel_span = self._get_current_otel_span()
2268
2269        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2282    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2283        """Get the URL to view a trace in the Langfuse UI.
2284
2285        This method generates a URL that links directly to a trace in the Langfuse UI.
2286        It's useful for providing links in logs, notifications, or debugging tools.
2287
2288        Args:
2289            trace_id: Optional trace ID to generate a URL for. If not provided,
2290                     the trace ID of the current active span will be used.
2291
2292        Returns:
2293            A URL string pointing to the trace in the Langfuse UI,
2294            or None if the project ID couldn't be retrieved or no trace ID is available.
2295
2296        Example:
2297            ```python
2298            # Get URL for the current trace
2299            with langfuse.start_as_current_observation(name="process-request") as span:
2300                trace_url = langfuse.get_trace_url()
2301                log.info(f"Processing trace: {trace_url}")
2302
2303            # Get URL for a specific trace
2304            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2305            send_notification(f"Review needed for trace: {specific_trace_url}")
2306            ```
2307        """
2308        final_trace_id = trace_id or self.get_current_trace_id()
2309        if not final_trace_id:
2310            return None
2311
2312        project_id = self._get_project_id()
2313
2314        return (
2315            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2316            if project_id and final_trace_id
2317            else None
2318        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2320    def get_dataset(
2321        self,
2322        name: str,
2323        *,
2324        fetch_items_page_size: Optional[int] = 50,
2325        version: Optional[datetime] = None,
2326    ) -> "DatasetClient":
2327        """Fetch a dataset by its name.
2328
2329        Args:
2330            name (str): The name of the dataset to fetch.
2331            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2332            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2333                If provided, returns the state of items at the specified UTC timestamp.
2334                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2335
2336        Returns:
2337            DatasetClient: The dataset with the given name.
2338        """
2339        try:
2340            langfuse_logger.debug(f"Getting datasets {name}")
2341            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2342
2343            dataset_items = []
2344            page = 1
2345
2346            while True:
2347                new_items = self.api.dataset_items.list(
2348                    dataset_name=self._url_encode(name, is_url_param=True),
2349                    page=page,
2350                    limit=fetch_items_page_size,
2351                    version=version,
2352                )
2353                dataset_items.extend(new_items.data)
2354
2355                if new_items.meta.total_pages <= page:
2356                    break
2357
2358                page += 1
2359
2360            return DatasetClient(
2361                dataset=dataset,
2362                items=dataset_items,
2363                version=version,
2364                langfuse_client=self,
2365            )
2366
2367        except Error as e:
2368            handle_fern_exception(e)
2369            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2371    def get_dataset_run(
2372        self, *, dataset_name: str, run_name: str
2373    ) -> DatasetRunWithItems:
2374        """Fetch a dataset run by dataset name and run name.
2375
2376        Args:
2377            dataset_name (str): The name of the dataset.
2378            run_name (str): The name of the run.
2379
2380        Returns:
2381            DatasetRunWithItems: The dataset run with its items.
2382        """
2383        try:
2384            return cast(
2385                DatasetRunWithItems,
2386                self.api.datasets.get_run(
2387                    dataset_name=self._url_encode(dataset_name),
2388                    run_name=self._url_encode(run_name),
2389                    request_options=None,
2390                ),
2391            )
2392        except Error as e:
2393            handle_fern_exception(e)
2394            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2396    def get_dataset_runs(
2397        self,
2398        *,
2399        dataset_name: str,
2400        page: Optional[int] = None,
2401        limit: Optional[int] = None,
2402    ) -> PaginatedDatasetRuns:
2403        """Fetch all runs for a dataset.
2404
2405        Args:
2406            dataset_name (str): The name of the dataset.
2407            page (Optional[int]): Page number, starts at 1.
2408            limit (Optional[int]): Limit of items per page.
2409
2410        Returns:
2411            PaginatedDatasetRuns: Paginated list of dataset runs.
2412        """
2413        try:
2414            return cast(
2415                PaginatedDatasetRuns,
2416                self.api.datasets.get_runs(
2417                    dataset_name=self._url_encode(dataset_name),
2418                    page=page,
2419                    limit=limit,
2420                    request_options=None,
2421                ),
2422            )
2423        except Error as e:
2424            handle_fern_exception(e)
2425            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2427    def delete_dataset_run(
2428        self, *, dataset_name: str, run_name: str
2429    ) -> DeleteDatasetRunResponse:
2430        """Delete a dataset run and all its run items. This action is irreversible.
2431
2432        Args:
2433            dataset_name (str): The name of the dataset.
2434            run_name (str): The name of the run.
2435
2436        Returns:
2437            DeleteDatasetRunResponse: Confirmation of deletion.
2438        """
2439        try:
2440            return cast(
2441                DeleteDatasetRunResponse,
2442                self.api.datasets.delete_run(
2443                    dataset_name=self._url_encode(dataset_name),
2444                    run_name=self._url_encode(run_name),
2445                    request_options=None,
2446                ),
2447            )
2448        except Error as e:
2449            handle_fern_exception(e)
2450            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2452    def run_experiment(
2453        self,
2454        *,
2455        name: str,
2456        run_name: Optional[str] = None,
2457        description: Optional[str] = None,
2458        data: ExperimentData,
2459        task: TaskFunction,
2460        evaluators: List[EvaluatorFunction] = [],
2461        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2462        run_evaluators: List[RunEvaluatorFunction] = [],
2463        max_concurrency: int = 50,
2464        metadata: Optional[Dict[str, str]] = None,
2465        _dataset_version: Optional[datetime] = None,
2466    ) -> ExperimentResult:
2467        """Run an experiment on a dataset with automatic tracing and evaluation.
2468
2469        This method executes a task function on each item in the provided dataset,
2470        automatically traces all executions with Langfuse for observability, runs
2471        item-level and run-level evaluators on the outputs, and returns comprehensive
2472        results with evaluation metrics.
2473
2474        The experiment system provides:
2475        - Automatic tracing of all task executions
2476        - Concurrent processing with configurable limits
2477        - Comprehensive error handling that isolates failures
2478        - Integration with Langfuse datasets for experiment tracking
2479        - Flexible evaluation framework supporting both sync and async evaluators
2480
2481        Args:
2482            name: Human-readable name for the experiment. Used for identification
2483                in the Langfuse UI.
2484            run_name: Optional exact name for the experiment run. If provided, this will be
2485                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2486                If not provided, this will default to the experiment name appended with an ISO timestamp.
2487            description: Optional description explaining the experiment's purpose,
2488                methodology, or expected outcomes.
2489            data: Array of data items to process. Can be either:
2490                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2491                - List of Langfuse DatasetItem objects from dataset.items
2492            task: Function that processes each data item and returns output.
2493                Must accept 'item' as keyword argument and can return sync or async results.
2494                The task function signature should be: task(*, item, **kwargs) -> Any
2495            evaluators: List of functions to evaluate each item's output individually.
2496                Each evaluator receives input, output, expected_output, and metadata.
2497                Can return single Evaluation dict or list of Evaluation dicts.
2498            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2499                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2500                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2501                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2502            run_evaluators: List of functions to evaluate the entire experiment run.
2503                Each run evaluator receives all item_results and can compute aggregate metrics.
2504                Useful for calculating averages, distributions, or cross-item comparisons.
2505            max_concurrency: Maximum number of concurrent task executions (default: 50).
2506                Controls the number of items processed simultaneously. Adjust based on
2507                API rate limits and system resources.
2508            metadata: Optional metadata dictionary to attach to all experiment traces.
2509                This metadata will be included in every trace created during the experiment.
2510                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2511
2512        Returns:
2513            ExperimentResult containing:
2514            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2515            - item_results: List of results for each processed item with outputs and evaluations
2516            - run_evaluations: List of aggregate evaluation results for the entire run
2517            - experiment_id: Stable identifier for the experiment run across all items
2518            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2519            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2520
2521        Raises:
2522            ValueError: If required parameters are missing or invalid
2523            Exception: If experiment setup fails (individual item failures are handled gracefully)
2524
2525        Examples:
2526            Basic experiment with local data:
2527            ```python
2528            def summarize_text(*, item, **kwargs):
2529                return f"Summary: {item['input'][:50]}..."
2530
2531            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2532                return {
2533                    "name": "output_length",
2534                    "value": len(output),
2535                    "comment": f"Output contains {len(output)} characters"
2536                }
2537
2538            result = langfuse.run_experiment(
2539                name="Text Summarization Test",
2540                description="Evaluate summarization quality and length",
2541                data=[
2542                    {"input": "Long article text...", "expected_output": "Expected summary"},
2543                    {"input": "Another article...", "expected_output": "Another summary"}
2544                ],
2545                task=summarize_text,
2546                evaluators=[length_evaluator]
2547            )
2548
2549            print(f"Processed {len(result.item_results)} items")
2550            for item_result in result.item_results:
2551                print(f"Input: {item_result.item['input']}")
2552                print(f"Output: {item_result.output}")
2553                print(f"Evaluations: {item_result.evaluations}")
2554            ```
2555
2556            Advanced experiment with async task and multiple evaluators:
2557            ```python
2558            async def llm_task(*, item, **kwargs):
2559                # Simulate async LLM call
2560                response = await openai_client.chat.completions.create(
2561                    model="gpt-4",
2562                    messages=[{"role": "user", "content": item["input"]}]
2563                )
2564                return response.choices[0].message.content
2565
2566            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2567                if expected_output and expected_output.lower() in output.lower():
2568                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2569                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2570
2571            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2572                # Simulate toxicity check
2573                toxicity_score = check_toxicity(output)  # Your toxicity checker
2574                return {
2575                    "name": "toxicity",
2576                    "value": toxicity_score,
2577                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2578                }
2579
2580            def average_accuracy(*, item_results, **kwargs):
2581                accuracies = [
2582                    eval.value for result in item_results
2583                    for eval in result.evaluations
2584                    if eval.name == "accuracy"
2585                ]
2586                return {
2587                    "name": "average_accuracy",
2588                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2589                    "comment": f"Average accuracy across {len(accuracies)} items"
2590                }
2591
2592            result = langfuse.run_experiment(
2593                name="LLM Safety and Accuracy Test",
2594                description="Evaluate model accuracy and safety across diverse prompts",
2595                data=test_dataset,  # Your dataset items
2596                task=llm_task,
2597                evaluators=[accuracy_evaluator, toxicity_evaluator],
2598                run_evaluators=[average_accuracy],
2599                max_concurrency=5,  # Limit concurrent API calls
2600                metadata={"model": "gpt-4", "temperature": 0.7}
2601            )
2602            ```
2603
2604            Using with Langfuse datasets:
2605            ```python
2606            # Get dataset from Langfuse
2607            dataset = langfuse.get_dataset("my-eval-dataset")
2608
2609            result = dataset.run_experiment(
2610                name="Production Model Evaluation",
2611                description="Monthly evaluation of production model performance",
2612                task=my_production_task,
2613                evaluators=[accuracy_evaluator, latency_evaluator]
2614            )
2615
2616            # Results automatically linked to dataset in Langfuse UI
2617            print(f"View results: {result['dataset_run_url']}")
2618            ```
2619
2620        Note:
2621            - Task and evaluator functions can be either synchronous or asynchronous
2622            - Individual item failures are logged but don't stop the experiment
2623            - All executions are automatically traced and visible in Langfuse UI
2624            - When using Langfuse datasets, results are automatically linked for easy comparison
2625            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2626            - Async execution is handled automatically with smart event loop detection
2627        """
2628        return cast(
2629            ExperimentResult,
2630            run_async_safely(
2631                self._run_experiment_async(
2632                    name=name,
2633                    run_name=self._create_experiment_run_name(
2634                        name=name, run_name=run_name
2635                    ),
2636                    description=description,
2637                    data=data,
2638                    task=task,
2639                    evaluators=evaluators or [],
2640                    composite_evaluator=composite_evaluator,
2641                    run_evaluators=run_evaluators or [],
2642                    max_concurrency=max_concurrency,
2643                    metadata=metadata,
2644                    dataset_version=_dataset_version,
2645                ),
2646            ),
2647        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • experiment_id: Stable identifier for the experiment run across all items
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
3009    def run_batched_evaluation(
3010        self,
3011        *,
3012        scope: Literal["traces", "observations"],
3013        mapper: MapperFunction,
3014        filter: Optional[str] = None,
3015        fetch_batch_size: int = 50,
3016        fetch_trace_fields: Optional[str] = None,
3017        max_items: Optional[int] = None,
3018        max_retries: int = 3,
3019        evaluators: List[EvaluatorFunction],
3020        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3021        max_concurrency: int = 5,
3022        metadata: Optional[Dict[str, Any]] = None,
3023        _add_observation_scores_to_trace: bool = False,
3024        _additional_trace_tags: Optional[List[str]] = None,
3025        resume_from: Optional[BatchEvaluationResumeToken] = None,
3026        verbose: bool = False,
3027    ) -> BatchEvaluationResult:
3028        """Fetch traces or observations and run evaluations on each item.
3029
3030        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3031        It fetches items based on filters, transforms them using a mapper function, runs
3032        evaluators on each item, and creates scores that are linked back to the original
3033        entities. This is ideal for:
3034
3035        - Running evaluations on production traces after deployment
3036        - Backtesting new evaluation metrics on historical data
3037        - Batch scoring of observations for quality monitoring
3038        - Periodic evaluation runs on recent data
3039
3040        The method uses a streaming/pipeline approach to process items in batches, making
3041        it memory-efficient for large datasets. It includes comprehensive error handling,
3042        retry logic, and resume capability for long-running evaluations.
3043
3044        Args:
3045            scope: The type of items to evaluate. Must be one of:
3046                - "traces": Evaluate complete traces with all their observations
3047                - "observations": Evaluate individual observations (spans, generations, events)
3048            mapper: Function that transforms API response objects into evaluator inputs.
3049                Receives a trace/observation object and returns an EvaluatorInputs
3050                instance with input, output, expected_output, and metadata fields.
3051                Can be sync or async.
3052            evaluators: List of evaluation functions to run on each item. Each evaluator
3053                receives the mapped inputs and returns Evaluation object(s). Evaluator
3054                failures are logged but don't stop the batch evaluation.
3055            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3056                - '{"tags": ["production"]}'
3057                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3058                Default: None (fetches all items).
3059            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3060                Larger values may be faster but use more memory. Default: 50.
3061            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3062            max_items: Maximum total number of items to process. If None, processes all
3063                items matching the filter. Useful for testing or limiting evaluation runs.
3064                Default: None (process all).
3065            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3066                parallelism and resource usage. Default: 5.
3067            composite_evaluator: Optional function that creates a composite score from
3068                item-level evaluations. Receives the original item and its evaluations,
3069                returns a single Evaluation. Useful for weighted averages or combined metrics.
3070                Default: None.
3071            metadata: Optional metadata dict to add to all created scores. Useful for
3072                tracking evaluation runs, versions, or other context. Default: None.
3073            max_retries: Maximum number of retry attempts for failed batch fetches.
3074                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3075            verbose: If True, logs progress information to console. Useful for monitoring
3076                long-running evaluations. Default: False.
3077            resume_from: Optional resume token from a previous incomplete run. Allows
3078                continuing evaluation after interruption or failure. Default: None.
3079
3080
3081        Returns:
3082            BatchEvaluationResult containing:
3083                - total_items_fetched: Number of items fetched from API
3084                - total_items_processed: Number of items successfully evaluated
3085                - total_items_failed: Number of items that failed evaluation
3086                - total_scores_created: Scores created by item-level evaluators
3087                - total_composite_scores_created: Scores created by composite evaluator
3088                - total_evaluations_failed: Individual evaluator failures
3089                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3090                - resume_token: Token for resuming if incomplete (None if completed)
3091                - completed: True if all items processed
3092                - duration_seconds: Total execution time
3093                - failed_item_ids: IDs of items that failed
3094                - error_summary: Error types and counts
3095                - has_more_items: True if max_items reached but more exist
3096
3097        Raises:
3098            ValueError: If invalid scope is provided.
3099
3100        Examples:
3101            Basic trace evaluation:
3102            ```python
3103            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3104
3105            client = Langfuse()
3106
3107            # Define mapper to extract fields from traces
3108            def trace_mapper(trace):
3109                return EvaluatorInputs(
3110                    input=trace.input,
3111                    output=trace.output,
3112                    expected_output=None,
3113                    metadata={"trace_id": trace.id}
3114                )
3115
3116            # Define evaluator
3117            def length_evaluator(*, input, output, expected_output, metadata):
3118                return Evaluation(
3119                    name="output_length",
3120                    value=len(output) if output else 0
3121                )
3122
3123            # Run batch evaluation
3124            result = client.run_batched_evaluation(
3125                scope="traces",
3126                mapper=trace_mapper,
3127                evaluators=[length_evaluator],
3128                filter='{"tags": ["production"]}',
3129                max_items=1000,
3130                verbose=True
3131            )
3132
3133            print(f"Processed {result.total_items_processed} traces")
3134            print(f"Created {result.total_scores_created} scores")
3135            ```
3136
3137            Evaluation with composite scorer:
3138            ```python
3139            def accuracy_evaluator(*, input, output, expected_output, metadata):
3140                # ... evaluation logic
3141                return Evaluation(name="accuracy", value=0.85)
3142
3143            def relevance_evaluator(*, input, output, expected_output, metadata):
3144                # ... evaluation logic
3145                return Evaluation(name="relevance", value=0.92)
3146
3147            def composite_evaluator(*, item, evaluations):
3148                # Weighted average of evaluations
3149                weights = {"accuracy": 0.6, "relevance": 0.4}
3150                total = sum(
3151                    e.value * weights.get(e.name, 0)
3152                    for e in evaluations
3153                    if isinstance(e.value, (int, float))
3154                )
3155                return Evaluation(
3156                    name="composite_score",
3157                    value=total,
3158                    comment=f"Weighted average of {len(evaluations)} metrics"
3159                )
3160
3161            result = client.run_batched_evaluation(
3162                scope="traces",
3163                mapper=trace_mapper,
3164                evaluators=[accuracy_evaluator, relevance_evaluator],
3165                composite_evaluator=composite_evaluator,
3166                filter='{"user_id": "important_user"}',
3167                verbose=True
3168            )
3169            ```
3170
3171            Handling incomplete runs with resume:
3172            ```python
3173            # Initial run that may fail or timeout
3174            result = client.run_batched_evaluation(
3175                scope="observations",
3176                mapper=obs_mapper,
3177                evaluators=[my_evaluator],
3178                max_items=10000,
3179                verbose=True
3180            )
3181
3182            # Check if incomplete
3183            if not result.completed and result.resume_token:
3184                print(f"Processed {result.resume_token.items_processed} items before interruption")
3185
3186                # Resume from where it left off
3187                result = client.run_batched_evaluation(
3188                    scope="observations",
3189                    mapper=obs_mapper,
3190                    evaluators=[my_evaluator],
3191                    resume_from=result.resume_token,
3192                    verbose=True
3193                )
3194
3195            print(f"Total items processed: {result.total_items_processed}")
3196            ```
3197
3198            Monitoring evaluator performance:
3199            ```python
3200            result = client.run_batched_evaluation(...)
3201
3202            for stats in result.evaluator_stats:
3203                success_rate = stats.successful_runs / stats.total_runs
3204                print(f"{stats.name}:")
3205                print(f"  Success rate: {success_rate:.1%}")
3206                print(f"  Scores created: {stats.total_scores_created}")
3207
3208                if stats.failed_runs > 0:
3209                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3210            ```
3211
3212        Note:
3213            - Evaluator failures are logged but don't stop the batch evaluation
3214            - Individual item failures are tracked but don't stop processing
3215            - Fetch failures are retried with exponential backoff
3216            - All scores are automatically flushed to Langfuse at the end
3217            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3218        """
3219        runner = BatchEvaluationRunner(self)
3220
3221        return cast(
3222            BatchEvaluationResult,
3223            run_async_safely(
3224                runner.run_async(
3225                    scope=scope,
3226                    mapper=mapper,
3227                    evaluators=evaluators,
3228                    filter=filter,
3229                    fetch_batch_size=fetch_batch_size,
3230                    fetch_trace_fields=fetch_trace_fields,
3231                    max_items=max_items,
3232                    max_concurrency=max_concurrency,
3233                    composite_evaluator=composite_evaluator,
3234                    metadata=metadata,
3235                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3236                    _additional_trace_tags=_additional_trace_tags,
3237                    max_retries=max_retries,
3238                    verbose=verbose,
3239                    resume_from=resume_from,
3240                )
3241            ),
3242        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3244    def auth_check(self) -> bool:
3245        """Check if the provided credentials (public and secret key) are valid.
3246
3247        Raises:
3248            Exception: If no projects were found for the provided credentials.
3249
3250        Note:
3251            This method is blocking. It is discouraged to use it in production code.
3252        """
3253        try:
3254            projects = self.api.projects.get()
3255            langfuse_logger.debug(
3256                f"Auth check successful, found {len(projects.data)} projects"
3257            )
3258            if len(projects.data) == 0:
3259                raise Exception(
3260                    "Auth check failed, no project found for the keys provided."
3261                )
3262            return True
3263
3264        except AttributeError as e:
3265            langfuse_logger.warning(
3266                f"Auth check failed: Client not properly initialized. Error: {e}"
3267            )
3268            return False
3269
3270        except Error as e:
3271            handle_fern_exception(e)
3272            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3274    def create_dataset(
3275        self,
3276        *,
3277        name: str,
3278        description: Optional[str] = None,
3279        metadata: Optional[Any] = None,
3280        input_schema: Optional[Any] = None,
3281        expected_output_schema: Optional[Any] = None,
3282    ) -> Dataset:
3283        """Create a dataset with the given name on Langfuse.
3284
3285        Args:
3286            name: Name of the dataset to create.
3287            description: Description of the dataset. Defaults to None.
3288            metadata: Additional metadata. Defaults to None.
3289            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3290            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3291
3292        Returns:
3293            Dataset: The created dataset as returned by the Langfuse API.
3294        """
3295        try:
3296            langfuse_logger.debug(f"Creating datasets {name}")
3297
3298            result = self.api.datasets.create(
3299                name=name,
3300                description=description,
3301                metadata=metadata,
3302                input_schema=input_schema,
3303                expected_output_schema=expected_output_schema,
3304            )
3305
3306            return cast(Dataset, result)
3307
3308        except Error as e:
3309            handle_fern_exception(e)
3310            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3312    def create_dataset_item(
3313        self,
3314        *,
3315        dataset_name: str,
3316        input: Optional[Any] = None,
3317        expected_output: Optional[Any] = None,
3318        metadata: Optional[Any] = None,
3319        source_trace_id: Optional[str] = None,
3320        source_observation_id: Optional[str] = None,
3321        status: Optional[DatasetStatus] = None,
3322        id: Optional[str] = None,
3323    ) -> DatasetItem:
3324        """Create a dataset item.
3325
3326        Upserts if an item with id already exists.
3327
3328        Args:
3329            dataset_name: Name of the dataset in which the dataset item should be created.
3330            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3331            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3332            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3333            source_trace_id: Id of the source trace. Defaults to None.
3334            source_observation_id: Id of the source observation. Defaults to None.
3335            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3336            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3337
3338        Returns:
3339            DatasetItem: The created dataset item as returned by the Langfuse API.
3340
3341        Example:
3342            ```python
3343            from langfuse import Langfuse
3344
3345            langfuse = Langfuse()
3346
3347            # Uploading items to the Langfuse dataset named "capital_cities"
3348            langfuse.create_dataset_item(
3349                dataset_name="capital_cities",
3350                input={"input": {"country": "Italy"}},
3351                expected_output={"expected_output": "Rome"},
3352                metadata={"foo": "bar"}
3353            )
3354            ```
3355        """
3356        try:
3357            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3358
3359            result = self.api.dataset_items.create(
3360                dataset_name=dataset_name,
3361                input=input,
3362                expected_output=expected_output,
3363                metadata=metadata,
3364                source_trace_id=source_trace_id,
3365                source_observation_id=source_observation_id,
3366                status=status,
3367                id=id,
3368            )
3369
3370            return cast(DatasetItem, result)
3371        except Error as e:
3372            handle_fern_exception(e)
3373            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3375    def resolve_media_references(
3376        self,
3377        *,
3378        obj: Any,
3379        resolve_with: Literal["base64_data_uri"],
3380        max_depth: int = 10,
3381        content_fetch_timeout_seconds: int = 5,
3382    ) -> Any:
3383        """Replace media reference strings in an object with base64 data URIs.
3384
3385        This method recursively traverses an object (up to max_depth) looking for media reference strings
3386        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3387        the provided Langfuse client and replaces the reference string with a base64 data URI.
3388
3389        If fetching media content fails for a reference string, a warning is logged and the reference
3390        string is left unchanged.
3391
3392        Args:
3393            obj: The object to process. Can be a primitive value, array, or nested object.
3394                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3395            resolve_with: The representation of the media content to replace the media reference string with.
3396                Currently only "base64_data_uri" is supported.
3397            max_depth: int: The maximum depth to traverse the object. Default is 10.
3398            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3399
3400        Returns:
3401            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3402            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3403
3404        Example:
3405            obj = {
3406                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3407                "nested": {
3408                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3409                }
3410            }
3411
3412            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3413
3414            # Result:
3415            # {
3416            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3417            #     "nested": {
3418            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3419            #     }
3420            # }
3421        """
3422        return LangfuseMedia.resolve_media_references(
3423            langfuse_client=self,
3424            obj=obj,
3425            resolve_with=resolve_with,
3426            max_depth=max_depth,
3427            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3428        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3458    def get_prompt(
3459        self,
3460        name: str,
3461        *,
3462        version: Optional[int] = None,
3463        label: Optional[str] = None,
3464        type: Literal["chat", "text"] = "text",
3465        cache_ttl_seconds: Optional[int] = None,
3466        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3467        max_retries: Optional[int] = None,
3468        fetch_timeout_seconds: Optional[int] = None,
3469    ) -> PromptClient:
3470        """Get a prompt.
3471
3472        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3473        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3474        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3475        return the expired prompt as a fallback.
3476
3477        Args:
3478            name (str): The name of the prompt to retrieve.
3479
3480        Keyword Args:
3481            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3482            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3483            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3484            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3485            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3486            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3487            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3488            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3489
3490        Returns:
3491            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3492            - TextPromptClient, if type argument is 'text'.
3493            - ChatPromptClient, if type argument is 'chat'.
3494
3495        Raises:
3496            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3497            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3498        """
3499        if self._resources is None:
3500            raise Error(
3501                "SDK is not correctly initialized. Check the init logs for more details."
3502            )
3503        if version is not None and label is not None:
3504            raise ValueError("Cannot specify both version and label at the same time.")
3505
3506        if not name:
3507            raise ValueError("Prompt name cannot be empty.")
3508
3509        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3510        bounded_max_retries = self._get_bounded_max_retries(
3511            max_retries, default_max_retries=2, max_retries_upper_bound=4
3512        )
3513
3514        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3515        cached_prompt = self._resources.prompt_cache.get(cache_key)
3516
3517        if cached_prompt is None or cache_ttl_seconds == 0:
3518            langfuse_logger.debug(
3519                f"Prompt '{cache_key}' not found in cache or caching disabled."
3520            )
3521            try:
3522                return self._fetch_prompt_and_update_cache(
3523                    name,
3524                    version=version,
3525                    label=label,
3526                    ttl_seconds=cache_ttl_seconds,
3527                    max_retries=bounded_max_retries,
3528                    fetch_timeout_seconds=fetch_timeout_seconds,
3529                )
3530            except Exception as e:
3531                if fallback:
3532                    langfuse_logger.warning(
3533                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3534                    )
3535
3536                    fallback_client_args: Dict[str, Any] = {
3537                        "name": name,
3538                        "prompt": fallback,
3539                        "type": type,
3540                        "version": version or 0,
3541                        "config": {},
3542                        "labels": [label] if label else [],
3543                        "tags": [],
3544                    }
3545
3546                    if type == "text":
3547                        return TextPromptClient(
3548                            prompt=Prompt_Text(**fallback_client_args),
3549                            is_fallback=True,
3550                        )
3551
3552                    if type == "chat":
3553                        return ChatPromptClient(
3554                            prompt=Prompt_Chat(**fallback_client_args),
3555                            is_fallback=True,
3556                        )
3557
3558                raise e
3559
3560        if cached_prompt.is_expired():
3561            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3562            try:
3563                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3564                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3565
3566                def refresh_task() -> None:
3567                    self._fetch_prompt_and_update_cache(
3568                        name,
3569                        version=version,
3570                        label=label,
3571                        ttl_seconds=cache_ttl_seconds,
3572                        max_retries=bounded_max_retries,
3573                        fetch_timeout_seconds=fetch_timeout_seconds,
3574                    )
3575
3576                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3577                    cache_key,
3578                    cached_prompt,
3579                    refresh_task,
3580                )
3581                langfuse_logger.debug(
3582                    f"Returning stale prompt '{cache_key}' from cache."
3583                )
3584                # return stale prompt
3585                return cached_prompt.value
3586
3587            except Exception as e:
3588                langfuse_logger.warning(
3589                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3590                )
3591                # creation of refresh prompt task failed, return stale prompt
3592                return cached_prompt.value
3593
3594        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:
  • version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
  • keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
  • type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
  • fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
  • max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
  • fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3696    def create_prompt(
3697        self,
3698        *,
3699        name: str,
3700        prompt: Union[
3701            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3702        ],
3703        labels: List[str] = [],
3704        tags: Optional[List[str]] = None,
3705        type: Optional[Literal["chat", "text"]] = "text",
3706        config: Optional[Any] = None,
3707        commit_message: Optional[str] = None,
3708    ) -> PromptClient:
3709        """Create a new prompt in Langfuse.
3710
3711        Keyword Args:
3712            name : The name of the prompt to be created.
3713            prompt : The content of the prompt to be created.
3714            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3715            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3716            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3717            config: Additional structured data to be saved with the prompt. Defaults to None.
3718            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3719            commit_message: Optional string describing the change.
3720
3721        Returns:
3722            TextPromptClient: The prompt if type argument is 'text'.
3723            ChatPromptClient: The prompt if type argument is 'chat'.
3724        """
3725        try:
3726            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3727
3728            if type == "chat":
3729                if not isinstance(prompt, list):
3730                    raise ValueError(
3731                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3732                    )
3733                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3734                    CreateChatPromptRequest(
3735                        name=name,
3736                        prompt=cast(Any, prompt),
3737                        labels=labels,
3738                        tags=tags,
3739                        config=config or {},
3740                        commit_message=commit_message,
3741                        type=CreateChatPromptType.CHAT,
3742                    )
3743                )
3744                server_prompt = self.api.prompts.create(request=request)
3745
3746                if self._resources is not None:
3747                    self._resources.prompt_cache.invalidate(name)
3748
3749                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3750
3751            if not isinstance(prompt, str):
3752                raise ValueError("For 'text' type, 'prompt' must be a string.")
3753
3754            request = CreateTextPromptRequest(
3755                name=name,
3756                prompt=prompt,
3757                labels=labels,
3758                tags=tags,
3759                config=config or {},
3760                commit_message=commit_message,
3761            )
3762
3763            server_prompt = self.api.prompts.create(request=request)
3764
3765            if self._resources is not None:
3766                self._resources.prompt_cache.invalidate(name)
3767
3768            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3769
3770        except Error as e:
3771            handle_fern_exception(e)
3772            raise e

Create a new prompt in Langfuse.

Keyword Args:
  • name : The name of the prompt to be created.
  • prompt : The content of the prompt to be created.
  • is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
  • labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
  • tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
  • config: Additional structured data to be saved with the prompt. Defaults to None.
  • type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
  • commit_message: Optional string describing the change.
Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3774    def update_prompt(
3775        self,
3776        *,
3777        name: str,
3778        version: int,
3779        new_labels: List[str] = [],
3780    ) -> Any:
3781        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3782
3783        Args:
3784            name (str): The name of the prompt to update.
3785            version (int): The version number of the prompt to update.
3786            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3787
3788        Returns:
3789            Prompt: The updated prompt from the Langfuse API.
3790
3791        """
3792        updated_prompt = self.api.prompt_version.update(
3793            name=self._url_encode(name),
3794            version=version,
3795            new_labels=new_labels,
3796        )
3797
3798        if self._resources is not None:
3799            self._resources.prompt_cache.invalidate(name)
3800
3801        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3816    def clear_prompt_cache(self) -> None:
3817        """Clear the entire prompt cache, removing all cached prompts.
3818
3819        This method is useful when you want to force a complete refresh of all
3820        cached prompts, for example after major updates or when you need to
3821        ensure the latest versions are fetched from the server.
3822        """
3823        if self._resources is not None:
3824            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 64def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 65    """Get or create a Langfuse client instance.
 66
 67    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 68    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 69
 70    Behavior:
 71    - Single project: Returns existing client or creates new one
 72    - Multi-project: Requires public_key to return specific client
 73    - No public_key in multi-project: Returns disabled client to prevent data leakage
 74
 75    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 76
 77    Args:
 78        public_key (Optional[str]): Project identifier
 79            - With key: Returns client for that project
 80            - Without key: Returns single client or disabled client if multiple exist
 81
 82    Returns:
 83        Langfuse: Client instance in one of three states:
 84            1. Client for specified public_key
 85            2. Default client for single-project setup
 86            3. Disabled client when multiple projects exist without key
 87
 88    Security:
 89        Disables tracing when multiple projects exist without explicit key to prevent
 90        cross-project data leakage. Multi-project setups are experimental.
 91
 92    Example:
 93        ```python
 94        # Single project
 95        client = get_client()  # Default client
 96
 97        # In multi-project usage:
 98        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 99        client_b = get_client(public_key="project_b_key")  # Returns project B's client
100
101        # Without specific key in multi-project setup:
102        client = get_client()  # Returns disabled client for safety
103        ```
104    """
105    with LangfuseResourceManager._lock:
106        active_instances = LangfuseResourceManager._instances
107
108        # If no explicit public_key provided, check execution context
109        if not public_key:
110            public_key = _current_public_key.get(None)
111
112        if not public_key:
113            if len(active_instances) == 0:
114                # No clients initialized yet, create default instance
115                return Langfuse()
116
117            if len(active_instances) == 1:
118                # Only one client exists, safe to use without specifying key
119                instance = list(active_instances.values())[0]
120
121                # Initialize with the credentials bound to the instance
122                # This is important if the original instance was instantiated
123                # via constructor arguments
124                return _create_client_from_instance(instance)
125
126            else:
127                # Multiple clients exist but no key specified - disable tracing
128                # to prevent cross-project data leakage
129                langfuse_logger.warning(
130                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
131                )
132                return Langfuse(
133                    tracing_enabled=False, public_key="fake", secret_key="fake"
134                )
135
136        else:
137            # Specific key provided, look up existing instance
138            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
139                public_key, None
140            )
141
142            if target_instance is None:
143                # No instance found with this key - client not initialized properly
144                langfuse_logger.warning(
145                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
146                )
147                return Langfuse(
148                    tracing_enabled=False, public_key="fake", secret_key="fake"
149                )
150
151            # target_instance is guaranteed to be not None at this point
152            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 88    def observe(
 89        self,
 90        func: Optional[F] = None,
 91        *,
 92        name: Optional[str] = None,
 93        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 94        capture_input: Optional[bool] = None,
 95        capture_output: Optional[bool] = None,
 96        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 97    ) -> Union[F, Callable[[F], F]]:
 98        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
 99
100        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
101        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
102        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
103
104        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
105        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
106
107        Args:
108            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
109            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
110            as_type (Optional[Literal]): Set the observation type. Supported values:
111                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
112                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
113                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
114                    can be set.
115
116        Returns:
117            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
118
119        Example:
120            For general function tracing with automatic naming:
121            ```python
122            @observe()
123            def process_user_request(user_id, query):
124                # Function is automatically traced with name "process_user_request"
125                return get_response(query)
126            ```
127
128            For language model generation tracking:
129            ```python
130            @observe(name="answer-generation", as_type="generation")
131            async def generate_answer(query):
132                # Creates a generation-type span with extended LLM metrics
133                response = await openai.chat.completions.create(
134                    model="gpt-4",
135                    messages=[{"role": "user", "content": query}]
136                )
137                return response.choices[0].message.content
138            ```
139
140            For trace context propagation between functions:
141            ```python
142            @observe()
143            def main_process():
144                # Parent span is created
145                return sub_process()  # Child span automatically connected to parent
146
147            @observe()
148            def sub_process():
149                # Automatically becomes a child span of main_process
150                return "result"
151            ```
152
153        Raises:
154            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
155
156        Notes:
157            - The decorator preserves the original function's signature, docstring, and return type.
158            - Proper parent-child relationships between spans are automatically maintained.
159            - Special keyword arguments can be passed to control tracing:
160              - langfuse_trace_id: Explicitly set the trace ID for this function call
161              - langfuse_parent_observation_id: Explicitly set the parent span ID
162              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
163            - For async functions, the decorator returns an async function wrapper.
164            - For sync functions, the decorator returns a synchronous wrapper.
165        """
166        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
167        if as_type is not None and as_type not in valid_types:
168            logger.warning(
169                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
170            )
171            as_type = "span"
172
173        function_io_capture_enabled = os.environ.get(
174            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
175        ).lower() not in ("false", "0")
176
177        should_capture_input = (
178            capture_input if capture_input is not None else function_io_capture_enabled
179        )
180
181        should_capture_output = (
182            capture_output
183            if capture_output is not None
184            else function_io_capture_enabled
185        )
186
187        def decorator(func: F) -> F:
188            return (
189                self._async_observe(
190                    func,
191                    name=name,
192                    as_type=as_type,
193                    capture_input=should_capture_input,
194                    capture_output=should_capture_output,
195                    transform_to_string=transform_to_string,
196                )
197                if asyncio.iscoroutinefunction(func)
198                else self._sync_observe(
199                    func,
200                    name=name,
201                    as_type=as_type,
202                    capture_input=should_capture_input,
203                    capture_output=should_capture_output,
204                    transform_to_string=transform_to_string,
205                )
206            )
207
208        """Handle decorator with or without parentheses.
209
210        This logic enables the decorator to work both with and without parentheses:
211        - @observe - Python passes the function directly to the decorator
212        - @observe() - Python calls the decorator first, which must return a function decorator
213
214        When called without arguments (@observe), the func parameter contains the function to decorate,
215        so we directly apply the decorator to it. When called with parentheses (@observe()),
216        func is None, so we return the decorator function itself for Python to apply in the next step.
217        """
218        if func is None:
219            return decorator
220        else:
221            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 95def propagate_attributes(
 96    *,
 97    user_id: Optional[str] = None,
 98    session_id: Optional[str] = None,
 99    metadata: Optional[Dict[str, Any]] = None,
100    version: Optional[str] = None,
101    tags: Optional[List[str]] = None,
102    trace_name: Optional[str] = None,
103    as_baggage: bool = False,
104) -> _AgnosticContextManager[Any]:
105    """Propagate trace-level attributes to all spans created within this context.
106
107    This context manager sets attributes on the currently active span AND automatically
108    propagates them to all new child spans created within the context. This is the
109    recommended way to set trace-level attributes like user_id, session_id, and metadata
110    dimensions that should be consistently applied across all observations in a trace.
111
112    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
113    currently active span and spans created after entering this context will have these
114    attributes. Pre-existing spans will NOT be retroactively updated.
115
116    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
117    filtering by session_id) only include observations that have the attribute set.
118    If you call `propagate_attributes` late in your workflow, earlier spans won't be
119    included in aggregations for that attribute.
120
121    Args:
122        user_id: User identifier to associate with all spans in this context.
123            Must be US-ASCII string, ≤200 characters. Use this to track which user
124            generated each trace and enable e.g. per-user cost/performance analysis.
125        session_id: Session identifier to associate with all spans in this context.
126            Must be US-ASCII string, ≤200 characters. Use this to group related traces
127            within a user session (e.g., a conversation thread, multi-turn interaction).
128        metadata: Additional key-value metadata to propagate to all spans.
129            - Keys must be US-ASCII strings
130            - Values are coerced to strings
131            - Coerced values must be ≤200 characters
132            - Use for dimensions like internal correlating identifiers
133            - AVOID: large payloads or sensitive data
134        version: Version identfier for parts of your application that are independently versioned, e.g. agents
135        tags: List of tags to categorize the group of observations
136        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
137            Use this to set a consistent trace name for all spans created within this context.
138        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
139            cross-process/service propagation. **Security warning**: When enabled,
140            attribute values are added to HTTP headers on ALL outbound requests.
141            Only enable if values are safe to transmit via HTTP headers and you need
142            cross-service tracing. Default: False.
143
144    Returns:
145        Context manager that propagates attributes to all child spans.
146
147    Example:
148        Basic usage with user and session tracking:
149
150        ```python
151        from langfuse import Langfuse
152
153        langfuse = Langfuse()
154
155        # Set attributes early in the trace
156        with langfuse.start_as_current_observation(name="user_workflow") as span:
157            with langfuse.propagate_attributes(
158                user_id="user_123",
159                session_id="session_abc",
160                metadata={"experiment": "variant_a", "environment": "production"}
161            ):
162                # All spans created here will have user_id, session_id, and metadata
163                with langfuse.start_observation(name="llm_call") as llm_span:
164                    # This span inherits: user_id, session_id, experiment, environment
165                    ...
166
167                with langfuse.start_generation(name="completion") as gen:
168                    # This span also inherits all attributes
169                    ...
170        ```
171
172        Late propagation (anti-pattern):
173
174        ```python
175        with langfuse.start_as_current_observation(name="workflow") as span:
176            # These spans WON'T have user_id
177            early_span = langfuse.start_observation(name="early_work")
178            early_span.end()
179
180            # Set attributes in the middle
181            with langfuse.propagate_attributes(user_id="user_123"):
182                # Only spans created AFTER this point will have user_id
183                late_span = langfuse.start_observation(name="late_work")
184                late_span.end()
185
186            # Result: Aggregations by user_id will miss "early_work" span
187        ```
188
189        Cross-service propagation with baggage (advanced):
190
191        ```python
192        # Service A - originating service
193        with langfuse.start_as_current_observation(name="api_request"):
194            with langfuse.propagate_attributes(
195                user_id="user_123",
196                session_id="session_abc",
197                as_baggage=True  # Propagate via HTTP headers
198            ):
199                # Make HTTP request to Service B
200                response = requests.get("https://service-b.example.com/api")
201                # user_id and session_id are now in HTTP headers
202
203        # Service B - downstream service
204        # OpenTelemetry will automatically extract baggage from HTTP headers
205        # and propagate to spans in Service B
206        ```
207
208    Note:
209        - **Validation**: Attribute values (user_id, session_id, version, tags,
210          trace_name) must be strings ≤200 characters. Metadata values are
211          coerced to strings before the 200 character limit is applied. Invalid
212          values will be dropped with a warning logged.
213        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
214          making it compatible with other OTel-instrumented libraries.
215
216    Raises:
217        No exceptions are raised. Invalid values are logged as warnings and dropped.
218    """
219    return _propagate_attributes(
220        user_id=user_id,
221        session_id=session_id,
222        metadata=metadata,
223        version=version,
224        tags=tags,
225        trace_name=trace_name,
226        as_baggage=as_baggage,
227    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys must be US-ASCII strings
    • Values are coerced to strings
    • Coerced values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads or sensitive data
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: Attribute values (user_id, session_id, version, tags, trace_name) must be strings ≤200 characters. Metadata values are coerced to strings before the 200 character limit is applied. Invalid values will be dropped with a warning logged.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1251class LangfuseSpan(LangfuseObservationWrapper):
1252    """Standard span implementation for general operations in Langfuse.
1253
1254    This class represents a general-purpose span that can be used to trace
1255    any operation in your application. It extends the base LangfuseObservationWrapper
1256    with specific methods for creating child spans, generations, and updating
1257    span-specific attributes. If possible, use a more specific type for
1258    better observability and insights.
1259    """
1260
1261    def __init__(
1262        self,
1263        *,
1264        otel_span: otel_trace_api.Span,
1265        langfuse_client: "Langfuse",
1266        input: Optional[Any] = None,
1267        output: Optional[Any] = None,
1268        metadata: Optional[Any] = None,
1269        environment: Optional[str] = None,
1270        release: Optional[str] = None,
1271        version: Optional[str] = None,
1272        level: Optional[SpanLevel] = None,
1273        status_message: Optional[str] = None,
1274    ):
1275        """Initialize a new LangfuseSpan.
1276
1277        Args:
1278            otel_span: The OpenTelemetry span to wrap
1279            langfuse_client: Reference to the parent Langfuse client
1280            input: Input data for the span (any JSON-serializable object)
1281            output: Output data from the span (any JSON-serializable object)
1282            metadata: Additional metadata to associate with the span
1283            environment: The tracing environment
1284            release: Release identifier for the application
1285            version: Version identifier for the code or component
1286            level: Importance level of the span (info, warning, error)
1287            status_message: Optional status message for the span
1288        """
1289        super().__init__(
1290            otel_span=otel_span,
1291            as_type="span",
1292            langfuse_client=langfuse_client,
1293            input=input,
1294            output=output,
1295            metadata=metadata,
1296            environment=environment,
1297            release=release,
1298            version=version,
1299            level=level,
1300            status_message=status_message,
1301        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1261    def __init__(
1262        self,
1263        *,
1264        otel_span: otel_trace_api.Span,
1265        langfuse_client: "Langfuse",
1266        input: Optional[Any] = None,
1267        output: Optional[Any] = None,
1268        metadata: Optional[Any] = None,
1269        environment: Optional[str] = None,
1270        release: Optional[str] = None,
1271        version: Optional[str] = None,
1272        level: Optional[SpanLevel] = None,
1273        status_message: Optional[str] = None,
1274    ):
1275        """Initialize a new LangfuseSpan.
1276
1277        Args:
1278            otel_span: The OpenTelemetry span to wrap
1279            langfuse_client: Reference to the parent Langfuse client
1280            input: Input data for the span (any JSON-serializable object)
1281            output: Output data from the span (any JSON-serializable object)
1282            metadata: Additional metadata to associate with the span
1283            environment: The tracing environment
1284            release: Release identifier for the application
1285            version: Version identifier for the code or component
1286            level: Importance level of the span (info, warning, error)
1287            status_message: Optional status message for the span
1288        """
1289        super().__init__(
1290            otel_span=otel_span,
1291            as_type="span",
1292            langfuse_client=langfuse_client,
1293            input=input,
1294            output=output,
1295            metadata=metadata,
1296            environment=environment,
1297            release=release,
1298            version=version,
1299            level=level,
1300            status_message=status_message,
1301        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1304class LangfuseGeneration(LangfuseObservationWrapper):
1305    """Specialized span implementation for AI model generations in Langfuse.
1306
1307    This class represents a generation span specifically designed for tracking
1308    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1309    attributes for model details, token usage, and costs.
1310    """
1311
1312    def __init__(
1313        self,
1314        *,
1315        otel_span: otel_trace_api.Span,
1316        langfuse_client: "Langfuse",
1317        input: Optional[Any] = None,
1318        output: Optional[Any] = None,
1319        metadata: Optional[Any] = None,
1320        environment: Optional[str] = None,
1321        release: Optional[str] = None,
1322        version: Optional[str] = None,
1323        level: Optional[SpanLevel] = None,
1324        status_message: Optional[str] = None,
1325        completion_start_time: Optional[datetime] = None,
1326        model: Optional[str] = None,
1327        model_parameters: Optional[Dict[str, MapValue]] = None,
1328        usage_details: Optional[Dict[str, int]] = None,
1329        cost_details: Optional[Dict[str, float]] = None,
1330        prompt: Optional[PromptClient] = None,
1331    ):
1332        """Initialize a new LangfuseGeneration span.
1333
1334        Args:
1335            otel_span: The OpenTelemetry span to wrap
1336            langfuse_client: Reference to the parent Langfuse client
1337            input: Input data for the generation (e.g., prompts)
1338            output: Output from the generation (e.g., completions)
1339            metadata: Additional metadata to associate with the generation
1340            environment: The tracing environment
1341            release: Release identifier for the application
1342            version: Version identifier for the model or component
1343            level: Importance level of the generation (info, warning, error)
1344            status_message: Optional status message for the generation
1345            completion_start_time: When the model started generating the response
1346            model: Name/identifier of the AI model used (e.g., "gpt-4")
1347            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1348            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1349            cost_details: Cost information for the model call
1350            prompt: Associated prompt template from Langfuse prompt management
1351        """
1352        super().__init__(
1353            as_type="generation",
1354            otel_span=otel_span,
1355            langfuse_client=langfuse_client,
1356            input=input,
1357            output=output,
1358            metadata=metadata,
1359            environment=environment,
1360            release=release,
1361            version=version,
1362            level=level,
1363            status_message=status_message,
1364            completion_start_time=completion_start_time,
1365            model=model,
1366            model_parameters=model_parameters,
1367            usage_details=usage_details,
1368            cost_details=cost_details,
1369            prompt=prompt,
1370        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1312    def __init__(
1313        self,
1314        *,
1315        otel_span: otel_trace_api.Span,
1316        langfuse_client: "Langfuse",
1317        input: Optional[Any] = None,
1318        output: Optional[Any] = None,
1319        metadata: Optional[Any] = None,
1320        environment: Optional[str] = None,
1321        release: Optional[str] = None,
1322        version: Optional[str] = None,
1323        level: Optional[SpanLevel] = None,
1324        status_message: Optional[str] = None,
1325        completion_start_time: Optional[datetime] = None,
1326        model: Optional[str] = None,
1327        model_parameters: Optional[Dict[str, MapValue]] = None,
1328        usage_details: Optional[Dict[str, int]] = None,
1329        cost_details: Optional[Dict[str, float]] = None,
1330        prompt: Optional[PromptClient] = None,
1331    ):
1332        """Initialize a new LangfuseGeneration span.
1333
1334        Args:
1335            otel_span: The OpenTelemetry span to wrap
1336            langfuse_client: Reference to the parent Langfuse client
1337            input: Input data for the generation (e.g., prompts)
1338            output: Output from the generation (e.g., completions)
1339            metadata: Additional metadata to associate with the generation
1340            environment: The tracing environment
1341            release: Release identifier for the application
1342            version: Version identifier for the model or component
1343            level: Importance level of the generation (info, warning, error)
1344            status_message: Optional status message for the generation
1345            completion_start_time: When the model started generating the response
1346            model: Name/identifier of the AI model used (e.g., "gpt-4")
1347            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1348            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1349            cost_details: Cost information for the model call
1350            prompt: Associated prompt template from Langfuse prompt management
1351        """
1352        super().__init__(
1353            as_type="generation",
1354            otel_span=otel_span,
1355            langfuse_client=langfuse_client,
1356            input=input,
1357            output=output,
1358            metadata=metadata,
1359            environment=environment,
1360            release=release,
1361            version=version,
1362            level=level,
1363            status_message=status_message,
1364            completion_start_time=completion_start_time,
1365            model=model,
1366            model_parameters=model_parameters,
1367            usage_details=usage_details,
1368            cost_details=cost_details,
1369            prompt=prompt,
1370        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1373class LangfuseEvent(LangfuseObservationWrapper):
1374    """Specialized span implementation for Langfuse Events."""
1375
1376    def __init__(
1377        self,
1378        *,
1379        otel_span: otel_trace_api.Span,
1380        langfuse_client: "Langfuse",
1381        input: Optional[Any] = None,
1382        output: Optional[Any] = None,
1383        metadata: Optional[Any] = None,
1384        environment: Optional[str] = None,
1385        release: Optional[str] = None,
1386        version: Optional[str] = None,
1387        level: Optional[SpanLevel] = None,
1388        status_message: Optional[str] = None,
1389    ):
1390        """Initialize a new LangfuseEvent span.
1391
1392        Args:
1393            otel_span: The OpenTelemetry span to wrap
1394            langfuse_client: Reference to the parent Langfuse client
1395            input: Input data for the event
1396            output: Output from the event
1397            metadata: Additional metadata to associate with the generation
1398            environment: The tracing environment
1399            release: Release identifier for the application
1400            version: Version identifier for the model or component
1401            level: Importance level of the generation (info, warning, error)
1402            status_message: Optional status message for the generation
1403        """
1404        super().__init__(
1405            otel_span=otel_span,
1406            as_type="event",
1407            langfuse_client=langfuse_client,
1408            input=input,
1409            output=output,
1410            metadata=metadata,
1411            environment=environment,
1412            release=release,
1413            version=version,
1414            level=level,
1415            status_message=status_message,
1416        )
1417
1418    def update(
1419        self,
1420        *,
1421        name: Optional[str] = None,
1422        input: Optional[Any] = None,
1423        output: Optional[Any] = None,
1424        metadata: Optional[Any] = None,
1425        version: Optional[str] = None,
1426        level: Optional[SpanLevel] = None,
1427        status_message: Optional[str] = None,
1428        completion_start_time: Optional[datetime] = None,
1429        model: Optional[str] = None,
1430        model_parameters: Optional[Dict[str, MapValue]] = None,
1431        usage_details: Optional[Dict[str, int]] = None,
1432        cost_details: Optional[Dict[str, float]] = None,
1433        prompt: Optional[PromptClient] = None,
1434        **kwargs: Any,
1435    ) -> "LangfuseEvent":
1436        """Update is not allowed for LangfuseEvent because events cannot be updated.
1437
1438        This method logs a warning and returns self without making changes.
1439
1440        Returns:
1441            self: Returns the unchanged LangfuseEvent instance
1442        """
1443        langfuse_logger.warning(
1444            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1445        )
1446        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1376    def __init__(
1377        self,
1378        *,
1379        otel_span: otel_trace_api.Span,
1380        langfuse_client: "Langfuse",
1381        input: Optional[Any] = None,
1382        output: Optional[Any] = None,
1383        metadata: Optional[Any] = None,
1384        environment: Optional[str] = None,
1385        release: Optional[str] = None,
1386        version: Optional[str] = None,
1387        level: Optional[SpanLevel] = None,
1388        status_message: Optional[str] = None,
1389    ):
1390        """Initialize a new LangfuseEvent span.
1391
1392        Args:
1393            otel_span: The OpenTelemetry span to wrap
1394            langfuse_client: Reference to the parent Langfuse client
1395            input: Input data for the event
1396            output: Output from the event
1397            metadata: Additional metadata to associate with the generation
1398            environment: The tracing environment
1399            release: Release identifier for the application
1400            version: Version identifier for the model or component
1401            level: Importance level of the generation (info, warning, error)
1402            status_message: Optional status message for the generation
1403        """
1404        super().__init__(
1405            otel_span=otel_span,
1406            as_type="event",
1407            langfuse_client=langfuse_client,
1408            input=input,
1409            output=output,
1410            metadata=metadata,
1411            environment=environment,
1412            release=release,
1413            version=version,
1414            level=level,
1415            status_message=status_message,
1416        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1418    def update(
1419        self,
1420        *,
1421        name: Optional[str] = None,
1422        input: Optional[Any] = None,
1423        output: Optional[Any] = None,
1424        metadata: Optional[Any] = None,
1425        version: Optional[str] = None,
1426        level: Optional[SpanLevel] = None,
1427        status_message: Optional[str] = None,
1428        completion_start_time: Optional[datetime] = None,
1429        model: Optional[str] = None,
1430        model_parameters: Optional[Dict[str, MapValue]] = None,
1431        usage_details: Optional[Dict[str, int]] = None,
1432        cost_details: Optional[Dict[str, float]] = None,
1433        prompt: Optional[PromptClient] = None,
1434        **kwargs: Any,
1435    ) -> "LangfuseEvent":
1436        """Update is not allowed for LangfuseEvent because events cannot be updated.
1437
1438        This method logs a warning and returns self without making changes.
1439
1440        Returns:
1441            self: Returns the unchanged LangfuseEvent instance
1442        """
1443        langfuse_logger.warning(
1444            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1445        )
1446        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63    IS_APP_ROOT = "langfuse.internal.is_app_root"
64
65    # Experiments
66    EXPERIMENT_ID = "langfuse.experiment.id"
67    EXPERIMENT_NAME = "langfuse.experiment.name"
68    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
69    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
70    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
71    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
72    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
73    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
74    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
IS_APP_ROOT = 'langfuse.internal.is_app_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1449class LangfuseAgent(LangfuseObservationWrapper):
1450    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1451
1452    def __init__(self, **kwargs: Any) -> None:
1453        """Initialize a new LangfuseAgent span."""
1454        kwargs["as_type"] = "agent"
1455        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1452    def __init__(self, **kwargs: Any) -> None:
1453        """Initialize a new LangfuseAgent span."""
1454        kwargs["as_type"] = "agent"
1455        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1458class LangfuseTool(LangfuseObservationWrapper):
1459    """Tool observation representing external tool calls, e.g., calling a weather API."""
1460
1461    def __init__(self, **kwargs: Any) -> None:
1462        """Initialize a new LangfuseTool span."""
1463        kwargs["as_type"] = "tool"
1464        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1461    def __init__(self, **kwargs: Any) -> None:
1462        """Initialize a new LangfuseTool span."""
1463        kwargs["as_type"] = "tool"
1464        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1467class LangfuseChain(LangfuseObservationWrapper):
1468    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1469
1470    def __init__(self, **kwargs: Any) -> None:
1471        """Initialize a new LangfuseChain span."""
1472        kwargs["as_type"] = "chain"
1473        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1470    def __init__(self, **kwargs: Any) -> None:
1471        """Initialize a new LangfuseChain span."""
1472        kwargs["as_type"] = "chain"
1473        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1485class LangfuseEmbedding(LangfuseObservationWrapper):
1486    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1487
1488    def __init__(self, **kwargs: Any) -> None:
1489        """Initialize a new LangfuseEmbedding span."""
1490        kwargs["as_type"] = "embedding"
1491        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1488    def __init__(self, **kwargs: Any) -> None:
1489        """Initialize a new LangfuseEmbedding span."""
1490        kwargs["as_type"] = "embedding"
1491        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1494class LangfuseEvaluator(LangfuseObservationWrapper):
1495    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1496
1497    def __init__(self, **kwargs: Any) -> None:
1498        """Initialize a new LangfuseEvaluator span."""
1499        kwargs["as_type"] = "evaluator"
1500        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1497    def __init__(self, **kwargs: Any) -> None:
1498        """Initialize a new LangfuseEvaluator span."""
1499        kwargs["as_type"] = "evaluator"
1500        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1476class LangfuseRetriever(LangfuseObservationWrapper):
1477    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1478
1479    def __init__(self, **kwargs: Any) -> None:
1480        """Initialize a new LangfuseRetriever span."""
1481        kwargs["as_type"] = "retriever"
1482        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1479    def __init__(self, **kwargs: Any) -> None:
1480        """Initialize a new LangfuseRetriever span."""
1481        kwargs["as_type"] = "retriever"
1482        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1503class LangfuseGuardrail(LangfuseObservationWrapper):
1504    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1505
1506    def __init__(self, **kwargs: Any) -> None:
1507        """Initialize a new LangfuseGuardrail span."""
1508        kwargs["as_type"] = "guardrail"
1509        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1506    def __init__(self, **kwargs: Any) -> None:
1507        """Initialize a new LangfuseGuardrail span."""
1508        kwargs["as_type"] = "guardrail"
1509        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
101class Evaluation:
102    """Represents an evaluation result for an experiment item or an entire experiment run.
103
104    This class provides a strongly-typed way to create evaluation results in evaluator functions.
105    Users must use keyword arguments when instantiating this class.
106
107    Attributes:
108        name: Unique identifier for the evaluation metric. Should be descriptive
109            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
110            Used for aggregation and comparison across experiment runs.
111        value: The evaluation score or result. Can be:
112            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
113            - String: For categorical results like "positive", "negative", "neutral"
114            - Boolean: For binary assessments like "passes_safety_check"
115        comment: Optional human-readable explanation of the evaluation result.
116            Useful for providing context, explaining scoring rationale, or noting
117            special conditions. Displayed in Langfuse UI for interpretability.
118        metadata: Optional structured metadata about the evaluation process.
119            Can include confidence scores, intermediate calculations, model versions,
120            or any other relevant technical details.
121        data_type: Optional score data type. Required if value is not NUMERIC.
122            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
123        config_id: Optional Langfuse score config ID.
124
125    Examples:
126        Basic accuracy evaluation:
127        ```python
128        from langfuse import Evaluation
129
130        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
131            if not expected_output:
132                return Evaluation(name="accuracy", value=0, comment="No expected output")
133
134            is_correct = output.strip().lower() == expected_output.strip().lower()
135            return Evaluation(
136                name="accuracy",
137                value=1.0 if is_correct else 0.0,
138                comment="Correct answer" if is_correct else "Incorrect answer"
139            )
140        ```
141
142        Multi-metric evaluator:
143        ```python
144        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
145            return [
146                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
147                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
148                Evaluation(
149                    name="quality",
150                    value=0.85,
151                    comment="High quality response",
152                    metadata={"confidence": 0.92, "model": "gpt-4"}
153                )
154            ]
155        ```
156
157        Categorical evaluation:
158        ```python
159        def sentiment_evaluator(*, input, output, **kwargs):
160            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
161            return Evaluation(
162                name="sentiment",
163                value=sentiment,
164                comment=f"Response expresses {sentiment} sentiment",
165                data_type="CATEGORICAL"
166            )
167        ```
168
169        Failed evaluation with error handling:
170        ```python
171        def external_api_evaluator(*, input, output, **kwargs):
172            try:
173                score = external_api.evaluate(output)
174                return Evaluation(name="external_score", value=score)
175            except Exception as e:
176                return Evaluation(
177                    name="external_score",
178                    value=0,
179                    comment=f"API unavailable: {e}",
180                    metadata={"error": str(e), "retry_count": 3}
181                )
182        ```
183
184    Note:
185        All arguments must be passed as keywords. Positional arguments are not allowed
186        to ensure code clarity and prevent errors from argument reordering.
187    """
188
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  ⚠️  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    ⚠️  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\n⚠️  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("⚠️  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"ℹ️  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    ⚠️  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\n⚠️  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("⚠️  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"ℹ️  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
class RunnerContext:
1062class RunnerContext:
1063    """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1064
1065    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1066    (https://github.com/langfuse/experiment-action). The action builds a
1067    ``RunnerContext`` before invoking the user's ``experiment(context)``
1068    function. Defaults set here (dataset, metadata tags) are applied when
1069    the user omits them on the :meth:`run_experiment` call; users can
1070    override any default by passing the corresponding argument explicitly.
1071    """
1072
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata
1110
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )

Wraps Langfuse.run_experiment() with CI-injected defaults.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action builds a RunnerContext before invoking the user's experiment(context) function. Defaults set here (dataset, metadata tags) are applied when the user omits them on the run_experiment() call; users can override any default by passing the corresponding argument explicitly.

RunnerContext( *, client: Langfuse, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, dataset_version: Optional[datetime.datetime] = None, metadata: Optional[Dict[str, str]] = None)
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata

Build a RunnerContext populated with defaults for run_experiment.

Typically called by the langfuse/experiment-action GitHub Action, not by end users directly. Every field except client is optional: fields left as None simply mean the corresponding argument must be supplied on the run_experiment() call.

Arguments:
  • client: Initialized Langfuse SDK client used to execute the experiment. The action creates this from the langfuse_public_key / langfuse_secret_key / langfuse_base_url inputs.
  • data: Default dataset items to run the experiment on. Accepts either List[LocalExperimentItem] or List[DatasetItem]. Injected by the action when dataset_name is configured. If None, the user must pass data= to run_experiment().
  • dataset_version: Optional pinned dataset version. Injected by the action when dataset_version is configured.
  • metadata: Default metadata attached to every experiment trace and the dataset run. The action injects GitHub-sourced tags (SHA, PR link, workflow run link, branch, GH user, etc.). Merged with any metadata passed to run_experiment(), with user-supplied keys winning on collision.
client
data
dataset_version
metadata
def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )
class RegressionError(builtins.Exception):
1157class RegressionError(Exception):
1158    """Raised by a user's ``experiment`` function to signal a CI gate failure.
1159
1160    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1161    (https://github.com/langfuse/experiment-action). The action catches this
1162    exception and, when ``should_fail_on_error`` is enabled, fails the
1163    workflow run and renders a callout in the PR comment using
1164    ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1165
1166    Callers choose one of three forms:
1167
1168    - ``RegressionError(result=r)`` — minimal, generic message.
1169    - ``RegressionError(result=r, message="...")`` — free-form message.
1170    - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1171      structured; ``metric`` and ``value`` must be provided together so the
1172      action can render a targeted callout without ``None`` placeholders.
1173    """
1174
1175    @overload
1176    def __init__(self, *, result: ExperimentResult) -> None: ...
1177    @overload
1178    def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1179    @overload
1180    def __init__(
1181        self,
1182        *,
1183        result: ExperimentResult,
1184        metric: str,
1185        value: float,
1186        threshold: Optional[float] = None,
1187        message: Optional[str] = None,
1188    ) -> None: ...
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)

Raised by a user's experiment function to signal a CI gate failure.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action catches this exception and, when should_fail_on_error is enabled, fails the workflow run and renders a callout in the PR comment using metric/value/threshold if supplied, otherwise str(exc).

Callers choose one of three forms:

  • RegressionError(result=r) — minimal, generic message.
  • RegressionError(result=r, message="...") — free-form message.
  • RegressionError(result=r, metric="acc", value=0.7, threshold=0.9) — structured; metric and value must be provided together so the action can render a targeted callout without None placeholders.
RegressionError( *, result: langfuse.experiment.ExperimentResult, metric: Optional[str] = None, value: Optional[float] = None, threshold: Optional[float] = None, message: Optional[str] = None)
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)
result
metric
value
threshold
__version__ = '4.9.1'
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
 98def is_default_export_span(span: ReadableSpan) -> bool:
 99    """Return whether a span should be exported by default."""
100    return (
101        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
102    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
61def is_langfuse_span(span: ReadableSpan) -> bool:
62    """Return whether the span was created by the Langfuse SDK tracer."""
63    return (
64        span.instrumentation_scope is not None
65        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
66    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
69def is_genai_span(span: ReadableSpan) -> bool:
70    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
71    if span.attributes is None:
72        return False
73
74    return any(
75        isinstance(key, str) and key.startswith("gen_ai")
76        for key in span.attributes.keys()
77    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
86    """Return whether the span comes from a known LLM instrumentation scope."""
87    if span.instrumentation_scope is None:
88        return False
89
90    scope_name = span.instrumentation_scope.name
91
92    return any(
93        _matches_scope_prefix(scope_name, prefix)
94        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
95    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'opentelemetry.instrumentation.voyageai', 'opentelemetry.instrumentation.langchain', 'opentelemetry.instrumentation.bedrock', 'opentelemetry.instrumentation.writer', 'opentelemetry.instrumentation.anthropic', 'opentelemetry.instrumentation.ollama', 'opentelemetry.instrumentation.vertexai', 'opentelemetry.instrumentation.replicate', 'opentelemetry.instrumentation.openai_agents', 'opentelemetry.instrumentation.sagemaker', 'opentelemetry.instrumentation.haystack', 'opentelemetry.instrumentation.openai_v2', 'litellm', 'haystack', 'opentelemetry.instrumentation.transformers', 'opentelemetry.instrumentation.together', 'agent_framework', 'opentelemetry.instrumentation.mistralai', 'opentelemetry.instrumentation.groq', 'strands-agents', 'autogen-core', 'opentelemetry.instrumentation.google_generativeai', 'ai', 'opentelemetry.instrumentation.llamaindex', 'opentelemetry.instrumentation.cohere', 'pydantic-ai', 'opentelemetry.instrumentation.watsonx', 'langfuse-sdk', 'langsmith', 'opentelemetry.instrumentation.agno', 'vllm', 'openinference', 'opentelemetry.instrumentation.openai', 'opentelemetry.instrumentation.alephalpha', 'opentelemetry.instrumentation.crewai'})
class MaskOtelSpansFunction(typing.Protocol):
224class MaskOtelSpansFunction(Protocol):
225    """Function protocol for export-stage OpenTelemetry span masking.
226
227    `mask_otel_spans` runs after Langfuse decides which spans this client should
228    export and after export-stage media handling has converted supported media
229    payloads into Langfuse media references. It affects only the spans exported
230    by this Langfuse client. If the same OpenTelemetry spans are sent to another
231    exporter, that exporter receives its own unmodified copy.
232
233    The function is synchronous. It usually runs on the OpenTelemetry batch span
234    processor worker thread; during `flush()` and shutdown it may run on the
235    caller thread. Keep it deterministic and fast, and avoid relying on request
236    locals, the current active span, or async I/O.
237
238    Return `None` to leave the whole batch unchanged, or return
239    `MaskOtelSpansResult` with sparse patches for the spans that should change.
240
241    Example:
242        ```python
243        from typing import Optional
244
245        from langfuse import Langfuse
246        from langfuse.types import (
247            MaskOtelSpansParams,
248            MaskOtelSpansResult,
249            OtelSpanPatch,
250        )
251
252        def mask_otel_spans(
253            *, params: MaskOtelSpansParams
254        ) -> Optional[MaskOtelSpansResult]:
255            patches = {}
256
257            for identifier, span in params.spans.items():
258                if span.instrumentation_scope_name == "openai":
259                    patches[identifier] = OtelSpanPatch(
260                        delete_attributes=(
261                            "gen_ai.prompt.0.content",
262                            "gen_ai.completion.0.content",
263                        ),
264                        set_attributes={"masking.applied": True},
265                    )
266
267            return MaskOtelSpansResult(span_patches=patches)
268
269        langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
270        ```
271    """
272
273    def __call__(
274        self, *, params: MaskOtelSpansParams
275    ) -> Optional[MaskOtelSpansResult]: ...

Function protocol for export-stage OpenTelemetry span masking.

mask_otel_spans runs after Langfuse decides which spans this client should export and after export-stage media handling has converted supported media payloads into Langfuse media references. It affects only the spans exported by this Langfuse client. If the same OpenTelemetry spans are sent to another exporter, that exporter receives its own unmodified copy.

The function is synchronous. It usually runs on the OpenTelemetry batch span processor worker thread; during flush() and shutdown it may run on the caller thread. Keep it deterministic and fast, and avoid relying on request locals, the current active span, or async I/O.

Return None to leave the whole batch unchanged, or return MaskOtelSpansResult with sparse patches for the spans that should change.

Example:
from typing import Optional

from langfuse import Langfuse
from langfuse.types import (
    MaskOtelSpansParams,
    MaskOtelSpansResult,
    OtelSpanPatch,
)

def mask_otel_spans(
    *, params: MaskOtelSpansParams
) -> Optional[MaskOtelSpansResult]:
    patches = {}

    for identifier, span in params.spans.items():
        if span.instrumentation_scope_name == "openai":
            patches[identifier] = OtelSpanPatch(
                delete_attributes=(
                    "gen_ai.prompt.0.content",
                    "gen_ai.completion.0.content",
                ),
                set_attributes={"masking.applied": True},
            )

    return MaskOtelSpansResult(span_patches=patches)

langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
MaskOtelSpansFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
@dataclass(frozen=True)
class MaskOtelSpansParams:
123@dataclass(frozen=True)
124class MaskOtelSpansParams:
125    """Input passed to an export-stage OpenTelemetry span masking function.
126
127    A single call receives one OpenTelemetry export batch, not necessarily a
128    complete trace, request, or Langfuse observation tree. Batch contents depend
129    on OpenTelemetry span processor settings such as `flush_at`,
130    `flush_interval`, explicit `flush()`, and shutdown.
131
132    Example:
133        ```python
134        from typing import Optional
135
136        from langfuse.types import (
137            MaskOtelSpansParams,
138            MaskOtelSpansResult,
139            OtelSpanPatch,
140        )
141
142        def mask_otel_spans(
143            *, params: MaskOtelSpansParams
144        ) -> Optional[MaskOtelSpansResult]:
145            patches = {}
146
147            for identifier, span in params.spans.items():
148                if "http.request.header.authorization" in span.attributes:
149                    patches[identifier] = OtelSpanPatch(
150                        delete_attributes=("http.request.header.authorization",),
151                        set_attributes={"security.redacted": True},
152                    )
153
154            return MaskOtelSpansResult(span_patches=patches)
155        ```
156
157    Attributes:
158        spans: Read-only mapping from stable span identifiers to span snapshots.
159            Return patches using keys from this mapping.
160    """
161
162    spans: Mapping[OtelSpanIdentifier, OtelSpanData]

Input passed to an export-stage OpenTelemetry span masking function.

A single call receives one OpenTelemetry export batch, not necessarily a complete trace, request, or Langfuse observation tree. Batch contents depend on OpenTelemetry span processor settings such as flush_at, flush_interval, explicit flush(), and shutdown.

Example:
from typing import Optional

from langfuse.types import (
    MaskOtelSpansParams,
    MaskOtelSpansResult,
    OtelSpanPatch,
)

def mask_otel_spans(
    *, params: MaskOtelSpansParams
) -> Optional[MaskOtelSpansResult]:
    patches = {}

    for identifier, span in params.spans.items():
        if "http.request.header.authorization" in span.attributes:
            patches[identifier] = OtelSpanPatch(
                delete_attributes=("http.request.header.authorization",),
                set_attributes={"security.redacted": True},
            )

    return MaskOtelSpansResult(span_patches=patches)
Attributes:
  • spans: Read-only mapping from stable span identifiers to span snapshots. Return patches using keys from this mapping.
MaskOtelSpansParams( spans: Mapping[OtelSpanIdentifier, OtelSpanData])
spans: Mapping[OtelSpanIdentifier, OtelSpanData]
@dataclass(frozen=True)
class MaskOtelSpansResult:
200@dataclass(frozen=True)
201class MaskOtelSpansResult:
202    """Patches returned by a `mask_otel_spans` function.
203
204    Omit spans that do not need changes. A mapping value of `None` also leaves
205    that span unchanged. Returning an invalid patch to drop a span is not a
206    supported API; use `should_export_span` when you need span-level export
207    filtering.
208
209    If `mask_otel_spans` raises or returns an object that is not a
210    `MaskOtelSpansResult`, Langfuse drops the whole export batch. If one
211    individual `OtelSpanPatch` is invalid, Langfuse drops only that span from
212    the export batch.
213
214    Attributes:
215        span_patches: Mapping from identifiers in `MaskOtelSpansParams.spans` to
216            sparse attribute patches.
217    """
218
219    span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = field(
220        default_factory=lambda: MappingProxyType({})
221    )

Patches returned by a mask_otel_spans function.

Omit spans that do not need changes. A mapping value of None also leaves that span unchanged. Returning an invalid patch to drop a span is not a supported API; use should_export_span when you need span-level export filtering.

If mask_otel_spans raises or returns an object that is not a MaskOtelSpansResult, Langfuse drops the whole export batch. If one individual OtelSpanPatch is invalid, Langfuse drops only that span from the export batch.

Attributes:
MaskOtelSpansResult( span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = <factory>)
span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]]
@dataclass(frozen=True)
class OtelSpanData:
 82@dataclass(frozen=True)
 83class OtelSpanData:
 84    """Read-only OpenTelemetry span snapshot passed to `mask_otel_spans`.
 85
 86    The snapshot contains the span data that Langfuse is about to export after
 87    the SDK has applied `should_export_span` filtering and export-stage media
 88    processing. The mappings are immutable views and mutating them is not
 89    supported; return an `OtelSpanPatch` to change exported attributes.
 90
 91    `mask_otel_spans` can only change span attributes. It cannot change the
 92    span name, IDs, parent relationship, resource attributes, events, links, or
 93    instrumentation scope.
 94
 95    Attributes:
 96        trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
 97        span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
 98        parent_span_id: Lowercase hexadecimal parent span ID, or `None` for a
 99            root span or when the parent is not available.
100        name: OpenTelemetry span name.
101        instrumentation_scope_name: Name of the instrumentation scope that
102            emitted the span, for example `openai` or `langfuse`.
103        instrumentation_scope_version: Version of the instrumentation scope, if
104            the instrumentation library provided one.
105        attributes: Read-only attributes that will be exported unless patched.
106            Values use OpenTelemetry `AttributeValue` types: strings, booleans,
107            numbers, or homogeneous sequences of those scalar values.
108        resource_attributes: Read-only resource attributes from the span's
109            OpenTelemetry resource. These are available for decisions only and
110            cannot be patched through `mask_otel_spans`.
111    """
112
113    trace_id: str
114    span_id: str
115    parent_span_id: Optional[str]
116    name: str
117    instrumentation_scope_name: Optional[str]
118    instrumentation_scope_version: Optional[str]
119    attributes: Mapping[str, AttributeValue]
120    resource_attributes: Mapping[str, AttributeValue]

Read-only OpenTelemetry span snapshot passed to mask_otel_spans.

The snapshot contains the span data that Langfuse is about to export after the SDK has applied should_export_span filtering and export-stage media processing. The mappings are immutable views and mutating them is not supported; return an OtelSpanPatch to change exported attributes.

mask_otel_spans can only change span attributes. It cannot change the span name, IDs, parent relationship, resource attributes, events, links, or instrumentation scope.

Attributes:
  • trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
  • span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
  • parent_span_id: Lowercase hexadecimal parent span ID, or None for a root span or when the parent is not available.
  • name: OpenTelemetry span name.
  • instrumentation_scope_name: Name of the instrumentation scope that emitted the span, for example openai or langfuse.
  • instrumentation_scope_version: Version of the instrumentation scope, if the instrumentation library provided one.
  • attributes: Read-only attributes that will be exported unless patched. Values use OpenTelemetry AttributeValue types: strings, booleans, numbers, or homogeneous sequences of those scalar values.
  • resource_attributes: Read-only resource attributes from the span's OpenTelemetry resource. These are available for decisions only and cannot be patched through mask_otel_spans.
OtelSpanData( trace_id: str, span_id: str, parent_span_id: Optional[str], name: str, instrumentation_scope_name: Optional[str], instrumentation_scope_version: Optional[str], attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]], resource_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]])
trace_id: str
span_id: str
parent_span_id: Optional[str]
name: str
instrumentation_scope_name: Optional[str]
instrumentation_scope_version: Optional[str]
attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
resource_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
@dataclass(frozen=True)
class OtelSpanIdentifier:
65@dataclass(frozen=True)
66class OtelSpanIdentifier:
67    """Stable key for one OpenTelemetry span in a masking batch.
68
69    Use this object as the key when returning a patch for a span. It is a
70    frozen, hashable dataclass, so the safest pattern is to reuse the exact
71    identifier object from `MaskOtelSpansParams.spans` instead of rebuilding it.
72
73    Attributes:
74        trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
75        span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
76    """
77
78    trace_id: str
79    span_id: str

Stable key for one OpenTelemetry span in a masking batch.

Use this object as the key when returning a patch for a span. It is a frozen, hashable dataclass, so the safest pattern is to reuse the exact identifier object from MaskOtelSpansParams.spans instead of rebuilding it.

Attributes:
  • trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
  • span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
OtelSpanIdentifier(trace_id: str, span_id: str)
trace_id: str
span_id: str
@dataclass(frozen=True)
class OtelSpanPatch:
165@dataclass(frozen=True)
166class OtelSpanPatch:
167    """Attribute changes to apply to one OpenTelemetry span before export.
168
169    Patches are sparse: include only the attributes that should change. Langfuse
170    deletes `delete_attributes` first and then applies `set_attributes`, so a key
171    present in both fields is exported with the value from `set_attributes`.
172
173    Attribute values must be valid OpenTelemetry attributes: strings, booleans,
174    integers, floats, or homogeneous sequences of those scalar types. If one
175    value is not valid for OpenTelemetry, Langfuse removes that attribute from
176    the export rather than sending an invalid span.
177
178    Example:
179        ```python
180        OtelSpanPatch(
181            delete_attributes=("gen_ai.prompt.0.content",),
182            set_attributes={
183                "gen_ai.prompt.redacted": True,
184                "app.masking.rule": "drop_prompt_text",
185            },
186        )
187        ```
188
189    Attributes:
190        set_attributes: Attribute values to add or replace on the exported span.
191        delete_attributes: Attribute keys to remove from the exported span.
192    """
193
194    set_attributes: Mapping[str, AttributeValue] = field(
195        default_factory=lambda: MappingProxyType({})
196    )
197    delete_attributes: Sequence[str] = field(default_factory=tuple)

Attribute changes to apply to one OpenTelemetry span before export.

Patches are sparse: include only the attributes that should change. Langfuse deletes delete_attributes first and then applies set_attributes, so a key present in both fields is exported with the value from set_attributes.

Attribute values must be valid OpenTelemetry attributes: strings, booleans, integers, floats, or homogeneous sequences of those scalar types. If one value is not valid for OpenTelemetry, Langfuse removes that attribute from the export rather than sending an invalid span.

Example:
OtelSpanPatch(
    delete_attributes=("gen_ai.prompt.0.content",),
    set_attributes={
        "gen_ai.prompt.redacted": True,
        "app.masking.rule": "drop_prompt_text",
    },
)
Attributes:
  • set_attributes: Attribute values to add or replace on the exported span.
  • delete_attributes: Attribute keys to remove from the exported span.
OtelSpanPatch( set_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]] = <factory>, delete_attributes: Sequence[str] = <factory>)
set_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
delete_attributes: Sequence[str]