luml.experiments.tracker
ExperimentTracker Objects
class ExperimentTracker()
Local experiment tracking for ML experiments.
Tracks metrics, parameters, artifacts, and traces for machine learning experiments. Supports multiple backend storage options via connection strings.
Arguments:
connection_string- Backend connection string. Format: 'backend://config'. Default is 'sqlite://./experiments' for local SQLite storage.
Example:
tracker = ExperimentTracker("sqlite://./my_experiments")
exp_id = tracker.start_experiment(
name="my_experiment", group="my_group", tags=["baseline"]
)
tracker.log_static("learning_rate", 0.001, experiment_id=exp_id)
tracker.log_dynamic("loss", 0.5, step=1, experiment_id=exp_id)
tracker.end_experiment(exp_id)
start_experiment
def start_experiment(
name: str | None = None,
group: str = "default",
experiment_id: str | None = None,
tags: list[str] | None = None
) -> str
Starts a new experiment by initializing it with the backend and setting the experiment's metadata.
Arguments:
namestr | None - The name of the experiment. If not provided, the experiment will be initialized without a specific namegroupstr - The group to which the experiment belongs. Defaults to "default".experiment_idstr | None - A unique identifier for the experiment. If not provided, a new UUID will be generated as the experiment ID.tagslist[str] | None - A list of tags to associate with the experiment. Can be None if no tags are necessary.
Returns:
str- The experiment ID.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment(
"image_classification",
name="baseline_model",
tags=["resnet", "baseline"]
)
end_experiment
def end_experiment(experiment_id: str | None = None) -> None
End an active experiment tracking session.
Arguments:
experiment_id- ID of experiment to end. Uses current experiment if not specified.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment(name="my_exp")
tracker.end_experiment(exp_id)
fail_experiment
def fail_experiment(experiment_id: str | None = None) -> None
Mark an experiment as failed due to an error or interruption.
log_static
def log_static(key: str, value: Any, experiment_id: str | None = None) -> None
Log static parameters or metadata (values that don't change during training).
Arguments:
key- Parameter name.value- Parameter value (can be any serializable type).experiment_id- Experiment ID. Uses current experiment if not specified.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_static("learning_rate", 0.001)
tracker.log_static("model_architecture", "resnet50")
tracker.log_static("batch_size", 32)
log_dynamic
def log_dynamic(
key: str,
value: int | float,
step: int | None = None,
experiment_id: str | None = None
) -> None
Log time-series metrics (values that change during training).
Arguments:
key- Metric name.value- Metric value (numeric).step- Training step/epoch number.experiment_id- Experiment ID. Uses current experiment if not specified.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
for epoch in range(10):
loss = train_epoch()
tracker.log_dynamic("train_loss", loss, step=epoch)
log_span
def log_span(
trace_id: str,
span_id: str,
name: str,
start_time_unix_nano: int,
end_time_unix_nano: int,
parent_span_id: str | None = None,
kind: int = 0,
status_code: int = 0,
status_message: str | None = None,
attributes: dict[str, Any] | None = None,
events: list[dict[str, Any]] | None = None,
links: list[dict[str, Any]] | None = None,
trace_flags: int = 0,
experiment_id: str | None = None
) -> None
Log an OpenTelemetry-compatible span to the experiment.
Records a single span representing a unit of work within a trace.
Spans can be nested via parent_span_id to form a trace tree.
Arguments:
trace_idstr - Unique identifier for the trace this span belongs to.span_idstr - Unique identifier for this span.namestr - Human-readable name describing the operation.start_time_unix_nanoint - Span start time in nanoseconds since Unix epoch.end_time_unix_nanoint - Span end time in nanoseconds since Unix epoch.parent_span_idstr | None - Span ID of the parent span, orNonefor root spans.kindint - Span kind following the OpenTelemetry spec (0=INTERNAL, 1=SERVER, 2=CLIENT, 3=PRODUCER, 4=CONSUMER). Defaults to 0.status_codeint - Status code (0=UNSET, 1=OK, 2=ERROR). Defaults to 0.status_messagestr | None - Optional status description, typically set for error spans.attributesdict[str, Any] | None - Key-value pairs of span attributes.eventslist[dict[str, Any]] | None - Timestamped event records attached to the span.linkslist[dict[str, Any]] | None - Links to other spans, each containing at minimumtrace_idandspan_idkeys.trace_flagsint - W3C trace flags. Defaults to 0.experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided.
Example:
import time
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
start = time.time_ns()
# ... do work ...
end = time.time_ns()
tracker.log_span(
trace_id="abc123",
span_id="span_1",
name="data_preprocessing",
start_time_unix_nano=start,
end_time_unix_nano=end,
attributes={"input_rows": 1000},
)
log_eval_sample
def log_eval_sample(
eval_id: str,
dataset_id: str,
inputs: dict[str, Any],
outputs: dict[str, Any] | None = None,
references: dict[str, Any] | None = None,
scores: dict[str, Any] | None = None,
metadata: dict[str, Any] | None = None,
experiment_id: str | None = None
) -> None
Log a single evaluation sample to the experiment.
Records one data point from a model evaluation run, including its inputs, model outputs, ground-truth references, computed scores, and optional metadata.
Arguments:
eval_idstr - Unique identifier for this evaluation sample.dataset_idstr - Identifier of the evaluation dataset this sample belongs to.inputsdict[str, Any] - Input data fed to the model for this sample.outputsdict[str, Any] | None - Model outputs/predictions for this sample.referencesdict[str, Any] | None - Ground-truth or reference values to compare against.scoresdict[str, Any] | None - Computed evaluation scores (e.g. accuracy, F1, BLEU).metadatadict[str, Any] | None - Additional metadata for the sample (e.g. latency, token counts).experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_eval_sample(
eval_id="sample_001",
dataset_id="test_set_v2",
inputs={"prompt": "Summarize this text..."},
outputs={"response": "The text discusses..."},
references={"expected": "A summary of..."},
scores={"bleu": 0.72, "rouge_l": 0.65},
)
link_eval_sample_to_trace
def link_eval_sample_to_trace(
eval_dataset_id: str,
eval_id: str,
trace_id: str,
experiment_id: str | None = None
) -> None
Link an evaluation sample to a trace.
Associates a previously logged evaluation sample with an execution trace, enabling correlation between evaluation results and the traced execution that produced them.
Arguments:
eval_dataset_idstr - Identifier of the evaluation dataset the sample belongs to.eval_idstr - Identifier of the evaluation sample to link.trace_idstr - Identifier of the trace to link to.experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_eval_sample(
eval_id="sample_001",
dataset_id="test_set_v2",
inputs={"prompt": "Hello"},
)
tracker.link_eval_sample_to_trace(
eval_dataset_id="test_set_v2",
eval_id="sample_001",
trace_id="trace_abc",
)
get_evals_annotation_summaries
def get_evals_annotation_summaries(
experiment_id: str,
eval_ids: list[str]
) -> dict[str, AnnotationSummary]
Retrieves annotation summaries for a batch of evaluations within an experiment.
Arguments:
experiment_idstr - The unique identifier of the experiment.eval_idslist[str] - A list of evaluation IDs to retrieve summaries for.
Returns:
dict[str, AnnotationSummary]: A dictionary mapping each eval ID to its annotation summary. Eval IDs with no annotations are excluded from the result.
Example:
tracker = ExperimentTracker()
tracker.get_evals_annotation_summaries("exp-001", ["eval-xyz", "eval-abc"])
result = {
"eval-xyz": AnnotationSummary(
feedback=[
FeedbackSummaryItem(
name="quality",
total=2,
counts={"true": 1, "false": 1}
)
],
expectations=[]
)
}
log_attachment
def log_attachment(
name: str,
data: Any,
binary: bool = False,
experiment_id: str | None = None
) -> None
Log files or artifacts to the experiment.
Arguments:
name- Attachment name/filename.data- Data to attach (string, bytes, or file path).binary- Whether data is binary. Default is False.experiment_id- Experiment ID. Uses current experiment if not specified.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_attachment("model_config.json", config_json)
tracker.log_attachment("plot.png", image_bytes, binary=True)
get_experiment
def get_experiment(experiment_id: str) -> ExperimentData
Retrieve full experiment data by ID.
Returns the complete experiment record including metadata, static parameters, dynamic metrics, and attachment information.
Arguments:
experiment_idstr - Unique identifier of the experiment to retrieve.
Returns:
ExperimentData- Complete experiment data containingexperiment_id,metadata,static_params,dynamic_metrics, andattachments.
Example:
tracker = ExperimentTracker()
data = tracker.get_experiment("my-experiment-id")
print(data.metadata.name)
print(data.static_params)
print(data.dynamic_metrics)
get_attachment
def get_attachment(name: str, experiment_id: str | None = None) -> Any
Retrieve a previously logged attachment by name.
Arguments:
namestr - Name of the attachment as specified during :meth:log_attachment.experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Returns:
Any- The attachment data as bytes.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_attachment("config.json", '{"lr": 0.001}')
config = tracker.get_attachment("config.json")
list_attachments
def list_attachments(experiment_id: str | None = None) -> list[AttachmentRecord]
List all attachments logged for an experiment.
Arguments:
experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Returns:
list[AttachmentRecord]- List of attachment records withname,file_pathandcreated_atfields.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided.
list_experiments
def list_experiments() -> list[Experiment]
List all experiments in the backend.
Returns:
List of Experiment objects with metadata.
Example:
tracker = ExperimentTracker()
experiments = tracker.list_experiments()
for exp in experiments:
print(f"{exp.id}: {exp.name}")
delete_experiment
def delete_experiment(experiment_id: str) -> None
Delete an experiment and all its associated data.
Permanently removes the experiment record along with its static parameters, dynamic metrics, spans, evaluation samples, attachments, and models from the backend.
Arguments:
experiment_idstr - Unique identifier of the experiment to delete.
Example:
tracker = ExperimentTracker()
tracker.delete_experiment("old-experiment-id")
create_group
def create_group(
name: str,
description: str | None = None,
tags: list[str] | None = None
) -> Group
Create a new experiment group.
Groups organize related experiments (e.g. by project, task, or hypothesis).
Experiments are assigned to a group via the group parameter of
:meth:start_experiment.
Arguments:
namestr - Unique name for the group.descriptionstr | None - Optional human-readable description of the group's purpose.tagslist[str] | None - Optional tags to associate with the group.
Returns:
Group- The created group withid,name,description,created_at, andtagsfields.
Example:
tracker = ExperimentTracker()
group = tracker.create_group(
"hyperparameter_search",
description="LR sweep experiments",
tags=["production", "v2"],
)
exp_id = tracker.start_experiment(group=group.name)
list_groups
def list_groups() -> list[Group]
List all experiment groups in the backend.
Returns:
list[Group]- List of Group objects, each containingid,name,description, andcreated_atfields.
Example:
tracker = ExperimentTracker()
groups = tracker.list_groups()
for group in groups:
print(f"{group.name}: {group.description}")
log_model
def log_model(
model: ModelReference | Any,
*,
name: str | None = None,
tags: list[str] | None = None,
flavor: str | None = None,
inputs: Any = None,
experiment_id: str | None = None,
dependencies: Literal["default", "all"] | list[str] = "default",
extra_dependencies: list[str] | None = None,
extra_code_modules: list[str] | Literal["auto"] | None = None,
manifest_model_name: str | None = None,
manifest_model_version: str | None = None,
manifest_model_description: str | None = None,
manifest_extra_producer_tags: list[str] | None = None,
**save_kwargs: Any
) -> ModelReference
Log a model to the experiment.
Accepts either a pre-saved ModelReference or a raw model object. When a
raw model is provided, it is serialized automatically using the detected or
explicitly specified flavor. The serialized artifact is stored in the backend
and the temporary file is cleaned up.
Arguments:
modelModelReference | Any - AModelReferencepointing to an already-saved model, or a raw model object to be serialized.namestr | None - Optional display name for the model.tagslist[str] | None - Optional tags to associate with the model.flavorstr | None - Serialization flavor (e.g."sklearn","xgboost"). Auto-detected from the model's module if not provided. Supported flavors: sklearn, xgboost, lightgbm, catboost, langgraph.inputsAny - Sample input data used for model signature inference. Required bysklearn, optional forxgboostandlightgbm, and not used bycatboostorlanggraph.experiment_idstr | None - Experiment ID. Uses current experiment if not specified.dependenciesLiteral["default", "all"] | list[str] - Dependency capture strategy."default"captures direct dependencies,"all"captures the full environment, or pass an explicit list of package names.extra_dependencieslist[str] | None - Additional package dependencies to include beyond those captured bydependencies.extra_code_moduleslist[str] | Literal["auto"] | None - Extra Python modules to bundle with the model."auto"attempts automatic detection.manifest_model_namestr | None - Model name for the artifact manifest.manifest_model_versionstr | None - Model version for the artifact manifest.manifest_model_descriptionstr | None - Model description for the artifact manifest.manifest_extra_producer_tagslist[str] | None - Additional producer tags for the artifact manifest.**save_kwargsAny - Additional keyword arguments forwarded to the flavor-specific save function.
Returns:
ModelReference- Reference to the stored model in the backend.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided, or if the flavor cannot be auto-detected.
Example:
from sklearn.ensemble import RandomForestClassifier
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
model = RandomForestClassifier().fit(X_train, y_train)
model_ref = tracker.log_model(model, name="rf_v1", inputs=X_train)
get_models
def get_models(experiment_id: str | None = None) -> list[Model]
List all models logged to an experiment.
Arguments:
experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Returns:
list[Model]- List of Model objects, each containingid,name,created_at,tags,path, andexperiment_idfields.
Raises:
ValueError- If no experiment is active andexperiment_idis not provided.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_model(model, name="rf_v1", inputs=X_train)
models = tracker.get_models()
for m in models:
print(f"{m.name} logged at {m.created_at}")
get_model
def get_model(model_id: str) -> Model
Retrieve a single model by its ID.
Arguments:
model_idstr - Unique identifier of the model to retrieve.
Returns:
Model- The model record containingid,name,created_at,tags,path, andexperiment_idfields.
Example:
tracker = ExperimentTracker()
model = tracker.get_model("model-uuid-123")
print(f"{model.name}: {model.path}")
link_to_model
def link_to_model(
model_reference: ModelReference,
experiment_id: str | None = None
) -> None
Link experiment data to a saved model.
Attaches experiment tracking data (metrics, parameters, artifacts) to a model for reproducibility and model versioning.
Arguments:
model_reference- ModelReference object to link to.experiment_id- Experiment ID. Uses current experiment if not specified.
Example:
from luml.integrations.sklearn import save_sklearn
tracker = ExperimentTracker()
exp_id = tracker.start_experiment(name="sklearn_model")
tracker.log_static("model_type", "RandomForest")
model_ref = save_sklearn(model, X_train)
tracker.link_to_model(model_ref)
log_eval_annotation
def log_eval_annotation(
dataset_id: str,
eval_id: str,
name: str,
annotation_kind: str,
value_type: str,
value: int | bool | str,
user: str,
rationale: str | None = None,
experiment_id: str | None = None
) -> AnnotationRecord
Create an annotation on an eval sample.
Annotations categorize eval results with human feedback or expectations.
Feedback annotations must use value_type='bool'.
Arguments:
dataset_idstr - The dataset the eval belongs to.eval_idstr - The eval sample to annotate.namestr - Annotation name used for grouping (e.g. "accuracy", "relevance").annotation_kindstr - Either'feedback'or'expectation'.value_typestr - Type of the value:'bool','int', or'string'.valueint | bool | str - The annotation value.userstr - The user who created the annotation.rationalestr | None - Optional free-text explanation for the annotation.experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Returns:
AnnotationRecord- The created annotation record.
Raises:
ValueError- If no experiment is active, the experiment uses an older schema without annotation support, or a feedback annotation uses a non-boolvalue_type.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
tracker.log_eval_sample(
eval_id="eval-1", dataset_id="ds-1",
inputs={"prompt": "What is 2+2?"},
outputs={"response": "4"},
)
annotation = tracker.log_eval_annotation(
dataset_id="ds-1",
eval_id="eval-1",
name="accuracy",
annotation_kind="feedback",
value_type="bool",
value=True,
user="alice",
rationale="The answer is correct",
)
log_span_annotation
def log_span_annotation(
trace_id: str,
span_id: str,
name: str,
annotation_kind: str,
value_type: str,
value: int | bool | str,
user: str,
rationale: str | None = None,
experiment_id: str | None = None
) -> AnnotationRecord
Create an annotation on a span within a trace.
Annotations attach human feedback or expectations to individual spans.
Feedback annotations must use value_type='bool'.
Arguments:
trace_idstr - The trace containing the span.span_idstr - The span to annotate.namestr - Annotation name used for grouping (e.g. "quality", "latency").annotation_kindstr - Either'feedback'or'expectation'.value_typestr - Type of the value:'bool','int', or'string'.valueint | bool | str - The annotation value.userstr - The user who created the annotation.rationalestr | None - Optional free-text explanation for the annotation.experiment_idstr | None - Experiment ID. Uses current experiment if not specified.
Returns:
AnnotationRecord- The created annotation record.
Raises:
ValueError- If no experiment is active, the experiment uses an older schema without annotation support, or a feedback annotation uses a non-boolvalue_type.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
annotation = tracker.log_span_annotation(
trace_id="trace-abc",
span_id="span-1",
name="quality",
annotation_kind="feedback",
value_type="bool",
value=True,
user="alice",
rationale="Output was relevant and well-structured",
)
get_experiment_record
def get_experiment_record(experiment_id: str) -> Experiment | None
Retrieve experiment metadata by ID.
Arguments:
experiment_idstr - The experiment to look up.
Returns:
Experiment | None: The experiment metadata, or None if not found.
Example:
tracker = ExperimentTracker()
exp = tracker.get_experiment_record("my-experiment-id")
if exp:
print(exp.name, exp.tags)
get_trace
def get_trace(experiment_id: str, trace_id: str) -> TraceDetails | None
Retrieve full trace details including all spans.
Arguments:
experiment_idstr - The experiment containing the trace.trace_idstr - The trace to retrieve.
Returns:
TraceDetails | None: Trace with its spans, or None if not found.
Example:
tracker = ExperimentTracker()
trace = tracker.get_trace("exp-1", "trace-abc")
if trace:
for span in trace.spans:
print(span.name, span.annotation_count)
get_experiment_traces
def get_experiment_traces(
experiment_id: str,
limit: int = 20,
cursor_str: str | None = None,
sort_by: str = "execution_time",
order: str = "desc",
search: str | None = None,
filters: list[str] | None = None,
states: list[TraceState] | None = None
) -> PaginatedResponse[TraceRecord]
Retrieve paginated traces for an experiment.
Arguments:
experiment_idstr - The experiment to query.limitint - Maximum number of traces per page (1–100). Defaults to 20.cursor_strstr | None - Pagination cursor from a previous response.sort_bystr - Sort field. Defaults to'execution_time'.orderstr - Sort order,'asc'or'desc'. Defaults to'desc'.searchstr | None - Optional substring filter on trace ID.stateslist[TraceState] | None - Optional filter by trace state.
Returns:
PaginatedResponse[TraceRecord]- Page of traces with pagination cursor.
Example:
tracker = ExperimentTracker()
page = tracker.get_experiment_traces("exp-1", limit=10)
for trace in page.items:
print(trace.trace_id, trace.execution_time_ms)
get_experiment_metric_history
def get_experiment_metric_history(
experiment_id: str,
key: str
) -> list[dict[str, Any]]
Retrieve the full history of a dynamic metric.
Arguments:
experiment_idstr - The experiment to query.keystr - The metric key (e.g.'train_loss').
Returns:
list[dict[str, Any]]: List of {value, step, logged_at} entries
ordered by step.
Example:
tracker = ExperimentTracker()
history = tracker.get_experiment_metric_history("exp-1", "train_loss")
for point in history:
print(f"step={point['step']} loss={point['value']}")
get_experiment_evals
def get_experiment_evals(
experiment_id: str,
limit: int = 20,
cursor_str: str | None = None,
sort_by: str = "created_at",
order: str = "desc",
dataset_id: str | None = None,
json_sort_column: str | None = None,
search: str | None = None,
filters: list[str] | None = None
) -> PaginatedResponse[EvalRecord]
Retrieve paginated eval samples for an experiment.
Arguments:
experiment_idstr - The experiment to query.limitint - Maximum number of evals per page (1–100). Defaults to 20.cursor_strstr | None - Pagination cursor from a previous response.sort_bystr - Sort field. Defaults to'created_at'.orderstr - Sort order,'asc'or'desc'. Defaults to'desc'.dataset_idstr | None - Optional filter by dataset.json_sort_columnstr | None - Resolved JSON column for sorting by score / input / output keys.searchstr | None - Optional substring filter on eval ID.
Returns:
PaginatedResponse[EvalRecord]- Page of eval records with pagination cursor.
Example:
tracker = ExperimentTracker()
page = tracker.get_experiment_evals("exp-1", dataset_id="ds-1")
for eval_rec in page.items:
print(eval_rec.eval_id, eval_rec.scores)
get_experiment_eval_columns
def get_experiment_eval_columns(
experiment_id: str,
dataset_id: str | None = None
) -> EvalColumns
Retrieve the set of available column keys across all evals in an experiment.
Returns the distinct keys found in scores, inputs, outputs, and refs fields, useful for building dynamic table headers.
Arguments:
experiment_idstr - The experiment to query.dataset_idstr | None, optional - Dataset ID for filtering. If not provided, all datasets within the experiment are considered.
Returns:
EvalColumns- Object containing lists of available column keys.
Example:
tracker = ExperimentTracker()
columns = tracker.get_experiment_eval_columns("exp-1")
print("Score columns:", columns.scores)
print("Input columns:", columns.inputs)
get_experiment_eval_typed_columns
def get_experiment_eval_typed_columns(
experiment_id: str,
dataset_id: str | None = None
) -> EvalTypedColumns
Like get_experiment_eval_columns but also returns the type for each key.
get_experiment_trace_columns
def get_experiment_trace_columns(experiment_id: str) -> TraceColumns
Return distinct attribute keys from all spans in an experiment.
get_experiment_trace_typed_columns
def get_experiment_trace_typed_columns(experiment_id: str) -> TraceTypedColumns
Like get_experiment_trace_columns but also returns the type for each key.
get_experiment_evals_average_scores
def get_experiment_evals_average_scores(
experiment_id: str,
dataset_id: str | None = None
) -> dict[str, float]
Calculates the average scores for evaluations from a specified experiment and optionally filters them by a specific dataset.
Arguments:
experiment_idstr - The unique identifier of the experiment from which to fetch evaluation data.dataset_idstr | None, optional - The unique identifier of the dataset to filter evaluations. If not provided, all datasets within the experiment will be considered.
Returns:
dict[str, float]: A dictionary where the keys are evaluation metric names and the values are their corresponding average scores.
get_experiment_eval_dataset_ids
def get_experiment_eval_dataset_ids(experiment_id: str) -> list[str]
Retrieve all unique dataset IDs from evals of an experiment.
resolve_evals_sort_column
def resolve_evals_sort_column(self, experiment_id: str, sort_by: str) -> str | None:
Arguments:
experiment_idstr - The experiment to query.
Returns:
list[str]- Sorted list of distinct dataset IDs.
resolve_evals_sort_column
def resolve_evals_sort_column(experiment_id: str, sort_by: str) -> str | None
Resolve a sort key to the underlying JSON column expression for eval queries.
Used to translate user-facing sort keys (e.g. 'scores.accuracy') into the SQL expression needed for sorting.
Arguments:
experiment_idstr - The experiment to query.sort_bystr - The sort key to resolve.
Returns:
str | None: The resolved SQL column expression, or None if invalid.
Example:
tracker = ExperimentTracker()
col = tracker.resolve_evals_sort_column("exp-1", "scores.accuracy")
update_experiment
def update_experiment(
experiment_id: str,
name: str | None = None,
description: str | None = None,
tags: list[str] | None = None
) -> Experiment | None
Update experiment metadata.
Only the provided fields are updated; None values are ignored.
Arguments:
experiment_idstr - The experiment to update.namestr | None - New experiment name.descriptionstr | None - New experiment description.tagslist[str] | None - New list of tags.
Returns:
Experiment | None: The updated experiment, or None if not found.
Example:
tracker = ExperimentTracker()
tracker.update_experiment(
"exp-1",
name="renamed_experiment",
tags=["production", "v2"],
)
get_eval_annotations
def get_eval_annotations(
experiment_id: str,
dataset_id: str,
eval_id: str
) -> list[AnnotationRecord]
Retrieve all annotations for a specific eval sample.
Returns an empty list if the experiment DB uses an older schema without annotation support.
Arguments:
experiment_idstr - The experiment containing the eval.dataset_idstr - The dataset the eval belongs to.eval_idstr - The eval sample to query.
Returns:
list[AnnotationRecord]- Annotations ordered by creation time.
Example:
tracker = ExperimentTracker()
annotations = tracker.get_eval_annotations("exp-1", "ds-1", "eval-1")
for ann in annotations:
print(f"{ann.name}: {ann.value} by {ann.user}")
get_span_annotations
def get_span_annotations(
experiment_id: str,
trace_id: str,
span_id: str
) -> list[AnnotationRecord]
Retrieve all annotations for a specific span.
Returns an empty list if the experiment DB uses an older schema without annotation support.
Arguments:
experiment_idstr - The experiment containing the trace.trace_idstr - The trace containing the span.span_idstr - The span to query.
Returns:
list[AnnotationRecord]- Annotations ordered by creation time.
Example:
tracker = ExperimentTracker()
annotations = tracker.get_span_annotations("exp-1", "trace-abc", "span-1")
for ann in annotations:
print(f"{ann.name}: {ann.value} by {ann.user}")
update_annotation
def update_annotation(
experiment_id: str,
annotation_id: str,
target: Literal["eval", "span"],
value: int | bool | str | None = None,
rationale: str | None = None
) -> AnnotationRecord
Update an existing annotation's value and/or rationale.
At least one of value or rationale must be provided.
Arguments:
experiment_idstr - The experiment containing the annotation.annotation_idstr - The annotation to update.targetLiteral["eval", "span"] - Whether this is an eval or span annotation.valueint | bool | str | None - New annotation value.Noneto leave unchanged.rationalestr | None - New rationale text.Noneto leave unchanged.
Returns:
AnnotationRecord- The updated annotation record.
Raises:
ValueError- If no fields are provided to update, the annotation is not found, or the experiment uses an older schema.
Example:
tracker = ExperimentTracker()
updated = tracker.update_annotation(
"exp-1", "ann-uuid", "eval",
value=False,
rationale="Revised: answer was actually wrong",
)
delete_annotation
def delete_annotation(
experiment_id: str,
annotation_id: str,
target: Literal["eval", "span"]
) -> None
Delete an annotation by ID.
No-op if the experiment DB uses an older schema without annotation support.
Arguments:
experiment_idstr - The experiment containing the annotation.annotation_idstr - The annotation to delete.targetLiteral["eval", "span"] - Whether this is an eval or span annotation.
Example:
tracker = ExperimentTracker()
tracker.delete_annotation("exp-1", "ann-uuid", "eval")
get_eval_annotation_summary
def get_eval_annotation_summary(
experiment_id: str,
dataset_id: str
) -> AnnotationSummary
Get an aggregated summary of annotations across all evals in a dataset.
Returns feedback and expectation annotations grouped by annotation name.
Feedback items include a counts dict keyed by value (e.g. {"true": 3, "false": 1}). Returns empty lists if the experiment DB uses an older schema.
Arguments:
experiment_idstr - The experiment to query.dataset_idstr - The dataset to summarize.
Returns:
AnnotationSummary- Summary withfeedbackandexpectationslists.
Example:
tracker = ExperimentTracker()
summary = tracker.get_eval_annotation_summary("exp-1", "ds-1")
for fb in summary.feedback:
print(f"{fb.name}: {fb.total} total, counts={fb.counts}")
for exp in summary.expectations:
print(f"{exp.name}: {exp.total} total")
get_trace_annotation_summary
def get_trace_annotation_summary(
experiment_id: str,
trace_id: str
) -> AnnotationSummary
Get an aggregated summary of annotations across all spans in a trace.
Returns feedback and expectation annotations grouped by annotation name.
Feedback items include a counts dict keyed by value. Returns empty lists if the experiment DB uses an older schema.
Arguments:
experiment_idstr - The experiment containing the trace.trace_idstr - The trace to summarize.
Returns:
AnnotationSummary- Summary withfeedbackandexpectationslists.
Example:
tracker = ExperimentTracker()
summary = tracker.get_trace_annotation_summary("exp-1", "trace-abc")
for fb in summary.feedback:
print(f"{fb.name}: {fb.total} total, counts={fb.counts}")
get_all_traces_annotation_summary
def get_all_traces_annotation_summary(experiment_id: str) -> AnnotationSummary
Get an aggregated summary of span annotations across all traces.
Unlike get_trace_annotation_summary which scopes to a single trace, this method aggregates annotations from every span in the experiment.
Arguments:
experiment_idstr - The experiment to query.
Returns:
AnnotationSummary- Summary withfeedbackandexpectationslists.
Example:
tracker = ExperimentTracker()
summary = tracker.get_all_traces_annotation_summary("exp-1")
for fb in summary.feedback:
print(f"{fb.name}: {fb.total} total, counts={fb.counts}")
get_experiment_ddl_version
def get_experiment_ddl_version(experiment_id: str) -> int
Retrieve the schema version of the experiment database.
The version corresponds to PRAGMA user_version in the SQLite DB.
Version 0 indicates a legacy DB without annotation table support.
Arguments:
experiment_idstr - The experiment to check.
Returns:
int- The schema version number.
Example:
tracker = ExperimentTracker()
version = tracker.get_experiment_ddl_version("exp-1")
if version < 1:
print("This experiment does not support annotations")
get_group
def get_group(group_id: str) -> Group | None
Retrieve a group by ID.
Arguments:
group_idstr - The group to look up.
Returns:
Group | None: The group metadata, or None if not found.
Example:
tracker = ExperimentTracker()
group = tracker.get_group("group-uuid")
if group:
print(group.name)
update_group
def update_group(
group_id: str,
name: str | None = None,
description: str | None = None,
tags: list[str] | None = None
) -> Group | None
Update group metadata.
Only the provided fields are updated; None values are ignored.
Arguments:
group_idstr - The group to update.namestr | None - New group name.descriptionstr | None - New group description.tagslist[str] | None - New list of tags.
Returns:
Group | None: The updated group, or None if not found.
Example:
tracker = ExperimentTracker()
tracker.update_group("group-uuid", name="Production Models")
delete_group
def delete_group(group_id: str) -> None
Delete a group by ID.
Arguments:
group_idstr - The group to delete.
Example:
tracker = ExperimentTracker()
tracker.delete_group("group-uuid")
list_groups_pagination
def list_groups_pagination(
limit: int = 20,
cursor_str: str | None = None,
sort_by: str = "created_at",
order: str = "desc",
search: str | None = None
) -> PaginatedResponse[Group]
Retrieve a paginated list of groups.
Arguments:
limitint - Maximum number of groups per page. Defaults to 20.cursor_strstr | None - Pagination cursor from a previous response.sort_bystr - Sort field. Defaults to'created_at'.orderstr - Sort order,'asc'or'desc'. Defaults to'desc'.searchstr | None - Optional substring filter on group name.
Returns:
PaginatedResponse[Group]- Page of groups with pagination cursor.
Example:
tracker = ExperimentTracker()
page = tracker.list_groups_pagination(limit=10, search="prod")
for group in page.items:
print(group.name)
list_group_experiments_pagination
def list_group_experiments_pagination(
group_id: str,
limit: int = 20,
cursor_str: str | None = None,
sort_by: str = "created_at",
order: str = "desc",
search: str | None = None,
json_sort_column: str | None = None
) -> PaginatedResponse[Experiment]
Retrieve a paginated list of experiments within a group.
Arguments:
group_idstr - The group to query.limitint - Maximum number of experiments per page. Defaults to 20.cursor_strstr | None - Pagination cursor from a previous response.sort_bystr - Sort field. Defaults to'created_at'.orderstr - Sort order,'asc'or'desc'. Defaults to'desc'.searchstr | None - Optional string for filtering experiments by sql-like query.json_sort_columnstr | None - Resolved JSON column for sorting by static param or dynamic metric keys.
Returns:
PaginatedResponse[Experiment]- Page of experiments with pagination cursor.
Example:
tracker = ExperimentTracker()
page = tracker.list_group_experiments_pagination("group-uuid", limit=5)
for exp in page.items:
print(exp.name, exp.tags)
get_group_experiments_static_params_keys
def get_group_experiments_static_params_keys(group_id: str) -> list[str]
Retrieve all distinct static parameter keys across experiments in a group.
Useful for building comparison tables where each column is a parameter.
Arguments:
group_idstr - The group to query.
Returns:
list[str]- Sorted list of distinct parameter key names.
Example:
tracker = ExperimentTracker()
keys = tracker.get_group_experiments_static_params_keys("group-uuid")
# e.g. ["batch_size", "learning_rate", "model_architecture"]
get_group_experiments_dynamic_metrics_keys
def get_group_experiments_dynamic_metrics_keys(group_id: str) -> list[str]
Retrieve all distinct dynamic metric keys across experiments in a group.
Useful for building comparison charts where each series is a metric.
Arguments:
group_idstr - The group to query.
Returns:
list[str]- Sorted list of distinct metric key names.
Example:
tracker = ExperimentTracker()
keys = tracker.get_group_experiments_dynamic_metrics_keys("group-uuid")
# e.g. ["eval_accuracy", "train_loss", "val_loss"]
resolve_experiment_sort_column
def resolve_experiment_sort_column(group_id: str, sort_by: str) -> str | None
Resolve a sort key to the underlying column expression for experiment queries.
Used to translate user-facing sort keys (e.g. 'static.learning_rate') into the SQL expression needed for sorting.
Arguments:
group_idstr - The group context for resolution.sort_bystr - The sort key to resolve.
Returns:
str | None: The resolved SQL column expression, or None if invalid.
Example:
tracker = ExperimentTracker()
col = tracker.resolve_experiment_sort_column("group-uuid", "static.lr")
update_model
def update_model(
model_id: str,
name: str | None = None,
tags: list[str] | None = None
) -> Model | None
Update model metadata.
Only the provided fields are updated; None values are ignored.
Arguments:
model_idstr - The model to update.namestr | None - New model name.tagslist[str] | None - New list of tags.
Returns:
Model | None: The updated model, or None if not found.
Example:
tracker = ExperimentTracker()
tracker.update_model("model-uuid", name="v2-finetuned", tags=["prod"])
delete_model
def delete_model(model_id: str) -> None
Delete a model by ID.
Arguments:
model_idstr - The model to delete.
Example:
tracker = ExperimentTracker()
tracker.delete_model("model-uuid")
list_experiment_models
def list_experiment_models(experiment_id: str) -> list[Model]
Retrieve all models associated with an experiment.
Arguments:
experiment_idstr - The experiment to query.
Returns:
list[Model]- List of models linked to the experiment.
Example:
tracker = ExperimentTracker()
models = tracker.list_experiment_models("exp-1")
for model in models:
print(f"{model.name} ({model.size} bytes)")
enable_tracing
def enable_tracing() -> None
Enable OpenTelemetry tracing for the experiment.
Sets up automatic tracing of function calls and links traces to the experiment. Useful for tracking execution flow in ML pipelines.
Example:
tracker = ExperimentTracker()
tracker.enable_tracing()
exp_id = tracker.start_experiment()
# All traced functions will be logged to this experiment
export
def export(output_path: str,
experiment_id: str | None = None) -> "ExperimentReference"
Export the entire experiment tracking data and save as an artifact.
Arguments:
output_path- Path to save the exported artifact.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
# Log data...
tracker.end_experiment()
tracker.export("experiment_data.tar", experiment_id=exp_id)
export
def export(
output_path: str,
experiment_id: str | None = None
) -> "ExperimentReference"
Export the entire experiment tracking data and save as an artifact.
Arguments:
output_path- Path to save the exported artifact.
Example:
tracker = ExperimentTracker()
exp_id = tracker.start_experiment()
# Log data...
tracker.end_experiment()
tracker.export("experiment_data.tar", experiment_id=exp_id)