Skip to content

evalwire.cli

evalwire.cli

evalwire CLI — evalwire upload, evalwire run, evalwire validate, evalwire export, evalwire compare, and evalwire report commands.

main()

evalwire — evaluate any async callable with Arize Phoenix.

Source code in src/evalwire/cli.py
@click.group()
def main() -> None:
    """evalwire — evaluate any async callable with Arize Phoenix."""

upload_cmd(csv_path, on_exist, input_keys, output_keys, tag_column, delimiter, strict, config_path)

Upload a CSV testset to Arize Phoenix as one or more named datasets.

Source code in src/evalwire/cli.py
@main.command("upload")
@click.option("--csv", "csv_path", default=None, help="Path to the CSV file.")
@click.option(
    "--on-exist",
    type=click.Choice(["skip", "overwrite", "append"]),
    default=None,
    show_default=True,
    help="How to handle existing datasets.",
)
@click.option(
    "--input-keys",
    default=None,
    help="Comma-separated input column names.",
)
@click.option(
    "--output-keys",
    default=None,
    help="Comma-separated output column names.",
)
@click.option(
    "--tag-column",
    default=None,
    help="Column used for dataset splitting.",
)
@click.option(
    "--delimiter",
    default=None,
    help="Pipe-split delimiter.",
)
@click.option(
    "--strict",
    is_flag=True,
    default=False,
    help="Abort upload if validation issues are found.",
)
@click.option(
    "--config",
    "config_path",
    default=None,
    help="Path to evalwire.toml.",
)
def upload_cmd(
    csv_path: str | None,
    on_exist: str | None,
    input_keys: str | None,
    output_keys: str | None,
    tag_column: str | None,
    delimiter: str | None,
    strict: bool,
    config_path: str | None,
) -> None:
    """Upload a CSV testset to Arize Phoenix as one or more named datasets."""
    try:
        config = load_config(config_path)
        ds_cfg = get_dataset_config(config)

        # CLI flags take precedence over config file values.
        resolved_csv = csv_path or ds_cfg.get("csv_path")
        resolved_on_exist = cast(
            Literal["skip", "overwrite", "append"],
            on_exist or ds_cfg.get("on_exist", "skip"),
        )
        resolved_input_keys = (
            [k.strip() for k in input_keys.split(",")]
            if input_keys
            else ds_cfg.get("input_keys", ["user_query"])
        )
        resolved_output_keys = (
            [k.strip() for k in output_keys.split(",")]
            if output_keys
            else ds_cfg.get("output_keys", ["expected_output"])
        )
        resolved_tag_column = tag_column or ds_cfg.get("tag_column", "tags")
        resolved_delimiter = delimiter or ds_cfg.get("delimiter", "|")

        if not resolved_csv:
            raise click.UsageError(
                "No CSV path provided. Use --csv or set csv_path in evalwire.toml."
            )

        if strict:
            from evalwire.validator import DatasetValidator

            validator = DatasetValidator()
            result = validator.validate(
                csv_path=resolved_csv,
                input_keys=resolved_input_keys,
                output_keys=resolved_output_keys,
                tag_column=resolved_tag_column,
            )
            if not result.is_valid:
                for issue in result.issues:
                    row_info = f"row {issue.row}: " if issue.row is not None else ""
                    click.echo(f"  {row_info}{issue.message}", err=True)
                click.echo(
                    f"Validation failed: {len(result.issues)} issue(s) found. Upload aborted.",
                    err=True,
                )
                sys.exit(2)

        from evalwire.uploader import DatasetUploader

        client = _make_client()
        uploader = DatasetUploader(
            csv_path=resolved_csv,
            phoenix_client=client,
            input_keys=resolved_input_keys,
            output_keys=resolved_output_keys,
            tag_column=resolved_tag_column,
            delimiter=resolved_delimiter,
        )
        datasets = uploader.upload(on_exist=resolved_on_exist)
        click.echo(f"Uploaded {len(datasets)} dataset(s): {', '.join(datasets)}")
    except click.UsageError:
        raise
    except Exception as exc:
        click.echo(f"Error: {exc}", err=True)
        sys.exit(2)

run_cmd(experiments_path, names, dry_run, concurrency, prefix, config_path)

Discover and execute all registered experiments against their Phoenix datasets.

Source code in src/evalwire/cli.py
@main.command("run")
@click.option(
    "--experiments",
    "experiments_path",
    default=None,
    help="Path to the experiments directory.",
)
@click.option(
    "--name",
    "names",
    multiple=True,
    help="Run only the named experiment(s). Repeatable.",
)
@click.option(
    "--dry-run",
    "dry_run",
    default=None,
    type=int,
    is_flag=False,
    flag_value=1,
    help="Run without uploading results. Optional count of examples.",
)
@click.option(
    "--concurrency",
    default=None,
    type=int,
    help="Number of parallel experiments.",
)
@click.option(
    "--prefix",
    default=None,
    help="Experiment name prefix in Phoenix.",
)
@click.option(
    "--config",
    "config_path",
    default=None,
    help="Path to evalwire.toml.",
)
def run_cmd(
    experiments_path: str | None,
    names: tuple[str, ...],
    dry_run: int | None,
    concurrency: int | None,
    prefix: str | None,
    config_path: str | None,
) -> None:
    """Discover and execute all registered experiments against their Phoenix datasets."""
    try:
        config = load_config(config_path)
        exp_cfg = get_experiments_config(config)

        resolved_experiments_path = experiments_path or exp_cfg.get(
            "dir", "experiments"
        )
        resolved_concurrency = (
            concurrency if concurrency is not None else exp_cfg.get("concurrency", 1)
        )
        resolved_prefix = prefix or exp_cfg.get("prefix", "eval")
        resolved_dry_run: bool | int = dry_run if dry_run is not None else False

        from evalwire.runner import ExperimentRunner

        client = _make_client()
        runner = ExperimentRunner(
            experiments_dir=resolved_experiments_path,
            phoenix_client=client,
            concurrency=resolved_concurrency,
            dry_run=resolved_dry_run,
        )
        results = runner.run(
            names=list(names) if names else None,
            experiment_name_prefix=resolved_prefix,
        )
        click.echo(f"Completed {len(results)} experiment(s).")
    except SystemExit:
        sys.exit(1)
    except click.UsageError:
        raise
    except Exception as exc:
        click.echo(f"Error: {exc}", err=True)
        sys.exit(2)

export_cmd(experiment_id, fmt, output_path)

Export experiment results to a CSV or JSON file.

Source code in src/evalwire/cli.py
@main.command("export")
@click.option(
    "--experiment",
    "experiment_id",
    default=None,
    help="Phoenix experiment ID.",
)
@click.option(
    "--format",
    "fmt",
    type=click.Choice(["csv", "json"]),
    default="csv",
    show_default=True,
    help="Output format.",
)
@click.option(
    "--output",
    "output_path",
    default=None,
    help="Destination file path.",
)
def export_cmd(
    experiment_id: str | None,
    fmt: str,
    output_path: str | None,
) -> None:
    """Export experiment results to a CSV or JSON file."""
    if not experiment_id:
        raise click.UsageError("No experiment ID provided. Use --experiment.")
    try:
        from pathlib import Path as _Path

        from evalwire.results import ResultCollector

        client = _make_client()
        rc = ResultCollector(client)
        resolved_output = (
            _Path(output_path) if output_path else _Path(f"{experiment_id}.{fmt}")
        )
        rc.export(experiment_id, format=fmt, path=resolved_output)  # ty: ignore[invalid-argument-type]
        click.echo(f"Exported results to {resolved_output}")
    except click.UsageError:
        raise
    except Exception as exc:
        click.echo(f"Error: {exc}", err=True)
        sys.exit(2)

compare_cmd(experiment_id_a, experiment_id_b)

Compare two experiment runs by their mean evaluator scores.

Source code in src/evalwire/cli.py
@main.command("compare")
@click.argument("experiment_id_a")
@click.argument("experiment_id_b")
def compare_cmd(experiment_id_a: str, experiment_id_b: str) -> None:
    """Compare two experiment runs by their mean evaluator scores."""
    try:
        from evalwire.results import ResultCollector

        client = _make_client()
        rc = ResultCollector(client)
        comparison = rc.compare(experiment_id_a, experiment_id_b)

        click.echo(f"Comparing {experiment_id_a!r} vs {experiment_id_b!r}")
        click.echo("")
        for name, info in sorted(comparison.items()):
            delta = info["delta"]
            sign = "+" if delta >= 0 else ""
            click.echo(
                f"  {name}: {info['score_a']:.4f}{info['score_b']:.4f}  ({sign}{delta:.4f})"
            )
    except Exception as exc:
        click.echo(f"Error: {exc}", err=True)
        sys.exit(2)

report_cmd(experiment_id)

Generate a markdown summary report for an experiment.

Source code in src/evalwire/cli.py
@main.command("report")
@click.option(
    "--experiment",
    "experiment_id",
    default=None,
    help="Phoenix experiment ID.",
)
def report_cmd(experiment_id: str | None) -> None:
    """Generate a markdown summary report for an experiment."""
    if not experiment_id:
        raise click.UsageError("No experiment ID provided. Use --experiment.")
    try:
        from evalwire.results import ResultCollector

        client = _make_client()
        rc = ResultCollector(client)
        report = rc.report(experiment_id)
        click.echo(report)
    except click.UsageError:
        raise
    except Exception as exc:
        click.echo(f"Error: {exc}", err=True)
        sys.exit(2)

validate_cmd(csv_path, input_keys, output_keys, tag_column)

Validate a CSV testset for structural and content correctness.

Source code in src/evalwire/cli.py
@main.command("validate")
@click.option("--csv", "csv_path", default=None, help="Path to the CSV file.")
@click.option(
    "--input-keys",
    default="user_query",
    show_default=True,
    help="Comma-separated input column names.",
)
@click.option(
    "--output-keys",
    default="expected_output",
    show_default=True,
    help="Comma-separated output column names.",
)
@click.option(
    "--tag-column",
    default="tags",
    show_default=True,
    help="Column used for dataset splitting.",
)
def validate_cmd(
    csv_path: str | None,
    input_keys: str,
    output_keys: str,
    tag_column: str,
) -> None:
    """Validate a CSV testset for structural and content correctness."""
    if not csv_path:
        raise click.UsageError("No CSV path provided. Use --csv.")
    try:
        from evalwire.validator import DatasetValidator

        resolved_input_keys = [k.strip() for k in input_keys.split(",")]
        resolved_output_keys = [k.strip() for k in output_keys.split(",")]

        validator = DatasetValidator()
        result = validator.validate(
            csv_path=csv_path,
            input_keys=resolved_input_keys,
            output_keys=resolved_output_keys,
            tag_column=tag_column,
        )
        if result.is_valid:
            click.echo("Validation passed: testset is valid.")
        else:
            for issue in result.issues:
                row_info = f"row {issue.row}: " if issue.row is not None else ""
                click.echo(f"  {row_info}{issue.message}")
            click.echo(f"Validation failed: {len(result.issues)} issue(s) found.")
            sys.exit(1)
    except click.UsageError:
        raise
    except Exception as exc:
        click.echo(f"Error: {exc}", err=True)
        sys.exit(2)