Evals
Provider-neutral eval runner, metrics, and reporters.
Import from @anvia/core or @anvia/core/evals.
EvalCase
type EvalCase<Input, Expected = unknown> = {
id: string;
input: Input;
expected?: Expected;
metadata?: Record<string, JsonValue | undefined>;
};Purpose: one input and optional expected value for an eval suite.
EvalOutcome
type EvalOutcomeStatus = "pass" | "fail" | "invalid";
type EvalOutcome<Score = unknown> =
| { outcome: "pass"; score?: Score; comment?: string; metadata?: EvalMetadata }
| { outcome: "fail"; score?: Score; comment?: string; metadata?: EvalMetadata }
| { outcome: "invalid"; reason: string; score?: Score; comment?: string; metadata?: EvalMetadata };Purpose: normalized metric result. Use EvalOutcome.pass(...), EvalOutcome.fail(...), and EvalOutcome.invalid(...) to construct outcomes.
EvalMetric
type EvalMetric<Input, Output, Score = unknown, Expected = unknown> = {
name: string;
evaluate(args: EvalMetricArgs<Input, Output, Expected>): EvalOutcome<Score> | Promise<EvalOutcome<Score>>;
};
type EvalMetricArgs<Input, Output, Expected = unknown> = {
suiteName: string;
case: EvalCase<Input, Expected>;
output: Output;
};
type EvalMetricResult<Score = unknown> = {
metricName: string;
outcome: EvalOutcome<Score>;
reporterErrors: unknown[];
};
type EvalCaseResult<Input, Output, Expected = unknown> = {
case: EvalCase<Input, Expected>;
output?: Output;
targetError?: unknown;
metrics: EvalMetricResult[];
};Purpose: evaluates one case output and records normalized metric results.
EvalReporter
type EvalReportArgs<Input, Output, Score = unknown, Expected = unknown> = {
suiteName: string;
case: EvalCase<Input, Expected>;
output?: Output;
targetError?: unknown;
metric: EvalMetric<Input, Output, Score, Expected>;
outcome: EvalOutcome<Score>;
};
type EvalReporter<Input = unknown, Output = unknown, Expected = unknown> = {
report(args: EvalReportArgs<Input, Output, unknown, Expected>): void | Promise<void>;
};Purpose: receives each metric outcome for persistence or external reporting.
Return behavior: reporter errors are collected on metric results unless failOnReporterError is true.
runEvalSuite
type RunEvalSuiteOptions<Input, Output, Expected = unknown> = {
name: string;
cases: Array<EvalCase<Input, Expected>>;
target: EvalTarget<Input, Output, Expected>;
metrics: Array<EvalMetric<Input, Output, unknown, Expected>>;
concurrency?: number;
reporters?: Array<EvalReporter<Input, Output, Expected>>;
failOnReporterError?: boolean;
};
type EvalSuiteResult<Input, Output, Expected = unknown> = {
name: string;
results: Array<EvalCaseResult<Input, Output, Expected>>;
passed: number;
failed: number;
invalid: number;
durationMs: number;
};
function runEvalSuite<Input, Output, Expected>(
options: RunEvalSuiteOptions<Input, Output, Expected>,
): Promise<EvalSuiteResult<Input, Output, Expected>>;Purpose: runs cases through a target, evaluates each metric, calls optional reporters, and returns ordered results.
Return behavior: target errors become invalid metric outcomes. Reporter errors are collected unless failOnReporterError is true.
Built-in Metrics
type ValueSelector<Input, Output, Expected, Value> = (
args: EvalMetricArgs<Input, Output, Expected>,
) => Value | Promise<Value>;
type SelectorOrValue<Input, Output, Expected, Value> =
| Value
| ValueSelector<Input, Output, Expected, Value>;
type ExactMatchOptions<Input, Output, Expected = unknown> = {
name?: string;
actual?: ValueSelector<Input, Output, Expected, unknown>;
expected?: SelectorOrValue<Input, Output, Expected, unknown>;
};
type ContainsOptions<Input, Output, Expected = unknown> = {
name?: string;
actual?: ValueSelector<Input, Output, Expected, string>;
expected?: SelectorOrValue<Input, Output, Expected, string | RegExp>;
};
type SemanticSimilarityOptions<Input, Output, Expected = unknown> = {
name?: string;
model: EmbeddingModel;
threshold: number;
actual?: ValueSelector<Input, Output, Expected, string>;
expected?: SelectorOrValue<Input, Output, Expected, string>;
};
type LlmJudgeOptions<Input, Output, SchemaOutput, Expected = unknown> = {
name?: string;
model: CompletionModel;
schema: ZodSchema<SchemaOutput>;
passes(value: SchemaOutput): boolean;
instructions?: string;
retries?: number;
prompt?: ValueSelector<Input, Output, Expected, string>;
};
type LlmScoreMetricScore = {
score: number;
feedback: string;
};
type LlmScoreOptions<Input, Output, Expected = unknown> = {
name?: string;
model: CompletionModel;
threshold: number;
criteria: string | string[];
instructions?: string;
retries?: number;
prompt?: ValueSelector<Input, Output, Expected, string>;
};
exactMatch(options?: ExactMatchOptions);
contains(options?: ContainsOptions);
semanticSimilarity(options: SemanticSimilarityOptions);
llmJudge(options: LlmJudgeOptions);
llmScore(options: LlmScoreOptions);Purpose: common deterministic, embedding, and LLM-as-judge eval checks.
agentEvalTarget
type AgentEvalTargetOptions<Input, Output = PromptResponse> = {
prompt?: (input: Input, testCase: EvalCase<Input>) => string | Message;
output?: (response: PromptResponse, testCase: EvalCase<Input>) => Output;
};
function agentEvalTarget<Input>(
agent: Agent,
options?: AgentEvalTargetOptions<Input, PromptResponse>,
): EvalTarget<Input, PromptResponse>;
function agentEvalTarget<Input, Output>(
agent: Agent,
options: AgentEvalTargetOptions<Input, Output>,
): EvalTarget<Input, Output>;Purpose: adapts an Agent to an eval target by calling agent.prompt(input).send().
Return behavior: returns the full prompt response by default, or a selected value when options.output is provided.
For workflow guidance, see Evals.
