EntityProcess · christso · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -19,13 +19,13 @@ AgentV's core should remain minimal. Complex or domain-specific logic belongs in
 
 **Extension points (prefer these over adding built-ins):**
 - `code-grader` scripts for custom evaluation logic
-- `llm-grader` evaluators with custom prompt files for domain-specific LLM grading
+- `llm-grader` graders with custom prompt files for domain-specific LLM grading
 - CLI wrappers that consume AgentV's JSON/JSONL output for post-processing (aggregation, comparison, reporting)
 
-**Ask yourself:** "Can this be achieved with existing primitives + a plugin or wrapper?" If yes, it should not be a built-in. This includes adding config overrides to existing evaluators — if a niche provider needs custom tool-name matching, that's a code-grader, not a new config field.
+**Ask yourself:** "Can this be achieved with existing primitives + a plugin or wrapper?" If yes, it should not be a built-in. This includes adding config overrides to existing graders — if a niche provider needs custom tool-name matching, that's a code-grader, not a new config field.
 
 ### 2. Built-ins for Primitives Only
-Built-in evaluators provide **universal primitives** that users compose. A primitive is:
+Built-in graders provide **universal primitives** that users compose. A primitive is:
 - Stateless and deterministic
 - Has a single, clear responsibility
 - Cannot be trivially composed from other primitives
@@ -77,11 +77,11 @@ AI agents are the primary users of AgentV—not humans reading docs. Design for
 
 ## Project Structure
 - `packages/core/` - Evaluation engine, providers, grading
-  - `src/evaluation/registry/` - Extensible evaluator registry (EvaluatorRegistry, assertion discovery)
+  - `src/evaluation/registry/` - Extensible grader registry (EvaluatorRegistry, assertion discovery)
   - `src/evaluation/providers/provider-registry.ts` - Provider plugin registry
   - `src/evaluation/evaluate.ts` - `evaluate()` programmatic API
   - `src/evaluation/config.ts` - `defineConfig()` for typed agentv.config.ts
-- `packages/eval/` - Lightweight assertion SDK (`defineAssertion`, `defineCodeJudge`)
+- `packages/eval/` - Lightweight assertion SDK (`defineAssertion`, `defineCodeGrader`)
 - `apps/cli/` - Command-line interface (published as `agentv`)
   - `src/commands/create/` - Scaffold commands (`agentv create assertion/eval`)
 - `examples/features/sdk-*` - SDK usage examples (custom assertion, programmatic API, config file)
@@ -261,9 +261,9 @@ Tests should be lean and focused on what matters. Follow these principles:
 - **Regression tests > comprehensive tests.** A test that would have caught the bug is worth more than five tests that exercise happy paths.
 - **Tests are executable contracts.** When a module's behavioral contract changes, the tests must reflect the new contract — not just the happy path. If you change what a function promises, update its tests to assert the new promise.
 
-### Verifying Evaluator Changes
+### Verifying Grader Changes
 
-Unit tests alone are insufficient for evaluator changes. After implementing or modifying evaluators:
+Unit tests alone are insufficient for grader changes. After implementing or modifying graders:
 
 1. **Copy `.env` to the worktree** if running in a git worktree (e2e tests need environment variables):
    ```bash
@@ -272,21 +272,21 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m
    ```powershell
    Copy-Item D:/path/to/main/.env .env
    ```
-   Do not claim e2e or evaluator verification results unless this preflight has passed.
+   Do not claim e2e or grader verification results unless this preflight has passed.
 
 2. **Run an actual eval** with a real example file:
    ```bash
    bun apps/cli/src/cli.ts eval examples/features/rubric/evals/dataset.eval.yaml --test-id <test-id>
    ```
 
 3. **Inspect the results JSONL** to verify:
-   - The correct evaluator type is invoked (check `scores[].type`)
+   - The correct grader type is invoked (check `scores[].type`)
    - Scores are calculated as expected
    - Assertions array reflects the evaluation logic (each entry has `text`, `passed`, optional `evidence`)
 
 4. **Update baseline files** if output format changes (e.g., type name renames). Baseline files live alongside eval YAML files as `*.baseline.jsonl` and contain expected `scores[].type` values. There are 30+ baseline files across `examples/`.
 
-5. **Note:** `--dry-run` returns schema-valid mock responses (`{}` as output, zeroed `tokenUsage`). Built-in graders will not crash, but scores are meaningless. Use it for testing harness flow, not evaluator logic.
+5. **Note:** `--dry-run` returns schema-valid mock responses (`{}` as output, zeroed `tokenUsage`). Built-in graders will not crash, but scores are meaningless. Use it for testing harness flow, not grader logic.
 
 ### Completing Work — E2E Checklist
 
@@ -307,11 +307,11 @@ Before marking any branch as ready for review, complete this checklist:
    - **Green (with your changes):** Run the identical scenario with your branch. Confirm the fix or feature works correctly from the end user's perspective. Capture the output.
    - **Document both** red and green results in the PR description or comments so reviewers can see the before/after evidence.
 
-   For evaluator changes, this means running a real eval (not `--dry-run`) and inspecting the output JSONL. For CLI/UX changes, this means running the CLI command and verifying the console output.
+   For grader changes, this means running a real eval (not `--dry-run`) and inspecting the output JSONL. For CLI/UX changes, this means running the CLI command and verifying the console output.
 
-4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).
+4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed grader parsing, run an eval that exercises different grader types).
 
-5. **Live eval verification**: For changes affecting scoring, thresholds, or evaluator behavior, run at least one real eval with a live provider (not `--dry-run`) and verify the output JSONL has correct scores, verdicts, and execution status.
+5. **Live eval verification**: For changes affecting scoring, thresholds, or grader behavior, run at least one real eval with a live provider (not `--dry-run`) and verify the output JSONL has correct scores, verdicts, and execution status.
 
 6. **Studio UX verification**: For changes affecting config, scoring display, or studio API, use `agent-browser` to verify the studio UI still renders and functions correctly (settings page loads, pass/fail indicators are correct, config saves work).
 
@@ -323,15 +323,15 @@ When making changes to functionality:
 
 1. **Docs site** (`apps/web/src/content/docs/`): Update human-readable documentation on agentv.dev. This is the comprehensive reference.
 
-2. **Skill files** (`plugins/agentv-dev/skills/agentv-eval-builder/`): Update the AI-focused reference card if the change affects YAML schema, evaluator types, or CLI commands. Keep concise — link to docs site for details.
+2. **Skill files** (`plugins/agentv-dev/skills/agentv-eval-builder/`): Update the AI-focused reference card if the change affects YAML schema, grader types, or CLI commands. Keep concise — link to docs site for details.
 
 3. **Examples** (`examples/`): Update any example code, scripts, or eval YAML files that exercise the changed functionality. Examples are both documentation and integration tests.
 
 4. **README.md**: Keep minimal. Links point to agentv.dev.
 
-## Evaluator Type System
+## Grader Type System
 
-Evaluator types use **kebab-case** everywhere (matching promptfoo convention):
+Grader types use **kebab-case** everywhere (matching promptfoo convention):
 
 - **YAML config:** `type: llm-grader`, `type: is-json`, `type: execution-metrics`
 - **Internal TypeScript:** `EvaluatorKind = 'llm-grader' | 'is-json' | ...`
@@ -340,7 +340,7 @@ Evaluator types use **kebab-case** everywhere (matching promptfoo convention):
 
 **Source of truth:** `EVALUATOR_KIND_VALUES` array in `packages/core/src/evaluation/types.ts`
 
-**Backward compatibility:** Snake_case is accepted in YAML (`llm_judge` → `llm-grader`) via `normalizeEvaluatorType()` in `evaluator-parser.ts`. Single-word types (`contains`, `equals`, `regex`, `latency`, `cost`) have no separator and are unchanged.
+**Backward compatibility:** Snake_case is accepted in YAML (`llm_judge` → `llm-grader`) via `normalizeGraderType()` in `grader-parser.ts`. Single-word types (`contains`, `equals`, `regex`, `latency`, `cost`) have no separator and are unchanged.
 
 **Two type definitions exist:**
 - `EvaluatorKind` in `packages/core/src/evaluation/types.ts` — internal, canonical

diff --git a/README.md b/README.md
@@ -107,7 +107,7 @@ console.log(`${summary.passed}/${summary.total} passed`);
 Full docs at [agentv.dev/docs](https://agentv.dev/docs/getting-started/introduction/).
 
 - [Eval files](https://agentv.dev/docs/evaluation/eval-files/) — format and structure
-- [Custom evaluators](https://agentv.dev/docs/evaluators/custom-evaluators/) — code graders in any language
+- [Custom graders](https://agentv.dev/docs/graders/custom-graders/) — code graders in any language
 - [Rubrics](https://agentv.dev/docs/evaluation/rubrics/) — structured criteria scoring
 - [Targets](https://agentv.dev/docs/targets/configuration/) — configure agents and providers
 - [Compare results](https://agentv.dev/docs/tools/compare/) — A/B testing and regression detection

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -4,7 +4,7 @@ import path from 'node:path';
 import {
   DEFAULT_THRESHOLD,
   type EvaluationResult,
-  type EvaluatorResult,
+  type GraderResult,
   toTranscriptJsonLines,
 } from '@agentv/core';
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
@@ -227,9 +227,7 @@ function buildAssertions(result: EvaluationResult): GradingArtifact['assertions'
 // Build graders list
 // ---------------------------------------------------------------------------
 
-function buildEvaluators(
-  scores: readonly EvaluatorResult[] | undefined,
-): GradingArtifact['graders'] {
+function buildEvaluators(scores: readonly GraderResult[] | undefined): GradingArtifact['graders'] {
   if (!scores || scores.length === 0) {
     return undefined;
   }
@@ -370,7 +368,7 @@ export function buildBenchmarkArtifact(
     runSummary[target] = entry as (typeof runSummary)[string];
   }
 
-  // Per-evaluator summary across all results
+  // Per-grader summary across all results
   const evaluatorScores = new Map<string, number[]>();
   for (const result of results) {
     if (result.scores) {

diff --git a/apps/cli/src/commands/eval/benchmark-writer.ts b/apps/cli/src/commands/eval/benchmark-writer.ts
@@ -32,10 +32,10 @@ function computeStats(values: readonly number[]): BenchmarkStats {
 }
 
 /**
- * Compute per-test pass_rate from evaluator scores.
+ * Compute per-test pass_rate from grader scores.
  *
  * For each test, pass_rate = count(evaluator.score >= 0.8) / total_evaluators.
- * If no per-evaluator scores exist, falls back to the top-level result score
+ * If no per-grader scores exist, falls back to the top-level result score
  * with the same threshold (>= 0.8 → 1.0, else 0.0).
  */
 function computePassRate(result: EvaluationResult): number {

diff --git a/apps/cli/src/commands/eval/commands/assert.ts b/apps/cli/src/commands/eval/commands/assert.ts
@@ -62,7 +62,7 @@ export const evalAssertCommand = command({
       process.exit(1);
     }
 
-    // Build payload matching CodeEvaluator's expected format (snake_case).
+    // Build payload matching CodeGrader's expected format (snake_case).
     // Include all fields that defineCodeGrader validates as required.
     const payload = JSON.stringify(
       {

diff --git a/apps/cli/src/commands/eval/html-writer.ts b/apps/cli/src/commands/eval/html-writer.ts
@@ -500,10 +500,10 @@ const SCRIPT = `
     h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
     h+="</div>";
 
-    /* evaluator results */
+    /* grader results */
     if(r.scores&&r.scores.length>0){
-      h+="<h4>Evaluator Results</h4>";
-      h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
+      h+="<h4>Grader Results</h4>";
+      h+='<table class="eval-table"><thead><tr><th>Grader</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
       for(var i=0;i<r.scores.length;i++){
         var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
         var evAssertions=ev.assertions||[];

diff --git a/apps/cli/src/commands/inspect/score.ts b/apps/cli/src/commands/inspect/score.ts
@@ -2,9 +2,9 @@ import {
   type EvalTest,
   type EvaluationContext,
   type EvaluationScore,
-  type Evaluator,
-  type EvaluatorConfig,
-  type EvaluatorDispatchContext,
+  type Grader,
+  type GraderConfig,
+  type GraderDispatchContext,
   type Message,
   type Provider,
   type ProviderRequest,
@@ -24,7 +24,7 @@ import {
 } from './utils.js';
 
 /**
- * Evaluator types that work without an LLM provider.
+ * Grader types that work without an LLM provider.
  */
 const SUPPORTED_TYPES = [
   'contains',
@@ -52,7 +52,7 @@ function parseKeyValues(s: string): Record<string, string> {
 }
 
 /**
- * Parse an inline evaluator spec string into an EvaluatorConfig.
+ * Parse an inline evaluator spec string into an GraderConfig.
  *
  * Supported formats:
  *   contains:value
@@ -64,7 +64,7 @@ function parseKeyValues(s: string): Record<string, string> {
  *   token-usage:max_total=N,max_input=N,max_output=N
  *   execution-metrics:max_tool_calls=N,max_tokens=N,max_llm_calls=N,...
  */
-export function parseAssertSpec(spec: string): EvaluatorConfig {
+export function parseAssertSpec(spec: string): GraderConfig {
   const colonIdx = spec.indexOf(':');
   // Normalize snake_case to kebab-case for backward compat
   const type = (colonIdx === -1 ? spec : spec.slice(0, colonIdx)).replace(/_/g, '-');
@@ -73,31 +73,31 @@ export function parseAssertSpec(spec: string): EvaluatorConfig {
   switch (type) {
     case 'contains':
       if (!params) throw new Error('contains requires a value: contains:<value>');
-      return { name: 'contains', type: 'contains', value: params } as EvaluatorConfig;
+      return { name: 'contains', type: 'contains', value: params } as GraderConfig;
 
     case 'regex':
       if (!params) throw new Error('regex requires a pattern: regex:<pattern>');
-      return { name: 'regex', type: 'regex', value: params } as EvaluatorConfig;
+      return { name: 'regex', type: 'regex', value: params } as GraderConfig;
 
     case 'is-json':
-      return { name: 'is-json', type: 'is-json' } as EvaluatorConfig;
+      return { name: 'is-json', type: 'is-json' } as GraderConfig;
 
     case 'equals':
       if (!params) throw new Error('equals requires a value: equals:<value>');
-      return { name: 'equals', type: 'equals', value: params } as EvaluatorConfig;
+      return { name: 'equals', type: 'equals', value: params } as GraderConfig;
 
     case 'latency': {
       const threshold = Number(params);
       if (!params || Number.isNaN(threshold))
         throw new Error('latency requires a threshold in ms: latency:<ms>');
-      return { name: 'latency', type: 'latency', threshold } as EvaluatorConfig;
+      return { name: 'latency', type: 'latency', threshold } as GraderConfig;
     }
 
     case 'cost': {
       const budget = Number(params);
       if (!params || Number.isNaN(budget))
         throw new Error('cost requires a budget in USD: cost:<usd>');
-      return { name: 'cost', type: 'cost', budget } as EvaluatorConfig;
+      return { name: 'cost', type: 'cost', budget } as GraderConfig;
     }
 
     case 'token-usage': {
@@ -106,7 +106,7 @@ export function parseAssertSpec(spec: string): EvaluatorConfig {
       if (kv.max_total) config.max_total = Number(kv.max_total);
       if (kv.max_input) config.max_input = Number(kv.max_input);
       if (kv.max_output) config.max_output = Number(kv.max_output);
-      return config as EvaluatorConfig;
+      return config as GraderConfig;
     }
 
     case 'execution-metrics': {
@@ -120,12 +120,12 @@ export function parseAssertSpec(spec: string): EvaluatorConfig {
       if (kv.max_tokens) config.max_tokens = Number(kv.max_tokens);
       if (kv.max_cost_usd) config.max_cost_usd = Number(kv.max_cost_usd);
       if (kv.max_duration_ms) config.max_duration_ms = Number(kv.max_duration_ms);
-      return config as EvaluatorConfig;
+      return config as GraderConfig;
     }
 
     default:
       throw new Error(
-        `Unsupported evaluator type: "${type}". Supported: ${SUPPORTED_TYPES.join(', ')}`,
+        `Unsupported grader type: "${type}". Supported: ${SUPPORTED_TYPES.join(', ')}`,
       );
   }
 }
@@ -171,7 +171,7 @@ const stubProvider: Provider = {
 /**
  * A no-op evaluator stub used as the required llmGrader in the dispatch context.
  */
-const stubLlmGrader: Evaluator = {
+const stubLlmGrader: Grader = {
   kind: 'llm-grader',
   evaluate(): EvaluationScore {
     throw new Error('trace score does not support LLM-based evaluators');
@@ -189,12 +189,12 @@ interface ScoreResult {
 
 async function runScore(
   results: RawResult[],
-  evaluatorConfig: EvaluatorConfig,
+  evaluatorConfig: GraderConfig,
   testIdFilter?: string,
 ): Promise<ScoreResult[]> {
   const registry = createBuiltinRegistry();
 
-  const dispatchContext: EvaluatorDispatchContext = {
+  const dispatchContext: GraderDispatchContext = {
     llmGrader: stubLlmGrader,
     registry,
   };
@@ -308,7 +308,7 @@ export const traceScoreCommand = command({
       long: 'assert',
       short: 'a',
       description:
-        'Evaluator spec: contains:<val>, regex:<pat>, is-json, equals:<val>, latency:<ms>, cost:<usd>, token-usage:<params>, execution-metrics:<params>',
+        'Grader spec: contains:<val>, regex:<pat>, is-json, equals:<val>, latency:<ms>, cost:<usd>, token-usage:<params>, execution-metrics:<params>',
     }),
     testId: option({
       type: optional(string),
@@ -324,7 +324,7 @@ export const traceScoreCommand = command({
   },
   handler: async ({ file, assert: assertSpec, testId, format }) => {
     // Parse the evaluator spec
-    let evaluatorConfig: EvaluatorConfig;
+    let evaluatorConfig: GraderConfig;
     try {
       evaluatorConfig = parseAssertSpec(assertSpec);
     } catch (err) {

diff --git a/apps/cli/src/commands/inspect/show.ts b/apps/cli/src/commands/inspect/show.ts
@@ -46,7 +46,7 @@ function renderFlatTrace(result: RawResult): string {
 }
 
 /**
- * Render per-evaluator scores inline.
+ * Render per-grader scores inline.
  */
 function renderScores(scores: { name: string; score: number; type: string }[]): string {
   return scores

diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -28,7 +28,7 @@ interface EvaluatorScore {
 
 export const evalBenchCommand = command({
   name: 'bench',
-  description: 'Merge evaluator scores and produce benchmark artifacts',
+  description: 'Merge grader scores and produce benchmark artifacts',
   args: {
     exportDir: positional({
       type: string,