End-to-End Evaluation Pipeline

Complete walkthrough — create a suite, add cases, run evaluations, save baselines, and detect regressions.

This guide builds a complete evaluation pipeline from scratch using PostgreSQL and the Sentinel engine.

1. Set up the engine

package main

import (
    "context"
    "database/sql"
    "fmt"
    "log"
    "os"

    "github.com/uptrace/bun"
    "github.com/uptrace/bun/dialect/pgdialect"
    "github.com/uptrace/bun/driver/pgdriver"

    "github.com/xraph/sentinel"
    "github.com/xraph/sentinel/engine"
    "github.com/xraph/sentinel/suite"
    "github.com/xraph/sentinel/testcase"
    "github.com/xraph/sentinel/baseline"
    pgstore "github.com/xraph/sentinel/store/postgres"
)

func main() {
    ctx := context.Background()

    // PostgreSQL store
    sqldb := sql.OpenDB(pgdriver.NewConnector(
        pgdriver.WithDSN(os.Getenv("DATABASE_URL")),
    ))
    db := bun.NewDB(sqldb, pgdialect.New())
    store := pgstore.New(db)

    // Build engine
    eng, err := engine.New(
        engine.WithStore(store),
        engine.WithConfig(sentinel.Config{
            DefaultModel:  "gpt-4o",
            PassThreshold: 0.7,
            Concurrency:   4,
        }),
    )
    if err != nil {
        log.Fatal(err)
    }

    // Run migrations
    if err := store.Migrate(ctx); err != nil {
        log.Fatal("migrate:", err)
    }

    // Set scope
    ctx = sentinel.WithApp(ctx, "support-bot")
    ctx = sentinel.WithTenant(ctx, "acme-corp")

    run(ctx, eng)
}

2. Create a suite

func run(ctx context.Context, eng *engine.Engine) {
    s := &suite.Suite{
        Name:         "support-agent-eval",
        Description:  "Evaluate the customer support agent",
        SystemPrompt: "You are a helpful customer support assistant for Acme Corp.",
        Model:        "gpt-4o",
        Temperature:  0,
    }
    if err := eng.CreateSuite(ctx, s); err != nil {
        log.Fatal("create suite:", err)
    }
    fmt.Println("created suite:", s.ID)

3. Add test cases

    cases := []*testcase.Case{
        {
            SuiteID:      s.ID,
            Name:         "greeting",
            Input:        "Hello, I need help with my order #12345",
            Expected:     "Professional greeting, acknowledges the order number",
            ScenarioType: testcase.ScenarioStandard,
            Scorers: []testcase.ScorerConfig{
                {Name: "contains", Config: map[string]any{"substring": "12345"}},
                {Name: "llm_judge", Config: map[string]any{
                    "criteria": "Professional greeting that acknowledges the specific order",
                }},
            },
            Tags: []string{"greeting", "order"},
        },
        {
            SuiteID:      s.ID,
            Name:         "refund-policy",
            Input:        "What is your refund policy?",
            Expected:     "Accurate refund policy with timeframes",
            ScenarioType: testcase.ScenarioStandard,
            Scorers: []testcase.ScorerConfig{
                {Name: "contains", Config: map[string]any{"substring": "refund"}},
                {Name: "factual", Config: map[string]any{}},
            },
            Tags: []string{"policy", "refund"},
        },
        {
            SuiteID:      s.ID,
            Name:         "out-of-scope",
            Input:        "Can you help me write Python code?",
            Expected:     "Polite redirect to support topics",
            ScenarioType: testcase.ScenarioStandard,
            Scorers: []testcase.ScorerConfig{
                {Name: "llm_judge", Config: map[string]any{
                    "criteria": "Politely redirects to customer support topics without writing code",
                }},
            },
            Tags: []string{"boundary", "off-topic"},
        },
    }

    if err := eng.CreateCaseBatch(ctx, cases); err != nil {
        log.Fatal("create cases:", err)
    }
    fmt.Printf("created %d cases\n", len(cases))

4. Run the evaluation

Evaluations are triggered via the HTTP API (POST /sentinel/suites/:id/run) or programmatically through the eval runner. After execution:

    // List runs for the suite
    runs, err := eng.ListRunsBySuite(ctx, s.ID)
    if err != nil {
        log.Fatal("list runs:", err)
    }

    if len(runs) == 0 {
        fmt.Println("no runs yet — trigger via API: POST /sentinel/suites/" + s.ID.String() + "/run")
        return
    }

    run := runs[0]
    fmt.Printf("run %s: %d/%d passed (%.1f%%)\n", run.ID, run.Passed, run.TotalCases, run.PassRate*100)

5. Review results

    results, _ := eng.ListResults(ctx, run.ID)
    for _, r := range results {
        fmt.Printf("  [%s] %s: %.2f (%s)\n", r.Status, r.CaseName, r.Score, r.Output[:50])
    }

    stats, _ := eng.GetResultStats(ctx, run.ID)
    fmt.Printf("stats: %d total, %d passed, avg score %.2f\n",
        stats.TotalCases, stats.Passed, stats.AvgScore)

6. Save a baseline

    b := &baseline.Baseline{
        SuiteID:   s.ID,
        RunID:     run.ID,
        Name:      "v1.0-baseline",
        PassRate:  run.PassRate,
        AvgScore:  run.AvgScore,
        IsCurrent: true,
    }
    if err := eng.SaveBaseline(ctx, b); err != nil {
        log.Fatal("save baseline:", err)
    }
    fmt.Println("saved baseline:", b.ID)

7. Detect regressions

On subsequent runs, compare against the baseline:

    latest, _ := eng.GetLatestBaseline(ctx, s.ID)
    fmt.Printf("baseline: %.1f%% pass rate, %.2f avg score\n",
        latest.PassRate*100, latest.AvgScore)

    // Compare: if new run's pass rate < baseline's, regression detected
    if run.PassRate < latest.PassRate {
        fmt.Printf("REGRESSION: pass rate dropped from %.1f%% to %.1f%%\n",
            latest.PassRate*100, run.PassRate*100)
    }
}

8. Multi-model comparison

Compare the same suite across different models via the API:

POST /sentinel/suites/:id/compare
{
    "models": ["gpt-4o", "claude-3-opus", "gpt-4o-mini"],
    "temperature": 0
}

Results include per-model pass rates, scores, latency, cost, and dimension scores.

Complete code

The full runnable example follows the pattern above. For production, wire everything through the Forge extension as shown in the Forge Extension guide.

End-to-End Evaluation Pipeline

On this page