End-to-End Evaluation Pipeline
Complete walkthrough — create a suite, add cases, run evaluations, save baselines, and detect regressions.
This guide builds a complete evaluation pipeline from scratch using PostgreSQL and the Sentinel engine.
1. Set up the engine
package main
import (
"context"
"database/sql"
"fmt"
"log"
"os"
"github.com/uptrace/bun"
"github.com/uptrace/bun/dialect/pgdialect"
"github.com/uptrace/bun/driver/pgdriver"
"github.com/xraph/sentinel"
"github.com/xraph/sentinel/engine"
"github.com/xraph/sentinel/suite"
"github.com/xraph/sentinel/testcase"
"github.com/xraph/sentinel/baseline"
pgstore "github.com/xraph/sentinel/store/postgres"
)
func main() {
ctx := context.Background()
// PostgreSQL store
sqldb := sql.OpenDB(pgdriver.NewConnector(
pgdriver.WithDSN(os.Getenv("DATABASE_URL")),
))
db := bun.NewDB(sqldb, pgdialect.New())
store := pgstore.New(db)
// Build engine
eng, err := engine.New(
engine.WithStore(store),
engine.WithConfig(sentinel.Config{
DefaultModel: "gpt-4o",
PassThreshold: 0.7,
Concurrency: 4,
}),
)
if err != nil {
log.Fatal(err)
}
// Run migrations
if err := store.Migrate(ctx); err != nil {
log.Fatal("migrate:", err)
}
// Set scope
ctx = sentinel.WithApp(ctx, "support-bot")
ctx = sentinel.WithTenant(ctx, "acme-corp")
run(ctx, eng)
}2. Create a suite
func run(ctx context.Context, eng *engine.Engine) {
s := &suite.Suite{
Name: "support-agent-eval",
Description: "Evaluate the customer support agent",
SystemPrompt: "You are a helpful customer support assistant for Acme Corp.",
Model: "gpt-4o",
Temperature: 0,
}
if err := eng.CreateSuite(ctx, s); err != nil {
log.Fatal("create suite:", err)
}
fmt.Println("created suite:", s.ID)3. Add test cases
cases := []*testcase.Case{
{
SuiteID: s.ID,
Name: "greeting",
Input: "Hello, I need help with my order #12345",
Expected: "Professional greeting, acknowledges the order number",
ScenarioType: testcase.ScenarioStandard,
Scorers: []testcase.ScorerConfig{
{Name: "contains", Config: map[string]any{"substring": "12345"}},
{Name: "llm_judge", Config: map[string]any{
"criteria": "Professional greeting that acknowledges the specific order",
}},
},
Tags: []string{"greeting", "order"},
},
{
SuiteID: s.ID,
Name: "refund-policy",
Input: "What is your refund policy?",
Expected: "Accurate refund policy with timeframes",
ScenarioType: testcase.ScenarioStandard,
Scorers: []testcase.ScorerConfig{
{Name: "contains", Config: map[string]any{"substring": "refund"}},
{Name: "factual", Config: map[string]any{}},
},
Tags: []string{"policy", "refund"},
},
{
SuiteID: s.ID,
Name: "out-of-scope",
Input: "Can you help me write Python code?",
Expected: "Polite redirect to support topics",
ScenarioType: testcase.ScenarioStandard,
Scorers: []testcase.ScorerConfig{
{Name: "llm_judge", Config: map[string]any{
"criteria": "Politely redirects to customer support topics without writing code",
}},
},
Tags: []string{"boundary", "off-topic"},
},
}
if err := eng.CreateCaseBatch(ctx, cases); err != nil {
log.Fatal("create cases:", err)
}
fmt.Printf("created %d cases\n", len(cases))4. Run the evaluation
Evaluations are triggered via the HTTP API (POST /sentinel/suites/:id/run) or programmatically through the eval runner. After execution:
// List runs for the suite
runs, err := eng.ListRunsBySuite(ctx, s.ID)
if err != nil {
log.Fatal("list runs:", err)
}
if len(runs) == 0 {
fmt.Println("no runs yet — trigger via API: POST /sentinel/suites/" + s.ID.String() + "/run")
return
}
run := runs[0]
fmt.Printf("run %s: %d/%d passed (%.1f%%)\n", run.ID, run.Passed, run.TotalCases, run.PassRate*100)5. Review results
results, _ := eng.ListResults(ctx, run.ID)
for _, r := range results {
fmt.Printf(" [%s] %s: %.2f (%s)\n", r.Status, r.CaseName, r.Score, r.Output[:50])
}
stats, _ := eng.GetResultStats(ctx, run.ID)
fmt.Printf("stats: %d total, %d passed, avg score %.2f\n",
stats.TotalCases, stats.Passed, stats.AvgScore)6. Save a baseline
b := &baseline.Baseline{
SuiteID: s.ID,
RunID: run.ID,
Name: "v1.0-baseline",
PassRate: run.PassRate,
AvgScore: run.AvgScore,
IsCurrent: true,
}
if err := eng.SaveBaseline(ctx, b); err != nil {
log.Fatal("save baseline:", err)
}
fmt.Println("saved baseline:", b.ID)7. Detect regressions
On subsequent runs, compare against the baseline:
latest, _ := eng.GetLatestBaseline(ctx, s.ID)
fmt.Printf("baseline: %.1f%% pass rate, %.2f avg score\n",
latest.PassRate*100, latest.AvgScore)
// Compare: if new run's pass rate < baseline's, regression detected
if run.PassRate < latest.PassRate {
fmt.Printf("REGRESSION: pass rate dropped from %.1f%% to %.1f%%\n",
latest.PassRate*100, run.PassRate*100)
}
}8. Multi-model comparison
Compare the same suite across different models via the API:
POST /sentinel/suites/:id/compare
{
"models": ["gpt-4o", "claude-3-opus", "gpt-4o-mini"],
"temperature": 0
}Results include per-model pass rates, scores, latency, cost, and dimension scores.
Complete code
The full runnable example follows the pattern above. For production, wire everything through the Forge extension as shown in the Forge Extension guide.