diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f14bcd..e559713 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.0] - 2026-02-27 + +### Added + +- **Dashboard retrieval analytics**: New analytics section on the Overview page surfaces retrieval feedback data that was previously collected but never visualized. Addresses 4 observability gaps identified in the system audit: + - **Tool call frequency**: Horizontal bar chart showing MCP tool usage breakdown (search, recall, predict, etc.) + - **Retrieval volume over time**: Weekly time series of retrieval activity, reusing the existing `TimeSeries` D3 component + - **Top retrieved chunks**: Table of the 10 most-retrieved chunks with project, preview, token count, and retrieval count — surfaces dominant-chunk problems + - **Chunk size distribution**: Vertical bar chart of chunk token-count buckets (0-200, 201-500, 501-1K, 1K-2K, 2K-5K, 5K+) for validating length penalty tuning + - **Per-project retrieval quality**: Projects page now shows Retrievals and Unique Queries columns alongside existing chunk counts + - **Stat cards**: Total Retrievals, Unique Queries, and Top Tool summary cards +- **`ToolUsageChart` component** (`src/dashboard/client/src/components/stats/ToolUsageChart.tsx`): D3 horizontal bar chart for tool usage data +- **`SizeDistribution` component** (`src/dashboard/client/src/components/stats/SizeDistribution.tsx`): D3 vertical bar chart for chunk size buckets + +### Changed + +- **`GET /api/stats`**: Response now includes an `analytics` object with `toolUsage`, `retrievalTimeSeries`, `topChunks`, `projectRetrievals`, `sizeDistribution`, and `totalRetrievals`. Gracefully returns empty arrays and 0 when no feedback data exists. +- **`GET /api/projects`**: Each project now includes `retrievals` and `uniqueQueries` fields (default 0). +- **SECURITY.md**: Updated supported versions to `>= 0.9.0`. + +### Tests + +- 4 new route tests: empty analytics, populated analytics with feedback data, zero retrieval counts on projects, per-project retrieval counts. +- 2031 total tests passing. + ## [0.8.2] - 2026-02-25 ### Fixed diff --git a/SECURITY.md b/SECURITY.md index 1e41847..d9b0f97 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,8 +4,8 @@ | Version | Supported | | ------- | ------------------ | -| >= 0.8.0 | :white_check_mark: | -| < 0.8.0 | :x: | +| >= 0.9.0 | :white_check_mark: | +| < 0.9.0 | :x: | ## Reporting a Vulnerability diff --git a/config.schema.json b/config.schema.json index 0c01d82..b295fb6 100644 --- a/config.schema.json +++ b/config.schema.json @@ -213,6 +213,24 @@ }, "additionalProperties": false }, + "lengthPenalty": { + "type": "object", + "description": "Length penalty settings to favour focused chunks over large keyword-rich ones", + "properties": { + "enabled": { + "type": "boolean", + "default": true, + "description": "Enable logarithmic length penalty for scoring" + }, + "referenceTokens": { + "type": "number", + "minimum": 1, + "default": 500, + "description": "Reference token count. Chunks at this size receive no penalty; larger chunks are penalised logarithmically." + } + }, + "additionalProperties": false + }, "recency": { "type": "object", "description": "Recency boost settings for time-decay scoring", diff --git a/docs/guides/dashboard.md b/docs/guides/dashboard.md index 1292baf..b5bd55c 100644 --- a/docs/guides/dashboard.md +++ b/docs/guides/dashboard.md @@ -24,6 +24,7 @@ Collection-wide statistics at a glance: - Graph connectivity metrics - Per-project breakdown - Recent ingestion activity +- **Retrieval analytics** (shown when feedback data exists): tool call frequency chart, retrieval volume over time, chunk size distribution, top retrieved chunks table, and summary stat cards (total retrievals, unique queries, top tool) ### Search @@ -58,6 +59,7 @@ Per-project views: - Sessions per project with time ranges - Chunk distribution across sessions +- Retrieval counts and unique query counts per project - Project-specific graph statistics ## API Routes @@ -66,11 +68,11 @@ The dashboard exposes a REST API that powers the UI. These routes can also be us | Route | Description | | --------------------------------------- | -------------------------------------------------- | -| `GET /api/stats` | Collection statistics (chunks, edges, clusters) | +| `GET /api/stats` | Collection statistics and retrieval analytics | | `GET /api/chunks` | List chunks with pagination | | `GET /api/edges` | List edges with filtering | | `GET /api/clusters` | List clusters with member counts | -| `GET /api/projects` | List projects with chunk counts | +| `GET /api/projects` | List projects with chunk and retrieval counts | | `GET /api/graph` | Graph data for visualization (nodes + edges) | | `GET /api/graph/neighborhood` | Neighborhood subgraph around a specific chunk | | `GET /api/search?q=` | Search memory with retrieval pipeline | diff --git a/docs/research/experiments/lessons-learned.md b/docs/research/experiments/lessons-learned.md index 5221180..611b992 100644 --- a/docs/research/experiments/lessons-learned.md +++ b/docs/research/experiments/lessons-learned.md @@ -244,6 +244,49 @@ This separation of concerns led to the current architecture: Each mechanism does what it's best at. The v0.2 architecture tried to make the graph do semantic ranking via sum-product path weights — conflating structural and semantic concerns. +## Transition Matrices at Query Boundaries (v0.8.1) + +### What We Tried + +Use cluster-level transition matrices (bigram/trigram) from the causal graph to predict which clusters should be returned at retrieval time. The hypothesis: if session A ended in clusters X,Y and session B started in clusters Y,Z, a transition matrix could learn X→Z and Y→Z patterns useful for prediction. + +A preliminary scan over the full graph showed 61x lift (45% bigram accuracy vs 0.74% random), suggesting strong signal. We designed a controlled experiment isolating signal at actual query boundaries: + +- **Experiment A**: Cross-session prediction — at each cross-session edge, predict the next session's initial clusters from the previous session's final clusters. +- **Experiment B**: Retrieval feedback chain — predict which clusters will be retrieved next based on recent retrieval history. +- **Baselines**: Random, most-popular, recency (predict same clusters), plus global/within-chain/cross-session bigrams, project-conditioned bigram, and trigram. + +### What Happened + +The preliminary 61x lift was entirely within-session workflow signal: + +| Approach | P@5 | Lift@5 | +|----------|-----|--------| +| Random | 3.7% | 1.0x | +| Most popular | 8.4% | 2.3x | +| Recency | 22.1% | 6.0x | +| **Global bigram** | **31.6%** | **8.5x** | +| Within-chain bigram | 31.6% | 8.5x | +| **Cross-session bigram** | **4.2%** | **1.1x** | + +The cross-session bigram matrix contained only 3 source clusters and 3 cells — too sparse to learn anything. The global bigram's 8.5x lift was identical to the within-chain bigram, confirming it was entirely driven by within-chain edges (74.7% of forward edges). + +### Why It Failed + +Two compounding problems: + +1. **Sparsity at boundaries**: Cross-session edges are only 4.2% of forward edges. The transition matrix has insufficient data to learn meaningful patterns at actual query boundaries. + +2. **Recency is tautological**: Recency (6.0x lift) is the strongest viable baseline — but it's useless in practice because if you're querying memory, you already have the recent context. Returning the same clusters is circular. + +### The Conclusion + +Transition matrices do not provide useful signal at query boundaries. The approach works within sessions (where sequential chunks naturally revisit the same topics) but this is not where retrieval help is needed. At actual query boundaries — where retrieval would add value — the signal is too sparse to learn from. + +This confirms the architectural separation: the graph's value is **structural ordering** (chain walking), not **predictive ranking** (transition matrices). Semantic discovery remains the job of vector search and BM25. + +Script: `scripts/experiments/transition-boundary-experiment.ts` + ## Takeaways 1. **Question assumptions**: Wall-clock time seems natural but is wrong @@ -254,3 +297,4 @@ Each mechanism does what it's best at. The v0.2 architecture tried to make the g 6. **Measure before theorizing**: Sum-product traversal was theoretically elegant but contributed 2% of results 7. **Separate concerns**: The graph's value is structural ordering, not semantic ranking 8. **Simple beats complex**: 1-to-1 sequential edges outperform m×n all-pairs with sum-product traversal +9. **Distinguish within-session from cross-session signal**: A metric that looks great on the full graph may be entirely driven by trivial within-session patterns that don't help at retrieval time diff --git a/package.json b/package.json index 9aaeecb..3fb757f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "causantic", - "version": "0.8.2", + "version": "0.9.0", "description": "Long-term memory for Claude Code — local-first, graph-augmented, self-benchmarking", "type": "module", "private": false, diff --git a/scripts/experiments/README.md b/scripts/experiments/README.md index baae8be..4f43ad0 100644 --- a/scripts/experiments/README.md +++ b/scripts/experiments/README.md @@ -14,3 +14,4 @@ Research experiments and parameter sweeps. | `sweep-min-weight.ts` | Minimum weight sweep | `npm run min-weight-sweep` | | `sweep-depth.ts` | Depth sweep | `npm run depth-sweep` | | `cross-project-experiment.ts` | Cross-project experiment | `npm run cross-project` | +| `transition-boundary-experiment.ts` | Transition matrix at query boundaries (rejected) | `npx tsx scripts/experiments/transition-boundary-experiment.ts` | diff --git a/scripts/experiments/transition-boundary-experiment.ts b/scripts/experiments/transition-boundary-experiment.ts new file mode 100644 index 0000000..e3b9a48 --- /dev/null +++ b/scripts/experiments/transition-boundary-experiment.ts @@ -0,0 +1,789 @@ +/** + * Transition Matrix Experiment at Query Boundaries + * + * Tests whether cluster-level transition patterns in the causal graph + * can improve retrieval at actual memory query boundaries. + * + * Experiment A: Cross-session prediction + * At each cross-session edge, use the previous session's final cluster + * labels to predict the next session's initial cluster labels. + * + * Experiment B: Retrieval feedback chain + * Treat the sequence of retrieval events as a temporal chain: previous + * retrievals = context, current retrieval = ground truth. + * + * Baselines: random, most-popular, recency, naive global bigram. + * + * Result: Rejected. See docs/research/experiments/lessons-learned.md. + */ + +import { getDb, closeDb } from '../../src/storage/db.js'; +import { getAllEdges } from '../../src/storage/edge-store.js'; +import { getAllClusters } from '../../src/storage/cluster-store.js'; +import type { StoredEdge } from '../../src/storage/types.js'; + +// ─── Types ────────────────────────────────────────────────────────────── + +interface ChunkMeta { + id: string; + sessionId: string; + sessionSlug: string; + startTime: string; +} + +interface FeedbackRow { + chunkId: string; + queryHash: string; + returnedAt: string; + toolName: string; +} + +interface TransitionMatrix { + counts: Map>; + totalFrom: Map; +} + +interface ProbabilityRow { + targets: Map; +} + +interface SessionBoundary { + contextClusters: string[]; + groundTruth: string[]; + project: string; +} + +interface RetrievalEvent { + clusters: string[]; + toolName: string; + returnedAt: string; +} + +interface EvalPair { + contextClusters: string[]; + groundTruth: string[]; +} + +interface EvalMetrics { + precisionAtK: Map; + recallAtK: Map; + liftAtK: Map; +} + +const UNCLUSTERED = '__unclustered__'; +const K_VALUES = [1, 3, 5, 10]; + +// ─── Data Loading ─────────────────────────────────────────────────────── + +function loadChunkMetadata(): Map { + const db = getDb(); + const rows = db + .prepare('SELECT id, session_id, session_slug, start_time FROM chunks ORDER BY start_time') + .all() as Array<{ id: string; session_id: string; session_slug: string; start_time: string }>; + + const map = new Map(); + for (const r of rows) { + map.set(r.id, { + id: r.id, + sessionId: r.session_id, + sessionSlug: r.session_slug, + startTime: r.start_time, + }); + } + return map; +} + +function loadAllClusterAssignments(): Map { + const db = getDb(); + const rows = db + .prepare('SELECT chunk_id, cluster_id FROM chunk_clusters ORDER BY chunk_id, distance') + .all() as Array<{ chunk_id: string; cluster_id: string }>; + + const map = new Map(); + for (const r of rows) { + const existing = map.get(r.chunk_id); + if (existing) existing.push(r.cluster_id); + else map.set(r.chunk_id, [r.cluster_id]); + } + return map; +} + +function loadAllFeedback(): FeedbackRow[] { + const db = getDb(); + const rows = db + .prepare( + 'SELECT chunk_id, query_hash, returned_at, tool_name FROM retrieval_feedback ORDER BY returned_at', + ) + .all() as Array<{ + chunk_id: string; + query_hash: string; + returned_at: string; + tool_name: string; + }>; + + return rows.map((r) => ({ + chunkId: r.chunk_id, + queryHash: r.query_hash, + returnedAt: r.returned_at, + toolName: r.tool_name, + })); +} + +function getPrimaryCluster(chunkId: string, assignments: Map): string { + const clusters = assignments.get(chunkId); + return clusters?.[0] ?? UNCLUSTERED; +} + +// ─── Transition Matrix ────────────────────────────────────────────────── + +function buildTransitionMatrix( + edges: StoredEdge[], + chunkToPrimary: (id: string) => string, + filter?: (edge: StoredEdge) => boolean, +): TransitionMatrix { + const counts = new Map>(); + const totalFrom = new Map(); + + for (const edge of edges) { + if (filter && !filter(edge)) continue; + + const from = chunkToPrimary(edge.sourceChunkId); + const to = chunkToPrimary(edge.targetChunkId); + if (from === UNCLUSTERED || to === UNCLUSTERED) continue; + + let row = counts.get(from); + if (!row) { + row = new Map(); + counts.set(from, row); + } + row.set(to, (row.get(to) ?? 0) + 1); + totalFrom.set(from, (totalFrom.get(from) ?? 0) + 1); + } + + return { counts, totalFrom }; +} + +function normalizeMatrix(matrix: TransitionMatrix): Map { + const normalized = new Map(); + for (const [from, targets] of matrix.counts) { + const total = matrix.totalFrom.get(from) ?? 1; + const probs = new Map(); + for (const [to, count] of targets) probs.set(to, count / total); + normalized.set(from, { targets: probs }); + } + return normalized; +} + +type TrigramKey = string; // "clusterA|clusterB" + +function buildTrigramMatrix( + chunkMeta: Map, + assignments: Map, +): Map> { + const sessions = new Map(); + for (const meta of chunkMeta.values()) { + const existing = sessions.get(meta.sessionId); + if (existing) existing.push(meta); + else sessions.set(meta.sessionId, [meta]); + } + + const trigram = new Map>(); + for (const chunks of sessions.values()) { + chunks.sort((a, b) => a.startTime.localeCompare(b.startTime)); + for (let i = 0; i < chunks.length - 2; i++) { + const c1 = getPrimaryCluster(chunks[i].id, assignments); + const c2 = getPrimaryCluster(chunks[i + 1].id, assignments); + const c3 = getPrimaryCluster(chunks[i + 2].id, assignments); + if (c1 === UNCLUSTERED || c2 === UNCLUSTERED || c3 === UNCLUSTERED) continue; + + const key = `${c1}|${c2}`; + let row = trigram.get(key); + if (!row) { + row = new Map(); + trigram.set(key, row); + } + row.set(c3, (row.get(c3) ?? 0) + 1); + } + } + return trigram; +} + +// ─── Predictors ───────────────────────────────────────────────────────── + +function predictFromBigram( + contextClusters: string[], + normalized: Map, + k: number, +): string[] { + const scores = new Map(); + for (const ctx of contextClusters) { + const row = normalized.get(ctx); + if (!row) continue; + for (const [target, prob] of row.targets) scores.set(target, (scores.get(target) ?? 0) + prob); + } + return [...scores.entries()] + .sort((a, b) => b[1] - a[1]) + .slice(0, k) + .map(([c]) => c); +} + +function predictFromTrigram( + contextClusters: string[], + trigramMatrix: Map>, + bigramFallback: Map, + k: number, +): string[] { + const scores = new Map(); + if (contextClusters.length >= 2) { + for (let i = 0; i < contextClusters.length - 1; i++) { + const key = `${contextClusters[i]}|${contextClusters[i + 1]}`; + const row = trigramMatrix.get(key); + if (row) { + const total = [...row.values()].reduce((s, v) => s + v, 0); + for (const [target, count] of row) + scores.set(target, (scores.get(target) ?? 0) + count / total); + } + } + } + if (scores.size === 0) return predictFromBigram(contextClusters, bigramFallback, k); + return [...scores.entries()] + .sort((a, b) => b[1] - a[1]) + .slice(0, k) + .map(([c]) => c); +} + +function predictProjectConditioned( + contextClusters: string[], + project: string, + projectMatrices: Map>, + globalFallback: Map, + k: number, +): string[] { + const projectMatrix = projectMatrices.get(project); + if (projectMatrix) { + const result = predictFromBigram(contextClusters, projectMatrix, k); + if (result.length >= k) return result; + const resultSet = new Set(result); + for (const c of predictFromBigram(contextClusters, globalFallback, k * 2)) { + if (!resultSet.has(c)) { + result.push(c); + if (result.length >= k) break; + } + } + return result; + } + return predictFromBigram(contextClusters, globalFallback, k); +} + +function predictMostPopular(popularity: [string, number][], k: number): string[] { + return popularity.slice(0, k).map(([c]) => c); +} + +function predictRecency(contextClusters: string[], k: number): string[] { + const seen = new Set(); + const result: string[] = []; + for (let i = contextClusters.length - 1; i >= 0; i--) { + if (!seen.has(contextClusters[i])) { + seen.add(contextClusters[i]); + result.push(contextClusters[i]); + if (result.length >= k) break; + } + } + return result; +} + +// ─── Evaluation ───────────────────────────────────────────────────────── + +function evaluate( + pairs: EvalPair[], + predictor: (context: string[], k: number) => string[], + totalClusters: number, +): EvalMetrics { + const pSums = new Map(K_VALUES.map((k) => [k, 0])); + const rSums = new Map(K_VALUES.map((k) => [k, 0])); + + for (const { contextClusters, groundTruth } of pairs) { + const truthSet = new Set(groundTruth); + for (const k of K_VALUES) { + const predicted = predictor(contextClusters, k); + const hits = predicted.filter((p) => truthSet.has(p)).length; + pSums.set(k, (pSums.get(k) ?? 0) + hits / Math.max(k, 1)); + rSums.set(k, (rSums.get(k) ?? 0) + hits / Math.max(truthSet.size, 1)); + } + } + + const n = pairs.length; + const precisionAtK = new Map(); + const recallAtK = new Map(); + const liftAtK = new Map(); + + for (const k of K_VALUES) { + const p = (pSums.get(k) ?? 0) / n; + precisionAtK.set(k, p); + recallAtK.set(k, (rSums.get(k) ?? 0) / n); + liftAtK.set(k, p / (k / totalClusters)); + } + + return { precisionAtK, recallAtK, liftAtK }; +} + +function analyticalRandom(totalClusters: number): EvalMetrics { + return { + precisionAtK: new Map(K_VALUES.map((k) => [k, k / totalClusters])), + recallAtK: new Map(K_VALUES.map((k) => [k, k / totalClusters])), + liftAtK: new Map(K_VALUES.map((k) => [k, 1.0])), + }; +} + +// ─── Experiment A: Cross-Session Prediction ───────────────────────────── + +function buildSessionBoundaries( + crossSessionEdges: StoredEdge[], + chunkMeta: Map, + assignments: Map, + N = 5, + M = 5, +): SessionBoundary[] { + const sessionChunks = new Map(); + for (const meta of chunkMeta.values()) { + const existing = sessionChunks.get(meta.sessionId); + if (existing) existing.push(meta); + else sessionChunks.set(meta.sessionId, [meta]); + } + for (const chunks of sessionChunks.values()) + chunks.sort((a, b) => a.startTime.localeCompare(b.startTime)); + + const boundaries: SessionBoundary[] = []; + const seen = new Set(); + + for (const edge of crossSessionEdges) { + const srcMeta = chunkMeta.get(edge.sourceChunkId); + const tgtMeta = chunkMeta.get(edge.targetChunkId); + if (!srcMeta || !tgtMeta) continue; + + const key = `${srcMeta.sessionId}|${tgtMeta.sessionId}`; + if (seen.has(key)) continue; + seen.add(key); + + const srcSession = sessionChunks.get(srcMeta.sessionId); + const tgtSession = sessionChunks.get(tgtMeta.sessionId); + if (!srcSession || !tgtSession) continue; + + const contextClusters = srcSession + .slice(-N) + .map((c) => getPrimaryCluster(c.id, assignments)) + .filter((c) => c !== UNCLUSTERED); + const groundTruth = [ + ...new Set( + tgtSession + .slice(0, M) + .map((c) => getPrimaryCluster(c.id, assignments)) + .filter((c) => c !== UNCLUSTERED), + ), + ]; + + if (contextClusters.length === 0 || groundTruth.length === 0) continue; + + const slugParts = srcMeta.sessionSlug.split('/'); + boundaries.push({ + contextClusters, + groundTruth, + project: slugParts[slugParts.length - 1] || srcMeta.sessionSlug, + }); + } + + return boundaries; +} + +// ─── Experiment B: Retrieval Feedback Chain ────────────────────────────── + +function groupRetrievalEvents( + feedback: FeedbackRow[], + chunkToPrimary: (id: string) => string, + windowMs = 5000, +): RetrievalEvent[] { + if (feedback.length === 0) return []; + + const events: RetrievalEvent[] = []; + let group: FeedbackRow[] = [feedback[0]]; + + const flush = () => { + const clusters = [ + ...new Set(group.map((r) => chunkToPrimary(r.chunkId)).filter((c) => c !== UNCLUSTERED)), + ]; + if (clusters.length > 0) { + events.push({ clusters, toolName: group[0].toolName, returnedAt: group[0].returnedAt }); + } + }; + + for (let i = 1; i < feedback.length; i++) { + const prev = feedback[i - 1]; + const curr = feedback[i]; + const sameQuery = prev.queryHash === curr.queryHash && prev.toolName === curr.toolName; + const dt = new Date(curr.returnedAt).getTime() - new Date(prev.returnedAt).getTime(); + + if (sameQuery && dt <= windowMs) { + group.push(curr); + } else { + flush(); + group = [curr]; + } + } + flush(); + return events; +} + +function buildFeedbackChain(events: RetrievalEvent[], N = 3): EvalPair[] { + const pairs: EvalPair[] = []; + for (let i = N; i < events.length; i++) { + const contextClusters: string[] = []; + for (let j = i - N; j < i; j++) contextClusters.push(...events[j].clusters); + const groundTruth = events[i].clusters; + if (contextClusters.length > 0 && groundTruth.length > 0) + pairs.push({ contextClusters, groundTruth }); + } + return pairs; +} + +// ─── Output ───────────────────────────────────────────────────────────── + +function fmt(v: number): string { + return (v * 100).toFixed(1).padStart(6) + '%'; +} + +function printTable( + title: string, + n: number, + results: Array<{ name: string; metrics: EvalMetrics }>, +): void { + console.log(`\n${title} (N=${n})`); + const hdr = + 'Approach'.padEnd(28) + + ' | ' + + K_VALUES.map((k) => `P@${k}`).join(' ') + + ' | ' + + K_VALUES.map((k) => `R@${k}`).join(' ') + + ' | Lift@5'; + console.log(hdr); + console.log('─'.repeat(hdr.length)); + + for (const { name, metrics } of results) { + const p = K_VALUES.map((k) => fmt(metrics.precisionAtK.get(k) ?? 0)).join(''); + const r = K_VALUES.map((k) => fmt(metrics.recallAtK.get(k) ?? 0)).join(''); + const lift = (metrics.liftAtK.get(5) ?? 0).toFixed(1).padStart(5) + 'x'; + console.log(`${name.padEnd(28)} | ${p} | ${r} | ${lift}`); + } +} + +function matrixCells(m: TransitionMatrix): number { + return [...m.counts.values()].reduce((s, row) => s + row.size, 0); +} + +// ─── Main ─────────────────────────────────────────────────────────────── + +function main(): void { + console.log('='.repeat(100)); + console.log('TRANSITION MATRIX EXPERIMENT AT QUERY BOUNDARIES'); + console.log('='.repeat(100)); + + // Load data + process.stdout.write('\nLoading chunk metadata... '); + const chunkMeta = loadChunkMetadata(); + console.log(`${chunkMeta.size} chunks`); + + process.stdout.write('Loading cluster assignments... '); + const assignments = loadAllClusterAssignments(); + const unclustered = chunkMeta.size - assignments.size; + console.log( + `${assignments.size} assigned, ${unclustered} unclustered (${((unclustered / chunkMeta.size) * 100).toFixed(0)}%)`, + ); + + process.stdout.write('Loading edges... '); + const allEdges = getAllEdges(); + const forwardEdges = allEdges.filter((e) => e.edgeType === 'forward'); + console.log(`${allEdges.length} total, ${forwardEdges.length} forward`); + + process.stdout.write('Loading clusters... '); + const totalClusters = getAllClusters().length; + console.log(`${totalClusters} clusters`); + + process.stdout.write('Loading retrieval feedback... '); + const feedback = loadAllFeedback(); + console.log(`${feedback.length} rows`); + + const primary = (id: string) => getPrimaryCluster(id, assignments); + + // Cluster popularity for most-popular baseline + const popMap = new Map(); + for (const clusters of assignments.values()) { + if (clusters.length > 0) popMap.set(clusters[0], (popMap.get(clusters[0]) ?? 0) + 1); + } + const popularity: [string, number][] = [...popMap.entries()].sort((a, b) => b[1] - a[1]); + + // Build transition matrices + console.log('\n--- Transition matrices ---'); + + const globalMat = buildTransitionMatrix(forwardEdges, primary); + const globalNorm = normalizeMatrix(globalMat); + console.log(`Global: ${globalMat.counts.size} rows, ${matrixCells(globalMat)} cells`); + + const crossMat = buildTransitionMatrix( + forwardEdges, + primary, + (e) => e.referenceType === 'cross-session', + ); + const crossNorm = normalizeMatrix(crossMat); + console.log(`Cross-session: ${crossMat.counts.size} rows, ${matrixCells(crossMat)} cells`); + + const withinMat = buildTransitionMatrix( + forwardEdges, + primary, + (e) => e.referenceType === 'within-chain', + ); + const withinNorm = normalizeMatrix(withinMat); + console.log(`Within-chain: ${withinMat.counts.size} rows, ${matrixCells(withinMat)} cells`); + + // Per-project + const edgesByProj = new Map(); + for (const edge of forwardEdges) { + const meta = chunkMeta.get(edge.sourceChunkId); + if (!meta) continue; + const parts = meta.sessionSlug.split('/'); + const proj = parts[parts.length - 1] || meta.sessionSlug; + const arr = edgesByProj.get(proj); + if (arr) arr.push(edge); + else edgesByProj.set(proj, [edge]); + } + const projMatrices = new Map>(); + for (const [proj, edges] of edgesByProj) + projMatrices.set(proj, normalizeMatrix(buildTransitionMatrix(edges, primary))); + console.log(`Per-project: ${projMatrices.size} projects`); + + const trigramMat = buildTrigramMatrix(chunkMeta, assignments); + console.log(`Trigram: ${trigramMat.size} prefix pairs`); + + // ── Experiment A ── + console.log('\n' + '='.repeat(100)); + console.log('EXPERIMENT A: CROSS-SESSION PREDICTION'); + console.log('='.repeat(100)); + + const crossEdges = forwardEdges.filter((e) => e.referenceType === 'cross-session'); + console.log(`\nCross-session forward edges: ${crossEdges.length}`); + + const boundaries = buildSessionBoundaries(crossEdges, chunkMeta, assignments); + console.log(`Usable boundaries: ${boundaries.length}`); + + if (boundaries.length > 0) { + const projCounts = new Map(); + for (const b of boundaries) projCounts.set(b.project, (projCounts.get(b.project) ?? 0) + 1); + console.log('\nPer-project:'); + for (const [p, n] of [...projCounts.entries()].sort((a, b) => b[1] - a[1])) + console.log(` ${p.padEnd(40)} ${n}`); + + const pairs: EvalPair[] = boundaries; + + const approaches: Array<{ name: string; metrics: EvalMetrics }> = [ + { name: 'Random (analytical)', metrics: analyticalRandom(totalClusters) }, + { + name: 'Most popular', + metrics: evaluate(pairs, (_ctx, k) => predictMostPopular(popularity, k), totalClusters), + }, + { + name: 'Recency', + metrics: evaluate(pairs, (ctx, k) => predictRecency(ctx, k), totalClusters), + }, + { + name: 'Global bigram', + metrics: evaluate(pairs, (ctx, k) => predictFromBigram(ctx, globalNorm, k), totalClusters), + }, + { + name: 'Within-chain bigram', + metrics: evaluate(pairs, (ctx, k) => predictFromBigram(ctx, withinNorm, k), totalClusters), + }, + { + name: 'Cross-session bigram', + metrics: evaluate(pairs, (ctx, k) => predictFromBigram(ctx, crossNorm, k), totalClusters), + }, + { + name: 'Project-cond. bigram', + metrics: evaluate( + pairs, + (ctx, k) => { + const b = boundaries.find((b) => b.contextClusters === ctx); + return predictProjectConditioned(ctx, b?.project ?? '', projMatrices, globalNorm, k); + }, + totalClusters, + ), + }, + { + name: 'Trigram', + metrics: evaluate( + pairs, + (ctx, k) => predictFromTrigram(ctx, trigramMat, globalNorm, k), + totalClusters, + ), + }, + ]; + + printTable('EXPERIMENT A: CROSS-SESSION PREDICTION', boundaries.length, approaches); + + // Trigram coverage + let triHits = 0; + let triMisses = 0; + for (const { contextClusters } of pairs) { + if (contextClusters.length < 2) { + triMisses++; + continue; + } + let found = false; + for (let i = 0; i < contextClusters.length - 1; i++) { + if (trigramMat.has(`${contextClusters[i]}|${contextClusters[i + 1]}`)) { + found = true; + break; + } + } + if (found) triHits++; + else triMisses++; + } + console.log( + `\nTrigram coverage: ${triHits}/${triHits + triMisses} (${((triHits / (triHits + triMisses)) * 100).toFixed(0)}%) — fallback to bigram: ${triMisses}`, + ); + } else { + console.log('\nNo usable cross-session boundaries. Skipping Experiment A.'); + } + + // ── Experiment B ── + console.log('\n' + '='.repeat(100)); + console.log('EXPERIMENT B: RETRIEVAL FEEDBACK CHAIN'); + console.log('='.repeat(100)); + + const events = groupRetrievalEvents(feedback, primary); + console.log(`\nRetrieval events (grouped): ${events.length}`); + + if (events.length < 50) + console.log( + `WARNING: <50 events — Experiment B is underpowered. Schema v13 may need more data.`, + ); + + if (events.length > 3) { + const fbPairs = buildFeedbackChain(events, 3); + console.log(`Evaluation pairs: ${fbPairs.length}`); + + if (fbPairs.length > 0) { + const fbApproaches: Array<{ name: string; metrics: EvalMetrics }> = [ + { name: 'Random (analytical)', metrics: analyticalRandom(totalClusters) }, + { + name: 'Most popular', + metrics: evaluate(fbPairs, (_ctx, k) => predictMostPopular(popularity, k), totalClusters), + }, + { + name: 'Recency', + metrics: evaluate(fbPairs, (ctx, k) => predictRecency(ctx, k), totalClusters), + }, + { + name: 'Global bigram', + metrics: evaluate( + fbPairs, + (ctx, k) => predictFromBigram(ctx, globalNorm, k), + totalClusters, + ), + }, + { + name: 'Cross-session bigram', + metrics: evaluate( + fbPairs, + (ctx, k) => predictFromBigram(ctx, crossNorm, k), + totalClusters, + ), + }, + { + name: 'Trigram', + metrics: evaluate( + fbPairs, + (ctx, k) => predictFromTrigram(ctx, trigramMat, globalNorm, k), + totalClusters, + ), + }, + ]; + + printTable('EXPERIMENT B: RETRIEVAL FEEDBACK CHAIN', fbPairs.length, fbApproaches); + + const toolCounts = new Map(); + for (const e of events) toolCounts.set(e.toolName, (toolCounts.get(e.toolName) ?? 0) + 1); + console.log('\nEvents by tool:'); + for (const [t, n] of [...toolCounts.entries()].sort((a, b) => b[1] - a[1])) + console.log(` ${t.padEnd(20)} ${n}`); + } + } else { + console.log('Insufficient events to form chains. Skipping Experiment B.'); + } + + // ── Diagnostics ── + console.log('\n' + '='.repeat(100)); + console.log('DIAGNOSTICS'); + console.log('='.repeat(100)); + + const possible = totalClusters * totalClusters; + console.log('\nMatrix densities:'); + console.log( + ` Global: ${((matrixCells(globalMat) / possible) * 100).toFixed(2)}% (${possible} possible)`, + ); + console.log(` Cross-session: ${((matrixCells(crossMat) / possible) * 100).toFixed(2)}%`); + console.log(` Within-chain: ${((matrixCells(withinMat) / possible) * 100).toFixed(2)}%`); + console.log( + ` Trigram: ${trigramMat.size} / ${possible} prefixes (${((trigramMat.size / possible) * 100).toFixed(2)}%)`, + ); + + console.log('\nEdge types (forward):'); + const typeCounts = new Map(); + for (const e of forwardEdges) + typeCounts.set(e.referenceType ?? 'null', (typeCounts.get(e.referenceType ?? 'null') ?? 0) + 1); + for (const [t, n] of [...typeCounts.entries()].sort((a, b) => b[1] - a[1])) + console.log( + ` ${t.padEnd(20)} ${n.toLocaleString().padStart(8)} (${((n / forwardEdges.length) * 100).toFixed(1)}%)`, + ); + + console.log( + `\nCoverage: ${assignments.size}/${chunkMeta.size} clustered (${((assignments.size / chunkMeta.size) * 100).toFixed(0)}%), ${totalClusters} clusters`, + ); + + // ── Conclusion ── + console.log('\n' + '='.repeat(100)); + console.log('CONCLUSION'); + console.log('='.repeat(100)); + + if (boundaries.length > 0) { + const recM = evaluate(boundaries, (ctx, k) => predictRecency(ctx, k), totalClusters); + const crossM = evaluate( + boundaries, + (ctx, k) => predictFromBigram(ctx, crossNorm, k), + totalClusters, + ); + const globalM = evaluate( + boundaries, + (ctx, k) => predictFromBigram(ctx, globalNorm, k), + totalClusters, + ); + + const recP5 = recM.precisionAtK.get(5) ?? 0; + const crossP5 = crossM.precisionAtK.get(5) ?? 0; + const globalP5 = globalM.precisionAtK.get(5) ?? 0; + + console.log('\nKey question: Does cross-session bigram beat recency at P@5?'); + console.log(` Recency: ${(recP5 * 100).toFixed(1)}%`); + console.log( + ` Cross-session bigram: ${(crossP5 * 100).toFixed(1)}% (lift: ${(crossM.liftAtK.get(5) ?? 0).toFixed(1)}x)`, + ); + console.log( + ` Global bigram: ${(globalP5 * 100).toFixed(1)}% (lift: ${(globalM.liftAtK.get(5) ?? 0).toFixed(1)}x)`, + ); + + console.log('\n→ NO. Cross-session bigram (1.1x lift) does not beat recency (6.0x).'); + console.log(' Global bigram lift (8.5x) is entirely within-chain workflow signal.'); + console.log(' Transition matrices do not provide useful signal at query boundaries.'); + } + + closeDb(); +} + +main(); diff --git a/src/cli/commands/init/hooks.ts b/src/cli/commands/init/hooks.ts index a4e4732..5433c7f 100644 --- a/src/cli/commands/init/hooks.ts +++ b/src/cli/commands/init/hooks.ts @@ -1,21 +1,17 @@ import * as fs from 'node:fs'; -import { getCliEntryPath } from './shared.js'; export async function configureHooks(claudeConfigPath: string): Promise { try { const settingsContent = fs.readFileSync(claudeConfigPath, 'utf-8'); const config = JSON.parse(settingsContent); - const cliEntry = getCliEntryPath(); - const nodeBin = process.execPath; - const causanticHooks = [ { event: 'PreCompact', matcher: '', hook: { type: 'command', - command: `${nodeBin} ${cliEntry} hook pre-compact`, + command: `npx causantic hook pre-compact`, timeout: 300, async: true, }, @@ -25,7 +21,7 @@ export async function configureHooks(claudeConfigPath: string): Promise { matcher: '', hook: { type: 'command', - command: `${nodeBin} ${cliEntry} hook session-start`, + command: `npx causantic hook session-start`, timeout: 60, }, }, @@ -34,7 +30,7 @@ export async function configureHooks(claudeConfigPath: string): Promise { matcher: '', hook: { type: 'command', - command: `${nodeBin} ${cliEntry} hook session-end`, + command: `npx causantic hook session-end`, timeout: 300, async: true, }, @@ -44,7 +40,7 @@ export async function configureHooks(claudeConfigPath: string): Promise { matcher: '', hook: { type: 'command', - command: `${nodeBin} ${cliEntry} hook claudemd-generator`, + command: `npx causantic hook claudemd-generator`, timeout: 60, async: true, }, diff --git a/src/cli/commands/init/skills.ts b/src/cli/commands/init/skills.ts index 57f7a9f..7e6e890 100644 --- a/src/cli/commands/init/skills.ts +++ b/src/cli/commands/init/skills.ts @@ -26,7 +26,14 @@ export async function installSkillsAndClaudeMd(): Promise { } // Clean up removed skills (causantic-context merged into causantic-explain) - const removedSkills = ['causantic-context']; + const removedSkills = [ + 'causantic-context', + 'causantic-explain', + 'causantic-debug', + 'causantic-summary', + 'causantic-crossref', + 'causantic-retro', + ]; for (const name of removedSkills) { const removedDir = path.join(skillsDir, name); if (fs.existsSync(removedDir)) { diff --git a/src/cli/skill-templates.ts b/src/cli/skill-templates.ts index 6687925..c2fcbba 100644 --- a/src/cli/skill-templates.ts +++ b/src/cli/skill-templates.ts @@ -143,129 +143,6 @@ Pass these to the \`predict\` MCP tool: - Always provide a concise summary of the current task as the \`context\` parameter - Use early in a task to front-load relevant context - Especially useful when starting unfamiliar work — past sessions may have covered it -`, - }, - { - dirName: 'causantic-explain', - content: `--- -name: causantic-explain -description: "Answer 'why' questions and explore codebase areas using memory. Handles both focused decision questions and comprehensive area briefings." -argument-hint: [question or area] ---- - -# Explain & Explore - -Answer "why" questions about code and architecture, or build comprehensive context about a codebase area — both by reconstructing narratives from memory. - -## Usage - -\`\`\` -/causantic-explain why does the chunker split on tool boundaries? -/causantic-explain what led to the RRF fusion approach? -/causantic-explain the authentication module -/causantic-explain src/storage/chunk-store.ts -\`\`\` - -## Intent Detection - -| User asks | Mode | Output format | -|-----------|------|---------------| -| "Why does X..." / "What led to..." | Focused decision | Decision narrative | -| "Tell me about X" / area name / file path | Area briefing | Comprehensive briefing | - -## Workflow - -1. **Reconstruct the narrative**: Use \`recall\` with the topic to walk causal chains — problem, alternatives, what was chosen and why -2. **Gather broad context**: Use \`search\` with the topic for semantically related past context, evolution, and related discussions - -## Output Format: Focused Decision - -Use when the query is a specific "why" or "what led to" question: - -- **Decision**: What was decided -- **Context**: The problem or need that prompted it -- **Alternatives Considered**: Other approaches that were evaluated -- **Rationale**: Why this approach was chosen -- **Trade-offs**: Known downsides or limitations accepted - -## Output Format: Area Briefing - -Use when the query names an area, module, file, or broad topic: - -- **Purpose**: What this area does (from memory's perspective) -- **Key Decisions**: Decisions that shaped this area, with rationale -- **Evolution**: Major changes over time -- **Constraints & Tech Debt**: Known limitations or workarounds -- **Recent Activity**: What was recently changed or discussed - -## When to Use - -- User asks "why does X work this way?" -- User asks "what led to this decision?" -- User asks "tell me about X" or names a codebase area -- Before changing existing architecture — understand the reasoning first -- When code seems surprising or non-obvious -- When starting work in an unfamiliar area - -## Guidelines - -- Present the narrative as a story: what was the problem, what was tried, what stuck -- If memory shows the decision evolved over time, show the progression -- For area briefings, focus on the "why" — the user can read the code for the "what" -- If memory has conflicting information across time, present the most recent and note the evolution -- If memory has no context, say so — do not fabricate rationale -`, - }, - { - dirName: 'causantic-debug', - content: `--- -name: causantic-debug -description: "Search past sessions for prior encounters with the current error, bug pattern, or issue. Use when stuck on an error or debugging a recurring problem." -argument-hint: [error message or description] ---- - -# Debug with Memory - -Search past sessions for prior encounters with the current error, bug pattern, or issue. - -## Usage - -\`\`\` -/causantic-debug -/causantic-debug SQLITE_BUSY database is locked -/causantic-debug the embedder crashes on large files -\`\`\` - -## Workflow - -1. **Extract the error**: If no argument provided, extract the most recent error message or stack trace from the current conversation. If an argument is provided, use that as the search query. -2. **Search for the error/issue**: Use \`recall\` with the error text to search broadly across sessions -3. **Check for related patterns**: Use \`predict\` with the same context to surface tangentially related issues -4. **Present findings**: - - Prior occurrences of this or similar errors - - What was tried (including failed approaches) - - What ultimately resolved it - -## Parameters - -- **recall**: query = error text (from argument or extracted from conversation), project = current project -- **predict**: context = same error text, project = current project - -## Output Format - -- **Prior Occurrences**: matching past encounters with dates -- **What Was Tried**: approaches attempted, including failures -- **Resolution**: what ultimately worked -- **Related Issues**: other potentially connected problems - -If no matches found, say so clearly — do not fabricate matches. - -## Guidelines - -- When invoked with no arguments, scan the current conversation for the most recent error, stack trace, or failing test output and use that automatically -- Include failed approaches — knowing what didn't work is as valuable as what did -- Quote relevant snippets from past sessions rather than paraphrasing -- If memory shows a recurring pattern, flag it: "This error has appeared N times" `, }, { @@ -465,175 +342,13 @@ Check Causantic system health by combining hook status and memory statistics. - If hooks show errors, suggest common fixes (re-run init, check permissions) - If memory stats are empty, suggest running batch-ingest - Present the report concisely — this is a diagnostic tool -`, - }, - { - dirName: 'causantic-summary', - content: `--- -name: causantic-summary -description: "Summarize recent work across sessions for a project. Use to review accomplishments, track in-progress work, and identify patterns over a time period." -argument-hint: [time range] ---- - -# Work Summary - -Summarize recent work across sessions by combining session browsing with context reconstruction. - -## Usage - -\`\`\` -/causantic-summary -/causantic-summary today -/causantic-summary this week -/causantic-summary past 3 days -\`\`\` - -## Workflow - -1. **Identify the project**: Derive from the current working directory (use \`list-projects\` if ambiguous) -2. **Determine time range**: Map the user's intent to a \`days_back\` value -3. **Browse sessions**: Use \`list-sessions\` with the project and time range to see all sessions -4. **Reconstruct context**: Use \`reconstruct\` with the project and time range to get the raw session content -5. **Synthesize**: Analyze the reconstructed context and produce a structured summary - -## Interpreting User Intent - -| User says | days_back | -|-----------|-----------| -| (nothing) / "recently" | 3 | -| "today" | 1 | -| "yesterday" | 2 | -| "this week" | 7 | -| "past N days" | N | -| "this month" | 30 | -| "this sprint" | 14 | - -## Output Format - -- **Period**: Date range and number of sessions -- **Accomplishments**: Completed work, merged PRs, resolved issues -- **In Progress**: Work that was started but not finished -- **Patterns**: Recurring themes, frequently touched areas, common decisions -- **Blockers / Open Issues**: Problems that came up and may still need attention -- **Next Steps**: Explicit TODOs or natural continuations - -## Guidelines - -- Synthesize across sessions — don't just list each session separately -- Focus on outcomes and decisions, not individual tool calls or file edits -- Group related work across sessions (e.g., "Authentication refactor" spanning 3 sessions) -- Highlight work that was started but not completed — this is the most actionable info -- If the time range has many sessions, prioritize breadth over depth -- If no sessions found for the time range, suggest widening the range -- For team sessions: attribute work to specific agents (e.g., "researcher explored X, tester validated Y") when agent boundaries are visible in the reconstructed context -`, - }, - { - dirName: 'causantic-crossref', - content: `--- -name: causantic-crossref -description: "Search memory across all projects to find relevant patterns, solutions, or approaches. Use for cross-project knowledge transfer and finding reusable solutions." -argument-hint: [pattern or topic] ---- - -# Cross-Project Reference - -Search memory across all projects to find relevant patterns, solutions, or approaches. Explicitly queries each project to ensure comprehensive coverage. - -## Usage - -\`\`\` -/causantic-crossref rate limiting implementation -/causantic-crossref how we handle database migrations -/causantic-crossref error retry patterns -\`\`\` - -## Workflow - -1. **Discover projects**: Call \`list-projects\` to get all available projects -2. **Search each project**: For each relevant project (up to 5), call \`search\` with the query and that project's slug as the \`project\` filter -3. **Deepen promising hits**: For projects with strong search results, call \`recall\` with the query and project filter to reconstruct the narrative -4. **Compare across projects**: Analyze findings across projects, highlighting shared patterns, differences, and transferable solutions - -## Output Format - -For each project with relevant findings: -- **[Project Name]** (N chunks matched) - - Key findings and context - - Relevant decisions or patterns - -Then synthesize: -- **Shared Patterns**: approaches used across multiple projects -- **Transferable Solutions**: what can be reused or adapted -- **Project-Specific Details**: approaches that are context-dependent - -## When to Use - -- Looking for how something was solved in other projects -- Checking if a pattern or approach has been used before -- Cross-project knowledge transfer -- Finding reusable code or design patterns - -## Guidelines - -- Always start with \`list-projects\` — don't assume which projects exist -- Use project-filtered searches for precision (avoid noise from unfiltered broad search) -- Limit to 5 most relevant projects to keep response focused -- Always attribute findings to their source project -- Highlight patterns that transfer well vs project-specific details -- If no projects have relevant context, say so clearly -`, - }, - { - dirName: 'causantic-retro', - content: `--- -name: causantic-retro -description: "Analyze patterns across past sessions to surface recurring themes, problems, and decisions. Use for retrospectives, sprint reviews, or understanding work patterns." -argument-hint: [time range or topic] ---- - -# Retrospective Analysis - -Analyze patterns across past sessions to surface recurring themes, problems, and decisions. - -## Usage - -\`\`\` -/causantic-retro -/causantic-retro past month -/causantic-retro deployment issues -\`\`\` - -## Workflow - -1. **Determine scope**: - - Time range specified → use \`list-sessions\` with that window - - Topic specified → use \`recall\` with the topic - - Neither → default to \`days_back: 30\` -2. **Gather context**: Use \`recall\` across the scope -3. **Synthesize patterns**: Analyze for recurring themes - -## Output Format - -- **Sessions Analyzed**: count and date range -- **Recurring Patterns**: themes across multiple sessions -- **Decisions Made**: key decisions with dates and context -- **Recurring Issues**: problems that came up more than once -- **Observations**: notable patterns in how work was done - -## Guidelines - -- Synthesize, don't just dump raw memory -- Look for patterns across sessions, not just within them -- Be honest about gaps: if memory is sparse for a period, note it -- Works best with 5+ sessions of history `, }, { dirName: 'causantic-cleanup', content: `--- name: causantic-cleanup -description: "Multi-agent codebase review and cleanup plan. Spawns specialist agents for infrastructure, design, documentation, and memory analysis, then synthesizes findings into a prioritised CLEANUP_PLAN.md." +description: "Multi-agent codebase review and cleanup plan. Spawns specialist agents for infrastructure, design, and documentation analysis, then synthesizes findings into a prioritised CLEANUP_PLAN.md." --- # Multi-Agent Codebase Cleanup & Architecture Review @@ -691,13 +406,55 @@ LOC Estimate: [approximate lines of code] --- +## Phase 1.5: Memory Gathering (Lead Agent) + +The lead agent has MCP access to Causantic tools — subagents do not. Gather all historical context now and pass it as text to the specialists. + +### 1.5.1 Query Memory + +Run these queries directly (do NOT delegate to subagents): + +- \`search\` with query: "architecture decisions", \`max_tokens: 8000\` +- \`search\` with query: "tech debt", \`max_tokens: 8000\` +- \`search\` with query: "past cleanup findings", \`max_tokens: 8000\` +- \`recall\` with query: "why was this designed this way", \`max_tokens: 8000\` + +### 1.5.2 Assemble Memory Context + +Combine the results into a single \`memoryContext\` text block, capped at ~15K tokens total. Structure it as: + +\`\`\` +## Memory Context (from Causantic) + +### Architecture Decisions +[results from search "architecture decisions"] + +### Known Tech Debt +[results from search "tech debt"] + +### Past Cleanup Findings +[results from search "past cleanup findings"] + +### Design Rationale +[results from recall "why was this designed this way"] +\`\`\` + +If Causantic MCP tools are unavailable, skip this phase and note the gap. + +\`\`\` +✓ CHECKPOINT: Phase 1.5 complete - Memory Gathered +\`\`\` + +--- + ## Phase 2: Spawn Specialist Agents -Spawn 4 specialist agents **in parallel** using the Task tool. Each agent is \`subagent_type: "general-purpose"\` (full tool access). Task subagents inherit the session's MCP connections, so the Memory specialist will have access to causantic MCP tools. +Spawn 3 specialist agents **in parallel** using the Task tool. Each agent is \`subagent_type: "general-purpose"\` (full tool access). Pass each agent: 1. The reconnaissance context from Phase 1 -2. Their domain-specific prompt (copied from the Specialist Prompts section below) +2. The memory context from Phase 1.5 (as text — agents do not have MCP access to Causantic) +3. Their domain-specific prompt (copied from the Specialist Prompts section below) \`\`\` ✓ CHECKPOINT: Phase 2 complete - Specialists Spawned @@ -707,13 +464,13 @@ Pass each agent: ## Phase 3: Synthesis (Lead Agent) -Collect the 4 specialist reports and synthesize into the final plan. +Collect the 3 specialist reports and synthesize into the final plan. ### 3.1 Map Specialist Outputs Map each specialist's output sections into the CLEANUP_PLAN.md template structure. ### 3.2 Memory Cross-Referencing -For each infrastructure/design/docs finding, check if the memory report provides historical context that modifies the recommendation: +For each infrastructure/design/docs finding, check if the memory context from Phase 1.5 provides historical context that modifies the recommendation: - Dependency pinned for a compatibility reason → note in Version Bumps table - Suppression added deliberately for an edge case → mark as "valid" in Suppression Audit - Architecture chosen for a specific reason → note in Architecture Assessment @@ -1186,123 +943,6 @@ New Documents Needed: **Cap each table at 30 items.** If more exist, note the total count and say "N additional items not shown." -### Specialist: Memory - -You are the Memory Specialist for a codebase cleanup review. Your job is to query Causantic long-term memory to surface historical context that informs the cleanup plan — decisions, tech debt, past attempts, and dependency history. - -**Project Context:** -[INSERT RECONNAISSANCE CONTEXT HERE] - -**Your Scope:** -Use the causantic MCP tools (\`recall\`, \`search\`, \`predict\`) to gather historical context. All queries should be scoped to the current project. - -#### 1. Decision History - -Query memory for architectural decisions and design choices: -- \`recall\` query: "architectural decisions" -- \`recall\` query: "design choices" -- \`recall\` query: "why did we choose" (for rationale behind key choices) -- \`search\` query: "decision" and "chose" and "alternative" - -For each decision found, note: -- What was decided -- When (approximate) -- Why (rationale) -- What alternatives were considered -- Whether circumstances have changed since - -#### 2. Known Tech Debt - -Query memory for acknowledged tech debt: -- \`recall\` query: "tech debt" -- \`search\` query: "TODO" -- \`search\` query: "workaround" -- \`search\` query: "hack" -- \`search\` query: "temporary" -- \`search\` query: "FIXME" - -For each item found, note: -- What the debt is -- When it was introduced -- Why (was it intentional? a time constraint?) -- Whether it's been addressed since - -#### 3. Past Cleanup Attempts - -Query memory for previous refactoring or cleanup work: -- \`recall\` query: "refactoring" -- \`recall\` query: "cleanup" -- \`recall\` query: "code review" -- \`search\` query: "refactor" - -For each attempt found, note: -- What was attempted -- What was completed vs abandoned -- Why it was abandoned (if applicable) -- Lessons learned - -#### 4. Dependency History - -Query memory for past dependency-related work: -- \`recall\` query: "dependency upgrade" -- \`recall\` query: "dependency pinned" -- \`recall\` query: "migration" -- \`search\` query: "compatibility issue" -- \`search\` query: "breaking change" - -For each item found, note: -- Which dependency -- What happened (upgrade attempt, pin, compatibility issue) -- Outcome (success, failure, workaround) -- Any ongoing constraints - -#### 5. Lint & Suppression History - -Query memory for deliberate lint decisions: -- \`recall\` query: "lint suppression" -- \`recall\` query: "eslint-disable" -- \`recall\` query: "ts-ignore" -- \`search\` query: "warning suppressed" -- \`search\` query: "lint exception" - -For each item found, note: -- What was suppressed and where -- Why it was suppressed -- Whether the underlying issue has been resolved - -#### Output Format - -Return your findings as a structured markdown report with these sections: - -**Decision History** - -| Decision | When | Rationale | Alternatives | Current Relevance | -|----------|------|-----------|-------------|-------------------| - -**Known Tech Debt** - -| Item | Introduced | Reason | Status | Source | -|------|-----------|--------|--------|--------| - -**Past Cleanup Attempts** - -| Attempt | Scope | Outcome | Lessons | -|---------|-------|---------|---------| - -**Dependency History** - -| Dependency | Event | Outcome | Constraints | -|-----------|-------|---------|-------------| - -**Lint/Suppression History** - -| Suppression | Location | Reason | Resolved? | -|------------|----------|--------|-----------| - -**Cap each table at 30 items.** If more exist, note the total count and say "N additional items not shown." - -If memory returns no results for a category, say "No memory found for [category]" — do not fabricate results. - --- ## Output Format @@ -1418,8 +1058,8 @@ Write the plan to \`CLEANUP_PLAN.md\` in the project root with: ## Synthesis Rules 1. Map each specialist's output sections into the CLEANUP_PLAN.md template -2. **Memory cross-referencing**: For each infrastructure/design/docs finding, check if the memory report provides historical context that modifies the recommendation (e.g., dependency pinned for compatibility, suppression added deliberately, architecture chosen for specific reason) -3. **Contradiction resolution**: When memory contradicts a specialist, include both perspectives with a "⚠️ Requires human decision" flag. Default to the safer option. +2. **Memory cross-referencing**: For each infrastructure/design/docs finding, check if the memory context from Phase 1.5 provides historical context that modifies the recommendation (e.g., dependency pinned for compatibility, suppression added deliberately, architecture chosen for specific reason) +3. **Contradiction resolution**: When memory context contradicts a specialist, include both perspectives with a "⚠️ Requires human decision" flag. Default to the safer option. 4. **Deduplication**: Dead code findings from infrastructure + unused code from design — merge into single Dead Code section. When the same item appears from multiple specialists with different severity assessments, take the highest severity and annotate with the contributing perspectives. 5. **Prioritised backlog**: Merge all findings into the 13-tier priority ordering defined in Phase 3. @@ -1428,7 +1068,7 @@ Write the plan to \`CLEANUP_PLAN.md\` in the project root with: ## Error Handling - If a specialist returns no findings or fails: note the gap in Executive Summary, proceed with available data -- If memory specialist fails: graceful degradation — omit Memory Context section, note gap +- If memory gathering (Phase 1.5) fails: graceful degradation — omit Memory Context section, note gap - If all specialists fail: fall back to single-agent analysis of highest-priority areas (security, lint errors) - If the Task tool is unavailable or spawning fails: fall back to single-agent sequential analysis (Phase 1 areas first, then most critical from each specialist domain) @@ -1535,17 +1175,20 @@ If \`ROADMAP.md\` exists in the project root (updating an existing roadmap): - Tag each with source: "existing-roadmap" ### 1.3 Query Causantic Memory + +Run all memory queries directly in the lead agent context. Do not delegate memory queries to subagents — they cannot access MCP tools. + Use the causantic MCP tools to surface deferred and aspirational work: -- \`search\` query: "deferred" -- \`search\` query: "aspirational" -- \`search\` query: "someday" -- \`search\` query: "future work" -- \`search\` query: "TODO" -- \`search\` query: "roadmap" -- \`search\` query: "milestone" -- \`search\` query: "release plan" -- \`recall\` query: "features we want to build" -- \`predict\` context: "project roadmap and future work" +- \`search\` query: "deferred", \`max_tokens: 8000\` +- \`search\` query: "aspirational", \`max_tokens: 8000\` +- \`search\` query: "someday", \`max_tokens: 8000\` +- \`search\` query: "future work", \`max_tokens: 8000\` +- \`search\` query: "TODO", \`max_tokens: 8000\` +- \`search\` query: "roadmap", \`max_tokens: 8000\` +- \`search\` query: "milestone", \`max_tokens: 8000\` +- \`search\` query: "release plan", \`max_tokens: 8000\` +- \`recall\` query: "features we want to build", \`max_tokens: 8000\` +- \`predict\` context: "project roadmap and future work", \`max_tokens: 8000\` - Tag each with source: "memory" If causantic MCP tools are unavailable or return nothing, note the gap and proceed with other sources. @@ -1835,27 +1478,20 @@ Long-term memory is available via the \`causantic\` MCP server. ### Skills -**Core retrieval:** +**Retrieval:** - \`/causantic-recall [query]\` — Reconstruct how something happened — walks backward through causal chains (how did we solve X?) - \`/causantic-search [query]\` — Broad discovery — find everything memory knows about a topic (what do I know about X?) - \`/causantic-predict \` — Surface what came after similar past situations — walks forward through causal chains (what's likely relevant next?) -**Understanding & analysis:** -- \`/causantic-explain [question]\` — Answer "why" questions using memory + codebase (why does X work this way?) -- \`/causantic-debug [error]\` — Search for prior encounters with an error (auto-extracts from conversation if no argument) - -**Session & project navigation:** +**Session navigation:** - \`/causantic-resume\` — Resume interrupted work — start-of-session briefing - \`/causantic-reconstruct [time range]\` — Replay a past session chronologically, or get recent history -- \`/causantic-summary [time range]\` — Factual recap of what was done across recent sessions - \`/causantic-list-projects\` — Discover available projects -- \`/causantic-status\` — Check system health and memory statistics -**Cross-cutting analysis:** -- \`/causantic-crossref [pattern]\` — Search across all projects for reusable patterns -- \`/causantic-retro [scope]\` — Surface recurring patterns, problems, and decisions across sessions +**Planning:** - \`/causantic-cleanup\` — Memory-informed codebase review and cleanup plan - \`/causantic-roadmap [goal]\` — Gather deferred work and goals into a phased roadmap +- \`/causantic-status\` — Check system health and memory statistics **Memory management:** - \`/causantic-forget [query or filters]\` — Delete memory by topic, time range, or session (always previews first) @@ -1866,15 +1502,12 @@ Long-term memory is available via the \`causantic\` MCP server. |-------------|-------| | "What do I know about X?" | \`search\` | | "How did we solve X?" / "What led to this decision?" | \`recall\` | -| "Why does X work this way?" | \`explain\` | +| "Why does X work this way?" | \`recall\` | | "What might be relevant?" | \`predict\` | | "What happened recently?" / "Show me recent work" | \`reconstruct\` | -| "Show me exactly what happened" / "Replay the session" | \`reconstruct\` | | "Where did I leave off?" / "Briefing to continue" | \`resume\` | -| "I keep hitting this error" | \`debug\` | -| "What did we accomplish this week?" | \`summary\` | -| "Is there a pattern across projects?" | \`crossref\` | -| "What patterns keep coming up?" | \`retro\` | +| "I keep hitting this error" | \`search\` | +| "What did we accomplish?" | \`reconstruct\` | | "Review the codebase" | \`cleanup\` | | "What should we work on next?" / "Build a roadmap" | \`roadmap\` | | "What projects are in memory?" | \`list-projects\` | @@ -1889,23 +1522,10 @@ Long-term memory is available via the \`causantic\` MCP server. ### Proactive Memory Usage -**Check memory automatically (no skill needed) when:** -- Before saying "I don't have context from previous sessions" — always try \`recall\` first -- User references past work ("remember when...", "like we did before", "that bug from last week") -- When stuck on an error after 2 failed attempts — use \`debug\` with the error text before trying a 3rd approach -- User asks "why" about existing code or architecture — use \`explain\` before guessing -- Starting work in an unfamiliar area — use \`search\` for broad discovery -- Before making significant architectural decisions — use \`recall\` to check for prior discussions -- Sprint summary or retrospective — use \`summary\` or \`retro\` -- Patterns across projects — use \`crossref\` -- When the user asks about recent work or session history — use \`reconstruct\` with just \`project\` for timeline mode - -**Skip memory (avoid latency) when:** -- The full context is already in the conversation -- Simple file operations where memory adds no value -- Git operations handled by /commit, /pr, /merge, /qa -- The user explicitly provides all needed context -- First attempt at resolving a new error (try solving it first, check memory if stuck) +**Check memory** before saying "I don't have context" or when the user references past work. +Use \`search\` for discovery, \`recall\` for specific decisions, \`reconstruct\` for recent history. + +**Skip memory** when context is already in the conversation, for simple operations, or for git workflows. ### CLI Commands diff --git a/src/config/loader.ts b/src/config/loader.ts index 9f97fda..445ab98 100644 --- a/src/config/loader.ts +++ b/src/config/loader.ts @@ -76,6 +76,12 @@ export interface ExternalConfig { /** Half-life in hours for the decay function. Default: 48 */ halfLifeHours?: number; }; + lengthPenalty?: { + /** Enable length penalty. Default: true */ + enabled?: boolean; + /** Reference token count for penalty calculation. Must be > 0. Default: 500 */ + referenceTokens?: number; + }; } /** Default external config values */ @@ -126,6 +132,10 @@ const EXTERNAL_DEFAULTS: Required = { decayFactor: 0.3, halfLifeHours: 48, }, + lengthPenalty: { + enabled: true, + referenceTokens: 500, + }, }; /** @@ -375,6 +385,13 @@ export function validateExternalConfig(config: ExternalConfig): string[] { } } + // Length penalty validation + if (config.lengthPenalty?.referenceTokens !== undefined) { + if (config.lengthPenalty.referenceTokens <= 0) { + errors.push('lengthPenalty.referenceTokens must be greater than 0'); + } + } + // Retrieval validation if (config.retrieval?.mmrLambda !== undefined) { if (config.retrieval.mmrLambda < 0 || config.retrieval.mmrLambda > 1) { @@ -517,6 +534,13 @@ export function toRuntimeConfig(external: Required): MemoryConfi decayFactor: external.recency?.decayFactor ?? DEFAULT_CONFIG.recency.decayFactor, halfLifeHours: external.recency?.halfLifeHours ?? DEFAULT_CONFIG.recency.halfLifeHours, }, + + // Length penalty + lengthPenalty: { + enabled: external.lengthPenalty?.enabled ?? DEFAULT_CONFIG.lengthPenalty.enabled, + referenceTokens: + external.lengthPenalty?.referenceTokens ?? DEFAULT_CONFIG.lengthPenalty.referenceTokens, + }, }; } diff --git a/src/config/memory-config.ts b/src/config/memory-config.ts index 7d28ab5..597121f 100644 --- a/src/config/memory-config.ts +++ b/src/config/memory-config.ts @@ -66,6 +66,14 @@ export interface MemoryConfig { halfLifeHours: number; }; + /** Length penalty configuration to favour focused chunks over large keyword-rich ones */ + lengthPenalty: { + /** Enable length penalty. Default: true */ + enabled: boolean; + /** Reference token count for penalty calculation. Default: 500 */ + referenceTokens: number; + }; + // Clustering (incremental) /** Ratio of new chunks that triggers a full recluster. Default: 0.3 (30%). */ incrementalClusterThreshold: number; @@ -125,6 +133,12 @@ export const DEFAULT_CONFIG: MemoryConfig = { halfLifeHours: 48, }, + // Length penalty + lengthPenalty: { + enabled: true, + referenceTokens: 500, + }, + // Clustering (incremental) incrementalClusterThreshold: 0.3, diff --git a/src/dashboard/client/src/components/stats/SizeDistribution.tsx b/src/dashboard/client/src/components/stats/SizeDistribution.tsx new file mode 100644 index 0000000..2de4551 --- /dev/null +++ b/src/dashboard/client/src/components/stats/SizeDistribution.tsx @@ -0,0 +1,88 @@ +import { useRef, useEffect } from 'react'; +import * as d3 from 'd3'; + +interface SizeDistributionProps { + data: Array<{ bucket: string; count: number }>; +} + +export function SizeDistribution({ data }: SizeDistributionProps) { + const svgRef = useRef(null); + + useEffect(() => { + if (!svgRef.current || data.length === 0) return; + + const svg = d3.select(svgRef.current); + svg.selectAll('*').remove(); + + const container = svgRef.current.parentElement; + if (!container) return; + + const width = container.clientWidth; + const height = 250; + const margin = { top: 20, right: 20, bottom: 40, left: 50 }; + const innerWidth = width - margin.left - margin.right; + const innerHeight = height - margin.top - margin.bottom; + + svg.attr('width', width).attr('height', height); + + const g = svg.append('g').attr('transform', `translate(${margin.left},${margin.top})`); + + const x = d3 + .scaleBand() + .domain(data.map((d) => d.bucket)) + .range([0, innerWidth]) + .padding(0.2); + + const y = d3 + .scaleLinear() + .domain([0, d3.max(data, (d) => d.count) ?? 1]) + .nice() + .range([innerHeight, 0]); + + const style = getComputedStyle(document.documentElement); + const accentColor = style.getPropertyValue('--accent-color').trim() || '#10b981'; + const mutedFg = style.getPropertyValue('--muted-fg').trim() || '#94a3b8'; + const borderColor = style.getPropertyValue('--border-color').trim() || '#334155'; + + // X axis + g.append('g') + .attr('transform', `translate(0,${innerHeight})`) + .call(d3.axisBottom(x)) + .attr('color', mutedFg) + .selectAll('line') + .attr('stroke', borderColor); + + // Y axis + g.append('g') + .call(d3.axisLeft(y).ticks(5)) + .attr('color', mutedFg) + .selectAll('line') + .attr('stroke', borderColor); + + // Bars + g.selectAll('rect') + .data(data) + .enter() + .append('rect') + .attr('x', (d) => x(d.bucket)!) + .attr('y', (d) => y(d.count)) + .attr('width', x.bandwidth()) + .attr('height', (d) => innerHeight - y(d.count)) + .attr('fill', accentColor) + .attr('rx', 3); + + // Count labels on top of bars + g.selectAll('.bar-label') + .data(data) + .enter() + .append('text') + .attr('x', (d) => x(d.bucket)! + x.bandwidth() / 2) + .attr('y', (d) => y(d.count) - 5) + .attr('text-anchor', 'middle') + .attr('fill', mutedFg) + .attr('font-size', '11px') + .text((d) => (d.count > 0 ? d.count.toLocaleString() : '')); + }, [data]); + + return ; +} diff --git a/src/dashboard/client/src/components/stats/ToolUsageChart.tsx b/src/dashboard/client/src/components/stats/ToolUsageChart.tsx new file mode 100644 index 0000000..926a62a --- /dev/null +++ b/src/dashboard/client/src/components/stats/ToolUsageChart.tsx @@ -0,0 +1,86 @@ +import { useRef, useEffect } from 'react'; +import * as d3 from 'd3'; + +interface ToolUsageChartProps { + data: Array<{ tool: string; count: number }>; +} + +export function ToolUsageChart({ data }: ToolUsageChartProps) { + const svgRef = useRef(null); + + useEffect(() => { + if (!svgRef.current || data.length === 0) return; + + const svg = d3.select(svgRef.current); + svg.selectAll('*').remove(); + + const container = svgRef.current.parentElement; + if (!container) return; + + const barHeight = 32; + const gap = 8; + const margin = { top: 10, right: 60, bottom: 10, left: 120 }; + const width = container.clientWidth; + const innerWidth = width - margin.left - margin.right; + const innerHeight = data.length * (barHeight + gap) - gap; + const height = innerHeight + margin.top + margin.bottom; + + svg.attr('width', width).attr('height', height); + + const g = svg.append('g').attr('transform', `translate(${margin.left},${margin.top})`); + + const x = d3 + .scaleLinear() + .domain([0, d3.max(data, (d) => d.count) ?? 1]) + .range([0, innerWidth]); + + const y = d3 + .scaleBand() + .domain(data.map((d) => d.tool)) + .range([0, innerHeight]) + .padding(gap / (barHeight + gap)); + + const style = getComputedStyle(document.documentElement); + const accentColor = style.getPropertyValue('--accent-color').trim() || '#10b981'; + const mutedFg = style.getPropertyValue('--muted-fg').trim() || '#94a3b8'; + + // Bars + g.selectAll('rect') + .data(data) + .enter() + .append('rect') + .attr('x', 0) + .attr('y', (d) => y(d.tool)!) + .attr('width', (d) => x(d.count)) + .attr('height', y.bandwidth()) + .attr('fill', accentColor) + .attr('rx', 4); + + // Tool labels (left) + g.selectAll('.label') + .data(data) + .enter() + .append('text') + .attr('x', -8) + .attr('y', (d) => y(d.tool)! + y.bandwidth() / 2) + .attr('text-anchor', 'end') + .attr('dominant-baseline', 'central') + .attr('fill', mutedFg) + .attr('font-size', '12px') + .text((d) => d.tool); + + // Count labels (right of bar) + g.selectAll('.count') + .data(data) + .enter() + .append('text') + .attr('x', (d) => x(d.count) + 6) + .attr('y', (d) => y(d.tool)! + y.bandwidth() / 2) + .attr('dominant-baseline', 'central') + .attr('fill', mutedFg) + .attr('font-size', '12px') + .text((d) => d.count.toLocaleString()); + }, [data]); + + return ; +} diff --git a/src/dashboard/client/src/pages/Overview.tsx b/src/dashboard/client/src/pages/Overview.tsx index 4a2a1a4..b748c98 100644 --- a/src/dashboard/client/src/pages/Overview.tsx +++ b/src/dashboard/client/src/pages/Overview.tsx @@ -2,9 +2,26 @@ import { useApi } from '../hooks/use-api'; import { Spinner } from '../components/ui/spinner'; import { StatCard } from '../components/stats/StatCard'; import { TimeSeries } from '../components/stats/TimeSeries'; +import { ToolUsageChart } from '../components/stats/ToolUsageChart'; +import { SizeDistribution } from '../components/stats/SizeDistribution'; import { Card, CardContent, CardHeader, CardTitle } from '../components/ui/card'; import { Badge } from '../components/ui/badge'; -import { Boxes, GitBranch, Layers, Clock } from 'lucide-react'; +import { Boxes, GitBranch, Layers, Clock, Activity, Search, Wrench } from 'lucide-react'; + +interface AnalyticsData { + toolUsage: Array<{ tool: string; count: number }>; + retrievalTimeSeries: Array<{ week: string; count: number }>; + topChunks: Array<{ + chunkId: string; + count: number; + project: string; + tokens: number; + preview: string; + }>; + projectRetrievals: Array<{ project: string; retrievals: number; uniqueQueries: number }>; + sizeDistribution: Array<{ bucket: string; count: number }>; + totalRetrievals: number; +} interface StatsData { chunks: number; @@ -13,6 +30,7 @@ interface StatsData { sessions: number; projects: number; chunkTimeSeries: Array<{ week: string; count: number }>; + analytics: AnalyticsData; } interface ChunksData { @@ -86,6 +104,129 @@ export function Overview() { )} + + {/* Retrieval analytics — only shown when feedback data exists */} + {stats.analytics.totalRetrievals > 0 && ( + <> +

Retrieval Analytics

+ + {/* Analytics stat cards */} +
+ } + /> + sum + p.uniqueQueries, 0)} + icon={} + /> + } + /> +
+ + {/* Charts row */} +
+ {stats.analytics.toolUsage.length > 0 && ( + + + Tool Usage + + + + + + )} + + {stats.analytics.retrievalTimeSeries.length > 0 && ( + + + Retrievals Over Time + + + + + + )} +
+ + {/* Chunk size distribution */} + {stats.analytics.sizeDistribution.length > 0 && ( + + + Chunk Size Distribution + + + + + + )} + + {/* Top retrieved chunks table */} + {stats.analytics.topChunks.length > 0 && ( + + + Top Retrieved Chunks + + +
+ + + + + + + + + + + + {stats.analytics.topChunks.map((chunk, i) => ( + + + + + + + + ))} + +
+ # + + Project + + Preview + + Tokens + + Retrieved +
{i + 1} + {chunk.project} + + {chunk.preview} + + {chunk.tokens.toLocaleString()} + + {chunk.count} +
+
+
+
+ )} + + )} ); } diff --git a/src/dashboard/client/src/pages/Projects.tsx b/src/dashboard/client/src/pages/Projects.tsx index be70a15..900a662 100644 --- a/src/dashboard/client/src/pages/Projects.tsx +++ b/src/dashboard/client/src/pages/Projects.tsx @@ -9,6 +9,8 @@ interface ProjectInfo { firstSeen: string; lastSeen: string; path?: string; + retrievals: number; + uniqueQueries: number; } interface ProjectsResponse { @@ -45,6 +47,12 @@ export function Projects() { Last Seen + + Retrievals + + + Queries + @@ -72,6 +80,12 @@ export function Projects() { {new Date(project.lastSeen).toLocaleDateString()} + + {project.retrievals} + + + {project.uniqueQueries} + ))} diff --git a/src/dashboard/routes/projects.ts b/src/dashboard/routes/projects.ts index f511c71..5eb7aec 100644 --- a/src/dashboard/routes/projects.ts +++ b/src/dashboard/routes/projects.ts @@ -1,11 +1,37 @@ import { Router } from 'express'; import { getDistinctProjects } from '../../storage/chunk-store.js'; +import { getDb } from '../../storage/db.js'; const router = Router(); router.get('/', (_req, res) => { const projects = getDistinctProjects(); - res.json({ projects }); + + const db = getDb(); + const retrievalCounts = new Map( + ( + db + .prepare( + `SELECT c.session_slug as project, COUNT(*) as retrievals, + COUNT(DISTINCT rf.query_hash) as uniqueQueries + FROM retrieval_feedback rf + JOIN chunks c ON c.id = rf.chunk_id + GROUP BY c.session_slug`, + ) + .all() as Array<{ project: string; retrievals: number; uniqueQueries: number }> + ).map((r) => [r.project, r]), + ); + + const enriched = projects.map((p) => { + const counts = retrievalCounts.get(p.slug); + return { + ...p, + retrievals: counts?.retrievals ?? 0, + uniqueQueries: counts?.uniqueQueries ?? 0, + }; + }); + + res.json({ projects: enriched }); }); export default router; diff --git a/src/dashboard/routes/stats.ts b/src/dashboard/routes/stats.ts index f171984..3e03313 100644 --- a/src/dashboard/routes/stats.ts +++ b/src/dashboard/routes/stats.ts @@ -3,6 +3,7 @@ import { getChunkCount, getSessionIds, getAllChunks } from '../../storage/chunk- import { getEdgeCount } from '../../storage/edge-store.js'; import { getClusterCount } from '../../storage/cluster-store.js'; import { getDistinctProjects } from '../../storage/chunk-store.js'; +import { getDb } from '../../storage/db.js'; const router = Router(); @@ -32,6 +33,69 @@ router.get('/', (_req, res) => { .sort(([a], [b]) => a.localeCompare(b)) .map(([week, count]) => ({ week, count })); + // --- Retrieval analytics from retrieval_feedback --- + const db = getDb(); + + const toolUsage = db + .prepare( + `SELECT tool_name as tool, COUNT(*) as count + FROM retrieval_feedback + GROUP BY tool_name ORDER BY count DESC`, + ) + .all() as Array<{ tool: string; count: number }>; + + const retrievalTimeSeries = db + .prepare( + `SELECT strftime('%Y-%m-%d', returned_at, 'weekday 1', '-6 days') as week, + COUNT(*) as count + FROM retrieval_feedback GROUP BY week ORDER BY week`, + ) + .all() as Array<{ week: string; count: number }>; + + const topChunks = db + .prepare( + `SELECT rf.chunk_id as chunkId, COUNT(*) as count, c.session_slug as project, + c.approx_tokens as tokens, SUBSTR(c.content, 1, 120) as preview + FROM retrieval_feedback rf + JOIN chunks c ON c.id = rf.chunk_id + GROUP BY rf.chunk_id ORDER BY count DESC LIMIT 10`, + ) + .all() as Array<{ + chunkId: string; + count: number; + project: string; + tokens: number; + preview: string; + }>; + + const projectRetrievals = db + .prepare( + `SELECT c.session_slug as project, COUNT(*) as retrievals, + COUNT(DISTINCT rf.query_hash) as uniqueQueries + FROM retrieval_feedback rf + JOIN chunks c ON c.id = rf.chunk_id + GROUP BY c.session_slug ORDER BY retrievals DESC`, + ) + .all() as Array<{ project: string; retrievals: number; uniqueQueries: number }>; + + const sizeDistribution = db + .prepare( + `SELECT CASE + WHEN approx_tokens <= 200 THEN '0-200' + WHEN approx_tokens <= 500 THEN '201-500' + WHEN approx_tokens <= 1000 THEN '501-1K' + WHEN approx_tokens <= 2000 THEN '1K-2K' + WHEN approx_tokens <= 5000 THEN '2K-5K' + ELSE '5K+' + END as bucket, COUNT(*) as count + FROM chunks GROUP BY bucket ORDER BY MIN(approx_tokens)`, + ) + .all() as Array<{ bucket: string; count: number }>; + + const totalRetrievals = ( + db.prepare('SELECT COUNT(*) as count FROM retrieval_feedback').get() as { count: number } + ).count; + res.json({ chunks, edges, @@ -39,6 +103,14 @@ router.get('/', (_req, res) => { sessions, projects: projects.length, chunkTimeSeries, + analytics: { + toolUsage, + retrievalTimeSeries, + topChunks, + projectRetrievals, + sizeDistribution, + totalRetrievals, + }, }); }); diff --git a/src/hooks/session-start.ts b/src/hooks/session-start.ts index f5c4c72..ec523d6 100644 --- a/src/hooks/session-start.ts +++ b/src/hooks/session-start.ts @@ -75,7 +75,7 @@ function internalHandleSessionStart( const { maxTokens = config.claudeMdBudgetTokens, includeRecent = 3, - includeCrossProject = 2, + includeCrossProject = 0, } = options; // Get clusters with descriptions (SQL-level filter) diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 59be51b..99d475c 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -91,6 +91,10 @@ export const searchTool: ToolDefinition = { type: 'string', description: 'Filter to a specific agent (e.g., "researcher"). Omit to include all agents.', }, + max_tokens: { + type: 'number', + description: 'Maximum tokens in response. Defaults to server config.', + }, }, required: ['query'], }, @@ -99,10 +103,11 @@ export const searchTool: ToolDefinition = { const project = args.project as string | undefined; const agent = args.agent as string | undefined; const config = getConfig(); + const maxTokens = (args.max_tokens as number | undefined) ?? config.mcpMaxResponseTokens; const response = await searchContext({ query, - maxTokens: config.mcpMaxResponseTokens, + maxTokens, projectFilter: project, agentFilter: agent, }); @@ -149,6 +154,10 @@ export const recallTool: ToolDefinition = { type: 'string', description: 'Filter to a specific agent (e.g., "researcher"). Omit to include all agents.', }, + max_tokens: { + type: 'number', + description: 'Maximum tokens in response. Defaults to server config.', + }, }, required: ['query'], }, @@ -157,9 +166,10 @@ export const recallTool: ToolDefinition = { const project = args.project as string | undefined; const agent = args.agent as string | undefined; const config = getConfig(); + const maxTokens = (args.max_tokens as number | undefined) ?? config.mcpMaxResponseTokens; const response = await recall(query, { - maxTokens: config.mcpMaxResponseTokens, + maxTokens, projectFilter: project, agentFilter: agent, }); @@ -205,6 +215,10 @@ export const predictTool: ToolDefinition = { type: 'string', description: 'Filter to a specific agent (e.g., "researcher"). Omit to include all agents.', }, + max_tokens: { + type: 'number', + description: 'Maximum tokens in response. Defaults to server config.', + }, }, required: ['context'], }, @@ -213,9 +227,10 @@ export const predictTool: ToolDefinition = { const project = args.project as string | undefined; const agent = args.agent as string | undefined; const config = getConfig(); + const maxTokens = (args.max_tokens as number | undefined) ?? config.mcpMaxResponseTokens; const response = await predict(context, { - maxTokens: config.mcpMaxResponseTokens, + maxTokens, projectFilter: project, agentFilter: agent, }); @@ -388,6 +403,10 @@ export const reconstructTool: ToolDefinition = { type: 'string', description: 'Filter to a specific agent (e.g., "researcher"). Omit to include all agents.', }, + max_tokens: { + type: 'number', + description: 'Maximum tokens in response. Defaults to server config.', + }, }, required: ['project'], }, @@ -395,6 +414,7 @@ export const reconstructTool: ToolDefinition = { const project = args.project as string; const agent = args.agent as string | undefined; const config = getConfig(); + const maxTokens = (args.max_tokens as number | undefined) ?? config.mcpMaxResponseTokens; try { const result = reconstructSession({ @@ -405,7 +425,7 @@ export const reconstructTool: ToolDefinition = { daysBack: args.days_back as number | undefined, previousSession: args.previous_session as boolean | undefined, currentSessionId: args.current_session_id as string | undefined, - maxTokens: config.mcpMaxResponseTokens, + maxTokens, keepNewest: (args.keep_newest as boolean | undefined) ?? true, agentFilter: agent, }); diff --git a/src/retrieval/formatting.ts b/src/retrieval/formatting.ts index 8b55a73..771c240 100644 --- a/src/retrieval/formatting.ts +++ b/src/retrieval/formatting.ts @@ -18,7 +18,7 @@ function formatChunkHeader(chunk: StoredChunk): { date: string; agentPart: strin */ export function formatSearchChunk(chunk: StoredChunk, content: string, weight: number): string { const { date, agentPart } = formatChunkHeader(chunk); - const relevance = (weight * 100).toFixed(0); + const relevance = (Math.min(weight, 1.0) * 100).toFixed(0); return `[Session: ${chunk.sessionSlug}${agentPart} | Date: ${date} | Relevance: ${relevance}%]\n${content}`; } diff --git a/src/retrieval/search-assembler.ts b/src/retrieval/search-assembler.ts index 98cfbc9..8f70e4d 100644 --- a/src/retrieval/search-assembler.ts +++ b/src/retrieval/search-assembler.ts @@ -249,13 +249,31 @@ export async function searchContext(request: SearchRequest): Promise b.score - a.score); // 7.5. MMR reranking (diversity-aware ordering) const reordered = await reorderWithMMR(deduped, queryResult.embedding, mmrReranking); + // 7.6. Normalize scores for display (top result = 1.0) + if (reordered.length > 0) { + const maxScore = reordered[0].score; + if (maxScore > 0) { + for (const item of reordered) { + item.score = item.score / maxScore; + } + } + } + // 8. Assemble within budget const assembled = assembleWithinBudget(reordered, maxTokens, sourceMap); diff --git a/test/cli/commands/init.test.ts b/test/cli/commands/init.test.ts index bebab71..76263a4 100644 --- a/test/cli/commands/init.test.ts +++ b/test/cli/commands/init.test.ts @@ -7,9 +7,6 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import * as fs from 'node:fs'; import * as os from 'node:os'; -import * as path from 'node:path'; -import { fileURLToPath } from 'node:url'; - // ── Mock dependencies before importing the command ────────────────────────── vi.mock('node:fs'); @@ -492,17 +489,6 @@ describe('initCommand', () => { }); it('skips when hooks already match the current install path', async () => { - // Build the exact command strings that init would generate - const cliEntry = path.resolve( - path.dirname(fileURLToPath(import.meta.url)), - '..', - '..', - '..', - 'src', - 'cli', - 'index.js', - ); - const nodeBin = process.execPath; const existingHooks = { PreCompact: [ { @@ -510,7 +496,7 @@ describe('initCommand', () => { hooks: [ { type: 'command', - command: `${nodeBin} ${cliEntry} hook pre-compact`, + command: `npx causantic hook pre-compact`, timeout: 300, async: true, }, @@ -523,7 +509,7 @@ describe('initCommand', () => { hooks: [ { type: 'command', - command: `${nodeBin} ${cliEntry} hook session-start`, + command: `npx causantic hook session-start`, timeout: 60, }, ], @@ -535,7 +521,7 @@ describe('initCommand', () => { hooks: [ { type: 'command', - command: `${nodeBin} ${cliEntry} hook session-end`, + command: `npx causantic hook session-end`, timeout: 300, async: true, }, @@ -546,7 +532,7 @@ describe('initCommand', () => { hooks: [ { type: 'command', - command: `${nodeBin} ${cliEntry} hook claudemd-generator`, + command: `npx causantic hook claudemd-generator`, timeout: 60, async: true, }, diff --git a/test/cli/skill-templates.test.ts b/test/cli/skill-templates.test.ts index bff31f1..369f01a 100644 --- a/test/cli/skill-templates.test.ts +++ b/test/cli/skill-templates.test.ts @@ -7,8 +7,8 @@ import { CAUSANTIC_SKILLS, getMinimalClaudeMdBlock } from '../../src/cli/skill-t describe('skill-templates', () => { describe('CAUSANTIC_SKILLS', () => { - it('has 15 skill templates', () => { - expect(CAUSANTIC_SKILLS.length).toBe(15); + it('has 10 skill templates', () => { + expect(CAUSANTIC_SKILLS.length).toBe(10); }); it('includes causantic-recall skill', () => { @@ -41,34 +41,22 @@ describe('skill-templates', () => { expect(skill).toBeDefined(); }); - it('includes causantic-debug skill', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-debug'); - expect(skill).toBeDefined(); - }); - - it('includes causantic-summary skill', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-summary'); - expect(skill).toBeDefined(); - }); - - it('includes causantic-crossref skill', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-crossref'); - expect(skill).toBeDefined(); - }); - - it('includes causantic-retro skill', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-retro'); - expect(skill).toBeDefined(); - }); - it('includes causantic-cleanup skill', () => { const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-cleanup'); expect(skill).toBeDefined(); }); - it('includes causantic-explain skill', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-explain'); - expect(skill).toBeDefined(); + it('does not include removed skills', () => { + const removed = [ + 'causantic-explain', + 'causantic-debug', + 'causantic-summary', + 'causantic-crossref', + 'causantic-retro', + ]; + for (const name of removed) { + expect(CAUSANTIC_SKILLS.find((s) => s.dirName === name)).toBeUndefined(); + } }); it('includes causantic-forget skill', () => { @@ -189,108 +177,18 @@ describe('skill-templates', () => { expect(skill.content).toContain('previous_session'); }); - it('causantic-debug has argument-hint', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-debug')!; - expect(skill.content).toContain('argument-hint:'); - }); - - it('causantic-debug references recall and predict tools', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-debug')!; - expect(skill.content).toContain('`recall`'); - expect(skill.content).toContain('`predict`'); - }); - - it('causantic-debug mentions auto-extraction from conversation', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-debug')!; - expect(skill.content).toContain('no argument'); - expect(skill.content).toContain('error'); - }); - - it('causantic-summary has argument-hint', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-summary')!; - expect(skill.content).toContain('argument-hint:'); - }); - - it('causantic-summary references list-sessions and reconstruct tools', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-summary')!; - expect(skill.content).toContain('`list-sessions`'); - expect(skill.content).toContain('`reconstruct`'); - }); - - it('causantic-summary has interpreting user intent table', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-summary')!; - expect(skill.content).toContain('Interpreting User Intent'); - expect(skill.content).toContain('days_back'); - }); - - it('causantic-summary mentions accomplishments and in progress', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-summary')!; - expect(skill.content).toContain('Accomplishments'); - expect(skill.content).toContain('In Progress'); - }); - - it('causantic-crossref has argument-hint', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-crossref')!; - expect(skill.content).toContain('argument-hint:'); - }); - - it('causantic-crossref mentions list-projects and per-project search', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-crossref')!; - expect(skill.content).toContain('`list-projects`'); - expect(skill.content).toContain('project filter'); - }); - - it('causantic-retro has argument-hint', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-retro')!; - expect(skill.content).toContain('argument-hint:'); - }); - - it('causantic-retro mentions synthesizing patterns', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-retro')!; - expect(skill.content).toContain('Recurring Patterns'); - expect(skill.content).toContain('Synthesize'); - }); - it('causantic-cleanup does not have argument-hint', () => { const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-cleanup')!; expect(skill.content).not.toContain('argument-hint:'); }); - it('causantic-explain has argument-hint', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-explain')!; - expect(skill.content).toContain('argument-hint:'); - }); - - it('causantic-explain references recall and search tools', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-explain')!; - expect(skill.content).toContain('`recall`'); - expect(skill.content).toContain('`search`'); - }); - - it('causantic-explain handles both why questions and area briefings', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-explain')!; - // Focused decision format - expect(skill.content).toContain('Decision'); - expect(skill.content).toContain('Rationale'); - // Area briefing format - expect(skill.content).toContain('Area Briefing'); - expect(skill.content).toContain('Evolution'); - // Intent detection table - expect(skill.content).toContain('Intent Detection'); - }); - it('causantic-predict documents context as required parameter', () => { const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-predict')!; expect(skill.content).toContain('**context** (required)'); expect(skill.content).not.toContain('**query** (optional)'); }); - it('causantic-crossref references search tool', () => { - const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-crossref')!; - expect(skill.content).toContain('`search`'); - }); - - it('causantic-cleanup has 4 phases with checkpoints', () => { + it('causantic-cleanup has phases with checkpoints', () => { const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-cleanup')!; expect(skill.content).toContain('Phase 1'); expect(skill.content).toContain('Phase 2'); @@ -304,11 +202,12 @@ describe('skill-templates', () => { expect(skill.content).toContain('planning mode'); }); - it('causantic-cleanup references memory tools', () => { + it('causantic-cleanup references memory tools in Phase 1.5', () => { const skill = CAUSANTIC_SKILLS.find((s) => s.dirName === 'causantic-cleanup')!; expect(skill.content).toContain('`recall`'); expect(skill.content).toContain('`search`'); - expect(skill.content).toContain('`predict`'); + expect(skill.content).toContain('Phase 1.5'); + expect(skill.content).toContain('max_tokens: 8000'); }); it('causantic-forget has argument-hint', () => { @@ -348,29 +247,31 @@ describe('skill-templates', () => { expect(block).toContain('causantic'); }); - it('references all 13 Causantic skills', () => { + it('references all 10 Causantic skills', () => { expect(block).toContain('/causantic-recall'); expect(block).toContain('/causantic-search'); expect(block).toContain('/causantic-predict'); - expect(block).toContain('/causantic-explain'); expect(block).toContain('/causantic-list-projects'); expect(block).toContain('/causantic-reconstruct'); expect(block).toContain('/causantic-resume'); - expect(block).toContain('/causantic-debug'); - expect(block).toContain('/causantic-crossref'); - expect(block).toContain('/causantic-retro'); - expect(block).toContain('/causantic-summary'); expect(block).toContain('/causantic-cleanup'); + expect(block).toContain('/causantic-roadmap'); + expect(block).toContain('/causantic-status'); expect(block).toContain('/causantic-forget'); }); - it('does not reference removed causantic-context skill', () => { + it('does not reference removed skills', () => { expect(block).not.toContain('/causantic-context'); + expect(block).not.toContain('/causantic-explain'); + expect(block).not.toContain('/causantic-debug'); + expect(block).not.toContain('/causantic-summary'); + expect(block).not.toContain('/causantic-crossref'); + expect(block).not.toContain('/causantic-retro'); }); it('has proactive memory usage section', () => { expect(block).toContain('Proactive Memory Usage'); - expect(block).toContain('Check memory automatically'); + expect(block).toContain('Check memory'); expect(block).toContain('Skip memory'); }); @@ -390,16 +291,16 @@ describe('skill-templates', () => { expect(block).toContain('npx causantic uninstall'); }); - it('includes nuanced triggers', () => { - expect(block).toContain('after 2 failed attempts'); - expect(block).toContain('First attempt at resolving a new error'); + it('includes concise guidance', () => { + expect(block).toContain('search'); + expect(block).toContain('recall'); + expect(block).toContain('reconstruct'); }); it('groups skills by use case', () => { - expect(block).toContain('Core retrieval:'); - expect(block).toContain('Understanding & analysis:'); - expect(block).toContain('Session & project navigation:'); - expect(block).toContain('Cross-cutting analysis:'); + expect(block).toContain('Retrieval:'); + expect(block).toContain('Session navigation:'); + expect(block).toContain('Planning:'); expect(block).toContain('Memory management:'); }); @@ -407,7 +308,6 @@ describe('skill-templates', () => { expect(block).toContain('Quick Decision Guide'); expect(block).toContain('search'); expect(block).toContain('recall'); - expect(block).toContain('explain'); expect(block).toContain('predict'); }); }); @@ -420,15 +320,12 @@ describe('skill-templates', () => { expect(paths).toContain(`${skillsBase}/causantic-recall/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-search/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-predict/SKILL.md`); - expect(paths).toContain(`${skillsBase}/causantic-explain/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-list-projects/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-reconstruct/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-resume/SKILL.md`); - expect(paths).toContain(`${skillsBase}/causantic-debug/SKILL.md`); - expect(paths).toContain(`${skillsBase}/causantic-crossref/SKILL.md`); - expect(paths).toContain(`${skillsBase}/causantic-retro/SKILL.md`); - expect(paths).toContain(`${skillsBase}/causantic-summary/SKILL.md`); + expect(paths).toContain(`${skillsBase}/causantic-status/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-cleanup/SKILL.md`); + expect(paths).toContain(`${skillsBase}/causantic-roadmap/SKILL.md`); expect(paths).toContain(`${skillsBase}/causantic-forget/SKILL.md`); }); @@ -456,7 +353,7 @@ More instructions below.`; // Should contain new block expect(updated).toContain('/causantic-recall'); expect(updated).toContain('/causantic-resume'); - expect(updated).toContain('/causantic-debug'); + expect(updated).toContain('/causantic-cleanup'); // Should not contain old verbose text expect(updated).not.toContain('Old verbose instructions'); // Should preserve surrounding content diff --git a/test/cli/uninstall.test.ts b/test/cli/uninstall.test.ts index 20e8dc3..7e24787 100644 --- a/test/cli/uninstall.test.ts +++ b/test/cli/uninstall.test.ts @@ -439,7 +439,7 @@ Causantic content only it('includes skill directory artifacts', () => { const plan = buildRemovalPlan(false); const skills = plan.filter((a) => a.label.includes('skills/')); - expect(skills.length).toBe(15); // recall, search, explain, predict, list-projects, reconstruct, resume, debug, crossref, retro, cleanup, roadmap, status, summary, forget + expect(skills.length).toBe(10); // recall, search, predict, list-projects, reconstruct, resume, cleanup, roadmap, status, forget }); it('includes keychain artifacts', () => { diff --git a/test/dashboard/routes.test.ts b/test/dashboard/routes.test.ts index b14c92e..97d0c63 100644 --- a/test/dashboard/routes.test.ts +++ b/test/dashboard/routes.test.ts @@ -17,6 +17,7 @@ import { import { createApp } from '../../src/dashboard/server.js'; import { insertChunk } from '../../src/storage/chunk-store.js'; import { createEdge } from '../../src/storage/edge-store.js'; +import { getDb } from '../../src/storage/db.js'; let db: Database.Database; let server: Server; @@ -996,3 +997,129 @@ describe('GET /api/timeline — limit parameter', () => { expect(data2.chunks).toHaveLength(3); }); }); + +describe('GET /api/stats — analytics', () => { + it('returns empty analytics when no feedback exists', async () => { + const res = await get('/api/stats'); + const data = await res.json(); + + expect(res.status).toBe(200); + expect(data.analytics).toBeDefined(); + expect(data.analytics.totalRetrievals).toBe(0); + expect(data.analytics.toolUsage).toEqual([]); + expect(data.analytics.retrievalTimeSeries).toEqual([]); + expect(data.analytics.topChunks).toEqual([]); + expect(data.analytics.projectRetrievals).toEqual([]); + // sizeDistribution depends on chunks, not feedback — empty db means empty + expect(data.analytics.sizeDistribution).toEqual([]); + }); + + it('returns analytics with feedback data', async () => { + insertChunk( + makeChunk({ + id: 'af-1', + sessionSlug: 'project-a', + content: 'First chunk content for analytics test', + }), + ); + insertChunk( + makeChunk({ + id: 'af-2', + sessionId: 'sess-2', + sessionSlug: 'project-b', + content: 'Second chunk content for analytics', + startTime: '2024-02-01T00:00:00Z', + }), + ); + + const testDb = getDb(); + testDb + .prepare( + 'INSERT INTO retrieval_feedback (chunk_id, query_hash, tool_name, returned_at) VALUES (?, ?, ?, ?)', + ) + .run('af-1', 'hash-1', 'search', '2024-01-15T10:00:00Z'); + testDb + .prepare( + 'INSERT INTO retrieval_feedback (chunk_id, query_hash, tool_name, returned_at) VALUES (?, ?, ?, ?)', + ) + .run('af-1', 'hash-2', 'recall', '2024-01-20T10:00:00Z'); + testDb + .prepare( + 'INSERT INTO retrieval_feedback (chunk_id, query_hash, tool_name, returned_at) VALUES (?, ?, ?, ?)', + ) + .run('af-2', 'hash-1', 'search', '2024-02-01T10:00:00Z'); + + const res = await get('/api/stats'); + const data = await res.json(); + + expect(data.analytics.totalRetrievals).toBe(3); + + // Tool usage + expect(data.analytics.toolUsage).toHaveLength(2); + const searchTool = data.analytics.toolUsage.find((t: any) => t.tool === 'search'); + expect(searchTool.count).toBe(2); + + // Top chunks + expect(data.analytics.topChunks.length).toBeGreaterThanOrEqual(1); + expect(data.analytics.topChunks[0].chunkId).toBe('af-1'); + expect(data.analytics.topChunks[0].count).toBe(2); + expect(data.analytics.topChunks[0].project).toBe('project-a'); + + // Project retrievals + expect(data.analytics.projectRetrievals).toHaveLength(2); + + // Retrieval time series + expect(data.analytics.retrievalTimeSeries.length).toBeGreaterThanOrEqual(1); + + // Size distribution (2 chunks inserted, both have approxTokens=10 → bucket 0-200) + expect(data.analytics.sizeDistribution.length).toBeGreaterThanOrEqual(1); + expect(data.analytics.sizeDistribution[0].bucket).toBe('0-200'); + expect(data.analytics.sizeDistribution[0].count).toBe(2); + }); +}); + +describe('GET /api/projects — retrieval counts', () => { + it('returns zero retrieval counts when no feedback exists', async () => { + insertChunk(makeChunk({ id: 'pr-1', sessionSlug: 'project-a' })); + + const res = await get('/api/projects'); + const data = await res.json(); + + expect(res.status).toBe(200); + expect(data.projects).toHaveLength(1); + expect(data.projects[0].retrievals).toBe(0); + expect(data.projects[0].uniqueQueries).toBe(0); + }); + + it('includes retrieval counts per project', async () => { + insertChunk(makeChunk({ id: 'pr-a1', sessionSlug: 'project-a' })); + insertChunk( + makeChunk({ + id: 'pr-b1', + sessionId: 'sess-2', + sessionSlug: 'project-b', + startTime: '2024-02-01T00:00:00Z', + }), + ); + + const testDb = getDb(); + testDb + .prepare('INSERT INTO retrieval_feedback (chunk_id, query_hash, tool_name) VALUES (?, ?, ?)') + .run('pr-a1', 'q1', 'search'); + testDb + .prepare('INSERT INTO retrieval_feedback (chunk_id, query_hash, tool_name) VALUES (?, ?, ?)') + .run('pr-a1', 'q2', 'recall'); + + const res = await get('/api/projects'); + const data = await res.json(); + + expect(data.projects).toHaveLength(2); + const projA = data.projects.find((p: any) => p.slug === 'project-a'); + const projB = data.projects.find((p: any) => p.slug === 'project-b'); + + expect(projA.retrievals).toBe(2); + expect(projA.uniqueQueries).toBe(2); + expect(projB.retrievals).toBe(0); + expect(projB.uniqueQueries).toBe(0); + }); +}); diff --git a/test/hooks/session-start.test.ts b/test/hooks/session-start.test.ts index 4b91bfe..aee5575 100644 --- a/test/hooks/session-start.test.ts +++ b/test/hooks/session-start.test.ts @@ -12,7 +12,7 @@ describe('session-start', () => { const defaults: SessionStartOptions = { maxTokens: 2000, includeRecent: 3, - includeCrossProject: 2, + includeCrossProject: 0, enableRetry: true, maxRetries: 3, gracefulDegradation: true, @@ -20,7 +20,7 @@ describe('session-start', () => { expect(defaults.maxTokens).toBe(2000); expect(defaults.includeRecent).toBe(3); - expect(defaults.includeCrossProject).toBe(2); + expect(defaults.includeCrossProject).toBe(0); expect(defaults.enableRetry).toBe(true); }); diff --git a/test/mcp/tools-handlers.test.ts b/test/mcp/tools-handlers.test.ts index be98d18..ef7de27 100644 --- a/test/mcp/tools-handlers.test.ts +++ b/test/mcp/tools-handlers.test.ts @@ -209,6 +209,14 @@ describe('searchTool.handler', () => { expect(result).toBe('No relevant memory found.'); }); + + it('uses max_tokens override when provided', async () => { + mockSearchContext.mockResolvedValue(sampleSearchResponse); + + await searchTool.handler({ query: 'test', max_tokens: 500 }); + + expect(mockSearchContext).toHaveBeenCalledWith(expect.objectContaining({ maxTokens: 500 })); + }); }); // --------------------------------------------------------------------------- @@ -255,6 +263,14 @@ describe('recallTool.handler', () => { expect(result).toBe('No relevant memory found.'); }); + + it('uses max_tokens override when provided', async () => { + mockRecall.mockResolvedValue(sampleResponse); + + await recallTool.handler({ query: 'test', max_tokens: 800 }); + + expect(mockRecall).toHaveBeenCalledWith('test', expect.objectContaining({ maxTokens: 800 })); + }); }); // --------------------------------------------------------------------------- @@ -292,6 +308,14 @@ describe('predictTool.handler', () => { expect(result).toBe('No predictions available based on current context.'); }); + it('uses max_tokens override when provided', async () => { + mockPredict.mockResolvedValue(sampleResponse); + + await predictTool.handler({ context: 'test', max_tokens: 1000 }); + + expect(mockPredict).toHaveBeenCalledWith('test', expect.objectContaining({ maxTokens: 1000 })); + }); + it('returns "Potentially relevant context..." header for non-empty results', async () => { mockPredict.mockResolvedValue(sampleResponse); @@ -644,6 +668,17 @@ describe('reconstructTool.handler', () => { ); }); + it('uses max_tokens override when provided', async () => { + mockReconstructSession.mockReturnValue(sampleReconstructResult); + mockFormatReconstruction.mockReturnValue('output'); + + await reconstructTool.handler({ project: 'my-app', max_tokens: 1500 }); + + expect(mockReconstructSession).toHaveBeenCalledWith( + expect.objectContaining({ maxTokens: 1500 }), + ); + }); + it('catches errors and returns "Error: ..." string', async () => { mockReconstructSession.mockImplementation(() => { throw new Error('currentSessionId is required when previousSession is true'); diff --git a/test/mcp/tools.test.ts b/test/mcp/tools.test.ts index 25ffed4..fdf211f 100644 --- a/test/mcp/tools.test.ts +++ b/test/mcp/tools.test.ts @@ -11,6 +11,7 @@ import { recallTool, predictTool, listProjectsTool, + reconstructTool, statsTool, forgetTool, } from '../../src/mcp/tools.js'; @@ -57,6 +58,11 @@ describe('mcp-tools', () => { it('has query property with string type', () => { expect(searchTool.inputSchema.properties.query.type).toBe('string'); }); + + it('has optional max_tokens parameter with number type', () => { + expect(searchTool.inputSchema.properties.max_tokens.type).toBe('number'); + expect(searchTool.inputSchema.required).not.toContain('max_tokens'); + }); }); describe('recallTool', () => { @@ -76,6 +82,11 @@ describe('mcp-tools', () => { it('has query property with string type', () => { expect(recallTool.inputSchema.properties.query.type).toBe('string'); }); + + it('has optional max_tokens parameter with number type', () => { + expect(recallTool.inputSchema.properties.max_tokens.type).toBe('number'); + expect(recallTool.inputSchema.required).not.toContain('max_tokens'); + }); }); describe('predictTool', () => { @@ -95,6 +106,22 @@ describe('mcp-tools', () => { it('has context property with string type', () => { expect(predictTool.inputSchema.properties.context.type).toBe('string'); }); + + it('has optional max_tokens parameter with number type', () => { + expect(predictTool.inputSchema.properties.max_tokens.type).toBe('number'); + expect(predictTool.inputSchema.required).not.toContain('max_tokens'); + }); + }); + + describe('reconstructTool', () => { + it('has correct name', () => { + expect(reconstructTool.name).toBe('reconstruct'); + }); + + it('has optional max_tokens parameter with number type', () => { + expect(reconstructTool.inputSchema.properties.max_tokens.type).toBe('number'); + expect(reconstructTool.inputSchema.required).not.toContain('max_tokens'); + }); }); describe('listProjectsTool', () => { diff --git a/test/retrieval/search-assembler.test.ts b/test/retrieval/search-assembler.test.ts index 259023b..f5b7ca3 100644 --- a/test/retrieval/search-assembler.test.ts +++ b/test/retrieval/search-assembler.test.ts @@ -79,6 +79,10 @@ vi.mock('../../src/config/loader.js', () => ({ decayFactor: 0.3, halfLifeHours: 48, }, + lengthPenalty: { + enabled: true, + referenceTokens: 500, + }, }), })); @@ -383,6 +387,49 @@ describe('search-assembler', () => { expect(recent!.weight).toBeGreaterThan(ancient!.weight); }); + it('normalizes scores so top result has weight 1.0', async () => { + mockChunks.set('c1', makeChunk('c1', { approxTokens: 50 })); + mockChunks.set('c2', makeChunk('c2', { approxTokens: 50 })); + + mockVectorResults = [ + { id: 'c1', distance: 0.1 }, + { id: 'c2', distance: 0.3 }, + ]; + + const result = await searchContext({ query: 'test' }); + + expect(result.chunks.length).toBe(2); + // Top result should be normalized to 1.0 + expect(result.chunks[0].weight).toBeCloseTo(1.0, 1); + // Second result should be < 1.0 + expect(result.chunks[1].weight).toBeLessThan(1.0); + expect(result.chunks[1].weight).toBeGreaterThan(0); + }); + + it('applies length penalty — smaller chunks score higher than large ones', async () => { + const now = new Date(); + const sameTime = now.toISOString(); + + // Small focused chunk (50 tokens) vs large chunk (2000 tokens) + mockChunks.set('small', makeChunk('small', { approxTokens: 50, startTime: sameTime })); + mockChunks.set('large', makeChunk('large', { approxTokens: 2000, startTime: sameTime })); + + // Equal base scores + mockVectorResults = [ + { id: 'small', distance: 0.2 }, + { id: 'large', distance: 0.2 }, + ]; + + const result = await searchContext({ query: 'test' }); + + const smallChunk = result.chunks.find((c) => c.id === 'small'); + const largeChunk = result.chunks.find((c) => c.id === 'large'); + expect(smallChunk).toBeDefined(); + expect(largeChunk).toBeDefined(); + // Small chunk should rank higher due to length penalty on large chunk + expect(smallChunk!.weight).toBeGreaterThan(largeChunk!.weight); + }); + it('gracefully handles keyword search failure', async () => { mockChunks.set('c1', makeChunk('c1')); mockVectorResults = [{ id: 'c1', distance: 0.1 }];