From 69d6c06e5f05250500d8c67aacda24d9997c978a Mon Sep 17 00:00:00 2001 From: Lee-take <210963840+Lee-take@users.noreply.github.com> Date: Thu, 18 Jun 2026 22:52:35 +0800 Subject: [PATCH] Add source string index for code-like literals --- __tests__/source-strings.test.ts | 179 +++++++++++++++++++++++++++ src/bin/codegraph.ts | 9 +- src/context/index.ts | 25 ++++ src/db/migrations.ts | 49 +++++++- src/db/queries.ts | 203 +++++++++++++++++++++++++++++++ src/db/schema.sql | 44 +++++++ src/extraction/index.ts | 4 + src/index.ts | 8 ++ src/mcp/tools.ts | 9 +- src/source-strings/index.ts | 150 +++++++++++++++++++++++ src/types.ts | 36 ++++++ 11 files changed, 711 insertions(+), 5 deletions(-) create mode 100644 __tests__/source-strings.test.ts create mode 100644 src/source-strings/index.ts diff --git a/__tests__/source-strings.test.ts b/__tests__/source-strings.test.ts new file mode 100644 index 000000000..67021f856 --- /dev/null +++ b/__tests__/source-strings.test.ts @@ -0,0 +1,179 @@ +/** + * Source string index tests + * + * Exact code-like string literals should be queryable even when the string is + * not a symbol name in the caller repo. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import CodeGraph from '../src/index'; +import { ToolHandler } from '../src/mcp/tools'; + +describe('source string index', () => { + let testDir: string; + let cg: CodeGraph; + + beforeEach(async () => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-source-strings-')); + const srcDir = path.join(testDir, 'src'); + fs.mkdirSync(srcDir); + + fs.writeFileSync( + path.join(srcDir, 'client.ts'), + `export async function sendPayload(payload: unknown): Promise { + return fetch('/live-scoring/append-event', { + method: 'POST', + body: JSON.stringify(payload), + }); +} + +export async function saveRecord(createItem: (collection: string) => Promise): Promise { + return createItem('game_matches'); +} + +export function genericText(): string { + return 'plain sentence that should not be indexed'; +} + +export function dynamicRoute(id: string): string { + return \`/live-scoring/\${id}\`; +} +` + ); + + fs.writeFileSync( + path.join(srcDir, 'bridge.ts'), + `export function postBridgeEvent(postMessage: (event: string) => void): void { + postMessage('unity.score.updated'); +} +` + ); + + cg = CodeGraph.initSync(testDir, { + config: { + include: ['**/*.ts'], + exclude: [], + }, + }); + await cg.indexAll(); + }); + + afterEach(() => { + cg?.destroy(); + if (fs.existsSync(testDir)) { + fs.rmSync(testDir, { recursive: true, force: true }); + } + }); + + it('indexes exact code-like string literals with file line and enclosing symbol', () => { + const routeHits = cg.searchSourceStrings('/live-scoring/append-event'); + + expect(routeHits).toHaveLength(1); + expect(routeHits[0]).toMatchObject({ + literal: '/live-scoring/append-event', + filePath: 'src/client.ts', + line: 2, + nodeName: 'sendPayload', + nodeKind: 'function', + }); + + const collectionHits = cg.searchSourceStrings('game_matches'); + expect(collectionHits).toHaveLength(1); + expect(collectionHits[0]).toMatchObject({ + literal: 'game_matches', + filePath: 'src/client.ts', + line: 9, + nodeName: 'saveRecord', + nodeKind: 'function', + }); + }); + + it('does not index plain prose strings', () => { + expect(cg.searchSourceStrings('plain sentence that should not be indexed')).toHaveLength(0); + }); + + it('does not index dynamic template strings as exact literals', () => { + expect(cg.searchSourceStrings('/live-scoring/${id}')).toHaveLength(0); + }); + + it('supports FTS term lookup without weakening exact literal lookup', () => { + const ftsHits = cg.searchSourceStrings('live scoring append'); + expect(ftsHits[0]).toMatchObject({ + literal: '/live-scoring/append-event', + nodeName: 'sendPayload', + }); + + expect(cg.searchSourceStrings('/live-scoring/append')).toHaveLength(0); + }); + + it('uses source-string hits as search and context entry points', async () => { + const searchHits = cg.searchNodes('/live-scoring/append-event', { limit: 5 }); + expect(searchHits[0]?.node.name).toBe('sendPayload'); + expect(searchHits[0]?.sourceString).toMatchObject({ + literal: '/live-scoring/append-event', + line: 2, + }); + + const context = await cg.findRelevantContext('game_matches', { + searchLimit: 3, + traversalDepth: 0, + }); + const rootNames = context.roots.map((id) => context.nodes.get(id)?.name); + expect(rootNames).toContain('saveRecord'); + }); + + it('surfaces exact source-string sites through the MCP search and explore paths', async () => { + const handler = new ToolHandler(cg); + + const search = await handler.execute('codegraph_search', { + query: '/live-scoring/append-event', + limit: 5, + }); + expect(search.content[0]?.text).toContain('sendPayload'); + expect(search.content[0]?.text).toContain('source string `/live-scoring/append-event` at src/client.ts:2'); + + const explore = await handler.execute('codegraph_explore', { + query: 'unity.score.updated', + maxFiles: 3, + }); + expect(explore.content[0]?.text).toContain('postBridgeEvent'); + expect(explore.content[0]?.text).toContain("postMessage('unity.score.updated')"); + }); + + it('replaces source-string rows when files change during sync', async () => { + const clientPath = path.join(testDir, 'src', 'client.ts'); + fs.writeFileSync( + clientPath, + `export async function sendPayload(payload: unknown): Promise { + return fetch('/live-scoring/v2/append-event', { + method: 'POST', + body: JSON.stringify(payload), + }); +} +` + ); + + await cg.sync(); + + expect(cg.searchSourceStrings('/live-scoring/append-event')).toHaveLength(0); + const replacementHits = cg.searchSourceStrings('/live-scoring/v2/append-event'); + expect(replacementHits).toHaveLength(1); + expect(replacementHits[0]).toMatchObject({ + filePath: 'src/client.ts', + line: 2, + nodeName: 'sendPayload', + }); + }); + + it('clears source-string rows with the graph data', () => { + expect(cg.searchSourceStrings('/live-scoring/append-event')).toHaveLength(1); + + cg.clear(); + + expect(cg.searchSourceStrings('/live-scoring/append-event')).toHaveLength(0); + expect(cg.searchNodes('/live-scoring/append-event')).toHaveLength(0); + }); +}); diff --git a/src/bin/codegraph.ts b/src/bin/codegraph.ts index b0c2f4b48..097e873f7 100644 --- a/src/bin/codegraph.ts +++ b/src/bin/codegraph.ts @@ -909,7 +909,7 @@ program */ program .command('query ') - .description('Search for symbols in the codebase') + .description('Search for symbols or code-like source strings in the codebase') .option('-p, --path ', 'Project path') .option('-l, --limit ', 'Maximum results', '10') .option('-k, --kind ', 'Filter by node kind (function, class, etc.)') @@ -952,7 +952,9 @@ program for (const result of results) { const node = result.node; - const location = `${node.filePath}:${node.startLine}`; + const location = result.sourceString + ? `${result.sourceString.filePath}:${result.sourceString.line}` + : `${node.filePath}:${node.startLine}`; const score = chalk.dim(`(${(result.score * 100).toFixed(0)}%)`); console.log( @@ -961,6 +963,9 @@ program ' ' + score ); console.log(chalk.dim(` ${location}`)); + if (result.sourceString) { + console.log(chalk.dim(` source string: ${result.sourceString.literal}`)); + } if (node.signature) { console.log(chalk.dim(` ${node.signature}`)); } diff --git a/src/context/index.ts b/src/context/index.ts index 68123c284..baddf1ff8 100644 --- a/src/context/index.ts +++ b/src/context/index.ts @@ -585,6 +585,19 @@ export class ContextBuilder { logDebug('Text search failed', { query, error: String(error) }); } + let sourceStringResults: SearchResult[] = []; + try { + sourceStringResults = /[-_./:@]/.test(query) + ? this.queries.searchSourceStringNodes(query, { + limit: opts.searchLimit * 2, + kinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined, + }) + : []; + logDebug('Source string search results', { count: sourceStringResults.length }); + } catch (error) { + logDebug('Source string search failed', { query, error: String(error) }); + } + // Step 4: Merge results, taking the max score when duplicates appear // across search channels. Exact matches may have lower scores than FTS // results for the same node — use the best score from any channel. @@ -607,6 +620,18 @@ export class ContextBuilder { const existing = resultById.get(result.node.id); if (existing) { existing.score = Math.max(existing.score, result.score); + existing.sourceString = existing.sourceString ?? result.sourceString; + } else { + resultById.set(result.node.id, result); + searchResults.push(result); + } + } + + for (const result of sourceStringResults) { + const existing = resultById.get(result.node.id); + if (existing) { + existing.score = Math.max(existing.score, result.score); + existing.sourceString = existing.sourceString ?? result.sourceString; } else { resultById.set(result.node.id, result); searchResults.push(result); diff --git a/src/db/migrations.ts b/src/db/migrations.ts index bfea9024d..089f3bcd7 100644 --- a/src/db/migrations.ts +++ b/src/db/migrations.ts @@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter'; /** * Current schema version */ -export const CURRENT_SCHEMA_VERSION = 5; +export const CURRENT_SCHEMA_VERSION = 6; /** * Migration definition @@ -75,6 +75,53 @@ const migrations: Migration[] = [ `); }, }, + { + version: 6, + description: 'Add source_strings side-table and FTS index for code-like string literals', + up: (db) => { + db.exec(` + CREATE TABLE IF NOT EXISTS source_strings ( + id TEXT PRIMARY KEY, + literal TEXT NOT NULL, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + col INTEGER NOT NULL, + language TEXT NOT NULL, + node_id TEXT, + node_name TEXT, + node_kind TEXT + ); + CREATE INDEX IF NOT EXISTS idx_source_strings_literal ON source_strings(literal); + CREATE INDEX IF NOT EXISTS idx_source_strings_file_path ON source_strings(file_path); + CREATE INDEX IF NOT EXISTS idx_source_strings_node_id ON source_strings(node_id); + + CREATE VIRTUAL TABLE IF NOT EXISTS source_strings_fts USING fts5( + literal, + file_path UNINDEXED, + node_name UNINDEXED, + content='source_strings', + content_rowid='rowid' + ); + + CREATE TRIGGER IF NOT EXISTS source_strings_ai AFTER INSERT ON source_strings BEGIN + INSERT INTO source_strings_fts(rowid, literal, file_path, node_name) + VALUES (NEW.rowid, NEW.literal, NEW.file_path, NEW.node_name); + END; + + CREATE TRIGGER IF NOT EXISTS source_strings_ad AFTER DELETE ON source_strings BEGIN + INSERT INTO source_strings_fts(source_strings_fts, rowid, literal, file_path, node_name) + VALUES ('delete', OLD.rowid, OLD.literal, OLD.file_path, OLD.node_name); + END; + + CREATE TRIGGER IF NOT EXISTS source_strings_au AFTER UPDATE ON source_strings BEGIN + INSERT INTO source_strings_fts(source_strings_fts, rowid, literal, file_path, node_name) + VALUES ('delete', OLD.rowid, OLD.literal, OLD.file_path, OLD.node_name); + INSERT INTO source_strings_fts(rowid, literal, file_path, node_name) + VALUES (NEW.rowid, NEW.literal, NEW.file_path, NEW.node_name); + END; + `); + }, + }, ]; /** diff --git a/src/db/queries.ts b/src/db/queries.ts index adf239268..0cbfea563 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -16,6 +16,7 @@ import { GraphStats, SearchOptions, SearchResult, + SourceStringRef, } from '../types'; import { safeJsonParse } from '../utils'; import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils'; @@ -48,6 +49,34 @@ function isLowValueFile(filePath: string): boolean { } const SQLITE_PARAM_CHUNK_SIZE = 500; +const SOURCE_STRING_SEARCH_SCORE = 1000; + +function sourceStringQueryVariants(query: string): string[] { + const trimmed = query.trim(); + if (!trimmed) return []; + + const variants = new Set([trimmed]); + if ( + (trimmed.startsWith('"') && trimmed.endsWith('"')) || + (trimmed.startsWith("'") && trimmed.endsWith("'")) || + (trimmed.startsWith('`') && trimmed.endsWith('`')) + ) { + variants.add(trimmed.slice(1, -1)); + } + return [...variants].filter((v) => v.length >= 3 && (/[-_./:@]/.test(v) || /\s/.test(v))); +} + +function sourceStringFtsQuery(query: string): string { + return query + .replace(/['"`*():^]/g, ' ') + .split(/[^A-Za-z0-9_./:@-]+/) + .flatMap((part) => part.split(/[-/.:@]+/)) + .map((term) => term.trim()) + .filter((term) => term.length >= 2) + .filter((term) => !/^(AND|OR|NOT|NEAR)$/i.test(term)) + .map((term) => `"${term}"*`) + .join(' OR '); +} /** * Database row types (snake_case from SQLite) @@ -110,6 +139,18 @@ interface UnresolvedRefRow { language: string; } +interface SourceStringRow { + id: string; + literal: string; + file_path: string; + line: number; + col: number; + language: string; + node_id: string | null; + node_name: string | null; + node_kind: string | null; +} + /** * Convert database row to Node object */ @@ -139,6 +180,20 @@ function rowToNode(row: NodeRow): Node { }; } +function rowToSourceString(row: SourceStringRow): SourceStringRef { + return { + id: row.id, + literal: row.literal, + filePath: row.file_path, + line: row.line, + column: row.col, + language: row.language as Language, + nodeId: row.node_id ?? undefined, + nodeName: row.node_name ?? undefined, + nodeKind: row.node_kind ? row.node_kind as NodeKind : undefined, + }; +} + /** * Convert database row to Edge object */ @@ -219,6 +274,8 @@ export class QueryBuilder { getDominantFile?: SqliteStatement; getTopRouteFile?: SqliteStatement; getRoutingManifest?: SqliteStatement; + insertSourceString?: SqliteStatement; + deleteSourceStringsByFile?: SqliteStatement; } = {}; constructor(db: SqliteDatabase) { @@ -820,6 +877,23 @@ export class QueryBuilder { results = this.searchNodesFuzzy(text, { kinds, languages, limit }); } + const sourceStringResults = text && /[-_./:@]/.test(text) + ? this.searchSourceStringNodes(text, { kinds, languages, limit: Math.max(limit, 20) }) + : []; + if (sourceStringResults.length > 0) { + const byId = new Map(results.map((r) => [r.node.id, r])); + for (const sourceResult of sourceStringResults) { + const existing = byId.get(sourceResult.node.id); + if (existing) { + existing.score = Math.max(existing.score, sourceResult.score); + existing.sourceString = existing.sourceString ?? sourceResult.sourceString; + } else { + results.push(sourceResult); + byId.set(sourceResult.node.id, sourceResult); + } + } + } + // Supplement: ensure exact name matches are always candidates. // BM25 can bury short exact-match names (e.g. "getBean") under hundreds of // compound names (e.g. "getBeanDescriptor") in large codebases, @@ -891,6 +965,90 @@ export class QueryBuilder { return results; } + searchSourceStrings(query: string, options: SearchOptions = {}): SourceStringRef[] { + const { limit = 50, languages, kinds } = options; + const variants = sourceStringQueryVariants(query); + if (variants.length === 0) return []; + + const byId = new Map(); + for (const variant of variants) { + let sql = 'SELECT * FROM source_strings WHERE literal = ?'; + const params: (string | number)[] = [variant]; + if (languages && languages.length > 0) { + sql += ` AND language IN (${languages.map(() => '?').join(',')})`; + params.push(...languages); + } + if (kinds && kinds.length > 0) { + sql += ` AND node_kind IN (${kinds.map(() => '?').join(',')})`; + params.push(...kinds); + } + sql += ' ORDER BY file_path, line, col LIMIT ?'; + params.push(limit); + + const rows = this.db.prepare(sql).all(...params) as SourceStringRow[]; + for (const row of rows) byId.set(row.id, rowToSourceString(row)); + } + + if (byId.size === 0 && /\s/.test(query.trim())) { + const fts = sourceStringFtsQuery(variants[0]!); + if (fts) { + let sql = ` + SELECT source_strings.* + FROM source_strings_fts + JOIN source_strings ON source_strings_fts.rowid = source_strings.rowid + WHERE source_strings_fts MATCH ? + `; + const params: (string | number)[] = [fts]; + if (languages && languages.length > 0) { + sql += ` AND source_strings.language IN (${languages.map(() => '?').join(',')})`; + params.push(...languages); + } + if (kinds && kinds.length > 0) { + sql += ` AND source_strings.node_kind IN (${kinds.map(() => '?').join(',')})`; + params.push(...kinds); + } + sql += ' ORDER BY bm25(source_strings_fts) LIMIT ?'; + params.push(limit); + + try { + const rows = this.db.prepare(sql).all(...params) as SourceStringRow[]; + for (const row of rows) byId.set(row.id, rowToSourceString(row)); + } catch { + // Ignore FTS syntax/runtime failures; exact lookup already ran. + } + } + } + + return [...byId.values()].slice(0, limit); + } + + searchSourceStringNodes(query: string, options: SearchOptions = {}): SearchResult[] { + const refs = this.searchSourceStrings(query, options); + if (refs.length === 0) return []; + + const nodeIds = refs.map((ref) => ref.nodeId).filter((id): id is string => !!id); + const nodes = this.getNodesByIds(nodeIds); + const results: SearchResult[] = []; + const seen = new Set(); + + for (const ref of refs) { + if (!ref.nodeId || seen.has(ref.nodeId)) continue; + const node = nodes.get(ref.nodeId); + if (!node) continue; + seen.add(ref.nodeId); + results.push({ + node, + sourceString: ref, + score: + SOURCE_STRING_SEARCH_SCORE + + kindBonus(node.kind) + + scorePathRelevance(node.filePath, query, this.projectNameTokens), + }); + } + + return results.sort((a, b) => b.score - a.score); + } + /** * Match-everything path used when the user supplied only field * filters (`kind:function lang:typescript`) with no text. Returns @@ -1459,6 +1617,7 @@ export class QueryBuilder { */ deleteFile(filePath: string): void { this.db.transaction(() => { + this.deleteSourceStringsByFile(filePath); this.deleteNodesByFile(filePath); if (!this.stmts.deleteFile) { this.stmts.deleteFile = this.db.prepare('DELETE FROM files WHERE path = ?'); @@ -1489,6 +1648,49 @@ export class QueryBuilder { return rows.map(rowToFileRecord); } + replaceSourceStringsForFile(filePath: string, refs: SourceStringRef[]): void { + this.db.transaction(() => { + this.deleteSourceStringsByFile(filePath); + for (const ref of refs) { + this.insertSourceString(ref); + } + })(); + } + + deleteSourceStringsByFile(filePath: string): void { + if (!this.stmts.deleteSourceStringsByFile) { + this.stmts.deleteSourceStringsByFile = this.db.prepare( + 'DELETE FROM source_strings WHERE file_path = ?' + ); + } + this.stmts.deleteSourceStringsByFile.run(filePath); + } + + private insertSourceString(ref: SourceStringRef): void { + if (!this.stmts.insertSourceString) { + this.stmts.insertSourceString = this.db.prepare(` + INSERT OR REPLACE INTO source_strings ( + id, literal, file_path, line, col, language, node_id, node_name, node_kind + ) + VALUES ( + @id, @literal, @filePath, @line, @column, @language, @nodeId, @nodeName, @nodeKind + ) + `); + } + + this.stmts.insertSourceString.run({ + id: ref.id, + literal: ref.literal, + filePath: ref.filePath, + line: ref.line, + column: ref.column, + language: ref.language, + nodeId: ref.nodeId ?? null, + nodeName: ref.nodeName ?? null, + nodeKind: ref.nodeKind ?? null, + }); + } + /** * Most recent index timestamp (ms since epoch) across all tracked files, or * null when nothing is indexed yet. One indexed aggregate, no per-row scan. (#329) @@ -1830,6 +2032,7 @@ export class QueryBuilder { clear(): void { this.nodeCache.clear(); this.db.transaction(() => { + this.db.exec('DELETE FROM source_strings'); this.db.exec('DELETE FROM unresolved_refs'); this.db.exec('DELETE FROM edges'); this.db.exec('DELETE FROM nodes'); diff --git a/src/db/schema.sql b/src/db/schema.sql index 292981c82..081390885 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -81,6 +81,22 @@ CREATE TABLE IF NOT EXISTS unresolved_refs ( FOREIGN KEY (from_node_id) REFERENCES nodes(id) ON DELETE CASCADE ); +-- Source Strings: code-like string literals such as route keys, event names, +-- collection names, and config keys. These are intentionally side-table data: +-- the graph still contains symbols, while this table gives exact string +-- occurrences a queryable file:line surface. +CREATE TABLE IF NOT EXISTS source_strings ( + id TEXT PRIMARY KEY, + literal TEXT NOT NULL, + file_path TEXT NOT NULL, + line INTEGER NOT NULL, + col INTEGER NOT NULL, + language TEXT NOT NULL, + node_id TEXT, + node_name TEXT, + node_kind TEXT +); + -- ============================================================================= -- Indexes for Query Performance -- ============================================================================= @@ -143,6 +159,34 @@ CREATE INDEX IF NOT EXISTS idx_unresolved_name ON unresolved_refs(reference_name CREATE INDEX IF NOT EXISTS idx_unresolved_file_path ON unresolved_refs(file_path); CREATE INDEX IF NOT EXISTS idx_unresolved_from_name ON unresolved_refs(from_node_id, reference_name); CREATE INDEX IF NOT EXISTS idx_edges_provenance ON edges(provenance); +CREATE INDEX IF NOT EXISTS idx_source_strings_literal ON source_strings(literal); +CREATE INDEX IF NOT EXISTS idx_source_strings_file_path ON source_strings(file_path); +CREATE INDEX IF NOT EXISTS idx_source_strings_node_id ON source_strings(node_id); + +CREATE VIRTUAL TABLE IF NOT EXISTS source_strings_fts USING fts5( + literal, + file_path UNINDEXED, + node_name UNINDEXED, + content='source_strings', + content_rowid='rowid' +); + +CREATE TRIGGER IF NOT EXISTS source_strings_ai AFTER INSERT ON source_strings BEGIN + INSERT INTO source_strings_fts(rowid, literal, file_path, node_name) + VALUES (NEW.rowid, NEW.literal, NEW.file_path, NEW.node_name); +END; + +CREATE TRIGGER IF NOT EXISTS source_strings_ad AFTER DELETE ON source_strings BEGIN + INSERT INTO source_strings_fts(source_strings_fts, rowid, literal, file_path, node_name) + VALUES ('delete', OLD.rowid, OLD.literal, OLD.file_path, OLD.node_name); +END; + +CREATE TRIGGER IF NOT EXISTS source_strings_au AFTER UPDATE ON source_strings BEGIN + INSERT INTO source_strings_fts(source_strings_fts, rowid, literal, file_path, node_name) + VALUES ('delete', OLD.rowid, OLD.literal, OLD.file_path, OLD.node_name); + INSERT INTO source_strings_fts(rowid, literal, file_path, node_name) + VALUES (NEW.rowid, NEW.literal, NEW.file_path, NEW.node_name); +END; -- Project metadata for version/provenance tracking CREATE TABLE IF NOT EXISTS project_metadata ( diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 643634d66..2a41a8d7d 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -24,6 +24,7 @@ import { validatePathWithinRoot, normalizePath } from '../utils'; import ignore, { Ignore } from 'ignore'; import { detectFrameworks } from '../resolution/frameworks'; import type { ResolutionContext } from '../resolution/types'; +import { extractSourceStrings } from '../source-strings'; /** * Number of files to read in parallel during indexing. @@ -1612,6 +1613,9 @@ export class ExtractionOrchestrator { this.queries.insertNodes(validNodes); } + const sourceStrings = extractSourceStrings(filePath, content, language, validNodes); + this.queries.replaceSourceStringsForFile(filePath, sourceStrings); + // Filter edges to only reference nodes that were actually inserted if (result.edges.length > 0) { const insertedIds = new Set(validNodes.map((n) => n.id)); diff --git a/src/index.ts b/src/index.ts index 91ea9c074..8f0f3608c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -15,6 +15,7 @@ import { TraversalOptions, SearchOptions, SearchResult, + SourceStringRef, Context, GraphStats, TaskInput, @@ -789,6 +790,13 @@ export class CodeGraph { return this.queries.searchNodes(query, options); } + /** + * Search exact code-like source string literals. + */ + searchSourceStrings(query: string, options?: SearchOptions): SourceStringRef[] { + return this.queries.searchSourceStrings(query, options); + } + /** * Normalized project-name tokens (go.mod / package.json / repo dir) used to * down-weight the non-discriminative project name in search ranking (#720). diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 2ca435fc9..5c923ff7d 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -415,13 +415,13 @@ const projectPathProperty: PropertySchema = { export const tools: ToolDefinition[] = [ { name: 'codegraph_search', - description: 'Quick symbol search by name. Returns locations only (no code). Use codegraph_explore instead to get the actual source / understand an area in one call.', + description: 'Quick symbol or code-like source-string search. Returns locations only (no code). Use codegraph_explore instead to get the actual source / understand an area in one call.', inputSchema: { type: 'object', properties: { query: { type: 'string', - description: 'Symbol name or partial name (e.g., "auth", "signIn", "UserService")', + description: 'Symbol name, partial name, or exact code-like source string (e.g., "auth", "signIn", "UserService", "/api/route", "game_matches")', }, kind: { type: 'string', @@ -3782,6 +3782,11 @@ export class ToolHandler { // Compact format: one line per result with key info lines.push(`### ${node.name} (${node.kind})`); lines.push(`${node.filePath}${location}`); + if (result.sourceString) { + lines.push( + `source string \`${result.sourceString.literal}\` at ${result.sourceString.filePath}:${result.sourceString.line}` + ); + } if (node.signature) lines.push(`\`${node.signature}\``); lines.push(''); } diff --git a/src/source-strings/index.ts b/src/source-strings/index.ts new file mode 100644 index 000000000..ace643541 --- /dev/null +++ b/src/source-strings/index.ts @@ -0,0 +1,150 @@ +/** + * Source-string extraction. + * + * This indexes only compact code-like string literals: route paths, config keys, + * event names, collection names, and similar cross-boundary contracts. Plain + * prose strings are intentionally ignored so the table remains a code lookup + * surface rather than a general text/secret enumeration surface. + */ + +import { createHash } from 'crypto'; +import { Language, Node, SourceStringRef } from '../types'; + +const MAX_SOURCE_STRING_LENGTH = 160; + +export function extractSourceStrings( + filePath: string, + source: string, + language: Language, + nodes: Node[] +): SourceStringRef[] { + const refs: SourceStringRef[] = []; + let i = 0; + let line = 1; + let column = 0; + + const advance = (ch: string): void => { + if (ch === '\n') { + line++; + column = 0; + } else { + column++; + } + i++; + }; + + while (i < source.length) { + const ch = source[i]!; + const next = source[i + 1]; + + if (ch === '/' && next === '/') { + while (i < source.length && source[i] !== '\n') advance(source[i]!); + continue; + } + + if (ch === '/' && next === '*') { + advance(ch); + advance(next); + while (i < source.length) { + const c = source[i]!; + const n = source[i + 1]; + advance(c); + if (c === '*' && n === '/') { + advance(n); + break; + } + } + continue; + } + + if (ch !== '\'' && ch !== '"' && ch !== '`') { + advance(ch); + continue; + } + + const quote = ch; + const startLine = line; + const startColumn = column; + advance(ch); + + let literal = ''; + let closed = false; + while (i < source.length) { + const c = source[i]!; + if (c === '\\') { + const escaped = source[i + 1]; + advance(c); + if (escaped !== undefined) { + literal += escaped; + advance(escaped); + } + continue; + } + if (c === quote) { + closed = true; + advance(c); + break; + } + literal += c; + advance(c); + } + + if (!closed || (quote === '`' && literal.includes('${')) || !isCodeLikeSourceString(literal)) continue; + + const node = pickEnclosingNode(nodes, startLine); + refs.push({ + id: sourceStringId(filePath, startLine, startColumn, literal), + literal, + filePath, + line: startLine, + column: startColumn, + language, + nodeId: node?.id, + nodeName: node?.name, + nodeKind: node?.kind, + }); + } + + return refs; +} + +export function isCodeLikeSourceString(value: string): boolean { + const cleaned = value.trim(); + if (cleaned !== value) return false; + if (cleaned.length < 3 || cleaned.length > MAX_SOURCE_STRING_LENGTH) return false; + if (!/[A-Za-z0-9]/.test(cleaned)) return false; + if (/\s/.test(cleaned)) return false; + return /[-_./:@]/.test(cleaned); +} + +function pickEnclosingNode(nodes: Node[], line: number): Node | null { + const candidates = nodes + .filter((node) => node.kind !== 'file') + .filter((node) => node.startLine <= line && line <= node.endLine) + .sort((a, b) => { + const aRange = a.endLine - a.startLine; + const bRange = b.endLine - b.startLine; + return aRange - bRange || a.startLine - b.startLine; + }); + + return candidates[0] ?? nodes.find((node) => node.kind === 'file') ?? null; +} + +function sourceStringId( + filePath: string, + line: number, + column: number, + literal: string +): string { + const hash = createHash('sha1') + .update(filePath) + .update('\0') + .update(String(line)) + .update('\0') + .update(String(column)) + .update('\0') + .update(literal) + .digest('hex') + .slice(0, 24); + return `source-string:${hash}`; +} diff --git a/src/types.ts b/src/types.ts index 656bb1090..8b919e33d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -315,6 +315,39 @@ export interface UnresolvedReference { candidates?: string[]; } +/** + * A code-like string literal found inside source code and mapped back to the + * smallest indexed symbol that encloses the literal. + */ +export interface SourceStringRef { + /** Stable ID for this literal occurrence */ + id: string; + + /** The unquoted literal value */ + literal: string; + + /** File path relative to project root */ + filePath: string; + + /** 1-indexed line where the literal starts */ + line: number; + + /** 0-indexed column where the literal starts */ + column: number; + + /** Detected language of the containing file */ + language: Language; + + /** Enclosing node ID, when available */ + nodeId?: string; + + /** Enclosing node name, denormalized for query output */ + nodeName?: string; + + /** Enclosing node kind, denormalized for query output */ + nodeKind?: NodeKind; +} + // ============================================================================= // Query Types // ============================================================================= @@ -398,6 +431,9 @@ export interface SearchResult { /** Matching node */ node: Node; + /** Source-string occurrence that led to this node, if any */ + sourceString?: SourceStringRef; + /** Relevance score (0-1) */ score: number;