utils tfidf-index.ts

95.18% Statements 79/83
80.39% Branches 41/51
100% Functions 11/11
100% Lines 71/71
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9x
4433x
 
 
 
12399x
 
 
 
 
 
668x
668x
668x
 
 
 
 
 
 
668x
3664x
 
 
668x
3664x
7354x
 
 
 
 
668x
3664x
3664x
7354x
7354x
7354x
6817x
6817x
 
 
 
 
 
668x
668x
6464x
 
6464x
 
 
 
668x
3664x
3664x
3664x
7354x
7354x
7354x
 
 
3664x
3664x
3664x
6817x
6817x
6817x
6817x
6817x
6817x
 
 
3664x
3664x
 
 
 
 
 
 
 
 
 
 
769x
769x
 
713x
713x
2047x
2047x
 
 
713x
 
37x
37x
37x
37x
48x
48x
48x
48x
48x
48x
 
 
37x
 
 
769x
769x
225x
 
225x
225x
269x
269x
 
225x
225x
 
 
46x
37x
 
 
  /**
 * Lightweight TF-IDF vector index for offline vector search.
 * No external dependencies; tokenizes ASCII/latin text, lowercases,
 * strips punctuation, removes a small stopword set, and builds a sparse index.
 */
 
interface TfidfDocument {
	id: string;
	text: string;
}
 
interface TfidfResult {
	id: string;
	score: number; // cosine similarity (0..1)
}
 
const STOPWORDS = new Set([
	'a',
	'an',
	'the',
	'and',
	'or',
	'but',
	'if',
	'then',
	'else',
	'for',
	'of',
	'in',
	'on',
	'to',
	'from',
	'by',
	'with',
	'as',
	'at',
	'is',
	'are',
	'was',
	'were',
	'be',
	'been',
	'it',
	'this',
	'that',
	'these',
	'those',
	'not',
	'no',
	'can',
	'could',
	'should',
	'would',
	'may',
	'might',
	'do',
	'does',
	'did',
	'have',
	'has',
	'had',
	'you',
	'your',
]);
 
const tokenize = (text: string): string[] => {
	return text
		.toLowerCase()
		.replace(/[^a-z0-9_\s]/g, ' ')
		.split(/\s+/)
		.filter((t) => t && !STOPWORDS.has(t));
};
 
type SparseVec = Map<number, number>; // termId -> weight
 
export class TfidfIndex {
	private vocab = new Map<string, number>();
	private idf: number[] = [];
	private docs: { id: string; vec: SparseVec; norm: number }[] = [];
 
	/**
	 * Build index from a corpus of documents
	 */
	build(corpus: TfidfDocument[]): void {
		// vocab + df
		const df = new Map<number, number>();
		const docsTokens: string[][] = corpus.map((d) => tokenize(d.text));
 
		// assign term ids
		for (const tokens of docsTokens) {
			for (const t of tokens) {
				if (!this.vocab.has(t)) this.vocab.set(t, this.vocab.size);
			}
		}
 
		// compute df
		for (const tokens of docsTokens) {
			const seen = new Set<number>();
			for (const t of tokens) {
				const id = this.vocab.get(t);
				Iif (id === undefined) continue;
				if (!seen.has(id)) {
					seen.add(id);
					df.set(id, (df.get(id) || 0) + 1);
				}
			}
		}
 
		// compute idf
		const N = corpus.length;
		this.idf = Array.from({ length: this.vocab.size }, (_, id) => {
			const dfi = df.get(id) || 0;
			// smoothed idf
			return Math.log((N + 1) / (dfi + 1)) + 1;
		});
 
		// doc vectors
		this.docs = corpus.map((d, i) => {
			const docTokens = docsTokens[i] ?? [];
			const tf = new Map<number, number>();
			for (const t of docTokens) {
				const id = this.vocab.get(t);
				Iif (id === undefined) continue;
				tf.set(id, (tf.get(id) || 0) + 1);
			}
			// build weighted vector
			const vec: SparseVec = new Map();
			let normSq = 0;
			tf.forEach((f, id) => {
				const idf = this.idf[id];
				Iif (idf === undefined || docTokens.length === 0) return;
				const w = (f / docTokens.length) * idf;
				Eif (w > 0) {
					vec.set(id, w);
					normSq += w * w;
				}
			});
			const norm = Math.sqrt(normSq) || 1;
			return { id: d.id, vec, norm };
		});
	}
 
	/**
	 * Search for documents similar to the query
	 * @param query - Search query
	 * @param k - Maximum number of results to return
	 * @returns Array of results sorted by score (descending)
	 */
	search(query: string, k = 10): TfidfResult[] {
		const tokens = tokenize(query);
		if (tokens.length === 0 || this.vocab.size === 0) return [];
 
		const tf = new Map<number, number>();
		for (const t of tokens) {
			const id = this.vocab.get(t);
			if (id !== undefined) tf.set(id, (tf.get(id) || 0) + 1);
		}
 
		if (tf.size === 0) return [];
 
		const qVec: SparseVec = new Map();
		let qNormSq = 0;
		const total = tokens.length;
		tf.forEach((f, id) => {
			const idf = this.idf[id];
			Iif (idf === undefined) return;
			const w = total === 0 ? 0 : (f / total) * idf;
			Eif (w > 0) {
				qVec.set(id, w);
				qNormSq += w * w;
			}
		});
		const qNorm = Math.sqrt(qNormSq) || 1;
 
		// cosine similarity with sparse vectors
		const scores: TfidfResult[] = [];
		for (const d of this.docs) {
			let dot = 0;
			// iterate over smaller map
			const [small, big] = qVec.size <= d.vec.size ? [qVec, d.vec] : [d.vec, qVec];
			small.forEach((w, id) => {
				const v = big.get(id);
				if (v !== undefined) dot += w * v;
			});
			const sim = dot / (qNorm * d.norm);
			if (sim > 0) scores.push({ id: d.id, score: Math.min(1, Math.max(0, sim)) });
		}
 
		scores.sort((a, b) => b.score - a.score);
		return scores.slice(0, k);
	}
}