All files / src/utils tfidf-index.ts

95.18% Statements 79/83
76.47% Branches 39/51
100% Functions 11/11
100% Lines 71/71

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193                                4x                                                                                                 4x 210x       2316x           39x 39x 39x             39x 178x     39x 178x 1883x         39x 178x 178x 1883x 1883x 1883x 915x 915x           39x 39x 611x   611x       39x 178x 178x 178x 1883x 1883x 1883x     178x 178x 178x 915x 915x 915x 915x 915x 915x     178x 178x                     32x 32x   28x 28x 52x 52x     28x   26x 26x 26x 26x 41x 41x 41x 41x 41x 41x     26x     32x 32x 108x   108x 108x 184x 184x   108x 108x     46x 26x      
/**
 * Lightweight TF-IDF vector index for offline vector search.
 * No external dependencies; tokenizes ASCII/latin text, lowercases,
 * strips punctuation, removes a small stopword set, and builds a sparse index.
 */
 
interface TfidfDocument {
	id: string;
	text: string;
}
 
interface TfidfResult {
	id: string;
	score: number; // cosine similarity (0..1)
}
 
const STOPWORDS = new Set([
	'a',
	'an',
	'the',
	'and',
	'or',
	'but',
	'if',
	'then',
	'else',
	'for',
	'of',
	'in',
	'on',
	'to',
	'from',
	'by',
	'with',
	'as',
	'at',
	'is',
	'are',
	'was',
	'were',
	'be',
	'been',
	'it',
	'this',
	'that',
	'these',
	'those',
	'not',
	'no',
	'can',
	'could',
	'should',
	'would',
	'may',
	'might',
	'do',
	'does',
	'did',
	'have',
	'has',
	'had',
	'you',
	'your',
]);
 
const tokenize = (text: string): string[] => {
	return text
		.toLowerCase()
		.replace(/[^a-z0-9_\s]/g, ' ')
		.split(/\s+/)
		.filter((t) => t && !STOPWORDS.has(t));
};
 
type SparseVec = Map<number, number>; // termId -> weight
 
export class TfidfIndex {
	private vocab = new Map<string, number>();
	private idf: number[] = [];
	private docs: { id: string; vec: SparseVec; norm: number }[] = [];
 
	/**
	 * Build index from a corpus of documents
	 */
	build(corpus: TfidfDocument[]): void {
		// vocab + df
		const df = new Map<number, number>();
		const docsTokens: string[][] = corpus.map((d) => tokenize(d.text));
 
		// assign term ids
		for (const tokens of docsTokens) {
			for (const t of tokens) {
				if (!this.vocab.has(t)) this.vocab.set(t, this.vocab.size);
			}
		}
 
		// compute df
		for (const tokens of docsTokens) {
			const seen = new Set<number>();
			for (const t of tokens) {
				const id = this.vocab.get(t);
				Iif (id === undefined) continue;
				if (!seen.has(id)) {
					seen.add(id);
					df.set(id, (df.get(id) || 0) + 1);
				}
			}
		}
 
		// compute idf
		const N = corpus.length;
		this.idf = Array.from({ length: this.vocab.size }, (_, id) => {
			const dfi = df.get(id) || 0;
			// smoothed idf
			return Math.log((N + 1) / (dfi + 1)) + 1;
		});
 
		// doc vectors
		this.docs = corpus.map((d, i) => {
			const docTokens = docsTokens[i] ?? [];
			const tf = new Map<number, number>();
			for (const t of docTokens) {
				const id = this.vocab.get(t);
				Iif (id === undefined) continue;
				tf.set(id, (tf.get(id) || 0) + 1);
			}
			// build weighted vector
			const vec: SparseVec = new Map();
			let normSq = 0;
			tf.forEach((f, id) => {
				const idf = this.idf[id];
				Iif (idf === undefined || docTokens.length === 0) return;
				const w = (f / docTokens.length) * idf;
				Eif (w > 0) {
					vec.set(id, w);
					normSq += w * w;
				}
			});
			const norm = Math.sqrt(normSq) || 1;
			return { id: d.id, vec, norm };
		});
	}
 
	/**
	 * Search for documents similar to the query
	 * @param query - Search query
	 * @param k - Maximum number of results to return
	 * @returns Array of results sorted by score (descending)
	 */
	search(query: string, k = 10): TfidfResult[] {
		const tokens = tokenize(query);
		if (tokens.length === 0 || this.vocab.size === 0) return [];
 
		const tf = new Map<number, number>();
		for (const t of tokens) {
			const id = this.vocab.get(t);
			if (id !== undefined) tf.set(id, (tf.get(id) || 0) + 1);
		}
 
		if (tf.size === 0) return [];
 
		const qVec: SparseVec = new Map();
		let qNormSq = 0;
		const total = tokens.length;
		tf.forEach((f, id) => {
			const idf = this.idf[id];
			Iif (idf === undefined) return;
			const w = total === 0 ? 0 : (f / total) * idf;
			Eif (w > 0) {
				qVec.set(id, w);
				qNormSq += w * w;
			}
		});
		const qNorm = Math.sqrt(qNormSq) || 1;
 
		// cosine similarity with sparse vectors
		const scores: TfidfResult[] = [];
		for (const d of this.docs) {
			let dot = 0;
			// iterate over smaller map
			const [small, big] = qVec.size <= d.vec.size ? [qVec, d.vec] : [d.vec, qVec];
			small.forEach((w, id) => {
				const v = big.get(id);
				if (v !== undefined) dot += w * v;
			});
			const sim = dot / (qNorm * d.norm);
			if (sim > 0) scores.push({ id: d.id, score: Math.min(1, Math.max(0, sim)) });
		}
 
		scores.sort((a, b) => b.score - a.score);
		return scores.slice(0, k);
	}
}