Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | 4x 4x 210x 2316x 39x 39x 39x 39x 178x 39x 178x 1883x 39x 178x 178x 1883x 1883x 1883x 915x 915x 39x 39x 611x 611x 39x 178x 178x 178x 1883x 1883x 1883x 178x 178x 178x 915x 915x 915x 915x 915x 915x 178x 178x 32x 32x 28x 28x 52x 52x 28x 26x 26x 26x 26x 41x 41x 41x 41x 41x 41x 26x 32x 32x 108x 108x 108x 184x 184x 108x 108x 46x 26x | /**
* Lightweight TF-IDF vector index for offline vector search.
* No external dependencies; tokenizes ASCII/latin text, lowercases,
* strips punctuation, removes a small stopword set, and builds a sparse index.
*/
interface TfidfDocument {
id: string;
text: string;
}
interface TfidfResult {
id: string;
score: number; // cosine similarity (0..1)
}
const STOPWORDS = new Set([
'a',
'an',
'the',
'and',
'or',
'but',
'if',
'then',
'else',
'for',
'of',
'in',
'on',
'to',
'from',
'by',
'with',
'as',
'at',
'is',
'are',
'was',
'were',
'be',
'been',
'it',
'this',
'that',
'these',
'those',
'not',
'no',
'can',
'could',
'should',
'would',
'may',
'might',
'do',
'does',
'did',
'have',
'has',
'had',
'you',
'your',
]);
const tokenize = (text: string): string[] => {
return text
.toLowerCase()
.replace(/[^a-z0-9_\s]/g, ' ')
.split(/\s+/)
.filter((t) => t && !STOPWORDS.has(t));
};
type SparseVec = Map<number, number>; // termId -> weight
export class TfidfIndex {
private vocab = new Map<string, number>();
private idf: number[] = [];
private docs: { id: string; vec: SparseVec; norm: number }[] = [];
/**
* Build index from a corpus of documents
*/
build(corpus: TfidfDocument[]): void {
// vocab + df
const df = new Map<number, number>();
const docsTokens: string[][] = corpus.map((d) => tokenize(d.text));
// assign term ids
for (const tokens of docsTokens) {
for (const t of tokens) {
if (!this.vocab.has(t)) this.vocab.set(t, this.vocab.size);
}
}
// compute df
for (const tokens of docsTokens) {
const seen = new Set<number>();
for (const t of tokens) {
const id = this.vocab.get(t);
Iif (id === undefined) continue;
if (!seen.has(id)) {
seen.add(id);
df.set(id, (df.get(id) || 0) + 1);
}
}
}
// compute idf
const N = corpus.length;
this.idf = Array.from({ length: this.vocab.size }, (_, id) => {
const dfi = df.get(id) || 0;
// smoothed idf
return Math.log((N + 1) / (dfi + 1)) + 1;
});
// doc vectors
this.docs = corpus.map((d, i) => {
const docTokens = docsTokens[i] ?? [];
const tf = new Map<number, number>();
for (const t of docTokens) {
const id = this.vocab.get(t);
Iif (id === undefined) continue;
tf.set(id, (tf.get(id) || 0) + 1);
}
// build weighted vector
const vec: SparseVec = new Map();
let normSq = 0;
tf.forEach((f, id) => {
const idf = this.idf[id];
Iif (idf === undefined || docTokens.length === 0) return;
const w = (f / docTokens.length) * idf;
Eif (w > 0) {
vec.set(id, w);
normSq += w * w;
}
});
const norm = Math.sqrt(normSq) || 1;
return { id: d.id, vec, norm };
});
}
/**
* Search for documents similar to the query
* @param query - Search query
* @param k - Maximum number of results to return
* @returns Array of results sorted by score (descending)
*/
search(query: string, k = 10): TfidfResult[] {
const tokens = tokenize(query);
if (tokens.length === 0 || this.vocab.size === 0) return [];
const tf = new Map<number, number>();
for (const t of tokens) {
const id = this.vocab.get(t);
if (id !== undefined) tf.set(id, (tf.get(id) || 0) + 1);
}
if (tf.size === 0) return [];
const qVec: SparseVec = new Map();
let qNormSq = 0;
const total = tokens.length;
tf.forEach((f, id) => {
const idf = this.idf[id];
Iif (idf === undefined) return;
const w = total === 0 ? 0 : (f / total) * idf;
Eif (w > 0) {
qVec.set(id, w);
qNormSq += w * w;
}
});
const qNorm = Math.sqrt(qNormSq) || 1;
// cosine similarity with sparse vectors
const scores: TfidfResult[] = [];
for (const d of this.docs) {
let dot = 0;
// iterate over smaller map
const [small, big] = qVec.size <= d.vec.size ? [qVec, d.vec] : [d.vec, qVec];
small.forEach((w, id) => {
const v = big.get(id);
if (v !== undefined) dot += w * v;
});
const sim = dot / (qNorm * d.norm);
if (sim > 0) scores.push({ id: d.id, score: Math.min(1, Math.max(0, sim)) });
}
scores.sort((a, b) => b.score - a.score);
return scores.slice(0, k);
}
}
|