mirror of
https://github.com/moltbot/moltbot.git
synced 2026-04-26 07:57:40 +00:00
Memory: add configurable FTS5 tokenizer for CJK text support (openclaw#56707)
Verified: - pnpm build - pnpm check - pnpm test -- extensions/memory-core/src/memory/manager-search.test.ts packages/memory-host-sdk/src/host/query-expansion.test.ts - pnpm test -- extensions/memory-core/src/memory/index.test.ts -t "reindexes when extraPaths change" - pnpm test -- src/config/schema.base.generated.test.ts - pnpm test -- src/media-understanding/image.test.ts - pnpm test Co-authored-by: Mitsuyuki Osabe <24588751+carrotRakko@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,7 @@ export function ensureMemoryIndexSchema(params: {
|
||||
cacheEnabled: boolean;
|
||||
ftsTable: string;
|
||||
ftsEnabled: boolean;
|
||||
ftsTokenizer?: "unicode61" | "trigram";
|
||||
}): { ftsAvailable: boolean; ftsError?: string } {
|
||||
params.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS meta (
|
||||
@@ -58,6 +59,8 @@ export function ensureMemoryIndexSchema(params: {
|
||||
let ftsError: string | undefined;
|
||||
if (params.ftsEnabled) {
|
||||
try {
|
||||
const tokenizer = params.ftsTokenizer ?? "unicode61";
|
||||
const tokenizeClause = tokenizer === "trigram" ? `, tokenize='trigram case_sensitive 0'` : "";
|
||||
params.db.exec(
|
||||
`CREATE VIRTUAL TABLE IF NOT EXISTS ${params.ftsTable} USING fts5(\n` +
|
||||
` text,\n` +
|
||||
@@ -67,7 +70,7 @@ export function ensureMemoryIndexSchema(params: {
|
||||
` model UNINDEXED,\n` +
|
||||
` start_line UNINDEXED,\n` +
|
||||
` end_line UNINDEXED\n` +
|
||||
`);`,
|
||||
`${tokenizeClause});`,
|
||||
);
|
||||
ftsAvailable = true;
|
||||
} catch (err) {
|
||||
|
||||
@@ -174,6 +174,51 @@ describe("extractKeywords", () => {
|
||||
const testCount = keywords.filter((k) => k === "test").length;
|
||||
expect(testCount).toBe(1);
|
||||
});
|
||||
|
||||
describe("with trigram tokenizer", () => {
|
||||
const trigramOpts = { ftsTokenizer: "trigram" as const };
|
||||
|
||||
it("emits whole CJK block instead of unigrams in trigram mode", () => {
|
||||
const defaultKeywords = extractKeywords("之前讨论的那个方案");
|
||||
const trigramKeywords = extractKeywords("之前讨论的那个方案", trigramOpts);
|
||||
// Default mode produces bigrams
|
||||
expect(defaultKeywords).toContain("讨论");
|
||||
expect(defaultKeywords).toContain("方案");
|
||||
// Trigram mode emits the whole contiguous CJK block (FTS5 trigram
|
||||
// requires >= 3 chars per term; individual characters return no results)
|
||||
expect(trigramKeywords).toContain("之前讨论的那个方案");
|
||||
expect(trigramKeywords).not.toContain("讨论");
|
||||
expect(trigramKeywords).not.toContain("方案");
|
||||
});
|
||||
|
||||
it("skips Japanese kanji bigrams in trigram mode", () => {
|
||||
const defaultKeywords = extractKeywords("経済政策について");
|
||||
const trigramKeywords = extractKeywords("経済政策について", trigramOpts);
|
||||
// Default mode adds kanji bigrams: 経済, 済政, 政策
|
||||
expect(defaultKeywords).toContain("経済");
|
||||
expect(defaultKeywords).toContain("済政");
|
||||
expect(defaultKeywords).toContain("政策");
|
||||
// Trigram mode keeps the full kanji block but skips bigram splitting
|
||||
expect(trigramKeywords).toContain("経済政策");
|
||||
expect(trigramKeywords).not.toContain("済政");
|
||||
});
|
||||
|
||||
it("still filters stop words in trigram mode", () => {
|
||||
const keywords = extractKeywords("これ それ そして どう", trigramOpts);
|
||||
expect(keywords).not.toContain("これ");
|
||||
expect(keywords).not.toContain("それ");
|
||||
expect(keywords).not.toContain("そして");
|
||||
expect(keywords).not.toContain("どう");
|
||||
});
|
||||
|
||||
it("does not affect English keyword extraction", () => {
|
||||
const keywords = extractKeywords("that thing we discussed about the API", trigramOpts);
|
||||
expect(keywords).toContain("discussed");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).not.toContain("that");
|
||||
expect(keywords).not.toContain("the");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("expandQueryForFts", () => {
|
||||
|
||||
@@ -670,7 +670,8 @@ function isValidKeyword(token: string): boolean {
|
||||
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
|
||||
* For English, we split on whitespace and punctuation.
|
||||
*/
|
||||
function tokenize(text: string): string[] {
|
||||
function tokenize(text: string, opts?: { ftsTokenizer?: "unicode61" | "trigram" }): string[] {
|
||||
const useTrigram = opts?.ftsTokenizer === "trigram";
|
||||
const tokens: string[] = [];
|
||||
const normalized = text.toLowerCase().trim();
|
||||
|
||||
@@ -686,8 +687,10 @@ function tokenize(text: string): string[] {
|
||||
for (const part of jpParts) {
|
||||
if (/^[\u4e00-\u9fff]+$/.test(part)) {
|
||||
tokens.push(part);
|
||||
for (let i = 0; i < part.length - 1; i++) {
|
||||
tokens.push(part[i] + part[i + 1]);
|
||||
if (!useTrigram) {
|
||||
for (let i = 0; i < part.length - 1; i++) {
|
||||
tokens.push(part[i] + part[i + 1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tokens.push(part);
|
||||
@@ -695,13 +698,21 @@ function tokenize(text: string): string[] {
|
||||
}
|
||||
} else if (/[\u4e00-\u9fff]/.test(segment)) {
|
||||
// Check if segment contains CJK characters (Chinese)
|
||||
// For Chinese, extract character n-grams (unigrams and bigrams)
|
||||
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
|
||||
// Add individual characters
|
||||
tokens.push(...chars);
|
||||
// Add bigrams for better phrase matching
|
||||
for (let i = 0; i < chars.length - 1; i++) {
|
||||
tokens.push(chars[i] + chars[i + 1]);
|
||||
if (useTrigram) {
|
||||
// In trigram mode, push the whole contiguous CJK block (mirroring the
|
||||
// Japanese kanji path). SQLite's trigram FTS requires at least 3 characters
|
||||
// per query term — individual characters silently return no results.
|
||||
const block = chars.join("");
|
||||
if (block.length > 0) {
|
||||
tokens.push(block);
|
||||
}
|
||||
} else {
|
||||
// Default mode: unigrams + bigrams for phrase matching
|
||||
tokens.push(...chars);
|
||||
for (let i = 0; i < chars.length - 1; i++) {
|
||||
tokens.push(chars[i] + chars[i + 1]);
|
||||
}
|
||||
}
|
||||
} else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
|
||||
// For Korean (Hangul syllables and jamo), keep the word as-is unless it is
|
||||
@@ -732,8 +743,11 @@ function tokenize(text: string): string[] {
|
||||
* - "之前讨论的那个方案" → ["讨论", "方案"]
|
||||
* - "what was the solution for the bug" → ["solution", "bug"]
|
||||
*/
|
||||
export function extractKeywords(query: string): string[] {
|
||||
const tokens = tokenize(query);
|
||||
export function extractKeywords(
|
||||
query: string,
|
||||
opts?: { ftsTokenizer?: "unicode61" | "trigram" },
|
||||
): string[] {
|
||||
const tokens = tokenize(query, opts);
|
||||
const keywords: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
@@ -764,13 +778,16 @@ export function extractKeywords(query: string): string[] {
|
||||
* @param query - User's original query
|
||||
* @returns Object with original query and extracted keywords
|
||||
*/
|
||||
export function expandQueryForFts(query: string): {
|
||||
export function expandQueryForFts(
|
||||
query: string,
|
||||
opts?: { ftsTokenizer?: "unicode61" | "trigram" },
|
||||
): {
|
||||
original: string;
|
||||
keywords: string[];
|
||||
expanded: string;
|
||||
} {
|
||||
const original = query.trim();
|
||||
const keywords = extractKeywords(original);
|
||||
const keywords = extractKeywords(original, opts);
|
||||
|
||||
// Build expanded query: original terms OR extracted keywords
|
||||
// This ensures both exact matches and keyword matches are found
|
||||
@@ -792,6 +809,7 @@ export type LlmQueryExpander = (query: string) => Promise<string[]>;
|
||||
export async function expandQueryWithLlm(
|
||||
query: string,
|
||||
llmExpander?: LlmQueryExpander,
|
||||
opts?: { ftsTokenizer?: "unicode61" | "trigram" },
|
||||
): Promise<string[]> {
|
||||
// If LLM expander is provided, try it first
|
||||
if (llmExpander) {
|
||||
@@ -806,5 +824,5 @@ export async function expandQueryWithLlm(
|
||||
}
|
||||
|
||||
// Fall back to local keyword extraction
|
||||
return extractKeywords(query);
|
||||
return extractKeywords(query, opts);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user