Memory/QMD: normalize Han-script BM25 search queries

This commit is contained in:
Vignesh Natarajan
2026-02-22 01:53:00 -08:00
parent 9f0b6a8c92
commit 99a2f5379e
3 changed files with 156 additions and 2 deletions

View File

@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
- Gateway/Config reload: compare array-valued config paths structurally during diffing so unchanged `memory.qmd.paths` and `memory.qmd.scope.rules` no longer trigger false restart-required reloads. (#23185) Thanks @rex05ai.
- Cron/Scheduling: validate runtime cron expressions before schedule/stagger evaluation so malformed persisted jobs report a clear `invalid cron schedule: expr is required` error instead of crashing with `undefined.trim` failures and auto-disable churn. (#23223) Thanks @asimons81.
- Memory/QMD: migrate legacy unscoped collection bindings (for example `memory-root`) to per-agent scoped names (for example `memory-root-main`) during startup when safe, so QMD-backed `memory_search` no longer fails with `Collection not found` after upgrades. (#23228, #20727) Thanks @JLDynamics and @AaronFaby.
- Memory/QMD: normalize Han-script BM25 search queries before invoking `qmd search` so mixed CJK+Latin prompts no longer return empty results due to tokenizer mismatch. (#23426) Thanks @LunaLee0130.
- TUI/Input: enable multiline-paste burst coalescing on macOS Terminal.app and iTerm so pasted blocks no longer submit line-by-line as separate messages. (#18809) Thanks @fwends.
- TUI/RTL: isolate right-to-left script lines (Arabic/Hebrew ranges) with Unicode bidi isolation marks in TUI text sanitization so RTL assistant output no longer renders in reversed visual order in terminal chat panes. (#21936) Thanks @Asm3r96.
- TUI/Status: request immediate renders after setting `sending`/`waiting` activity states so in-flight runs always show visible progress indicators instead of appearing idle until completion. (#21549) Thanks @13Guinness.

View File

@@ -729,6 +729,121 @@ describe("QmdMemoryManager", () => {
await manager.close();
});
it("normalizes mixed Han-script BM25 queries before qmd search", async () => {
cfg = {
...cfg,
memory: {
backend: "qmd",
qmd: {
includeDefaultMemory: false,
searchMode: "search",
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
},
},
} as OpenClawConfig;
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
if (args[0] === "search") {
const child = createMockChild({ autoClose: false });
emitAndClose(child, "stdout", "[]");
return child;
}
return createMockChild();
});
const { manager, resolved } = await createManager();
const maxResults = resolved.qmd?.limits.maxResults;
if (!maxResults) {
throw new Error("qmd maxResults missing");
}
await expect(
manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }),
).resolves.toEqual([]);
const searchCall = spawnMock.mock.calls.find(
(call: unknown[]) => (call[1] as string[])?.[0] === "search",
);
expect(searchCall?.[1]).toEqual([
"search",
"記憶 憶系 系統 統升 升級 qmd",
"--json",
"-n",
String(maxResults),
"-c",
"workspace-main",
]);
await manager.close();
});
it("falls back to the original query when Han normalization yields no BM25 tokens", async () => {
cfg = {
...cfg,
memory: {
backend: "qmd",
qmd: {
includeDefaultMemory: false,
searchMode: "search",
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
},
},
} as OpenClawConfig;
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
if (args[0] === "search") {
const child = createMockChild({ autoClose: false });
emitAndClose(child, "stdout", "[]");
return child;
}
return createMockChild();
});
const { manager } = await createManager();
await expect(manager.search("記", { sessionKey: "agent:main:slack:dm:u123" })).resolves.toEqual(
[],
);
const searchCall = spawnMock.mock.calls.find(
(call: unknown[]) => (call[1] as string[])?.[0] === "search",
);
expect(searchCall?.[1]?.[1]).toBe("記");
await manager.close();
});
it("keeps original Han queries in qmd query mode", async () => {
cfg = {
...cfg,
memory: {
backend: "qmd",
qmd: {
includeDefaultMemory: false,
searchMode: "query",
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
},
},
} as OpenClawConfig;
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
if (args[0] === "query") {
const child = createMockChild({ autoClose: false });
emitAndClose(child, "stdout", "[]");
return child;
}
return createMockChild();
});
const { manager } = await createManager();
await expect(
manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }),
).resolves.toEqual([]);
const queryCall = spawnMock.mock.calls.find(
(call: unknown[]) => (call[1] as string[])?.[0] === "query",
);
expect(queryCall?.[1]?.[1]).toBe("記憶系統升級 QMD");
await manager.close();
});
it("retries search with qmd query when configured mode rejects flags", async () => {
cfg = {
...cfg,

View File

@@ -31,6 +31,7 @@ import type {
ResolvedQmdMcporterConfig,
} from "./backend-config.js";
import { parseQmdQueryJson, type QmdQueryResult } from "./qmd-query-parser.js";
import { extractKeywords } from "./query-expansion.js";
const log = createSubsystemLogger("memory");
@@ -40,9 +41,45 @@ const MAX_QMD_OUTPUT_CHARS = 200_000;
const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i;
const QMD_EMBED_BACKOFF_BASE_MS = 60_000;
const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000;
const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u;
const QMD_BM25_HAN_KEYWORD_LIMIT = 12;
let qmdEmbedQueueTail: Promise<void> = Promise.resolve();
function hasHanScript(value: string): boolean {
return HAN_SCRIPT_RE.test(value);
}
function normalizeHanBm25Query(query: string): string {
const trimmed = query.trim();
if (!trimmed || !hasHanScript(trimmed)) {
return trimmed;
}
const keywords = extractKeywords(trimmed);
const normalizedKeywords: string[] = [];
const seen = new Set<string>();
for (const keyword of keywords) {
const token = keyword.trim();
if (!token || seen.has(token)) {
continue;
}
const includesHan = hasHanScript(token);
// Han unigrams are usually too broad for BM25 and can drown signal.
if (includesHan && Array.from(token).length < 2) {
continue;
}
if (!includesHan && token.length < 2) {
continue;
}
seen.add(token);
normalizedKeywords.push(token);
if (normalizedKeywords.length >= QMD_BM25_HAN_KEYWORD_LIMIT) {
break;
}
}
return normalizedKeywords.length > 0 ? normalizedKeywords.join(" ") : trimmed;
}
async function runWithQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
const previous = qmdEmbedQueueTail;
let release: (() => void) | undefined;
@@ -1728,10 +1765,11 @@ export class QmdMemoryManager implements MemorySearchManager {
query: string,
limit: number,
): string[] {
const normalizedQuery = command === "search" ? normalizeHanBm25Query(query) : query;
if (command === "query") {
return ["query", query, "--json", "-n", String(limit)];
return ["query", normalizedQuery, "--json", "-n", String(limit)];
}
return [command, query, "--json", "-n", String(limit)];
return [command, normalizedQuery, "--json", "-n", String(limit)];
}
}