mirror of
https://github.com/moltbot/moltbot.git
synced 2026-03-09 15:35:17 +00:00
Memory/QMD: normalize Han-script BM25 search queries
This commit is contained in:
@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/Config reload: compare array-valued config paths structurally during diffing so unchanged `memory.qmd.paths` and `memory.qmd.scope.rules` no longer trigger false restart-required reloads. (#23185) Thanks @rex05ai.
|
||||
- Cron/Scheduling: validate runtime cron expressions before schedule/stagger evaluation so malformed persisted jobs report a clear `invalid cron schedule: expr is required` error instead of crashing with `undefined.trim` failures and auto-disable churn. (#23223) Thanks @asimons81.
|
||||
- Memory/QMD: migrate legacy unscoped collection bindings (for example `memory-root`) to per-agent scoped names (for example `memory-root-main`) during startup when safe, so QMD-backed `memory_search` no longer fails with `Collection not found` after upgrades. (#23228, #20727) Thanks @JLDynamics and @AaronFaby.
|
||||
- Memory/QMD: normalize Han-script BM25 search queries before invoking `qmd search` so mixed CJK+Latin prompts no longer return empty results due to tokenizer mismatch. (#23426) Thanks @LunaLee0130.
|
||||
- TUI/Input: enable multiline-paste burst coalescing on macOS Terminal.app and iTerm so pasted blocks no longer submit line-by-line as separate messages. (#18809) Thanks @fwends.
|
||||
- TUI/RTL: isolate right-to-left script lines (Arabic/Hebrew ranges) with Unicode bidi isolation marks in TUI text sanitization so RTL assistant output no longer renders in reversed visual order in terminal chat panes. (#21936) Thanks @Asm3r96.
|
||||
- TUI/Status: request immediate renders after setting `sending`/`waiting` activity states so in-flight runs always show visible progress indicators instead of appearing idle until completion. (#21549) Thanks @13Guinness.
|
||||
|
||||
@@ -729,6 +729,121 @@ describe("QmdMemoryManager", () => {
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("normalizes mixed Han-script BM25 queries before qmd search", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
memory: {
|
||||
backend: "qmd",
|
||||
qmd: {
|
||||
includeDefaultMemory: false,
|
||||
searchMode: "search",
|
||||
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
|
||||
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
|
||||
if (args[0] === "search") {
|
||||
const child = createMockChild({ autoClose: false });
|
||||
emitAndClose(child, "stdout", "[]");
|
||||
return child;
|
||||
}
|
||||
return createMockChild();
|
||||
});
|
||||
|
||||
const { manager, resolved } = await createManager();
|
||||
const maxResults = resolved.qmd?.limits.maxResults;
|
||||
if (!maxResults) {
|
||||
throw new Error("qmd maxResults missing");
|
||||
}
|
||||
|
||||
await expect(
|
||||
manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }),
|
||||
).resolves.toEqual([]);
|
||||
|
||||
const searchCall = spawnMock.mock.calls.find(
|
||||
(call: unknown[]) => (call[1] as string[])?.[0] === "search",
|
||||
);
|
||||
expect(searchCall?.[1]).toEqual([
|
||||
"search",
|
||||
"記憶 憶系 系統 統升 升級 qmd",
|
||||
"--json",
|
||||
"-n",
|
||||
String(maxResults),
|
||||
"-c",
|
||||
"workspace-main",
|
||||
]);
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("falls back to the original query when Han normalization yields no BM25 tokens", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
memory: {
|
||||
backend: "qmd",
|
||||
qmd: {
|
||||
includeDefaultMemory: false,
|
||||
searchMode: "search",
|
||||
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
|
||||
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
|
||||
if (args[0] === "search") {
|
||||
const child = createMockChild({ autoClose: false });
|
||||
emitAndClose(child, "stdout", "[]");
|
||||
return child;
|
||||
}
|
||||
return createMockChild();
|
||||
});
|
||||
|
||||
const { manager } = await createManager();
|
||||
await expect(manager.search("記", { sessionKey: "agent:main:slack:dm:u123" })).resolves.toEqual(
|
||||
[],
|
||||
);
|
||||
|
||||
const searchCall = spawnMock.mock.calls.find(
|
||||
(call: unknown[]) => (call[1] as string[])?.[0] === "search",
|
||||
);
|
||||
expect(searchCall?.[1]?.[1]).toBe("記");
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("keeps original Han queries in qmd query mode", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
memory: {
|
||||
backend: "qmd",
|
||||
qmd: {
|
||||
includeDefaultMemory: false,
|
||||
searchMode: "query",
|
||||
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
|
||||
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
|
||||
if (args[0] === "query") {
|
||||
const child = createMockChild({ autoClose: false });
|
||||
emitAndClose(child, "stdout", "[]");
|
||||
return child;
|
||||
}
|
||||
return createMockChild();
|
||||
});
|
||||
|
||||
const { manager } = await createManager();
|
||||
await expect(
|
||||
manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }),
|
||||
).resolves.toEqual([]);
|
||||
|
||||
const queryCall = spawnMock.mock.calls.find(
|
||||
(call: unknown[]) => (call[1] as string[])?.[0] === "query",
|
||||
);
|
||||
expect(queryCall?.[1]?.[1]).toBe("記憶系統升級 QMD");
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("retries search with qmd query when configured mode rejects flags", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
|
||||
@@ -31,6 +31,7 @@ import type {
|
||||
ResolvedQmdMcporterConfig,
|
||||
} from "./backend-config.js";
|
||||
import { parseQmdQueryJson, type QmdQueryResult } from "./qmd-query-parser.js";
|
||||
import { extractKeywords } from "./query-expansion.js";
|
||||
|
||||
const log = createSubsystemLogger("memory");
|
||||
|
||||
@@ -40,9 +41,45 @@ const MAX_QMD_OUTPUT_CHARS = 200_000;
|
||||
const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i;
|
||||
const QMD_EMBED_BACKOFF_BASE_MS = 60_000;
|
||||
const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000;
|
||||
const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u;
|
||||
const QMD_BM25_HAN_KEYWORD_LIMIT = 12;
|
||||
|
||||
let qmdEmbedQueueTail: Promise<void> = Promise.resolve();
|
||||
|
||||
function hasHanScript(value: string): boolean {
|
||||
return HAN_SCRIPT_RE.test(value);
|
||||
}
|
||||
|
||||
function normalizeHanBm25Query(query: string): string {
|
||||
const trimmed = query.trim();
|
||||
if (!trimmed || !hasHanScript(trimmed)) {
|
||||
return trimmed;
|
||||
}
|
||||
const keywords = extractKeywords(trimmed);
|
||||
const normalizedKeywords: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const keyword of keywords) {
|
||||
const token = keyword.trim();
|
||||
if (!token || seen.has(token)) {
|
||||
continue;
|
||||
}
|
||||
const includesHan = hasHanScript(token);
|
||||
// Han unigrams are usually too broad for BM25 and can drown signal.
|
||||
if (includesHan && Array.from(token).length < 2) {
|
||||
continue;
|
||||
}
|
||||
if (!includesHan && token.length < 2) {
|
||||
continue;
|
||||
}
|
||||
seen.add(token);
|
||||
normalizedKeywords.push(token);
|
||||
if (normalizedKeywords.length >= QMD_BM25_HAN_KEYWORD_LIMIT) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return normalizedKeywords.length > 0 ? normalizedKeywords.join(" ") : trimmed;
|
||||
}
|
||||
|
||||
async function runWithQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
|
||||
const previous = qmdEmbedQueueTail;
|
||||
let release: (() => void) | undefined;
|
||||
@@ -1728,10 +1765,11 @@ export class QmdMemoryManager implements MemorySearchManager {
|
||||
query: string,
|
||||
limit: number,
|
||||
): string[] {
|
||||
const normalizedQuery = command === "search" ? normalizeHanBm25Query(query) : query;
|
||||
if (command === "query") {
|
||||
return ["query", query, "--json", "-n", String(limit)];
|
||||
return ["query", normalizedQuery, "--json", "-n", String(limit)];
|
||||
}
|
||||
return [command, query, "--json", "-n", String(limit)];
|
||||
return [command, normalizedQuery, "--json", "-n", String(limit)];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user