mirror of
https://github.com/moltbot/moltbot.git
synced 2026-04-23 22:55:24 +00:00
fix(memory): account for CJK characters in QMD memory chunking
The QMD memory system uses a fixed 4:1 chars-to-tokens ratio for chunk sizing, which severely underestimates CJK (Chinese/Japanese/Korean) text where each character is roughly 1 token. This causes oversized chunks for CJK users, degrading vector search quality and wasting context window space. Changes: - Add shared src/utils/cjk-chars.ts module with CJK-aware character counting (estimateStringChars) and token estimation helpers - Update chunkMarkdown() in src/memory/internal.ts to use weighted character lengths for chunk boundary decisions and overlap calculation - Replace hardcoded estimateTokensFromChars in the context report command with the shared utility - Add 13 unit tests for the CJK estimation module and 5 new tests for CJK-aware memory chunking behavior Backward compatible: pure ASCII/Latin text behavior is unchanged. Closes #39965 Related: #40216
This commit is contained in:
committed by
Peter Steinberger
parent
7f46b03de0
commit
971ecabe80
@@ -5,14 +5,11 @@ import {
|
||||
} from "../../agents/pi-embedded-helpers.js";
|
||||
import { buildSystemPromptReport } from "../../agents/system-prompt-report.js";
|
||||
import type { SessionSystemPromptReport } from "../../config/sessions/types.js";
|
||||
import { estimateTokensFromChars } from "../../utils/cjk-chars.js";
|
||||
import type { ReplyPayload } from "../types.js";
|
||||
import { resolveCommandsSystemPromptBundle } from "./commands-system-prompt.js";
|
||||
import type { HandleCommandsParams } from "./commands-types.js";
|
||||
|
||||
function estimateTokensFromChars(chars: number): number {
|
||||
return Math.ceil(Math.max(0, chars) / 4);
|
||||
}
|
||||
|
||||
function formatInt(n: number): string {
|
||||
return new Intl.NumberFormat("en-US").format(n);
|
||||
}
|
||||
|
||||
105
src/utils/cjk-chars.test.ts
Normal file
105
src/utils/cjk-chars.test.ts
Normal file
@@ -0,0 +1,105 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
CHARS_PER_TOKEN_ESTIMATE,
|
||||
estimateStringChars,
|
||||
estimateTokensFromChars,
|
||||
} from "./cjk-chars.js";
|
||||
|
||||
describe("estimateStringChars", () => {
|
||||
it("returns plain string length for ASCII text", () => {
|
||||
expect(estimateStringChars("hello world")).toBe(11);
|
||||
});
|
||||
|
||||
it("returns 0 for empty string", () => {
|
||||
expect(estimateStringChars("")).toBe(0);
|
||||
});
|
||||
|
||||
it("counts Chinese characters with extra weight", () => {
|
||||
// "你好世" = 3 CJK chars
|
||||
// Each CJK char counted as CHARS_PER_TOKEN_ESTIMATE (4) chars
|
||||
// .length = 3, adjusted = 3 + 3 * (4 - 1) = 12
|
||||
expect(estimateStringChars("你好世")).toBe(12);
|
||||
});
|
||||
|
||||
it("handles mixed ASCII and CJK text", () => {
|
||||
// "hi你好" = 2 ASCII + 2 CJK
|
||||
// .length = 4, adjusted = 4 + 2 * 3 = 10
|
||||
expect(estimateStringChars("hi你好")).toBe(10);
|
||||
});
|
||||
|
||||
it("handles Japanese hiragana", () => {
|
||||
// "こんにちは" = 5 hiragana chars
|
||||
// .length = 5, adjusted = 5 + 5 * 3 = 20
|
||||
expect(estimateStringChars("こんにちは")).toBe(20);
|
||||
});
|
||||
|
||||
it("handles Japanese katakana", () => {
|
||||
// "カタカナ" = 4 katakana chars
|
||||
// .length = 4, adjusted = 4 + 4 * 3 = 16
|
||||
expect(estimateStringChars("カタカナ")).toBe(16);
|
||||
});
|
||||
|
||||
it("handles Korean hangul", () => {
|
||||
// "안녕하세요" = 5 hangul chars
|
||||
// .length = 5, adjusted = 5 + 5 * 3 = 20
|
||||
expect(estimateStringChars("안녕하세요")).toBe(20);
|
||||
});
|
||||
|
||||
it("handles CJK punctuation and symbols in the extended range", () => {
|
||||
// "⺀" (U+2E80) is in CJK Radicals Supplement range
|
||||
expect(estimateStringChars("⺀")).toBe(CHARS_PER_TOKEN_ESTIMATE);
|
||||
});
|
||||
|
||||
it("does not inflate standard Latin characters", () => {
|
||||
const latin = "The quick brown fox jumps over the lazy dog";
|
||||
expect(estimateStringChars(latin)).toBe(latin.length);
|
||||
});
|
||||
|
||||
it("does not inflate numbers and basic punctuation", () => {
|
||||
const text = "123.45, hello! @#$%";
|
||||
expect(estimateStringChars(text)).toBe(text.length);
|
||||
});
|
||||
|
||||
it("counts CJK Extension B characters as one code point", () => {
|
||||
// "𠀀" (U+20000) is represented as a surrogate pair in UTF-16.
|
||||
// Result = 1 + 1 * 3 = 4 (exactly CHARS_PER_TOKEN_ESTIMATE)
|
||||
expect(estimateStringChars("𠀀")).toBe(CHARS_PER_TOKEN_ESTIMATE);
|
||||
});
|
||||
|
||||
it("handles mixed BMP and Extension B CJK consistently", () => {
|
||||
// 3 CJK code points total: 你 + 𠀀 + 好 => 3 * 4 = 12
|
||||
expect(estimateStringChars("你𠀀好")).toBe(12);
|
||||
});
|
||||
|
||||
it("does not collapse non-CJK surrogate pairs like emoji", () => {
|
||||
// Emoji is a surrogate pair in UTF-16, but not matched by NON_LATIN_RE.
|
||||
// Its weighted length should remain the UTF-16 length (2).
|
||||
expect(estimateStringChars("😀")).toBe(2);
|
||||
});
|
||||
|
||||
it("keeps mixed CJK and emoji weighting consistent", () => {
|
||||
// "你" counts as 4, emoji remains 2 => total 6
|
||||
expect(estimateStringChars("你😀")).toBe(6);
|
||||
});
|
||||
|
||||
it("yields ~1 token per CJK char when divided by CHARS_PER_TOKEN_ESTIMATE", () => {
|
||||
// 10 CJK chars should estimate as ~10 tokens
|
||||
const cjk = "这是一个测试用的句子呢";
|
||||
const estimated = estimateStringChars(cjk);
|
||||
const tokens = Math.ceil(estimated / CHARS_PER_TOKEN_ESTIMATE);
|
||||
// Each CJK char ≈ 1 token, so tokens should be close to string length
|
||||
expect(tokens).toBe(cjk.length);
|
||||
});
|
||||
});
|
||||
|
||||
describe("estimateTokensFromChars", () => {
|
||||
it("divides by CHARS_PER_TOKEN_ESTIMATE and rounds up", () => {
|
||||
expect(estimateTokensFromChars(8)).toBe(2);
|
||||
expect(estimateTokensFromChars(9)).toBe(3);
|
||||
expect(estimateTokensFromChars(0)).toBe(0);
|
||||
});
|
||||
|
||||
it("clamps negative values to 0", () => {
|
||||
expect(estimateTokensFromChars(-10)).toBe(0);
|
||||
});
|
||||
});
|
||||
81
src/utils/cjk-chars.ts
Normal file
81
src/utils/cjk-chars.ts
Normal file
@@ -0,0 +1,81 @@
|
||||
/**
|
||||
* CJK-aware character counting for accurate token estimation.
|
||||
*
|
||||
* Most LLM tokenizers encode CJK (Chinese, Japanese, Korean) characters as
|
||||
* roughly 1 token per character, whereas Latin/ASCII text averages ~1 token
|
||||
* per 4 characters. When the codebase estimates tokens as `chars / 4`, CJK
|
||||
* content is underestimated by 2–4×.
|
||||
*
|
||||
* This module provides a shared helper that inflates the character count of
|
||||
* CJK text so that the standard `chars / 4` formula yields an accurate
|
||||
* token estimate for any script.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Default characters-per-token ratio used throughout the codebase.
|
||||
* Latin text ≈ 4 chars/token; CJK ≈ 1 char/token.
|
||||
*/
|
||||
export const CHARS_PER_TOKEN_ESTIMATE = 4;
|
||||
|
||||
/**
|
||||
* Matches CJK Unified Ideographs, CJK Extension A/B, CJK Compatibility
|
||||
* Ideographs, Hangul Syllables, Hiragana, Katakana, and other non-Latin
|
||||
* scripts that typically use ~1 token per character.
|
||||
*/
|
||||
const NON_LATIN_RE = /[\u2E80-\u9FFF\uA000-\uA4FF\uAC00-\uD7AF\uF900-\uFAFF\u{20000}-\u{2FA1F}]/gu;
|
||||
|
||||
/**
|
||||
* Return an adjusted character length that accounts for non-Latin (CJK, etc.)
|
||||
* characters. Each non-Latin character is counted as
|
||||
* {@link CHARS_PER_TOKEN_ESTIMATE} chars so that the downstream
|
||||
* `chars / CHARS_PER_TOKEN_ESTIMATE` token estimate remains accurate.
|
||||
*
|
||||
* For pure ASCII/Latin text the return value equals `text.length` (no change).
|
||||
*/
|
||||
export function estimateStringChars(text: string): number {
|
||||
if (text.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
const nonLatinCount = (text.match(NON_LATIN_RE) ?? []).length;
|
||||
// Use code-point length instead of UTF-16 length so that surrogate pairs
|
||||
// (CJK Extension B+, U+20000–U+2FA1F) are counted as 1 character, not 2.
|
||||
const codePointLength = countCodePoints(text, nonLatinCount);
|
||||
// Non-Latin chars already contribute 1 to codePointLength, so add the extra weight.
|
||||
return codePointLength + nonLatinCount * (CHARS_PER_TOKEN_ESTIMATE - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches surrogate pairs whose code point falls in the CJK Extension B+
|
||||
* range (U+20000–U+2FA1F). Only these surrogates need adjustment because
|
||||
* they are matched by {@link NON_LATIN_RE} and already counted in
|
||||
* `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by
|
||||
* that regex, so collapsing them would create an inconsistency.
|
||||
*
|
||||
* High-surrogate range for U+20000–U+2FA1F is D840–D87E.
|
||||
*/
|
||||
const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g;
|
||||
|
||||
/**
|
||||
* Return the code-point-aware length of the string, adjusting only for
|
||||
* CJK Extension B+ surrogate pairs. For text without such characters
|
||||
* (the vast majority of inputs) this returns `text.length` unchanged.
|
||||
*/
|
||||
function countCodePoints(text: string, nonLatinCount: number): number {
|
||||
if (nonLatinCount === 0) {
|
||||
return text.length;
|
||||
}
|
||||
// Count only CJK-range surrogate pairs — each occupies 2 UTF-16 units
|
||||
// but represents 1 code point (and 1 regex match in NON_LATIN_RE).
|
||||
const cjkSurrogates = (text.match(CJK_SURROGATE_HIGH_RE) ?? []).length;
|
||||
return text.length - cjkSurrogates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the number of tokens from a raw character count.
|
||||
*
|
||||
* For a more accurate estimate when the source text is available, prefer
|
||||
* `estimateStringChars(text) / CHARS_PER_TOKEN_ESTIMATE` instead.
|
||||
*/
|
||||
export function estimateTokensFromChars(chars: number): number {
|
||||
return Math.ceil(Math.max(0, chars) / CHARS_PER_TOKEN_ESTIMATE);
|
||||
}
|
||||
Reference in New Issue
Block a user