mirror of
https://github.com/moltbot/moltbot.git
synced 2026-03-07 22:44:16 +00:00
feat: add Korean language support for memory search query expansion (#18899)
* feat: add Korean stop words and tokenization for memory search * fix: address review comments on Korean query expansion * fix: lint errors - curly brace and toSorted * fix(memory): improve Korean stop words and deduplicate * Memory: tighten Korean query expansion filtering * Docs/Changelog: credit Korean memory query expansion --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
- Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path.
|
||||
- Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
|
||||
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
|
||||
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
|
||||
|
||||
### Breaking
|
||||
|
||||
@@ -38,6 +38,63 @@ describe("extractKeywords", () => {
|
||||
expect(keywords).toContain("bug");
|
||||
});
|
||||
|
||||
it("extracts keywords from Korean conversational query", () => {
|
||||
const keywords = extractKeywords("어제 논의한 배포 전략");
|
||||
expect(keywords).toContain("논의한");
|
||||
expect(keywords).toContain("배포");
|
||||
expect(keywords).toContain("전략");
|
||||
// Should not include stop words
|
||||
expect(keywords).not.toContain("어제");
|
||||
});
|
||||
|
||||
it("strips Korean particles to extract stems", () => {
|
||||
const keywords = extractKeywords("서버에서 발생한 에러를 확인");
|
||||
expect(keywords).toContain("서버");
|
||||
expect(keywords).toContain("에러");
|
||||
expect(keywords).toContain("확인");
|
||||
});
|
||||
|
||||
it("filters Korean stop words including inflected forms", () => {
|
||||
const keywords = extractKeywords("나는 그리고 그래서");
|
||||
expect(keywords).not.toContain("나");
|
||||
expect(keywords).not.toContain("나는");
|
||||
expect(keywords).not.toContain("그리고");
|
||||
expect(keywords).not.toContain("그래서");
|
||||
});
|
||||
|
||||
it("filters inflected Korean stop words not explicitly listed", () => {
|
||||
const keywords = extractKeywords("그녀는 우리는");
|
||||
expect(keywords).not.toContain("그녀는");
|
||||
expect(keywords).not.toContain("우리는");
|
||||
expect(keywords).not.toContain("그녀");
|
||||
expect(keywords).not.toContain("우리");
|
||||
});
|
||||
|
||||
it("does not produce bogus single-char stems from particle stripping", () => {
|
||||
const keywords = extractKeywords("논의");
|
||||
expect(keywords).toContain("논의");
|
||||
expect(keywords).not.toContain("논");
|
||||
});
|
||||
|
||||
it("strips longest Korean trailing particles first", () => {
|
||||
const keywords = extractKeywords("기능으로 설명");
|
||||
expect(keywords).toContain("기능");
|
||||
expect(keywords).not.toContain("기능으");
|
||||
});
|
||||
|
||||
it("keeps stripped ASCII stems for mixed Korean tokens", () => {
|
||||
const keywords = extractKeywords("API를 배포했다");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).toContain("배포했다");
|
||||
});
|
||||
|
||||
it("handles mixed Korean and English query", () => {
|
||||
const keywords = extractKeywords("API 배포에 대한 논의");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).toContain("배포");
|
||||
expect(keywords).toContain("논의");
|
||||
});
|
||||
|
||||
it("handles empty query", () => {
|
||||
expect(extractKeywords("")).toEqual([]);
|
||||
expect(extractKeywords(" ")).toEqual([]);
|
||||
|
||||
@@ -118,6 +118,161 @@ const STOP_WORDS_EN = new Set([
|
||||
"give",
|
||||
]);
|
||||
|
||||
const STOP_WORDS_KO = new Set([
|
||||
// Particles (조사)
|
||||
"은",
|
||||
"는",
|
||||
"이",
|
||||
"가",
|
||||
"을",
|
||||
"를",
|
||||
"의",
|
||||
"에",
|
||||
"에서",
|
||||
"로",
|
||||
"으로",
|
||||
"와",
|
||||
"과",
|
||||
"도",
|
||||
"만",
|
||||
"까지",
|
||||
"부터",
|
||||
"한테",
|
||||
"에게",
|
||||
"께",
|
||||
"처럼",
|
||||
"같이",
|
||||
"보다",
|
||||
"마다",
|
||||
"밖에",
|
||||
"대로",
|
||||
// Pronouns (대명사)
|
||||
"나",
|
||||
"나는",
|
||||
"내가",
|
||||
"나를",
|
||||
"너",
|
||||
"우리",
|
||||
"저",
|
||||
"저희",
|
||||
"그",
|
||||
"그녀",
|
||||
"그들",
|
||||
"이것",
|
||||
"저것",
|
||||
"그것",
|
||||
"여기",
|
||||
"저기",
|
||||
"거기",
|
||||
// Common verbs / auxiliaries (일반 동사/보조 동사)
|
||||
"있다",
|
||||
"없다",
|
||||
"하다",
|
||||
"되다",
|
||||
"이다",
|
||||
"아니다",
|
||||
"보다",
|
||||
"주다",
|
||||
"오다",
|
||||
"가다",
|
||||
// Nouns (의존 명사 / vague)
|
||||
"것",
|
||||
"거",
|
||||
"등",
|
||||
"수",
|
||||
"때",
|
||||
"곳",
|
||||
"중",
|
||||
"분",
|
||||
// Adverbs
|
||||
"잘",
|
||||
"더",
|
||||
"또",
|
||||
"매우",
|
||||
"정말",
|
||||
"아주",
|
||||
"많이",
|
||||
"너무",
|
||||
"좀",
|
||||
// Conjunctions
|
||||
"그리고",
|
||||
"하지만",
|
||||
"그래서",
|
||||
"그런데",
|
||||
"그러나",
|
||||
"또는",
|
||||
"그러면",
|
||||
// Question words
|
||||
"왜",
|
||||
"어떻게",
|
||||
"뭐",
|
||||
"언제",
|
||||
"어디",
|
||||
"누구",
|
||||
"무엇",
|
||||
"어떤",
|
||||
// Time (vague)
|
||||
"어제",
|
||||
"오늘",
|
||||
"내일",
|
||||
"최근",
|
||||
"지금",
|
||||
"아까",
|
||||
"나중",
|
||||
"전에",
|
||||
// Request words
|
||||
"제발",
|
||||
"부탁",
|
||||
]);
|
||||
|
||||
// Common Korean trailing particles to strip from words for tokenization
|
||||
// Sorted by descending length so longest-match-first is guaranteed.
|
||||
const KO_TRAILING_PARTICLES = [
|
||||
"에서",
|
||||
"으로",
|
||||
"에게",
|
||||
"한테",
|
||||
"처럼",
|
||||
"같이",
|
||||
"보다",
|
||||
"까지",
|
||||
"부터",
|
||||
"마다",
|
||||
"밖에",
|
||||
"대로",
|
||||
"은",
|
||||
"는",
|
||||
"이",
|
||||
"가",
|
||||
"을",
|
||||
"를",
|
||||
"의",
|
||||
"에",
|
||||
"로",
|
||||
"와",
|
||||
"과",
|
||||
"도",
|
||||
"만",
|
||||
].toSorted((a, b) => b.length - a.length);
|
||||
|
||||
function stripKoreanTrailingParticle(token: string): string | null {
|
||||
for (const particle of KO_TRAILING_PARTICLES) {
|
||||
if (token.length > particle.length && token.endsWith(particle)) {
|
||||
return token.slice(0, -particle.length);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function isUsefulKoreanStem(stem: string): boolean {
|
||||
// Prevent bogus one-syllable stems from words like "논의" -> "논".
|
||||
if (/[\uac00-\ud7af]/.test(stem)) {
|
||||
return stem.length >= 2;
|
||||
}
|
||||
// Keep stripped ASCII stems for mixed tokens like "API를" -> "api".
|
||||
return /^[a-z0-9_]+$/i.test(stem);
|
||||
}
|
||||
|
||||
const STOP_WORDS_ZH = new Set([
|
||||
// Pronouns
|
||||
"我",
|
||||
@@ -240,7 +395,7 @@ function isValidKeyword(token: string): boolean {
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple tokenizer that handles both English and Chinese text.
|
||||
* Simple tokenizer that handles English, Chinese, and Korean text.
|
||||
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
|
||||
* For English, we split on whitespace and punctuation.
|
||||
*/
|
||||
@@ -252,7 +407,7 @@ function tokenize(text: string): string[] {
|
||||
const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
|
||||
|
||||
for (const segment of segments) {
|
||||
// Check if segment contains CJK characters
|
||||
// Check if segment contains CJK characters (Chinese)
|
||||
if (/[\u4e00-\u9fff]/.test(segment)) {
|
||||
// For Chinese, extract character n-grams (unigrams and bigrams)
|
||||
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
|
||||
@@ -262,6 +417,18 @@ function tokenize(text: string): string[] {
|
||||
for (let i = 0; i < chars.length - 1; i++) {
|
||||
tokens.push(chars[i] + chars[i + 1]);
|
||||
}
|
||||
} else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
|
||||
// For Korean (Hangul syllables and jamo), keep the word as-is unless it is
|
||||
// effectively a stop word once trailing particles are removed.
|
||||
const stem = stripKoreanTrailingParticle(segment);
|
||||
const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem);
|
||||
if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) {
|
||||
tokens.push(segment);
|
||||
}
|
||||
// Also emit particle-stripped stems when they are useful keywords.
|
||||
if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) {
|
||||
tokens.push(stem);
|
||||
}
|
||||
} else {
|
||||
// For non-CJK, keep as single token
|
||||
tokens.push(segment);
|
||||
@@ -286,7 +453,7 @@ export function extractKeywords(query: string): string[] {
|
||||
|
||||
for (const token of tokens) {
|
||||
// Skip stop words
|
||||
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token)) {
|
||||
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
|
||||
continue;
|
||||
}
|
||||
// Skip invalid keywords
|
||||
|
||||
Reference in New Issue
Block a user