feat: add Korean language support for memory search query expansion (#18899)

* feat: add Korean stop words and tokenization for memory search

* fix: address review comments on Korean query expansion

* fix: lint errors - curly brace and toSorted

* fix(memory): improve Korean stop words and deduplicate

* Memory: tighten Korean query expansion filtering

* Docs/Changelog: credit Korean memory query expansion

---------

Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
Andrew Jeon
2026-02-22 11:33:30 +09:00
committed by GitHub
parent 5b4409d5d0
commit 853ae626fa
3 changed files with 228 additions and 3 deletions

View File

@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
- Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path.
- Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
### Breaking

View File

@@ -38,6 +38,63 @@ describe("extractKeywords", () => {
expect(keywords).toContain("bug");
});
it("extracts keywords from Korean conversational query", () => {
const keywords = extractKeywords("어제 논의한 배포 전략");
expect(keywords).toContain("논의한");
expect(keywords).toContain("배포");
expect(keywords).toContain("전략");
// Should not include stop words
expect(keywords).not.toContain("어제");
});
it("strips Korean particles to extract stems", () => {
const keywords = extractKeywords("서버에서 발생한 에러를 확인");
expect(keywords).toContain("서버");
expect(keywords).toContain("에러");
expect(keywords).toContain("확인");
});
it("filters Korean stop words including inflected forms", () => {
const keywords = extractKeywords("나는 그리고 그래서");
expect(keywords).not.toContain("나");
expect(keywords).not.toContain("나는");
expect(keywords).not.toContain("그리고");
expect(keywords).not.toContain("그래서");
});
it("filters inflected Korean stop words not explicitly listed", () => {
const keywords = extractKeywords("그녀는 우리는");
expect(keywords).not.toContain("그녀는");
expect(keywords).not.toContain("우리는");
expect(keywords).not.toContain("그녀");
expect(keywords).not.toContain("우리");
});
it("does not produce bogus single-char stems from particle stripping", () => {
const keywords = extractKeywords("논의");
expect(keywords).toContain("논의");
expect(keywords).not.toContain("논");
});
it("strips longest Korean trailing particles first", () => {
const keywords = extractKeywords("기능으로 설명");
expect(keywords).toContain("기능");
expect(keywords).not.toContain("기능으");
});
it("keeps stripped ASCII stems for mixed Korean tokens", () => {
const keywords = extractKeywords("API를 배포했다");
expect(keywords).toContain("api");
expect(keywords).toContain("배포했다");
});
it("handles mixed Korean and English query", () => {
const keywords = extractKeywords("API 배포에 대한 논의");
expect(keywords).toContain("api");
expect(keywords).toContain("배포");
expect(keywords).toContain("논의");
});
it("handles empty query", () => {
expect(extractKeywords("")).toEqual([]);
expect(extractKeywords(" ")).toEqual([]);

View File

@@ -118,6 +118,161 @@ const STOP_WORDS_EN = new Set([
"give",
]);
const STOP_WORDS_KO = new Set([
// Particles (조사)
"은",
"는",
"이",
"가",
"을",
"를",
"의",
"에",
"에서",
"로",
"으로",
"와",
"과",
"도",
"만",
"까지",
"부터",
"한테",
"에게",
"께",
"처럼",
"같이",
"보다",
"마다",
"밖에",
"대로",
// Pronouns (대명사)
"나",
"나는",
"내가",
"나를",
"너",
"우리",
"저",
"저희",
"그",
"그녀",
"그들",
"이것",
"저것",
"그것",
"여기",
"저기",
"거기",
// Common verbs / auxiliaries (일반 동사/보조 동사)
"있다",
"없다",
"하다",
"되다",
"이다",
"아니다",
"보다",
"주다",
"오다",
"가다",
// Nouns (의존 명사 / vague)
"것",
"거",
"등",
"수",
"때",
"곳",
"중",
"분",
// Adverbs
"잘",
"더",
"또",
"매우",
"정말",
"아주",
"많이",
"너무",
"좀",
// Conjunctions
"그리고",
"하지만",
"그래서",
"그런데",
"그러나",
"또는",
"그러면",
// Question words
"왜",
"어떻게",
"뭐",
"언제",
"어디",
"누구",
"무엇",
"어떤",
// Time (vague)
"어제",
"오늘",
"내일",
"최근",
"지금",
"아까",
"나중",
"전에",
// Request words
"제발",
"부탁",
]);
// Common Korean trailing particles to strip from words for tokenization
// Sorted by descending length so longest-match-first is guaranteed.
const KO_TRAILING_PARTICLES = [
"에서",
"으로",
"에게",
"한테",
"처럼",
"같이",
"보다",
"까지",
"부터",
"마다",
"밖에",
"대로",
"은",
"는",
"이",
"가",
"을",
"를",
"의",
"에",
"로",
"와",
"과",
"도",
"만",
].toSorted((a, b) => b.length - a.length);
function stripKoreanTrailingParticle(token: string): string | null {
for (const particle of KO_TRAILING_PARTICLES) {
if (token.length > particle.length && token.endsWith(particle)) {
return token.slice(0, -particle.length);
}
}
return null;
}
function isUsefulKoreanStem(stem: string): boolean {
// Prevent bogus one-syllable stems from words like "논의" -> "논".
if (/[\uac00-\ud7af]/.test(stem)) {
return stem.length >= 2;
}
// Keep stripped ASCII stems for mixed tokens like "API를" -> "api".
return /^[a-z0-9_]+$/i.test(stem);
}
const STOP_WORDS_ZH = new Set([
// Pronouns
"我",
@@ -240,7 +395,7 @@ function isValidKeyword(token: string): boolean {
}
/**
* Simple tokenizer that handles both English and Chinese text.
* Simple tokenizer that handles English, Chinese, and Korean text.
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
* For English, we split on whitespace and punctuation.
*/
@@ -252,7 +407,7 @@ function tokenize(text: string): string[] {
const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
for (const segment of segments) {
// Check if segment contains CJK characters
// Check if segment contains CJK characters (Chinese)
if (/[\u4e00-\u9fff]/.test(segment)) {
// For Chinese, extract character n-grams (unigrams and bigrams)
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
@@ -262,6 +417,18 @@ function tokenize(text: string): string[] {
for (let i = 0; i < chars.length - 1; i++) {
tokens.push(chars[i] + chars[i + 1]);
}
} else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
// For Korean (Hangul syllables and jamo), keep the word as-is unless it is
// effectively a stop word once trailing particles are removed.
const stem = stripKoreanTrailingParticle(segment);
const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem);
if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) {
tokens.push(segment);
}
// Also emit particle-stripped stems when they are useful keywords.
if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) {
tokens.push(stem);
}
} else {
// For non-CJK, keep as single token
tokens.push(segment);
@@ -286,7 +453,7 @@ export function extractKeywords(query: string): string[] {
for (const token of tokens) {
// Skip stop words
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token)) {
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
continue;
}
// Skip invalid keywords