feat: add Korean language support for memory search query expansion (#18899)

* feat: add Korean stop words and tokenization for memory search * fix: address review comments on Korean query expansion * fix: lint errors - curly brace and toSorted * fix(memory): improve Korean stop words and deduplicate * Memory: tighten Korean query expansion filtering * Docs/Changelog: credit Korean memory query expansion --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-03-07 22:44:16 +00:00 · 2026-02-22 11:33:30 +09:00
parent 5b4409d5d0
commit 853ae626fa
3 changed files with 228 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai

 - Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path.
 - Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
+- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
 - iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.

 ### Breaking
--- a/src/memory/query-expansion.test.ts
+++ b/src/memory/query-expansion.test.ts
@@ -38,6 +38,63 @@ describe("extractKeywords", () => {
    expect(keywords).toContain("bug");
  });

+  it("extracts keywords from Korean conversational query", () => {
+    const keywords = extractKeywords("어제 논의한 배포 전략");
+    expect(keywords).toContain("논의한");
+    expect(keywords).toContain("배포");
+    expect(keywords).toContain("전략");
+    // Should not include stop words
+    expect(keywords).not.toContain("어제");
+  });
+
+  it("strips Korean particles to extract stems", () => {
+    const keywords = extractKeywords("서버에서 발생한 에러를 확인");
+    expect(keywords).toContain("서버");
+    expect(keywords).toContain("에러");
+    expect(keywords).toContain("확인");
+  });
+
+  it("filters Korean stop words including inflected forms", () => {
+    const keywords = extractKeywords("나는 그리고 그래서");
+    expect(keywords).not.toContain("나");
+    expect(keywords).not.toContain("나는");
+    expect(keywords).not.toContain("그리고");
+    expect(keywords).not.toContain("그래서");
+  });
+
+  it("filters inflected Korean stop words not explicitly listed", () => {
+    const keywords = extractKeywords("그녀는 우리는");
+    expect(keywords).not.toContain("그녀는");
+    expect(keywords).not.toContain("우리는");
+    expect(keywords).not.toContain("그녀");
+    expect(keywords).not.toContain("우리");
+  });
+
+  it("does not produce bogus single-char stems from particle stripping", () => {
+    const keywords = extractKeywords("논의");
+    expect(keywords).toContain("논의");
+    expect(keywords).not.toContain("논");
+  });
+
+  it("strips longest Korean trailing particles first", () => {
+    const keywords = extractKeywords("기능으로 설명");
+    expect(keywords).toContain("기능");
+    expect(keywords).not.toContain("기능으");
+  });
+
+  it("keeps stripped ASCII stems for mixed Korean tokens", () => {
+    const keywords = extractKeywords("API를 배포했다");
+    expect(keywords).toContain("api");
+    expect(keywords).toContain("배포했다");
+  });
+
+  it("handles mixed Korean and English query", () => {
+    const keywords = extractKeywords("API 배포에 대한 논의");
+    expect(keywords).toContain("api");
+    expect(keywords).toContain("배포");
+    expect(keywords).toContain("논의");
+  });
+
  it("handles empty query", () => {
    expect(extractKeywords("")).toEqual([]);
    expect(extractKeywords("   ")).toEqual([]);
--- a/src/memory/query-expansion.ts
+++ b/src/memory/query-expansion.ts
@@ -118,6 +118,161 @@ const STOP_WORDS_EN = new Set([
  "give",
 ]);

+const STOP_WORDS_KO = new Set([
+  // Particles (조사)
+  "은",
+  "는",
+  "이",
+  "가",
+  "을",
+  "를",
+  "의",
+  "에",
+  "에서",
+  "로",
+  "으로",
+  "와",
+  "과",
+  "도",
+  "만",
+  "까지",
+  "부터",
+  "한테",
+  "에게",
+  "께",
+  "처럼",
+  "같이",
+  "보다",
+  "마다",
+  "밖에",
+  "대로",
+  // Pronouns (대명사)
+  "나",
+  "나는",
+  "내가",
+  "나를",
+  "너",
+  "우리",
+  "저",
+  "저희",
+  "그",
+  "그녀",
+  "그들",
+  "이것",
+  "저것",
+  "그것",
+  "여기",
+  "저기",
+  "거기",
+  // Common verbs / auxiliaries (일반 동사/보조 동사)
+  "있다",
+  "없다",
+  "하다",
+  "되다",
+  "이다",
+  "아니다",
+  "보다",
+  "주다",
+  "오다",
+  "가다",
+  // Nouns (의존 명사 / vague)
+  "것",
+  "거",
+  "등",
+  "수",
+  "때",
+  "곳",
+  "중",
+  "분",
+  // Adverbs
+  "잘",
+  "더",
+  "또",
+  "매우",
+  "정말",
+  "아주",
+  "많이",
+  "너무",
+  "좀",
+  // Conjunctions
+  "그리고",
+  "하지만",
+  "그래서",
+  "그런데",
+  "그러나",
+  "또는",
+  "그러면",
+  // Question words
+  "왜",
+  "어떻게",
+  "뭐",
+  "언제",
+  "어디",
+  "누구",
+  "무엇",
+  "어떤",
+  // Time (vague)
+  "어제",
+  "오늘",
+  "내일",
+  "최근",
+  "지금",
+  "아까",
+  "나중",
+  "전에",
+  // Request words
+  "제발",
+  "부탁",
+]);
+
+// Common Korean trailing particles to strip from words for tokenization
+// Sorted by descending length so longest-match-first is guaranteed.
+const KO_TRAILING_PARTICLES = [
+  "에서",
+  "으로",
+  "에게",
+  "한테",
+  "처럼",
+  "같이",
+  "보다",
+  "까지",
+  "부터",
+  "마다",
+  "밖에",
+  "대로",
+  "은",
+  "는",
+  "이",
+  "가",
+  "을",
+  "를",
+  "의",
+  "에",
+  "로",
+  "와",
+  "과",
+  "도",
+  "만",
+].toSorted((a, b) => b.length - a.length);
+
+function stripKoreanTrailingParticle(token: string): string | null {
+  for (const particle of KO_TRAILING_PARTICLES) {
+    if (token.length > particle.length && token.endsWith(particle)) {
+      return token.slice(0, -particle.length);
+    }
+  }
+  return null;
+}
+
+function isUsefulKoreanStem(stem: string): boolean {
+  // Prevent bogus one-syllable stems from words like "논의" -> "논".
+  if (/[\uac00-\ud7af]/.test(stem)) {
+    return stem.length >= 2;
+  }
+  // Keep stripped ASCII stems for mixed tokens like "API를" -> "api".
+  return /^[a-z0-9_]+$/i.test(stem);
+}
+
 const STOP_WORDS_ZH = new Set([
  // Pronouns
  "我",
@@ -240,7 +395,7 @@ function isValidKeyword(token: string): boolean {
 }

 /**
- * Simple tokenizer that handles both English and Chinese text.
+ * Simple tokenizer that handles English, Chinese, and Korean text.
 * For Chinese, we do character-based splitting since we don't have a proper segmenter.
 * For English, we split on whitespace and punctuation.
 */
@@ -252,7 +407,7 @@ function tokenize(text: string): string[] {
  const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);

  for (const segment of segments) {
-    // Check if segment contains CJK characters
+    // Check if segment contains CJK characters (Chinese)
    if (/[\u4e00-\u9fff]/.test(segment)) {
      // For Chinese, extract character n-grams (unigrams and bigrams)
      const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
@@ -262,6 +417,18 @@ function tokenize(text: string): string[] {
      for (let i = 0; i < chars.length - 1; i++) {
        tokens.push(chars[i] + chars[i + 1]);
      }
+    } else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
+      // For Korean (Hangul syllables and jamo), keep the word as-is unless it is
+      // effectively a stop word once trailing particles are removed.
+      const stem = stripKoreanTrailingParticle(segment);
+      const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem);
+      if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) {
+        tokens.push(segment);
+      }
+      // Also emit particle-stripped stems when they are useful keywords.
+      if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) {
+        tokens.push(stem);
+      }
    } else {
      // For non-CJK, keep as single token
      tokens.push(segment);
@@ -286,7 +453,7 @@ export function extractKeywords(query: string): string[] {

  for (const token of tokens) {
    // Skip stop words
-    if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token)) {
+    if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
      continue;
    }
    // Skip invalid keywords