From 853ae626fad127d4bddc5ca5d91ef4b582a88598 Mon Sep 17 00:00:00 2001 From: Andrew Jeon <46941315+ruypang@users.noreply.github.com> Date: Sun, 22 Feb 2026 11:33:30 +0900 Subject: [PATCH] feat: add Korean language support for memory search query expansion (#18899) * feat: add Korean stop words and tokenization for memory search * fix: address review comments on Korean query expansion * fix: lint errors - curly brace and toSorted * fix(memory): improve Korean stop words and deduplicate * Memory: tighten Korean query expansion filtering * Docs/Changelog: credit Korean memory query expansion --------- Co-authored-by: Vincent Koc --- CHANGELOG.md | 1 + src/memory/query-expansion.test.ts | 57 ++++++++++ src/memory/query-expansion.ts | 173 ++++++++++++++++++++++++++++- 3 files changed, 228 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 126ec8a6e27..8d416f94d27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai - Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path. - Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior. +- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang. - iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman. ### Breaking diff --git a/src/memory/query-expansion.test.ts b/src/memory/query-expansion.test.ts index f51eac1b6df..955e74858a6 100644 --- a/src/memory/query-expansion.test.ts +++ b/src/memory/query-expansion.test.ts @@ -38,6 +38,63 @@ describe("extractKeywords", () => { expect(keywords).toContain("bug"); }); + it("extracts keywords from Korean conversational query", () => { + const keywords = extractKeywords("어제 논의한 배포 전략"); + expect(keywords).toContain("논의한"); + expect(keywords).toContain("배포"); + expect(keywords).toContain("전략"); + // Should not include stop words + expect(keywords).not.toContain("어제"); + }); + + it("strips Korean particles to extract stems", () => { + const keywords = extractKeywords("서버에서 발생한 에러를 확인"); + expect(keywords).toContain("서버"); + expect(keywords).toContain("에러"); + expect(keywords).toContain("확인"); + }); + + it("filters Korean stop words including inflected forms", () => { + const keywords = extractKeywords("나는 그리고 그래서"); + expect(keywords).not.toContain("나"); + expect(keywords).not.toContain("나는"); + expect(keywords).not.toContain("그리고"); + expect(keywords).not.toContain("그래서"); + }); + + it("filters inflected Korean stop words not explicitly listed", () => { + const keywords = extractKeywords("그녀는 우리는"); + expect(keywords).not.toContain("그녀는"); + expect(keywords).not.toContain("우리는"); + expect(keywords).not.toContain("그녀"); + expect(keywords).not.toContain("우리"); + }); + + it("does not produce bogus single-char stems from particle stripping", () => { + const keywords = extractKeywords("논의"); + expect(keywords).toContain("논의"); + expect(keywords).not.toContain("논"); + }); + + it("strips longest Korean trailing particles first", () => { + const keywords = extractKeywords("기능으로 설명"); + expect(keywords).toContain("기능"); + expect(keywords).not.toContain("기능으"); + }); + + it("keeps stripped ASCII stems for mixed Korean tokens", () => { + const keywords = extractKeywords("API를 배포했다"); + expect(keywords).toContain("api"); + expect(keywords).toContain("배포했다"); + }); + + it("handles mixed Korean and English query", () => { + const keywords = extractKeywords("API 배포에 대한 논의"); + expect(keywords).toContain("api"); + expect(keywords).toContain("배포"); + expect(keywords).toContain("논의"); + }); + it("handles empty query", () => { expect(extractKeywords("")).toEqual([]); expect(extractKeywords(" ")).toEqual([]); diff --git a/src/memory/query-expansion.ts b/src/memory/query-expansion.ts index 123fd23ecd7..efb940e04be 100644 --- a/src/memory/query-expansion.ts +++ b/src/memory/query-expansion.ts @@ -118,6 +118,161 @@ const STOP_WORDS_EN = new Set([ "give", ]); +const STOP_WORDS_KO = new Set([ + // Particles (조사) + "은", + "는", + "이", + "가", + "을", + "를", + "의", + "에", + "에서", + "로", + "으로", + "와", + "과", + "도", + "만", + "까지", + "부터", + "한테", + "에게", + "께", + "처럼", + "같이", + "보다", + "마다", + "밖에", + "대로", + // Pronouns (대명사) + "나", + "나는", + "내가", + "나를", + "너", + "우리", + "저", + "저희", + "그", + "그녀", + "그들", + "이것", + "저것", + "그것", + "여기", + "저기", + "거기", + // Common verbs / auxiliaries (일반 동사/보조 동사) + "있다", + "없다", + "하다", + "되다", + "이다", + "아니다", + "보다", + "주다", + "오다", + "가다", + // Nouns (의존 명사 / vague) + "것", + "거", + "등", + "수", + "때", + "곳", + "중", + "분", + // Adverbs + "잘", + "더", + "또", + "매우", + "정말", + "아주", + "많이", + "너무", + "좀", + // Conjunctions + "그리고", + "하지만", + "그래서", + "그런데", + "그러나", + "또는", + "그러면", + // Question words + "왜", + "어떻게", + "뭐", + "언제", + "어디", + "누구", + "무엇", + "어떤", + // Time (vague) + "어제", + "오늘", + "내일", + "최근", + "지금", + "아까", + "나중", + "전에", + // Request words + "제발", + "부탁", +]); + +// Common Korean trailing particles to strip from words for tokenization +// Sorted by descending length so longest-match-first is guaranteed. +const KO_TRAILING_PARTICLES = [ + "에서", + "으로", + "에게", + "한테", + "처럼", + "같이", + "보다", + "까지", + "부터", + "마다", + "밖에", + "대로", + "은", + "는", + "이", + "가", + "을", + "를", + "의", + "에", + "로", + "와", + "과", + "도", + "만", +].toSorted((a, b) => b.length - a.length); + +function stripKoreanTrailingParticle(token: string): string | null { + for (const particle of KO_TRAILING_PARTICLES) { + if (token.length > particle.length && token.endsWith(particle)) { + return token.slice(0, -particle.length); + } + } + return null; +} + +function isUsefulKoreanStem(stem: string): boolean { + // Prevent bogus one-syllable stems from words like "논의" -> "논". + if (/[\uac00-\ud7af]/.test(stem)) { + return stem.length >= 2; + } + // Keep stripped ASCII stems for mixed tokens like "API를" -> "api". + return /^[a-z0-9_]+$/i.test(stem); +} + const STOP_WORDS_ZH = new Set([ // Pronouns "我", @@ -240,7 +395,7 @@ function isValidKeyword(token: string): boolean { } /** - * Simple tokenizer that handles both English and Chinese text. + * Simple tokenizer that handles English, Chinese, and Korean text. * For Chinese, we do character-based splitting since we don't have a proper segmenter. * For English, we split on whitespace and punctuation. */ @@ -252,7 +407,7 @@ function tokenize(text: string): string[] { const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean); for (const segment of segments) { - // Check if segment contains CJK characters + // Check if segment contains CJK characters (Chinese) if (/[\u4e00-\u9fff]/.test(segment)) { // For Chinese, extract character n-grams (unigrams and bigrams) const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c)); @@ -262,6 +417,18 @@ function tokenize(text: string): string[] { for (let i = 0; i < chars.length - 1; i++) { tokens.push(chars[i] + chars[i + 1]); } + } else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) { + // For Korean (Hangul syllables and jamo), keep the word as-is unless it is + // effectively a stop word once trailing particles are removed. + const stem = stripKoreanTrailingParticle(segment); + const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem); + if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) { + tokens.push(segment); + } + // Also emit particle-stripped stems when they are useful keywords. + if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) { + tokens.push(stem); + } } else { // For non-CJK, keep as single token tokens.push(segment); @@ -286,7 +453,7 @@ export function extractKeywords(query: string): string[] { for (const token of tokens) { // Skip stop words - if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token)) { + if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) { continue; } // Skip invalid keywords