diff --git a/pyproject.toml b/pyproject.toml index e1d4140..59903a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "whisperlivekit" -version = "0.2.5" +version = "0.2.5.post1" description = "Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization" readme = "README.md" authors = [ @@ -52,5 +52,5 @@ whisperlivekit-server = "whisperlivekit.basic_server:main" packages = ["whisperlivekit", "whisperlivekit.diarization", "whisperlivekit.simul_whisper", "whisperlivekit.simul_whisper.whisper", "whisperlivekit.simul_whisper.whisper.assets", "whisperlivekit.simul_whisper.whisper.normalizers", "whisperlivekit.web", "whisperlivekit.whisper_streaming_custom"] [tool.setuptools.package-data] -whisperlivekit = ["web/*.html"] +whisperlivekit = ["web/*.html", "web/*.css", "web/*.js", "web/src/*.svg"] "whisperlivekit.simul_whisper.whisper.assets" = ["*.tiktoken", "*.npz"] diff --git a/whisperlivekit/basic_server.py b/whisperlivekit/basic_server.py index 9ce0a1e..b49af59 100644 --- a/whisperlivekit/basic_server.py +++ b/whisperlivekit/basic_server.py @@ -5,6 +5,9 @@ from fastapi.middleware.cors import CORSMiddleware from whisperlivekit import TranscriptionEngine, AudioProcessor, get_web_interface_html, parse_args import asyncio import logging +from starlette.staticfiles import StaticFiles +import pathlib +import whisperlivekit.web as webpkg logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logging.getLogger().setLevel(logging.WARNING) @@ -30,6 +33,8 @@ app.add_middleware( allow_methods=["*"], allow_headers=["*"], ) +web_dir = pathlib.Path(webpkg.__file__).parent +app.mount("/web", StaticFiles(directory=str(web_dir)), name="web") @app.get("/") async def get(): diff --git a/whisperlivekit/web/live_transcription.css b/whisperlivekit/web/live_transcription.css new file mode 100644 index 0000000..be5e8b6 --- /dev/null +++ b/whisperlivekit/web/live_transcription.css @@ -0,0 +1,388 @@ +:root { + --bg: #ffffff; + --text: #111111; + --muted: #666666; + --border: #e5e5e5; + --chip-bg: rgba(0, 0, 0, 0.04); + --chip-text: #000000; + --spinner-border: #8d8d8d5c; + --spinner-top: #b0b0b0; + --silence-bg: #f3f3f3; + --loading-bg: rgba(255, 77, 77, 0.06); + --button-bg: #ffffff; + --button-border: #e9e9e9; + --wave-stroke: #000000; + --label-dia-text: #868686; + --label-trans-text: #111111; +} + +@media (prefers-color-scheme: dark) { + :root:not([data-theme="light"]) { + --bg: #0b0b0b; + --text: #e6e6e6; + --muted: #9aa0a6; + --border: #333333; + --chip-bg: rgba(255, 255, 255, 0.08); + --chip-text: #e6e6e6; + --spinner-border: #555555; + --spinner-top: #dddddd; + --silence-bg: #1a1a1a; + --loading-bg: rgba(255, 77, 77, 0.12); + --button-bg: #111111; + --button-border: #333333; + --wave-stroke: #e6e6e6; + --label-dia-text: #b3b3b3; + --label-trans-text: #ffffff; + } +} + +:root[data-theme="dark"] { + --bg: #0b0b0b; + --text: #e6e6e6; + --muted: #9aa0a6; + --border: #333333; + --chip-bg: rgba(255, 255, 255, 0.08); + --chip-text: #e6e6e6; + --spinner-border: #555555; + --spinner-top: #dddddd; + --silence-bg: #1a1a1a; + --loading-bg: rgba(255, 77, 77, 0.12); + --button-bg: #111111; + --button-border: #333333; + --wave-stroke: #e6e6e6; + --label-dia-text: #b3b3b3; + --label-trans-text: #ffffff; +} + +:root[data-theme="light"] { + --bg: #ffffff; + --text: #111111; + --muted: #666666; + --border: #e5e5e5; + --chip-bg: rgba(0, 0, 0, 0.04); + --chip-text: #000000; + --spinner-border: #8d8d8d5c; + --spinner-top: #b0b0b0; + --silence-bg: #f3f3f3; + --loading-bg: rgba(255, 77, 77, 0.06); + --button-bg: #ffffff; + --button-border: #e9e9e9; + --wave-stroke: #000000; + --label-dia-text: #868686; + --label-trans-text: #111111; +} + +body { + font-family: ui-sans-serif, system-ui, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; + margin: 20px; + text-align: center; + background-color: var(--bg); + color: var(--text); +} + +/* Record button */ +#recordButton { + width: 50px; + height: 50px; + border: none; + border-radius: 50%; + background-color: var(--button-bg); + cursor: pointer; + transition: all 0.3s ease; + border: 1px solid var(--button-border); + display: flex; + align-items: center; + justify-content: center; + position: relative; +} + +#recordButton.recording { + width: 180px; + border-radius: 40px; + justify-content: flex-start; + padding-left: 20px; +} + +#recordButton:active { + transform: scale(0.95); +} + +.shape-container { + width: 25px; + height: 25px; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; +} + +.shape { + width: 25px; + height: 25px; + background-color: rgb(209, 61, 53); + border-radius: 50%; + transition: all 0.3s ease; +} + +#recordButton:disabled .shape { + background-color: #6e6d6d; +} + +#recordButton.recording .shape { + border-radius: 5px; + width: 25px; + height: 25px; +} + +/* Recording elements */ +.recording-info { + display: none; + align-items: center; + margin-left: 15px; + flex-grow: 1; +} + +#recordButton.recording .recording-info { + display: flex; +} + +.wave-container { + width: 60px; + height: 30px; + position: relative; + display: flex; + align-items: center; + justify-content: center; +} + +#waveCanvas { + width: 100%; + height: 100%; +} + +.timer { + font-size: 14px; + font-weight: 500; + color: var(--text); + margin-left: 10px; +} + +#status { + margin-top: 20px; + font-size: 16px; + color: var(--text); +} + +/* Settings */ +.settings-container { + display: flex; + justify-content: center; + align-items: center; + gap: 15px; + margin-top: 20px; +} + +.settings { + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 12px; +} + +.field { + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 6px; +} + +#chunkSelector, +#websocketInput, +#themeSelector { + font-size: 16px; + padding: 5px 8px; + border-radius: 8px; + border: 1px solid var(--border); + background-color: var(--button-bg); + color: var(--text); + max-height: 34px; +} + +#websocketInput { + width: 220px; +} + +#chunkSelector:focus, +#websocketInput:focus, +#themeSelector:focus { + outline: none; + border-color: #007bff; + box-shadow: 0 0 0 3px rgba(0, 123, 255, 0.15); +} + +label { + font-size: 13px; + color: var(--muted); +} + +.ws-default { + font-size: 12px; + color: var(--muted); +} + +/* Segmented pill control for Theme */ +.segmented { + display: inline-flex; + align-items: stretch; + border: 1px solid var(--button-border); + background-color: var(--button-bg); + border-radius: 999px; + overflow: hidden; +} + +.segmented input[type="radio"] { + position: absolute; + opacity: 0; + pointer-events: none; +} + +.segmented label { + display: inline-flex; + align-items: center; + gap: 6px; + padding: 6px 12px; + font-size: 14px; + color: var(--muted); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease, color 0.2s ease; +} + +.segmented label:hover { + background-color: var(--chip-bg); +} + +.segmented img { + width: 16px; + height: 16px; +} + +.segmented input[type="radio"]:checked + label { + background-color: var(--chip-bg); + color: var(--text); +} + +.segmented input[type="radio"]:focus-visible + label, +.segmented input[type="radio"]:focus + label { + outline: 2px solid #007bff; + outline-offset: 2px; + border-radius: 999px; +} + +/* Transcript area */ +#linesTranscript { + margin: 20px auto; + max-width: 700px; + text-align: left; + font-size: 16px; +} + +#linesTranscript p { + margin: 0px 0; +} + +#linesTranscript strong { + color: var(--text); +} + +#speaker { + border: 1px solid var(--border); + border-radius: 100px; + padding: 2px 10px; + font-size: 14px; + margin-bottom: 0px; +} + +.label_diarization { + background-color: var(--chip-bg); + border-radius: 8px 8px 8px 8px; + padding: 2px 10px; + margin-left: 10px; + display: inline-block; + white-space: nowrap; + font-size: 14px; + margin-bottom: 0px; + color: var(--label-dia-text); +} + +.label_transcription { + background-color: var(--chip-bg); + border-radius: 8px 8px 8px 8px; + padding: 2px 10px; + display: inline-block; + white-space: nowrap; + margin-left: 10px; + font-size: 14px; + margin-bottom: 0px; + color: var(--label-trans-text); +} + +#timeInfo { + color: var(--muted); + margin-left: 10px; +} + +.textcontent { + font-size: 16px; + padding-left: 10px; + margin-bottom: 10px; + margin-top: 1px; + padding-top: 5px; + border-radius: 0px 0px 0px 10px; +} + +.buffer_diarization { + color: var(--label-dia-text); + margin-left: 4px; +} + +.buffer_transcription { + color: #7474748c; + margin-left: 4px; +} + +.spinner { + display: inline-block; + width: 8px; + height: 8px; + border: 2px solid var(--spinner-border); + border-top: 2px solid var(--spinner-top); + border-radius: 50%; + animation: spin 0.7s linear infinite; + vertical-align: middle; + margin-bottom: 2px; + margin-right: 5px; +} + +@keyframes spin { + to { + transform: rotate(360deg); + } +} + +.silence { + color: var(--muted); + background-color: var(--silence-bg); + font-size: 13px; + border-radius: 30px; + padding: 2px 10px; +} + +.loading { + color: var(--muted); + background-color: var(--loading-bg); + border-radius: 8px 8px 8px 0px; + padding: 2px 10px; + font-size: 14px; + margin-bottom: 0px; +} diff --git a/whisperlivekit/web/live_transcription.html b/whisperlivekit/web/live_transcription.html index b851e8e..a95e0bf 100644 --- a/whisperlivekit/web/live_transcription.html +++ b/whisperlivekit/web/live_transcription.html @@ -1,861 +1,60 @@ -
- - -No audio detected...
"; + return; + } + + const showLoading = !isFinalizing && (lines || []).some((it) => it.speaker == 0); + const showTransLag = !isFinalizing && remaining_time_transcription > 0; + const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0; + const signature = JSON.stringify({ + lines: (lines || []).map((it) => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })), + buffer_transcription: buffer_transcription || "", + buffer_diarization: buffer_diarization || "", + status: current_status, + showLoading, + showTransLag, + showDiaLag, + isFinalizing: !!isFinalizing, + }); + if (lastSignature === signature) { + const t = document.querySelector(".lag-transcription-value"); + if (t) t.textContent = fmt1(remaining_time_transcription); + const d = document.querySelector(".lag-diarization-value"); + if (d) d.textContent = fmt1(remaining_time_diarization); + const ld = document.querySelector(".loading-diarization-value"); + if (ld) ld.textContent = fmt1(remaining_time_diarization); + return; + } + lastSignature = signature; + + const linesHtml = (lines || []) + .map((item, idx) => { + let timeInfo = ""; + if (item.beg !== undefined && item.end !== undefined) { + timeInfo = ` ${item.beg} - ${item.end}`; + } + + let speakerLabel = ""; + if (item.speaker === -2) { + speakerLabel = `Silence${timeInfo}`; + } else if (item.speaker == 0 && !isFinalizing) { + speakerLabel = `${fmt1( + remaining_time_diarization + )} second(s) of audio are undergoing diarization`; + } else if (item.speaker == -1) { + speakerLabel = `Speaker 1${timeInfo}`; + } else if (item.speaker !== -1 && item.speaker !== 0) { + speakerLabel = `Speaker ${item.speaker}${timeInfo}`; + } + + let currentLineText = item.text || ""; + + if (idx === lines.length - 1) { + if (!isFinalizing && item.speaker !== -2) { + if (remaining_time_transcription > 0) { + speakerLabel += `Transcription lag ${fmt1( + remaining_time_transcription + )}s`; + } + if (buffer_diarization && remaining_time_diarization > 0) { + speakerLabel += `Diarization lag${fmt1( + remaining_time_diarization + )}s`; + } + } + + if (buffer_diarization) { + if (isFinalizing) { + currentLineText += + (currentLineText.length > 0 && buffer_diarization.trim().length > 0 ? " " : "") + buffer_diarization.trim(); + } else { + currentLineText += `${buffer_diarization}`; + } + } + if (buffer_transcription) { + if (isFinalizing) { + currentLineText += + (currentLineText.length > 0 && buffer_transcription.trim().length > 0 ? " " : "") + + buffer_transcription.trim(); + } else { + currentLineText += `${buffer_transcription}`; + } + } + } + + return currentLineText.trim().length > 0 || speakerLabel.length > 0 + ? `${speakerLabel}
${speakerLabel}