mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-03-07 22:33:36 +00:00
60 lines
2.2 KiB
Python
60 lines
2.2 KiB
Python
from typing import Sequence, Callable, Any, Optional, Dict
|
|
|
|
def _detect_tail_repetition(
|
|
seq: Sequence[Any],
|
|
key: Callable[[Any], Any] = lambda x: x, # extract comparable value
|
|
min_block: int = 1, # set to 2 to ignore 1-token loops like "."
|
|
max_tail: int = 300, # search window from the end for speed
|
|
prefer: str = "longest", # "longest" coverage or "smallest" block
|
|
) -> Optional[Dict]:
|
|
vals = [key(x) for x in seq][-max_tail:]
|
|
n = len(vals)
|
|
best = None
|
|
|
|
# try every possible block length
|
|
for b in range(min_block, n // 2 + 1):
|
|
block = vals[-b:]
|
|
# count how many times this block repeats contiguously at the very end
|
|
count, i = 0, n
|
|
while i - b >= 0 and vals[i - b:i] == block:
|
|
count += 1
|
|
i -= b
|
|
|
|
if count >= 2:
|
|
cand = {
|
|
"block_size": b,
|
|
"count": count,
|
|
"start_index": len(seq) - count * b, # in original seq
|
|
"end_index": len(seq),
|
|
}
|
|
if (best is None or
|
|
(prefer == "longest" and count * b > best["count"] * best["block_size"]) or
|
|
(prefer == "smallest" and b < best["block_size"])):
|
|
best = cand
|
|
return best
|
|
|
|
def trim_tail_repetition(
|
|
seq: Sequence[Any],
|
|
key: Callable[[Any], Any] = lambda x: x,
|
|
min_block: int = 1,
|
|
max_tail: int = 300,
|
|
prefer: str = "longest",
|
|
keep: int = 1, # how many copies of the repeating block to keep at the end (0 or 1 are common)
|
|
):
|
|
"""
|
|
Returns a new sequence with repeated tail trimmed.
|
|
keep=1 -> keep a single copy of the repeated block.
|
|
keep=0 -> remove all copies of the repeated block.
|
|
"""
|
|
rep = _detect_tail_repetition(seq, key, min_block, max_tail, prefer)
|
|
if not rep:
|
|
return seq, False # nothing to trim
|
|
|
|
b, c = rep["block_size"], rep["count"]
|
|
if keep < 0:
|
|
keep = 0
|
|
if keep >= c:
|
|
return seq, False # nothing to trim (already <= keep copies)
|
|
# new length = total - (copies_to_remove * block_size)
|
|
new_len = len(seq) - (c - keep) * b
|
|
return seq[:new_len], True |