diff --git a/setup.py b/setup.py
index b5f9767..aa4aad7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
setup(
name="whisperlivekit",
- version="0.2.2",
+ version="0.2.4.dev0",
description="Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
diff --git a/whisperlivekit/simul_whisper/whisper/__init__.py b/whisperlivekit/simul_whisper/whisper/__init__.py
index f06dd6c..e210718 100644
--- a/whisperlivekit/simul_whisper/whisper/__init__.py
+++ b/whisperlivekit/simul_whisper/whisper/__init__.py
@@ -4,7 +4,6 @@ import os
import urllib
import warnings
from typing import List, Optional, Union
-import logging
import torch
from tqdm import tqdm
@@ -15,8 +14,6 @@ from .model import ModelDimensions, Whisper
from .transcribe import transcribe
from .version import __version__
-logger = logging.getLogger(__name__)
-
_MODELS = {
"tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
"tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
@@ -74,7 +71,6 @@ def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
)
with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
- logger.info(f'Downloading model weights to {download_target}')
with tqdm(
total=int(source.info().get("Content-Length")),
ncols=80,
diff --git a/whisperlivekit/simul_whisper/whisper/decoding.py b/whisperlivekit/simul_whisper/whisper/decoding.py
index 15338a1..49485d0 100644
--- a/whisperlivekit/simul_whisper/whisper/decoding.py
+++ b/whisperlivekit/simul_whisper/whisper/decoding.py
@@ -32,7 +32,9 @@ def detect_language(
list of dictionaries containing the probability distribution over all languages.
"""
if tokenizer is None:
- tokenizer = get_tokenizer(model.is_multilingual)
+ tokenizer = get_tokenizer(
+ model.is_multilingual, num_languages=model.num_languages
+ )
if (
tokenizer.language is None
or tokenizer.language_token not in tokenizer.sot_sequence
@@ -111,9 +113,6 @@ class DecodingOptions:
# implementation details
fp16: bool = True # use fp16 for most of the calculation
- # streaming
- add_sot: Optional[bool] = True
-
@dataclass(frozen=True)
class DecodingResult:
@@ -513,19 +512,17 @@ class DecodingTask:
logit_filters: List[LogitFilter]
def __init__(self, model: "Whisper", options: DecodingOptions):
- self.options: DecodingOptions = self._verify_options(options)
- if self.options.fp16:
- self.model = model.half()
- else:
- self.model = model
+ self.model = model
language = options.language or "en"
tokenizer = get_tokenizer(
- model.is_multilingual, language=language, task=options.task
+ model.is_multilingual,
+ num_languages=model.num_languages,
+ language=language,
+ task=options.task,
)
self.tokenizer: Tokenizer = tokenizer
-
- # print(self.options)
+ self.options: DecodingOptions = self._verify_options(options)
self.n_group: int = options.beam_size or options.best_of or 1
self.n_ctx: int = model.dims.n_text_ctx
@@ -589,7 +586,7 @@ class DecodingTask:
def _get_initial_tokens(self) -> Tuple[int]:
tokens = list(self.sot_sequence)
- # print("prefix", prefix)
+
if prefix := self.options.prefix:
prefix_tokens = (
self.tokenizer.encode(" " + prefix.strip())
@@ -607,15 +604,12 @@ class DecodingTask:
if isinstance(prompt, str)
else prompt
)
- # if self.options.add_sot:
tokens = (
[self.tokenizer.sot_prev]
+ prompt_tokens[-(self.n_ctx // 2 - 1) :]
+ tokens
)
- #else:
- # tokens = ([self.tokenizer.sot_prev] + tokens + prompt_tokens[-(self.n_ctx // 2 - 1) :])
- # print("return", tokens)
+
return tuple(tokens)
def _get_suppress_tokens(self) -> Tuple[int]:
@@ -663,7 +657,7 @@ class DecodingTask:
if audio_features.dtype != (
torch.float16 if self.options.fp16 else torch.float32
):
- raise TypeError(
+ return TypeError(
f"audio_features has an incorrect dtype: {audio_features.dtype}"
)
@@ -689,10 +683,9 @@ class DecodingTask:
no_speech_probs = [np.nan] * n_batch
try:
- for i in range(self.sample_len): # 最多循环448次
- # print("in decode main loop", i , tokens[0].tolist())
+ for i in range(self.sample_len):
logits = self.inference.logits(tokens, audio_features)
- # print(logits)
+
if (
i == 0 and self.tokenizer.no_speech is not None
): # save no_speech_probs
@@ -724,7 +717,7 @@ class DecodingTask:
audio_features: Tensor = self._get_audio_features(mel) # encoder forward pass
tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
- # print("initial_tokens", self.initial_tokens)
+
# detect language if requested, overwriting the language token
languages, language_probs = self._detect_language(audio_features, tokens)
if self.options.task == "lang_id":
diff --git a/whisperlivekit/simul_whisper/whisper/normalizers/basic.py b/whisperlivekit/simul_whisper/whisper/normalizers/basic.py
index a824032..8690ae7 100644
--- a/whisperlivekit/simul_whisper/whisper/normalizers/basic.py
+++ b/whisperlivekit/simul_whisper/whisper/normalizers/basic.py
@@ -30,15 +30,19 @@ def remove_symbols_and_diacritics(s: str, keep=""):
and drop any diacritics (category 'Mn' and some manual mappings)
"""
return "".join(
- c
- if c in keep
- else ADDITIONAL_DIACRITICS[c]
- if c in ADDITIONAL_DIACRITICS
- else ""
- if unicodedata.category(c) == "Mn"
- else " "
- if unicodedata.category(c)[0] in "MSP"
- else c
+ (
+ c
+ if c in keep
+ else (
+ ADDITIONAL_DIACRITICS[c]
+ if c in ADDITIONAL_DIACRITICS
+ else (
+ ""
+ if unicodedata.category(c) == "Mn"
+ else " " if unicodedata.category(c)[0] in "MSP" else c
+ )
+ )
+ )
for c in unicodedata.normalize("NFKD", s)
)
diff --git a/whisperlivekit/simul_whisper/whisper/normalizers/english.json b/whisperlivekit/simul_whisper/whisper/normalizers/english.json
new file mode 100644
index 0000000..74a1c35
--- /dev/null
+++ b/whisperlivekit/simul_whisper/whisper/normalizers/english.json
@@ -0,0 +1,1741 @@
+{
+ "accessorise": "accessorize",
+ "accessorised": "accessorized",
+ "accessorises": "accessorizes",
+ "accessorising": "accessorizing",
+ "acclimatisation": "acclimatization",
+ "acclimatise": "acclimatize",
+ "acclimatised": "acclimatized",
+ "acclimatises": "acclimatizes",
+ "acclimatising": "acclimatizing",
+ "accoutrements": "accouterments",
+ "aeon": "eon",
+ "aeons": "eons",
+ "aerogramme": "aerogram",
+ "aerogrammes": "aerograms",
+ "aeroplane": "airplane",
+ "aeroplanes": "airplanes",
+ "aesthete": "esthete",
+ "aesthetes": "esthetes",
+ "aesthetic": "esthetic",
+ "aesthetically": "esthetically",
+ "aesthetics": "esthetics",
+ "aetiology": "etiology",
+ "ageing": "aging",
+ "aggrandisement": "aggrandizement",
+ "agonise": "agonize",
+ "agonised": "agonized",
+ "agonises": "agonizes",
+ "agonising": "agonizing",
+ "agonisingly": "agonizingly",
+ "almanack": "almanac",
+ "almanacks": "almanacs",
+ "aluminium": "aluminum",
+ "amortisable": "amortizable",
+ "amortisation": "amortization",
+ "amortisations": "amortizations",
+ "amortise": "amortize",
+ "amortised": "amortized",
+ "amortises": "amortizes",
+ "amortising": "amortizing",
+ "amphitheatre": "amphitheater",
+ "amphitheatres": "amphitheaters",
+ "anaemia": "anemia",
+ "anaemic": "anemic",
+ "anaesthesia": "anesthesia",
+ "anaesthetic": "anesthetic",
+ "anaesthetics": "anesthetics",
+ "anaesthetise": "anesthetize",
+ "anaesthetised": "anesthetized",
+ "anaesthetises": "anesthetizes",
+ "anaesthetising": "anesthetizing",
+ "anaesthetist": "anesthetist",
+ "anaesthetists": "anesthetists",
+ "anaesthetize": "anesthetize",
+ "anaesthetized": "anesthetized",
+ "anaesthetizes": "anesthetizes",
+ "anaesthetizing": "anesthetizing",
+ "analogue": "analog",
+ "analogues": "analogs",
+ "analyse": "analyze",
+ "analysed": "analyzed",
+ "analyses": "analyzes",
+ "analysing": "analyzing",
+ "anglicise": "anglicize",
+ "anglicised": "anglicized",
+ "anglicises": "anglicizes",
+ "anglicising": "anglicizing",
+ "annualised": "annualized",
+ "antagonise": "antagonize",
+ "antagonised": "antagonized",
+ "antagonises": "antagonizes",
+ "antagonising": "antagonizing",
+ "apologise": "apologize",
+ "apologised": "apologized",
+ "apologises": "apologizes",
+ "apologising": "apologizing",
+ "appal": "appall",
+ "appals": "appalls",
+ "appetiser": "appetizer",
+ "appetisers": "appetizers",
+ "appetising": "appetizing",
+ "appetisingly": "appetizingly",
+ "arbour": "arbor",
+ "arbours": "arbors",
+ "archeological": "archaeological",
+ "archaeologically": "archeologically",
+ "archaeologist": "archeologist",
+ "archaeologists": "archeologists",
+ "archaeology": "archeology",
+ "ardour": "ardor",
+ "armour": "armor",
+ "armoured": "armored",
+ "armourer": "armorer",
+ "armourers": "armorers",
+ "armouries": "armories",
+ "armoury": "armory",
+ "artefact": "artifact",
+ "artefacts": "artifacts",
+ "authorise": "authorize",
+ "authorised": "authorized",
+ "authorises": "authorizes",
+ "authorising": "authorizing",
+ "axe": "ax",
+ "backpedalled": "backpedaled",
+ "backpedalling": "backpedaling",
+ "bannister": "banister",
+ "bannisters": "banisters",
+ "baptise": "baptize",
+ "baptised": "baptized",
+ "baptises": "baptizes",
+ "baptising": "baptizing",
+ "bastardise": "bastardize",
+ "bastardised": "bastardized",
+ "bastardises": "bastardizes",
+ "bastardising": "bastardizing",
+ "battleax": "battleaxe",
+ "baulk": "balk",
+ "baulked": "balked",
+ "baulking": "balking",
+ "baulks": "balks",
+ "bedevilled": "bedeviled",
+ "bedevilling": "bedeviling",
+ "behaviour": "behavior",
+ "behavioural": "behavioral",
+ "behaviourism": "behaviorism",
+ "behaviourist": "behaviorist",
+ "behaviourists": "behaviorists",
+ "behaviours": "behaviors",
+ "behove": "behoove",
+ "behoved": "behooved",
+ "behoves": "behooves",
+ "bejewelled": "bejeweled",
+ "belabour": "belabor",
+ "belaboured": "belabored",
+ "belabouring": "belaboring",
+ "belabours": "belabors",
+ "bevelled": "beveled",
+ "bevvies": "bevies",
+ "bevvy": "bevy",
+ "biassed": "biased",
+ "biassing": "biasing",
+ "bingeing": "binging",
+ "bougainvillaea": "bougainvillea",
+ "bougainvillaeas": "bougainvilleas",
+ "bowdlerise": "bowdlerize",
+ "bowdlerised": "bowdlerized",
+ "bowdlerises": "bowdlerizes",
+ "bowdlerising": "bowdlerizing",
+ "breathalyse": "breathalyze",
+ "breathalysed": "breathalyzed",
+ "breathalyser": "breathalyzer",
+ "breathalysers": "breathalyzers",
+ "breathalyses": "breathalyzes",
+ "breathalysing": "breathalyzing",
+ "brutalise": "brutalize",
+ "brutalised": "brutalized",
+ "brutalises": "brutalizes",
+ "brutalising": "brutalizing",
+ "busses": "buses",
+ "bussing": "busing",
+ "caesarean": "cesarean",
+ "caesareans": "cesareans",
+ "calibre": "caliber",
+ "calibres": "calibers",
+ "calliper": "caliper",
+ "callipers": "calipers",
+ "callisthenics": "calisthenics",
+ "canalise": "canalize",
+ "canalised": "canalized",
+ "canalises": "canalizes",
+ "canalising": "canalizing",
+ "cancelation": "cancellation",
+ "cancelations": "cancellations",
+ "cancelled": "canceled",
+ "cancelling": "canceling",
+ "candour": "candor",
+ "cannibalise": "cannibalize",
+ "cannibalised": "cannibalized",
+ "cannibalises": "cannibalizes",
+ "cannibalising": "cannibalizing",
+ "canonise": "canonize",
+ "canonised": "canonized",
+ "canonises": "canonizes",
+ "canonising": "canonizing",
+ "capitalise": "capitalize",
+ "capitalised": "capitalized",
+ "capitalises": "capitalizes",
+ "capitalising": "capitalizing",
+ "caramelise": "caramelize",
+ "caramelised": "caramelized",
+ "caramelises": "caramelizes",
+ "caramelising": "caramelizing",
+ "carbonise": "carbonize",
+ "carbonised": "carbonized",
+ "carbonises": "carbonizes",
+ "carbonising": "carbonizing",
+ "carolled": "caroled",
+ "carolling": "caroling",
+ "catalogue": "catalog",
+ "catalogued": "cataloged",
+ "catalogues": "catalogs",
+ "cataloguing": "cataloging",
+ "catalyse": "catalyze",
+ "catalysed": "catalyzed",
+ "catalyses": "catalyzes",
+ "catalysing": "catalyzing",
+ "categorise": "categorize",
+ "categorised": "categorized",
+ "categorises": "categorizes",
+ "categorising": "categorizing",
+ "cauterise": "cauterize",
+ "cauterised": "cauterized",
+ "cauterises": "cauterizes",
+ "cauterising": "cauterizing",
+ "cavilled": "caviled",
+ "cavilling": "caviling",
+ "centigramme": "centigram",
+ "centigrammes": "centigrams",
+ "centilitre": "centiliter",
+ "centilitres": "centiliters",
+ "centimetre": "centimeter",
+ "centimetres": "centimeters",
+ "centralise": "centralize",
+ "centralised": "centralized",
+ "centralises": "centralizes",
+ "centralising": "centralizing",
+ "centre": "center",
+ "centred": "centered",
+ "centrefold": "centerfold",
+ "centrefolds": "centerfolds",
+ "centrepiece": "centerpiece",
+ "centrepieces": "centerpieces",
+ "centres": "centers",
+ "channelled": "channeled",
+ "channelling": "channeling",
+ "characterise": "characterize",
+ "characterised": "characterized",
+ "characterises": "characterizes",
+ "characterising": "characterizing",
+ "cheque": "check",
+ "chequebook": "checkbook",
+ "chequebooks": "checkbooks",
+ "chequered": "checkered",
+ "cheques": "checks",
+ "chilli": "chili",
+ "chimaera": "chimera",
+ "chimaeras": "chimeras",
+ "chiselled": "chiseled",
+ "chiselling": "chiseling",
+ "circularise": "circularize",
+ "circularised": "circularized",
+ "circularises": "circularizes",
+ "circularising": "circularizing",
+ "civilise": "civilize",
+ "civilised": "civilized",
+ "civilises": "civilizes",
+ "civilising": "civilizing",
+ "clamour": "clamor",
+ "clamoured": "clamored",
+ "clamouring": "clamoring",
+ "clamours": "clamors",
+ "clangour": "clangor",
+ "clarinettist": "clarinetist",
+ "clarinettists": "clarinetists",
+ "collectivise": "collectivize",
+ "collectivised": "collectivized",
+ "collectivises": "collectivizes",
+ "collectivising": "collectivizing",
+ "colonisation": "colonization",
+ "colonise": "colonize",
+ "colonised": "colonized",
+ "coloniser": "colonizer",
+ "colonisers": "colonizers",
+ "colonises": "colonizes",
+ "colonising": "colonizing",
+ "colour": "color",
+ "colourant": "colorant",
+ "colourants": "colorants",
+ "coloured": "colored",
+ "coloureds": "coloreds",
+ "colourful": "colorful",
+ "colourfully": "colorfully",
+ "colouring": "coloring",
+ "colourize": "colorize",
+ "colourized": "colorized",
+ "colourizes": "colorizes",
+ "colourizing": "colorizing",
+ "colourless": "colorless",
+ "colours": "colors",
+ "commercialise": "commercialize",
+ "commercialised": "commercialized",
+ "commercialises": "commercializes",
+ "commercialising": "commercializing",
+ "compartmentalise": "compartmentalize",
+ "compartmentalised": "compartmentalized",
+ "compartmentalises": "compartmentalizes",
+ "compartmentalising": "compartmentalizing",
+ "computerise": "computerize",
+ "computerised": "computerized",
+ "computerises": "computerizes",
+ "computerising": "computerizing",
+ "conceptualise": "conceptualize",
+ "conceptualised": "conceptualized",
+ "conceptualises": "conceptualizes",
+ "conceptualising": "conceptualizing",
+ "connexion": "connection",
+ "connexions": "connections",
+ "contextualise": "contextualize",
+ "contextualised": "contextualized",
+ "contextualises": "contextualizes",
+ "contextualising": "contextualizing",
+ "cosier": "cozier",
+ "cosies": "cozies",
+ "cosiest": "coziest",
+ "cosily": "cozily",
+ "cosiness": "coziness",
+ "cosy": "cozy",
+ "councillor": "councilor",
+ "councillors": "councilors",
+ "counselled": "counseled",
+ "counselling": "counseling",
+ "counsellor": "counselor",
+ "counsellors": "counselors",
+ "crenelated": "crenellated",
+ "criminalise": "criminalize",
+ "criminalised": "criminalized",
+ "criminalises": "criminalizes",
+ "criminalising": "criminalizing",
+ "criticise": "criticize",
+ "criticised": "criticized",
+ "criticises": "criticizes",
+ "criticising": "criticizing",
+ "crueller": "crueler",
+ "cruellest": "cruelest",
+ "crystallisation": "crystallization",
+ "crystallise": "crystallize",
+ "crystallised": "crystallized",
+ "crystallises": "crystallizes",
+ "crystallising": "crystallizing",
+ "cudgelled": "cudgeled",
+ "cudgelling": "cudgeling",
+ "customise": "customize",
+ "customised": "customized",
+ "customises": "customizes",
+ "customising": "customizing",
+ "cypher": "cipher",
+ "cyphers": "ciphers",
+ "decentralisation": "decentralization",
+ "decentralise": "decentralize",
+ "decentralised": "decentralized",
+ "decentralises": "decentralizes",
+ "decentralising": "decentralizing",
+ "decriminalisation": "decriminalization",
+ "decriminalise": "decriminalize",
+ "decriminalised": "decriminalized",
+ "decriminalises": "decriminalizes",
+ "decriminalising": "decriminalizing",
+ "defence": "defense",
+ "defenceless": "defenseless",
+ "defences": "defenses",
+ "dehumanisation": "dehumanization",
+ "dehumanise": "dehumanize",
+ "dehumanised": "dehumanized",
+ "dehumanises": "dehumanizes",
+ "dehumanising": "dehumanizing",
+ "demeanour": "demeanor",
+ "demilitarisation": "demilitarization",
+ "demilitarise": "demilitarize",
+ "demilitarised": "demilitarized",
+ "demilitarises": "demilitarizes",
+ "demilitarising": "demilitarizing",
+ "demobilisation": "demobilization",
+ "demobilise": "demobilize",
+ "demobilised": "demobilized",
+ "demobilises": "demobilizes",
+ "demobilising": "demobilizing",
+ "democratisation": "democratization",
+ "democratise": "democratize",
+ "democratised": "democratized",
+ "democratises": "democratizes",
+ "democratising": "democratizing",
+ "demonise": "demonize",
+ "demonised": "demonized",
+ "demonises": "demonizes",
+ "demonising": "demonizing",
+ "demoralisation": "demoralization",
+ "demoralise": "demoralize",
+ "demoralised": "demoralized",
+ "demoralises": "demoralizes",
+ "demoralising": "demoralizing",
+ "denationalisation": "denationalization",
+ "denationalise": "denationalize",
+ "denationalised": "denationalized",
+ "denationalises": "denationalizes",
+ "denationalising": "denationalizing",
+ "deodorise": "deodorize",
+ "deodorised": "deodorized",
+ "deodorises": "deodorizes",
+ "deodorising": "deodorizing",
+ "depersonalise": "depersonalize",
+ "depersonalised": "depersonalized",
+ "depersonalises": "depersonalizes",
+ "depersonalising": "depersonalizing",
+ "deputise": "deputize",
+ "deputised": "deputized",
+ "deputises": "deputizes",
+ "deputising": "deputizing",
+ "desensitisation": "desensitization",
+ "desensitise": "desensitize",
+ "desensitised": "desensitized",
+ "desensitises": "desensitizes",
+ "desensitising": "desensitizing",
+ "destabilisation": "destabilization",
+ "destabilise": "destabilize",
+ "destabilised": "destabilized",
+ "destabilises": "destabilizes",
+ "destabilising": "destabilizing",
+ "dialled": "dialed",
+ "dialling": "dialing",
+ "dialogue": "dialog",
+ "dialogues": "dialogs",
+ "diarrhoea": "diarrhea",
+ "digitise": "digitize",
+ "digitised": "digitized",
+ "digitises": "digitizes",
+ "digitising": "digitizing",
+ "disc": "disk",
+ "discolour": "discolor",
+ "discoloured": "discolored",
+ "discolouring": "discoloring",
+ "discolours": "discolors",
+ "discs": "disks",
+ "disembowelled": "disemboweled",
+ "disembowelling": "disemboweling",
+ "disfavour": "disfavor",
+ "dishevelled": "disheveled",
+ "dishonour": "dishonor",
+ "dishonourable": "dishonorable",
+ "dishonourably": "dishonorably",
+ "dishonoured": "dishonored",
+ "dishonouring": "dishonoring",
+ "dishonours": "dishonors",
+ "disorganisation": "disorganization",
+ "disorganised": "disorganized",
+ "distil": "distill",
+ "distils": "distills",
+ "dramatisation": "dramatization",
+ "dramatisations": "dramatizations",
+ "dramatise": "dramatize",
+ "dramatised": "dramatized",
+ "dramatises": "dramatizes",
+ "dramatising": "dramatizing",
+ "draught": "draft",
+ "draughtboard": "draftboard",
+ "draughtboards": "draftboards",
+ "draughtier": "draftier",
+ "draughtiest": "draftiest",
+ "draughts": "drafts",
+ "draughtsman": "draftsman",
+ "draughtsmanship": "draftsmanship",
+ "draughtsmen": "draftsmen",
+ "draughtswoman": "draftswoman",
+ "draughtswomen": "draftswomen",
+ "draughty": "drafty",
+ "drivelled": "driveled",
+ "drivelling": "driveling",
+ "duelled": "dueled",
+ "duelling": "dueling",
+ "economise": "economize",
+ "economised": "economized",
+ "economises": "economizes",
+ "economising": "economizing",
+ "edoema": "edema",
+ "editorialise": "editorialize",
+ "editorialised": "editorialized",
+ "editorialises": "editorializes",
+ "editorialising": "editorializing",
+ "empathise": "empathize",
+ "empathised": "empathized",
+ "empathises": "empathizes",
+ "empathising": "empathizing",
+ "emphasise": "emphasize",
+ "emphasised": "emphasized",
+ "emphasises": "emphasizes",
+ "emphasising": "emphasizing",
+ "enamelled": "enameled",
+ "enamelling": "enameling",
+ "enamoured": "enamored",
+ "encyclopaedia": "encyclopedia",
+ "encyclopaedias": "encyclopedias",
+ "encyclopaedic": "encyclopedic",
+ "endeavour": "endeavor",
+ "endeavoured": "endeavored",
+ "endeavouring": "endeavoring",
+ "endeavours": "endeavors",
+ "energise": "energize",
+ "energised": "energized",
+ "energises": "energizes",
+ "energising": "energizing",
+ "enrol": "enroll",
+ "enrols": "enrolls",
+ "enthral": "enthrall",
+ "enthrals": "enthralls",
+ "epaulette": "epaulet",
+ "epaulettes": "epaulets",
+ "epicentre": "epicenter",
+ "epicentres": "epicenters",
+ "epilogue": "epilog",
+ "epilogues": "epilogs",
+ "epitomise": "epitomize",
+ "epitomised": "epitomized",
+ "epitomises": "epitomizes",
+ "epitomising": "epitomizing",
+ "equalisation": "equalization",
+ "equalise": "equalize",
+ "equalised": "equalized",
+ "equaliser": "equalizer",
+ "equalisers": "equalizers",
+ "equalises": "equalizes",
+ "equalising": "equalizing",
+ "eulogise": "eulogize",
+ "eulogised": "eulogized",
+ "eulogises": "eulogizes",
+ "eulogising": "eulogizing",
+ "evangelise": "evangelize",
+ "evangelised": "evangelized",
+ "evangelises": "evangelizes",
+ "evangelising": "evangelizing",
+ "exorcise": "exorcize",
+ "exorcised": "exorcized",
+ "exorcises": "exorcizes",
+ "exorcising": "exorcizing",
+ "extemporisation": "extemporization",
+ "extemporise": "extemporize",
+ "extemporised": "extemporized",
+ "extemporises": "extemporizes",
+ "extemporising": "extemporizing",
+ "externalisation": "externalization",
+ "externalisations": "externalizations",
+ "externalise": "externalize",
+ "externalised": "externalized",
+ "externalises": "externalizes",
+ "externalising": "externalizing",
+ "factorise": "factorize",
+ "factorised": "factorized",
+ "factorises": "factorizes",
+ "factorising": "factorizing",
+ "faecal": "fecal",
+ "faeces": "feces",
+ "familiarisation": "familiarization",
+ "familiarise": "familiarize",
+ "familiarised": "familiarized",
+ "familiarises": "familiarizes",
+ "familiarising": "familiarizing",
+ "fantasise": "fantasize",
+ "fantasised": "fantasized",
+ "fantasises": "fantasizes",
+ "fantasising": "fantasizing",
+ "favour": "favor",
+ "favourable": "favorable",
+ "favourably": "favorably",
+ "favoured": "favored",
+ "favouring": "favoring",
+ "favourite": "favorite",
+ "favourites": "favorites",
+ "favouritism": "favoritism",
+ "favours": "favors",
+ "feminise": "feminize",
+ "feminised": "feminized",
+ "feminises": "feminizes",
+ "feminising": "feminizing",
+ "fertilisation": "fertilization",
+ "fertilise": "fertilize",
+ "fertilised": "fertilized",
+ "fertiliser": "fertilizer",
+ "fertilisers": "fertilizers",
+ "fertilises": "fertilizes",
+ "fertilising": "fertilizing",
+ "fervour": "fervor",
+ "fibre": "fiber",
+ "fibreglass": "fiberglass",
+ "fibres": "fibers",
+ "fictionalisation": "fictionalization",
+ "fictionalisations": "fictionalizations",
+ "fictionalise": "fictionalize",
+ "fictionalised": "fictionalized",
+ "fictionalises": "fictionalizes",
+ "fictionalising": "fictionalizing",
+ "fillet": "filet",
+ "filleted": "fileted",
+ "filleting": "fileting",
+ "fillets": "filets",
+ "finalisation": "finalization",
+ "finalise": "finalize",
+ "finalised": "finalized",
+ "finalises": "finalizes",
+ "finalising": "finalizing",
+ "flautist": "flutist",
+ "flautists": "flutists",
+ "flavour": "flavor",
+ "flavoured": "flavored",
+ "flavouring": "flavoring",
+ "flavourings": "flavorings",
+ "flavourless": "flavorless",
+ "flavours": "flavors",
+ "flavoursome": "flavorsome",
+ "flyer / flier": "flier / flyer",
+ "foetal": "fetal",
+ "foetid": "fetid",
+ "foetus": "fetus",
+ "foetuses": "fetuses",
+ "formalisation": "formalization",
+ "formalise": "formalize",
+ "formalised": "formalized",
+ "formalises": "formalizes",
+ "formalising": "formalizing",
+ "fossilisation": "fossilization",
+ "fossilise": "fossilize",
+ "fossilised": "fossilized",
+ "fossilises": "fossilizes",
+ "fossilising": "fossilizing",
+ "fraternisation": "fraternization",
+ "fraternise": "fraternize",
+ "fraternised": "fraternized",
+ "fraternises": "fraternizes",
+ "fraternising": "fraternizing",
+ "fulfil": "fulfill",
+ "fulfilment": "fulfillment",
+ "fulfils": "fulfills",
+ "funnelled": "funneled",
+ "funnelling": "funneling",
+ "galvanise": "galvanize",
+ "galvanised": "galvanized",
+ "galvanises": "galvanizes",
+ "galvanising": "galvanizing",
+ "gambolled": "gamboled",
+ "gambolling": "gamboling",
+ "gaol": "jail",
+ "gaolbird": "jailbird",
+ "gaolbirds": "jailbirds",
+ "gaolbreak": "jailbreak",
+ "gaolbreaks": "jailbreaks",
+ "gaoled": "jailed",
+ "gaoler": "jailer",
+ "gaolers": "jailers",
+ "gaoling": "jailing",
+ "gaols": "jails",
+ "gasses": "gases",
+ "gage": "gauge",
+ "gaged": "gauged",
+ "gages": "gauges",
+ "gaging": "gauging",
+ "generalisation": "generalization",
+ "generalisations": "generalizations",
+ "generalise": "generalize",
+ "generalised": "generalized",
+ "generalises": "generalizes",
+ "generalising": "generalizing",
+ "ghettoise": "ghettoize",
+ "ghettoised": "ghettoized",
+ "ghettoises": "ghettoizes",
+ "ghettoising": "ghettoizing",
+ "gipsies": "gypsies",
+ "glamorise": "glamorize",
+ "glamorised": "glamorized",
+ "glamorises": "glamorizes",
+ "glamorising": "glamorizing",
+ "glamor": "glamour",
+ "globalisation": "globalization",
+ "globalise": "globalize",
+ "globalised": "globalized",
+ "globalises": "globalizes",
+ "globalising": "globalizing",
+ "glueing": "gluing",
+ "goitre": "goiter",
+ "goitres": "goiters",
+ "gonorrhoea": "gonorrhea",
+ "gramme": "gram",
+ "grammes": "grams",
+ "gravelled": "graveled",
+ "grey": "gray",
+ "greyed": "grayed",
+ "greying": "graying",
+ "greyish": "grayish",
+ "greyness": "grayness",
+ "greys": "grays",
+ "grovelled": "groveled",
+ "grovelling": "groveling",
+ "groyne": "groin",
+ "groynes": "groins",
+ "gruelling": "grueling",
+ "gruellingly": "gruelingly",
+ "gryphon": "griffin",
+ "gryphons": "griffins",
+ "gynaecological": "gynecological",
+ "gynaecologist": "gynecologist",
+ "gynaecologists": "gynecologists",
+ "gynaecology": "gynecology",
+ "haematological": "hematological",
+ "haematologist": "hematologist",
+ "haematologists": "hematologists",
+ "haematology": "hematology",
+ "haemoglobin": "hemoglobin",
+ "haemophilia": "hemophilia",
+ "haemophiliac": "hemophiliac",
+ "haemophiliacs": "hemophiliacs",
+ "haemorrhage": "hemorrhage",
+ "haemorrhaged": "hemorrhaged",
+ "haemorrhages": "hemorrhages",
+ "haemorrhaging": "hemorrhaging",
+ "haemorrhoids": "hemorrhoids",
+ "harbour": "harbor",
+ "harboured": "harbored",
+ "harbouring": "harboring",
+ "harbours": "harbors",
+ "harmonisation": "harmonization",
+ "harmonise": "harmonize",
+ "harmonised": "harmonized",
+ "harmonises": "harmonizes",
+ "harmonising": "harmonizing",
+ "homoeopath": "homeopath",
+ "homoeopathic": "homeopathic",
+ "homoeopaths": "homeopaths",
+ "homoeopathy": "homeopathy",
+ "homogenise": "homogenize",
+ "homogenised": "homogenized",
+ "homogenises": "homogenizes",
+ "homogenising": "homogenizing",
+ "honour": "honor",
+ "honourable": "honorable",
+ "honourably": "honorably",
+ "honoured": "honored",
+ "honouring": "honoring",
+ "honours": "honors",
+ "hospitalisation": "hospitalization",
+ "hospitalise": "hospitalize",
+ "hospitalised": "hospitalized",
+ "hospitalises": "hospitalizes",
+ "hospitalising": "hospitalizing",
+ "humanise": "humanize",
+ "humanised": "humanized",
+ "humanises": "humanizes",
+ "humanising": "humanizing",
+ "humour": "humor",
+ "humoured": "humored",
+ "humouring": "humoring",
+ "humourless": "humorless",
+ "humours": "humors",
+ "hybridise": "hybridize",
+ "hybridised": "hybridized",
+ "hybridises": "hybridizes",
+ "hybridising": "hybridizing",
+ "hypnotise": "hypnotize",
+ "hypnotised": "hypnotized",
+ "hypnotises": "hypnotizes",
+ "hypnotising": "hypnotizing",
+ "hypothesise": "hypothesize",
+ "hypothesised": "hypothesized",
+ "hypothesises": "hypothesizes",
+ "hypothesising": "hypothesizing",
+ "idealisation": "idealization",
+ "idealise": "idealize",
+ "idealised": "idealized",
+ "idealises": "idealizes",
+ "idealising": "idealizing",
+ "idolise": "idolize",
+ "idolised": "idolized",
+ "idolises": "idolizes",
+ "idolising": "idolizing",
+ "immobilisation": "immobilization",
+ "immobilise": "immobilize",
+ "immobilised": "immobilized",
+ "immobiliser": "immobilizer",
+ "immobilisers": "immobilizers",
+ "immobilises": "immobilizes",
+ "immobilising": "immobilizing",
+ "immortalise": "immortalize",
+ "immortalised": "immortalized",
+ "immortalises": "immortalizes",
+ "immortalising": "immortalizing",
+ "immunisation": "immunization",
+ "immunise": "immunize",
+ "immunised": "immunized",
+ "immunises": "immunizes",
+ "immunising": "immunizing",
+ "impanelled": "impaneled",
+ "impanelling": "impaneling",
+ "imperilled": "imperiled",
+ "imperilling": "imperiling",
+ "individualise": "individualize",
+ "individualised": "individualized",
+ "individualises": "individualizes",
+ "individualising": "individualizing",
+ "industrialise": "industrialize",
+ "industrialised": "industrialized",
+ "industrialises": "industrializes",
+ "industrialising": "industrializing",
+ "inflexion": "inflection",
+ "inflexions": "inflections",
+ "initialise": "initialize",
+ "initialised": "initialized",
+ "initialises": "initializes",
+ "initialising": "initializing",
+ "initialled": "initialed",
+ "initialling": "initialing",
+ "instal": "install",
+ "instalment": "installment",
+ "instalments": "installments",
+ "instals": "installs",
+ "instil": "instill",
+ "instils": "instills",
+ "institutionalisation": "institutionalization",
+ "institutionalise": "institutionalize",
+ "institutionalised": "institutionalized",
+ "institutionalises": "institutionalizes",
+ "institutionalising": "institutionalizing",
+ "intellectualise": "intellectualize",
+ "intellectualised": "intellectualized",
+ "intellectualises": "intellectualizes",
+ "intellectualising": "intellectualizing",
+ "internalisation": "internalization",
+ "internalise": "internalize",
+ "internalised": "internalized",
+ "internalises": "internalizes",
+ "internalising": "internalizing",
+ "internationalisation": "internationalization",
+ "internationalise": "internationalize",
+ "internationalised": "internationalized",
+ "internationalises": "internationalizes",
+ "internationalising": "internationalizing",
+ "ionisation": "ionization",
+ "ionise": "ionize",
+ "ionised": "ionized",
+ "ioniser": "ionizer",
+ "ionisers": "ionizers",
+ "ionises": "ionizes",
+ "ionising": "ionizing",
+ "italicise": "italicize",
+ "italicised": "italicized",
+ "italicises": "italicizes",
+ "italicising": "italicizing",
+ "itemise": "itemize",
+ "itemised": "itemized",
+ "itemises": "itemizes",
+ "itemising": "itemizing",
+ "jeopardise": "jeopardize",
+ "jeopardised": "jeopardized",
+ "jeopardises": "jeopardizes",
+ "jeopardising": "jeopardizing",
+ "jewelled": "jeweled",
+ "jeweller": "jeweler",
+ "jewellers": "jewelers",
+ "jewellery": "jewelry",
+ "judgement": "judgment",
+ "kilogramme": "kilogram",
+ "kilogrammes": "kilograms",
+ "kilometre": "kilometer",
+ "kilometres": "kilometers",
+ "labelled": "labeled",
+ "labelling": "labeling",
+ "labour": "labor",
+ "laboured": "labored",
+ "labourer": "laborer",
+ "labourers": "laborers",
+ "labouring": "laboring",
+ "labours": "labors",
+ "lacklustre": "lackluster",
+ "legalisation": "legalization",
+ "legalise": "legalize",
+ "legalised": "legalized",
+ "legalises": "legalizes",
+ "legalising": "legalizing",
+ "legitimise": "legitimize",
+ "legitimised": "legitimized",
+ "legitimises": "legitimizes",
+ "legitimising": "legitimizing",
+ "leukaemia": "leukemia",
+ "levelled": "leveled",
+ "leveller": "leveler",
+ "levellers": "levelers",
+ "levelling": "leveling",
+ "libelled": "libeled",
+ "libelling": "libeling",
+ "libellous": "libelous",
+ "liberalisation": "liberalization",
+ "liberalise": "liberalize",
+ "liberalised": "liberalized",
+ "liberalises": "liberalizes",
+ "liberalising": "liberalizing",
+ "licence": "license",
+ "licenced": "licensed",
+ "licences": "licenses",
+ "licencing": "licensing",
+ "likeable": "likable",
+ "lionisation": "lionization",
+ "lionise": "lionize",
+ "lionised": "lionized",
+ "lionises": "lionizes",
+ "lionising": "lionizing",
+ "liquidise": "liquidize",
+ "liquidised": "liquidized",
+ "liquidiser": "liquidizer",
+ "liquidisers": "liquidizers",
+ "liquidises": "liquidizes",
+ "liquidising": "liquidizing",
+ "litre": "liter",
+ "litres": "liters",
+ "localise": "localize",
+ "localised": "localized",
+ "localises": "localizes",
+ "localising": "localizing",
+ "louvre": "louver",
+ "louvred": "louvered",
+ "louvres": "louvers",
+ "lustre": "luster",
+ "magnetise": "magnetize",
+ "magnetised": "magnetized",
+ "magnetises": "magnetizes",
+ "magnetising": "magnetizing",
+ "manoeuvrability": "maneuverability",
+ "manoeuvrable": "maneuverable",
+ "manoeuvre": "maneuver",
+ "manoeuvred": "maneuvered",
+ "manoeuvres": "maneuvers",
+ "manoeuvring": "maneuvering",
+ "manoeuvrings": "maneuverings",
+ "marginalisation": "marginalization",
+ "marginalise": "marginalize",
+ "marginalised": "marginalized",
+ "marginalises": "marginalizes",
+ "marginalising": "marginalizing",
+ "marshalled": "marshaled",
+ "marshalling": "marshaling",
+ "marvelled": "marveled",
+ "marvelling": "marveling",
+ "marvellous": "marvelous",
+ "marvellously": "marvelously",
+ "materialisation": "materialization",
+ "materialise": "materialize",
+ "materialised": "materialized",
+ "materialises": "materializes",
+ "materialising": "materializing",
+ "maximisation": "maximization",
+ "maximise": "maximize",
+ "maximised": "maximized",
+ "maximises": "maximizes",
+ "maximising": "maximizing",
+ "meagre": "meager",
+ "mechanisation": "mechanization",
+ "mechanise": "mechanize",
+ "mechanised": "mechanized",
+ "mechanises": "mechanizes",
+ "mechanising": "mechanizing",
+ "mediaeval": "medieval",
+ "memorialise": "memorialize",
+ "memorialised": "memorialized",
+ "memorialises": "memorializes",
+ "memorialising": "memorializing",
+ "memorise": "memorize",
+ "memorised": "memorized",
+ "memorises": "memorizes",
+ "memorising": "memorizing",
+ "mesmerise": "mesmerize",
+ "mesmerised": "mesmerized",
+ "mesmerises": "mesmerizes",
+ "mesmerising": "mesmerizing",
+ "metabolise": "metabolize",
+ "metabolised": "metabolized",
+ "metabolises": "metabolizes",
+ "metabolising": "metabolizing",
+ "metre": "meter",
+ "metres": "meters",
+ "micrometre": "micrometer",
+ "micrometres": "micrometers",
+ "militarise": "militarize",
+ "militarised": "militarized",
+ "militarises": "militarizes",
+ "militarising": "militarizing",
+ "milligramme": "milligram",
+ "milligrammes": "milligrams",
+ "millilitre": "milliliter",
+ "millilitres": "milliliters",
+ "millimetre": "millimeter",
+ "millimetres": "millimeters",
+ "miniaturisation": "miniaturization",
+ "miniaturise": "miniaturize",
+ "miniaturised": "miniaturized",
+ "miniaturises": "miniaturizes",
+ "miniaturising": "miniaturizing",
+ "minibusses": "minibuses",
+ "minimise": "minimize",
+ "minimised": "minimized",
+ "minimises": "minimizes",
+ "minimising": "minimizing",
+ "misbehaviour": "misbehavior",
+ "misdemeanour": "misdemeanor",
+ "misdemeanours": "misdemeanors",
+ "misspelt": "misspelled",
+ "mitre": "miter",
+ "mitres": "miters",
+ "mobilisation": "mobilization",
+ "mobilise": "mobilize",
+ "mobilised": "mobilized",
+ "mobilises": "mobilizes",
+ "mobilising": "mobilizing",
+ "modelled": "modeled",
+ "modeller": "modeler",
+ "modellers": "modelers",
+ "modelling": "modeling",
+ "modernise": "modernize",
+ "modernised": "modernized",
+ "modernises": "modernizes",
+ "modernising": "modernizing",
+ "moisturise": "moisturize",
+ "moisturised": "moisturized",
+ "moisturiser": "moisturizer",
+ "moisturisers": "moisturizers",
+ "moisturises": "moisturizes",
+ "moisturising": "moisturizing",
+ "monologue": "monolog",
+ "monologues": "monologs",
+ "monopolisation": "monopolization",
+ "monopolise": "monopolize",
+ "monopolised": "monopolized",
+ "monopolises": "monopolizes",
+ "monopolising": "monopolizing",
+ "moralise": "moralize",
+ "moralised": "moralized",
+ "moralises": "moralizes",
+ "moralising": "moralizing",
+ "motorised": "motorized",
+ "mould": "mold",
+ "moulded": "molded",
+ "moulder": "molder",
+ "mouldered": "moldered",
+ "mouldering": "moldering",
+ "moulders": "molders",
+ "mouldier": "moldier",
+ "mouldiest": "moldiest",
+ "moulding": "molding",
+ "mouldings": "moldings",
+ "moulds": "molds",
+ "mouldy": "moldy",
+ "moult": "molt",
+ "moulted": "molted",
+ "moulting": "molting",
+ "moults": "molts",
+ "moustache": "mustache",
+ "moustached": "mustached",
+ "moustaches": "mustaches",
+ "moustachioed": "mustachioed",
+ "multicoloured": "multicolored",
+ "nationalisation": "nationalization",
+ "nationalisations": "nationalizations",
+ "nationalise": "nationalize",
+ "nationalised": "nationalized",
+ "nationalises": "nationalizes",
+ "nationalising": "nationalizing",
+ "naturalisation": "naturalization",
+ "naturalise": "naturalize",
+ "naturalised": "naturalized",
+ "naturalises": "naturalizes",
+ "naturalising": "naturalizing",
+ "neighbour": "neighbor",
+ "neighbourhood": "neighborhood",
+ "neighbourhoods": "neighborhoods",
+ "neighbouring": "neighboring",
+ "neighbourliness": "neighborliness",
+ "neighbourly": "neighborly",
+ "neighbours": "neighbors",
+ "neutralisation": "neutralization",
+ "neutralise": "neutralize",
+ "neutralised": "neutralized",
+ "neutralises": "neutralizes",
+ "neutralising": "neutralizing",
+ "normalisation": "normalization",
+ "normalise": "normalize",
+ "normalised": "normalized",
+ "normalises": "normalizes",
+ "normalising": "normalizing",
+ "odour": "odor",
+ "odourless": "odorless",
+ "odours": "odors",
+ "oesophagus": "esophagus",
+ "oesophaguses": "esophaguses",
+ "oestrogen": "estrogen",
+ "offence": "offense",
+ "offences": "offenses",
+ "omelette": "omelet",
+ "omelettes": "omelets",
+ "optimise": "optimize",
+ "optimised": "optimized",
+ "optimises": "optimizes",
+ "optimising": "optimizing",
+ "organisation": "organization",
+ "organisational": "organizational",
+ "organisations": "organizations",
+ "organise": "organize",
+ "organised": "organized",
+ "organiser": "organizer",
+ "organisers": "organizers",
+ "organises": "organizes",
+ "organising": "organizing",
+ "orthopaedic": "orthopedic",
+ "orthopaedics": "orthopedics",
+ "ostracise": "ostracize",
+ "ostracised": "ostracized",
+ "ostracises": "ostracizes",
+ "ostracising": "ostracizing",
+ "outmanoeuvre": "outmaneuver",
+ "outmanoeuvred": "outmaneuvered",
+ "outmanoeuvres": "outmaneuvers",
+ "outmanoeuvring": "outmaneuvering",
+ "overemphasise": "overemphasize",
+ "overemphasised": "overemphasized",
+ "overemphasises": "overemphasizes",
+ "overemphasising": "overemphasizing",
+ "oxidisation": "oxidization",
+ "oxidise": "oxidize",
+ "oxidised": "oxidized",
+ "oxidises": "oxidizes",
+ "oxidising": "oxidizing",
+ "paederast": "pederast",
+ "paederasts": "pederasts",
+ "paediatric": "pediatric",
+ "paediatrician": "pediatrician",
+ "paediatricians": "pediatricians",
+ "paediatrics": "pediatrics",
+ "paedophile": "pedophile",
+ "paedophiles": "pedophiles",
+ "paedophilia": "pedophilia",
+ "palaeolithic": "paleolithic",
+ "palaeontologist": "paleontologist",
+ "palaeontologists": "paleontologists",
+ "palaeontology": "paleontology",
+ "panelled": "paneled",
+ "panelling": "paneling",
+ "panellist": "panelist",
+ "panellists": "panelists",
+ "paralyse": "paralyze",
+ "paralysed": "paralyzed",
+ "paralyses": "paralyzes",
+ "paralysing": "paralyzing",
+ "parcelled": "parceled",
+ "parcelling": "parceling",
+ "parlour": "parlor",
+ "parlours": "parlors",
+ "particularise": "particularize",
+ "particularised": "particularized",
+ "particularises": "particularizes",
+ "particularising": "particularizing",
+ "passivisation": "passivization",
+ "passivise": "passivize",
+ "passivised": "passivized",
+ "passivises": "passivizes",
+ "passivising": "passivizing",
+ "pasteurisation": "pasteurization",
+ "pasteurise": "pasteurize",
+ "pasteurised": "pasteurized",
+ "pasteurises": "pasteurizes",
+ "pasteurising": "pasteurizing",
+ "patronise": "patronize",
+ "patronised": "patronized",
+ "patronises": "patronizes",
+ "patronising": "patronizing",
+ "patronisingly": "patronizingly",
+ "pedalled": "pedaled",
+ "pedalling": "pedaling",
+ "pedestrianisation": "pedestrianization",
+ "pedestrianise": "pedestrianize",
+ "pedestrianised": "pedestrianized",
+ "pedestrianises": "pedestrianizes",
+ "pedestrianising": "pedestrianizing",
+ "penalise": "penalize",
+ "penalised": "penalized",
+ "penalises": "penalizes",
+ "penalising": "penalizing",
+ "pencilled": "penciled",
+ "pencilling": "penciling",
+ "personalise": "personalize",
+ "personalised": "personalized",
+ "personalises": "personalizes",
+ "personalising": "personalizing",
+ "pharmacopoeia": "pharmacopeia",
+ "pharmacopoeias": "pharmacopeias",
+ "philosophise": "philosophize",
+ "philosophised": "philosophized",
+ "philosophises": "philosophizes",
+ "philosophising": "philosophizing",
+ "philtre": "filter",
+ "philtres": "filters",
+ "phoney": "phony",
+ "plagiarise": "plagiarize",
+ "plagiarised": "plagiarized",
+ "plagiarises": "plagiarizes",
+ "plagiarising": "plagiarizing",
+ "plough": "plow",
+ "ploughed": "plowed",
+ "ploughing": "plowing",
+ "ploughman": "plowman",
+ "ploughmen": "plowmen",
+ "ploughs": "plows",
+ "ploughshare": "plowshare",
+ "ploughshares": "plowshares",
+ "polarisation": "polarization",
+ "polarise": "polarize",
+ "polarised": "polarized",
+ "polarises": "polarizes",
+ "polarising": "polarizing",
+ "politicisation": "politicization",
+ "politicise": "politicize",
+ "politicised": "politicized",
+ "politicises": "politicizes",
+ "politicising": "politicizing",
+ "popularisation": "popularization",
+ "popularise": "popularize",
+ "popularised": "popularized",
+ "popularises": "popularizes",
+ "popularising": "popularizing",
+ "pouffe": "pouf",
+ "pouffes": "poufs",
+ "practise": "practice",
+ "practised": "practiced",
+ "practises": "practices",
+ "practising": "practicing",
+ "praesidium": "presidium",
+ "praesidiums": "presidiums",
+ "pressurisation": "pressurization",
+ "pressurise": "pressurize",
+ "pressurised": "pressurized",
+ "pressurises": "pressurizes",
+ "pressurising": "pressurizing",
+ "pretence": "pretense",
+ "pretences": "pretenses",
+ "primaeval": "primeval",
+ "prioritisation": "prioritization",
+ "prioritise": "prioritize",
+ "prioritised": "prioritized",
+ "prioritises": "prioritizes",
+ "prioritising": "prioritizing",
+ "privatisation": "privatization",
+ "privatisations": "privatizations",
+ "privatise": "privatize",
+ "privatised": "privatized",
+ "privatises": "privatizes",
+ "privatising": "privatizing",
+ "professionalisation": "professionalization",
+ "professionalise": "professionalize",
+ "professionalised": "professionalized",
+ "professionalises": "professionalizes",
+ "professionalising": "professionalizing",
+ "programme": "program",
+ "programmes": "programs",
+ "prologue": "prolog",
+ "prologues": "prologs",
+ "propagandise": "propagandize",
+ "propagandised": "propagandized",
+ "propagandises": "propagandizes",
+ "propagandising": "propagandizing",
+ "proselytise": "proselytize",
+ "proselytised": "proselytized",
+ "proselytiser": "proselytizer",
+ "proselytisers": "proselytizers",
+ "proselytises": "proselytizes",
+ "proselytising": "proselytizing",
+ "psychoanalyse": "psychoanalyze",
+ "psychoanalysed": "psychoanalyzed",
+ "psychoanalyses": "psychoanalyzes",
+ "psychoanalysing": "psychoanalyzing",
+ "publicise": "publicize",
+ "publicised": "publicized",
+ "publicises": "publicizes",
+ "publicising": "publicizing",
+ "pulverisation": "pulverization",
+ "pulverise": "pulverize",
+ "pulverised": "pulverized",
+ "pulverises": "pulverizes",
+ "pulverising": "pulverizing",
+ "pummelled": "pummel",
+ "pummelling": "pummeled",
+ "pyjama": "pajama",
+ "pyjamas": "pajamas",
+ "pzazz": "pizzazz",
+ "quarrelled": "quarreled",
+ "quarrelling": "quarreling",
+ "radicalise": "radicalize",
+ "radicalised": "radicalized",
+ "radicalises": "radicalizes",
+ "radicalising": "radicalizing",
+ "rancour": "rancor",
+ "randomise": "randomize",
+ "randomised": "randomized",
+ "randomises": "randomizes",
+ "randomising": "randomizing",
+ "rationalisation": "rationalization",
+ "rationalisations": "rationalizations",
+ "rationalise": "rationalize",
+ "rationalised": "rationalized",
+ "rationalises": "rationalizes",
+ "rationalising": "rationalizing",
+ "ravelled": "raveled",
+ "ravelling": "raveling",
+ "realisable": "realizable",
+ "realisation": "realization",
+ "realisations": "realizations",
+ "realise": "realize",
+ "realised": "realized",
+ "realises": "realizes",
+ "realising": "realizing",
+ "recognisable": "recognizable",
+ "recognisably": "recognizably",
+ "recognisance": "recognizance",
+ "recognise": "recognize",
+ "recognised": "recognized",
+ "recognises": "recognizes",
+ "recognising": "recognizing",
+ "reconnoitre": "reconnoiter",
+ "reconnoitred": "reconnoitered",
+ "reconnoitres": "reconnoiters",
+ "reconnoitring": "reconnoitering",
+ "refuelled": "refueled",
+ "refuelling": "refueling",
+ "regularisation": "regularization",
+ "regularise": "regularize",
+ "regularised": "regularized",
+ "regularises": "regularizes",
+ "regularising": "regularizing",
+ "remodelled": "remodeled",
+ "remodelling": "remodeling",
+ "remould": "remold",
+ "remoulded": "remolded",
+ "remoulding": "remolding",
+ "remoulds": "remolds",
+ "reorganisation": "reorganization",
+ "reorganisations": "reorganizations",
+ "reorganise": "reorganize",
+ "reorganised": "reorganized",
+ "reorganises": "reorganizes",
+ "reorganising": "reorganizing",
+ "revelled": "reveled",
+ "reveller": "reveler",
+ "revellers": "revelers",
+ "revelling": "reveling",
+ "revitalise": "revitalize",
+ "revitalised": "revitalized",
+ "revitalises": "revitalizes",
+ "revitalising": "revitalizing",
+ "revolutionise": "revolutionize",
+ "revolutionised": "revolutionized",
+ "revolutionises": "revolutionizes",
+ "revolutionising": "revolutionizing",
+ "rhapsodise": "rhapsodize",
+ "rhapsodised": "rhapsodized",
+ "rhapsodises": "rhapsodizes",
+ "rhapsodising": "rhapsodizing",
+ "rigour": "rigor",
+ "rigours": "rigors",
+ "ritualised": "ritualized",
+ "rivalled": "rivaled",
+ "rivalling": "rivaling",
+ "romanticise": "romanticize",
+ "romanticised": "romanticized",
+ "romanticises": "romanticizes",
+ "romanticising": "romanticizing",
+ "rumour": "rumor",
+ "rumoured": "rumored",
+ "rumours": "rumors",
+ "sabre": "saber",
+ "sabres": "sabers",
+ "saltpetre": "saltpeter",
+ "sanitise": "sanitize",
+ "sanitised": "sanitized",
+ "sanitises": "sanitizes",
+ "sanitising": "sanitizing",
+ "satirise": "satirize",
+ "satirised": "satirized",
+ "satirises": "satirizes",
+ "satirising": "satirizing",
+ "saviour": "savior",
+ "saviours": "saviors",
+ "savour": "savor",
+ "savoured": "savored",
+ "savouries": "savories",
+ "savouring": "savoring",
+ "savours": "savors",
+ "savoury": "savory",
+ "scandalise": "scandalize",
+ "scandalised": "scandalized",
+ "scandalises": "scandalizes",
+ "scandalising": "scandalizing",
+ "sceptic": "skeptic",
+ "sceptical": "skeptical",
+ "sceptically": "skeptically",
+ "scepticism": "skepticism",
+ "sceptics": "skeptics",
+ "sceptre": "scepter",
+ "sceptres": "scepters",
+ "scrutinise": "scrutinize",
+ "scrutinised": "scrutinized",
+ "scrutinises": "scrutinizes",
+ "scrutinising": "scrutinizing",
+ "secularisation": "secularization",
+ "secularise": "secularize",
+ "secularised": "secularized",
+ "secularises": "secularizes",
+ "secularising": "secularizing",
+ "sensationalise": "sensationalize",
+ "sensationalised": "sensationalized",
+ "sensationalises": "sensationalizes",
+ "sensationalising": "sensationalizing",
+ "sensitise": "sensitize",
+ "sensitised": "sensitized",
+ "sensitises": "sensitizes",
+ "sensitising": "sensitizing",
+ "sentimentalise": "sentimentalize",
+ "sentimentalised": "sentimentalized",
+ "sentimentalises": "sentimentalizes",
+ "sentimentalising": "sentimentalizing",
+ "sepulchre": "sepulcher",
+ "sepulchres": "sepulchers",
+ "serialisation": "serialization",
+ "serialisations": "serializations",
+ "serialise": "serialize",
+ "serialised": "serialized",
+ "serialises": "serializes",
+ "serialising": "serializing",
+ "sermonise": "sermonize",
+ "sermonised": "sermonized",
+ "sermonises": "sermonizes",
+ "sermonising": "sermonizing",
+ "sheikh": "sheik",
+ "shovelled": "shoveled",
+ "shovelling": "shoveling",
+ "shrivelled": "shriveled",
+ "shrivelling": "shriveling",
+ "signalise": "signalize",
+ "signalised": "signalized",
+ "signalises": "signalizes",
+ "signalising": "signalizing",
+ "signalled": "signaled",
+ "signalling": "signaling",
+ "smoulder": "smolder",
+ "smouldered": "smoldered",
+ "smouldering": "smoldering",
+ "smoulders": "smolders",
+ "snivelled": "sniveled",
+ "snivelling": "sniveling",
+ "snorkelled": "snorkeled",
+ "snorkelling": "snorkeling",
+ "snowplough": "snowplow",
+ "snowploughs": "snowplow",
+ "socialisation": "socialization",
+ "socialise": "socialize",
+ "socialised": "socialized",
+ "socialises": "socializes",
+ "socialising": "socializing",
+ "sodomise": "sodomize",
+ "sodomised": "sodomized",
+ "sodomises": "sodomizes",
+ "sodomising": "sodomizing",
+ "solemnise": "solemnize",
+ "solemnised": "solemnized",
+ "solemnises": "solemnizes",
+ "solemnising": "solemnizing",
+ "sombre": "somber",
+ "specialisation": "specialization",
+ "specialisations": "specializations",
+ "specialise": "specialize",
+ "specialised": "specialized",
+ "specialises": "specializes",
+ "specialising": "specializing",
+ "spectre": "specter",
+ "spectres": "specters",
+ "spiralled": "spiraled",
+ "spiralling": "spiraling",
+ "splendour": "splendor",
+ "splendours": "splendors",
+ "squirrelled": "squirreled",
+ "squirrelling": "squirreling",
+ "stabilisation": "stabilization",
+ "stabilise": "stabilize",
+ "stabilised": "stabilized",
+ "stabiliser": "stabilizer",
+ "stabilisers": "stabilizers",
+ "stabilises": "stabilizes",
+ "stabilising": "stabilizing",
+ "standardisation": "standardization",
+ "standardise": "standardize",
+ "standardised": "standardized",
+ "standardises": "standardizes",
+ "standardising": "standardizing",
+ "stencilled": "stenciled",
+ "stencilling": "stenciling",
+ "sterilisation": "sterilization",
+ "sterilisations": "sterilizations",
+ "sterilise": "sterilize",
+ "sterilised": "sterilized",
+ "steriliser": "sterilizer",
+ "sterilisers": "sterilizers",
+ "sterilises": "sterilizes",
+ "sterilising": "sterilizing",
+ "stigmatisation": "stigmatization",
+ "stigmatise": "stigmatize",
+ "stigmatised": "stigmatized",
+ "stigmatises": "stigmatizes",
+ "stigmatising": "stigmatizing",
+ "storey": "story",
+ "storeys": "stories",
+ "subsidisation": "subsidization",
+ "subsidise": "subsidize",
+ "subsidised": "subsidized",
+ "subsidiser": "subsidizer",
+ "subsidisers": "subsidizers",
+ "subsidises": "subsidizes",
+ "subsidising": "subsidizing",
+ "succour": "succor",
+ "succoured": "succored",
+ "succouring": "succoring",
+ "succours": "succors",
+ "sulphate": "sulfate",
+ "sulphates": "sulfates",
+ "sulphide": "sulfide",
+ "sulphides": "sulfides",
+ "sulphur": "sulfur",
+ "sulphurous": "sulfurous",
+ "summarise": "summarize",
+ "summarised": "summarized",
+ "summarises": "summarizes",
+ "summarising": "summarizing",
+ "swivelled": "swiveled",
+ "swivelling": "swiveling",
+ "symbolise": "symbolize",
+ "symbolised": "symbolized",
+ "symbolises": "symbolizes",
+ "symbolising": "symbolizing",
+ "sympathise": "sympathize",
+ "sympathised": "sympathized",
+ "sympathiser": "sympathizer",
+ "sympathisers": "sympathizers",
+ "sympathises": "sympathizes",
+ "sympathising": "sympathizing",
+ "synchronisation": "synchronization",
+ "synchronise": "synchronize",
+ "synchronised": "synchronized",
+ "synchronises": "synchronizes",
+ "synchronising": "synchronizing",
+ "synthesise": "synthesize",
+ "synthesised": "synthesized",
+ "synthesiser": "synthesizer",
+ "synthesisers": "synthesizers",
+ "synthesises": "synthesizes",
+ "synthesising": "synthesizing",
+ "syphon": "siphon",
+ "syphoned": "siphoned",
+ "syphoning": "siphoning",
+ "syphons": "siphons",
+ "systematisation": "systematization",
+ "systematise": "systematize",
+ "systematised": "systematized",
+ "systematises": "systematizes",
+ "systematising": "systematizing",
+ "tantalise": "tantalize",
+ "tantalised": "tantalized",
+ "tantalises": "tantalizes",
+ "tantalising": "tantalizing",
+ "tantalisingly": "tantalizingly",
+ "tasselled": "tasseled",
+ "technicolour": "technicolor",
+ "temporise": "temporize",
+ "temporised": "temporized",
+ "temporises": "temporizes",
+ "temporising": "temporizing",
+ "tenderise": "tenderize",
+ "tenderised": "tenderized",
+ "tenderises": "tenderizes",
+ "tenderising": "tenderizing",
+ "terrorise": "terrorize",
+ "terrorised": "terrorized",
+ "terrorises": "terrorizes",
+ "terrorising": "terrorizing",
+ "theatre": "theater",
+ "theatregoer": "theatergoer",
+ "theatregoers": "theatergoers",
+ "theatres": "theaters",
+ "theorise": "theorize",
+ "theorised": "theorized",
+ "theorises": "theorizes",
+ "theorising": "theorizing",
+ "tonne": "ton",
+ "tonnes": "tons",
+ "towelled": "toweled",
+ "towelling": "toweling",
+ "toxaemia": "toxemia",
+ "tranquillise": "tranquilize",
+ "tranquillised": "tranquilized",
+ "tranquilliser": "tranquilizer",
+ "tranquillisers": "tranquilizers",
+ "tranquillises": "tranquilizes",
+ "tranquillising": "tranquilizing",
+ "tranquillity": "tranquility",
+ "tranquillize": "tranquilize",
+ "tranquillized": "tranquilized",
+ "tranquillizer": "tranquilizer",
+ "tranquillizers": "tranquilizers",
+ "tranquillizes": "tranquilizes",
+ "tranquillizing": "tranquilizing",
+ "tranquilly": "tranquility",
+ "transistorised": "transistorized",
+ "traumatise": "traumatize",
+ "traumatised": "traumatized",
+ "traumatises": "traumatizes",
+ "traumatising": "traumatizing",
+ "travelled": "traveled",
+ "traveller": "traveler",
+ "travellers": "travelers",
+ "travelling": "traveling",
+ "travelog": "travelogue",
+ "travelogs": "travelogues",
+ "trialled": "trialed",
+ "trialling": "trialing",
+ "tricolour": "tricolor",
+ "tricolours": "tricolors",
+ "trivialise": "trivialize",
+ "trivialised": "trivialized",
+ "trivialises": "trivializes",
+ "trivialising": "trivializing",
+ "tumour": "tumor",
+ "tumours": "tumors",
+ "tunnelled": "tunneled",
+ "tunnelling": "tunneling",
+ "tyrannise": "tyrannize",
+ "tyrannised": "tyrannized",
+ "tyrannises": "tyrannizes",
+ "tyrannising": "tyrannizing",
+ "tyre": "tire",
+ "tyres": "tires",
+ "unauthorised": "unauthorized",
+ "uncivilised": "uncivilized",
+ "underutilised": "underutilized",
+ "unequalled": "unequaled",
+ "unfavourable": "unfavorable",
+ "unfavourably": "unfavorably",
+ "unionisation": "unionization",
+ "unionise": "unionize",
+ "unionised": "unionized",
+ "unionises": "unionizes",
+ "unionising": "unionizing",
+ "unorganised": "unorganized",
+ "unravelled": "unraveled",
+ "unravelling": "unraveling",
+ "unrecognisable": "unrecognizable",
+ "unrecognised": "unrecognized",
+ "unrivalled": "unrivaled",
+ "unsavoury": "unsavory",
+ "untrammelled": "untrammeled",
+ "urbanisation": "urbanization",
+ "urbanise": "urbanize",
+ "urbanised": "urbanized",
+ "urbanises": "urbanizes",
+ "urbanising": "urbanizing",
+ "utilisable": "utilizable",
+ "utilisation": "utilization",
+ "utilise": "utilize",
+ "utilised": "utilized",
+ "utilises": "utilizes",
+ "utilising": "utilizing",
+ "valour": "valor",
+ "vandalise": "vandalize",
+ "vandalised": "vandalized",
+ "vandalises": "vandalizes",
+ "vandalising": "vandalizing",
+ "vaporisation": "vaporization",
+ "vaporise": "vaporize",
+ "vaporised": "vaporized",
+ "vaporises": "vaporizes",
+ "vaporising": "vaporizing",
+ "vapour": "vapor",
+ "vapours": "vapors",
+ "verbalise": "verbalize",
+ "verbalised": "verbalized",
+ "verbalises": "verbalizes",
+ "verbalising": "verbalizing",
+ "victimisation": "victimization",
+ "victimise": "victimize",
+ "victimised": "victimized",
+ "victimises": "victimizes",
+ "victimising": "victimizing",
+ "videodisc": "videodisk",
+ "videodiscs": "videodisks",
+ "vigour": "vigor",
+ "visualisation": "visualization",
+ "visualisations": "visualizations",
+ "visualise": "visualize",
+ "visualised": "visualized",
+ "visualises": "visualizes",
+ "visualising": "visualizing",
+ "vocalisation": "vocalization",
+ "vocalisations": "vocalizations",
+ "vocalise": "vocalize",
+ "vocalised": "vocalized",
+ "vocalises": "vocalizes",
+ "vocalising": "vocalizing",
+ "vulcanised": "vulcanized",
+ "vulgarisation": "vulgarization",
+ "vulgarise": "vulgarize",
+ "vulgarised": "vulgarized",
+ "vulgarises": "vulgarizes",
+ "vulgarising": "vulgarizing",
+ "waggon": "wagon",
+ "waggons": "wagons",
+ "watercolour": "watercolor",
+ "watercolours": "watercolors",
+ "weaselled": "weaseled",
+ "weaselling": "weaseling",
+ "westernisation": "westernization",
+ "westernise": "westernize",
+ "westernised": "westernized",
+ "westernises": "westernizes",
+ "westernising": "westernizing",
+ "womanise": "womanize",
+ "womanised": "womanized",
+ "womaniser": "womanizer",
+ "womanisers": "womanizers",
+ "womanises": "womanizes",
+ "womanising": "womanizing",
+ "woollen": "woolen",
+ "woollens": "woolens",
+ "woollies": "woolies",
+ "woolly": "wooly",
+ "worshipped": "worshiped",
+ "worshipping": "worshiping",
+ "worshipper": "worshiper",
+ "yodelled": "yodeled",
+ "yodelling": "yodeling",
+ "yoghourt": "yogurt",
+ "yoghourts": "yogurts",
+ "yoghurt": "yogurt",
+ "yoghurts": "yogurts",
+ "mhm": "hmm",
+ "mmm": "hmm"
+}
\ No newline at end of file
diff --git a/whisperlivekit/simul_whisper/whisper/timing.py b/whisperlivekit/simul_whisper/whisper/timing.py
index 823c33b..2340000 100644
--- a/whisperlivekit/simul_whisper/whisper/timing.py
+++ b/whisperlivekit/simul_whisper/whisper/timing.py
@@ -56,9 +56,8 @@ def median_filter(x: torch.Tensor, filter_width: int):
@numba.jit(nopython=True)
def backtrace(trace: np.ndarray):
- i = trace.shape[0] - 1 # trace: (N+1, M+1), i=N
- j = trace.shape[1] - 1 # j=M
- # 边界点其实无意义?
+ i = trace.shape[0] - 1
+ j = trace.shape[1] - 1
trace[0, :] = 2
trace[:, 0] = 1
@@ -83,8 +82,8 @@ def backtrace(trace: np.ndarray):
@numba.jit(nopython=True, parallel=True)
def dtw_cpu(x: np.ndarray):
N, M = x.shape
- cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf # cost: x[0, 0]到x[i-1, j-1]的最小代价
- trace = -np.ones((N + 1, M + 1), dtype=np.float32) # trace:
+ cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+ trace = -np.ones((N + 1, M + 1), dtype=np.float32)
cost[0, 0] = 0
for j in range(1, M + 1):
@@ -118,7 +117,7 @@ def dtw_cuda(x, BLOCK_SIZE=1024):
x_skew = x_skew.T.contiguous()
cost = torch.ones(N + M + 2, M + 2) * np.inf
cost[0, 0] = 0
- cost = cost.cuda()
+ cost = cost.to(x.device)
trace = torch.zeros_like(cost, dtype=torch.int32)
dtw_kernel[(1,)](
@@ -192,21 +191,19 @@ def find_alignment(
for i, block in enumerate(model.decoder.blocks)
]
- # 进行前传,获得token概率
- with torch.no_grad():
+ from .model import disable_sdpa
+
+ with torch.no_grad(), disable_sdpa():
logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0]
sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot]
token_probs = sampled_logits.softmax(dim=-1)
text_token_probs = token_probs[np.arange(len(text_tokens)), text_tokens]
text_token_probs = text_token_probs.tolist()
- # 移除钩子
for hook in hooks:
hook.remove()
# heads * tokens * frames
- # print(model.alignment_heads)
- # exit(0)
weights = torch.stack([QKs[_l][_h] for _l, _h in model.alignment_heads.indices().T])
weights = weights[:, :, : num_frames // 2]
weights = (weights * qk_scale).softmax(dim=-1)
@@ -215,18 +212,9 @@ def find_alignment(
weights = median_filter(weights, medfilt_width)
matrix = weights.mean(axis=0)
- print("attention", matrix.shape, matrix[:5, :5])
matrix = matrix[len(tokenizer.sot_sequence) : -1]
- print("attention", matrix.shape, matrix[:5, :5])
text_indices, time_indices = dtw(-matrix)
- print("num_frames", num_frames)
- print("attention", matrix.shape, matrix[:5, :5])
- print("text_indices", text_indices)
- print("time", time_indices)
- print("text_tokens", text_tokens, tokenizer.decode(text_tokens), len(text_tokens))
- print("eot", tokenizer.eot)
-
words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
if len(word_tokens) <= 1:
# return on eot only
@@ -238,9 +226,7 @@ def find_alignment(
word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
- # print("jumps", jumps, jumps.shape)
jump_times = time_indices[jumps] / TOKENS_PER_SECOND
- # print("jump_times", jump_times)
start_times = jump_times[word_boundaries[:-1]]
end_times = jump_times[word_boundaries[1:]]
word_probabilities = [
@@ -315,6 +301,7 @@ def add_word_timestamps(
word_durations = np.array([t.end - t.start for t in alignment])
word_durations = word_durations[word_durations.nonzero()]
median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
+ median_duration = min(0.7, float(median_duration))
max_duration = median_duration * 2
# hack: truncate long words at sentence boundaries.
diff --git a/whisperlivekit/simul_whisper/whisper/trans_nopad.py b/whisperlivekit/simul_whisper/whisper/trans_nopad.py
deleted file mode 100644
index d6eb69f..0000000
--- a/whisperlivekit/simul_whisper/whisper/trans_nopad.py
+++ /dev/null
@@ -1,501 +0,0 @@
-import argparse
-import os
-import warnings
-from typing import TYPE_CHECKING, Optional, Tuple, Union
-
-import numpy as np
-import torch
-import tqdm
-
-from whisper.audio import (
- FRAMES_PER_SECOND,
- HOP_LENGTH,
- N_FRAMES,
- N_SAMPLES,
- SAMPLE_RATE,
- log_mel_spectrogram,
- pad_or_trim,
-)
-from whisper.decoding import DecodingOptions, DecodingResult
-from whisper.timing import add_word_timestamps
-from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
-from whisper.utils import (
- exact_div,
- format_timestamp,
- get_writer,
- make_safe,
- optional_float,
- optional_int,
- str2bool,
-)
-
-if TYPE_CHECKING:
- from whisper.model import Whisper
-
-
-def transcribe(
- model: "Whisper",
- audio: Union[str, np.ndarray, torch.Tensor],
- *,
- verbose: Optional[bool] = None,
- temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
- compression_ratio_threshold: Optional[float] = 2.4,
- logprob_threshold: Optional[float] = -1.0,
- no_speech_threshold: Optional[float] = 0.6,
- condition_on_previous_text: bool = True,
- initial_prompt: Optional[str] = None,
- word_timestamps: bool = False,
- prepend_punctuations: str = "\"'“¿([{-",
- append_punctuations: str = "\"'.。,,!!??::”)]}、",
- **decode_options,
-):
- """
- Transcribe an audio file using Whisper
-
- Parameters
- ----------
- model: Whisper
- The Whisper model instance
-
- audio: Union[str, np.ndarray, torch.Tensor]
- The path to the audio file to open, or the audio waveform
-
- verbose: bool
- Whether to display the text being decoded to the console. If True, displays all the details,
- If False, displays minimal details. If None, does not display anything
-
- temperature: Union[float, Tuple[float, ...]]
- Temperature for sampling. It can be a tuple of temperatures, which will be successively used
- upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
-
- compression_ratio_threshold: float
- If the gzip compression ratio is above this value, treat as failed
-
- logprob_threshold: float
- If the average log probability over sampled tokens is below this value, treat as failed
-
- no_speech_threshold: float
- If the no_speech probability is higher than this value AND the average log probability
- over sampled tokens is below `logprob_threshold`, consider the segment as silent
-
- condition_on_previous_text: bool
- if True, the previous output of the model is provided as a prompt for the next window;
- disabling may make the text inconsistent across windows, but the model becomes less prone to
- getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
-
- word_timestamps: bool
- Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
- and include the timestamps for each word in each segment.
-
- prepend_punctuations: str
- If word_timestamps is True, merge these punctuation symbols with the next word
-
- append_punctuations: str
- If word_timestamps is True, merge these punctuation symbols with the previous word
-
- initial_prompt: Optional[str]
- Optional text to provide as a prompt for the first window. This can be used to provide, or
- "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
- to make it more likely to predict those word correctly.
-
- decode_options: dict
- Keyword arguments to construct `DecodingOptions` instances
-
- Returns
- -------
- A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
- the spoken language ("language"), which is detected when `decode_options["language"]` is None.
- """
- # print("HACKED")
- dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
- if model.device == torch.device("cpu"):
- if torch.cuda.is_available():
- warnings.warn("Performing inference on CPU when CUDA is available")
- if dtype == torch.float16:
- warnings.warn("FP16 is not supported on CPU; using FP32 instead")
- dtype = torch.float32
-
- if dtype == torch.float32:
- decode_options["fp16"] = False
-
- # Pad 30-seconds of silence to the input audio, for slicing
- mel = log_mel_spectrogram(audio, padding=0) # log_mel_spectrogram(audio, padding=N_SAMPLES) # 添加16000*30 = 480000个点
- # mel = pad_or_trim(mel, 3000)
- content_frames = mel.shape[-1] # - N_FRAMES # 对应3000帧;真正有内容的是去掉尾部3000的那些数据
-
- # 判断语种
- if decode_options.get("language", None) is None:
- # 如果是单语种模型,直接设成英文
- if not model.is_multilingual:
- decode_options["language"] = "en"
- # 否则需要前传一次
- else:
- if verbose:
- print(
- "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
- )
- mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
- # print(mel_segment.shape)
- _, probs = model.detect_language(mel_segment)
- decode_options["language"] = max(probs, key=probs.get)
- if verbose is not None:
- print(
- f"Detected language: {LANGUAGES[decode_options['language']].title()}"
- )
-
- language: str = decode_options["language"]
- task: str = decode_options.get("task", "transcribe")
- # 输出编码器
- tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
-
- # 词级别时间戳
- if word_timestamps and task == "translate":
- warnings.warn("Word-level timestamps on translations may not be reliable.")
-
- def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
- temperatures = (
- [temperature] if isinstance(temperature, (int, float)) else temperature
- )
- decode_result = None
-
- for t in temperatures:
- kwargs = {**decode_options}
- if t > 0:
- # disable beam_size and patience when t > 0
- kwargs.pop("beam_size", None)
- kwargs.pop("patience", None)
- else:
- # disable best_of when t == 0
- kwargs.pop("best_of", None)
-
- options = DecodingOptions(**kwargs, temperature=t)
- decode_result = model.decode(segment, options)
-
- # 几种解码可能失败的情况。这些情况下会重复解码
- # 感觉是一种KnowHow的东西 或许ChatGPT里有不少这种trick
- needs_fallback = False
- if (
- compression_ratio_threshold is not None
- and decode_result.compression_ratio > compression_ratio_threshold
- ):
- needs_fallback = True # too repetitive
- if (
- logprob_threshold is not None
- and decode_result.avg_logprob < logprob_threshold
- ):
- needs_fallback = True # average log probability is too low
- if (
- no_speech_threshold is not None
- and decode_result.no_speech_prob > no_speech_threshold
- ):
- needs_fallback = False # silence
- if not needs_fallback:
- break
- # print("decode with temperature {} compress rate {:.3f}/{:.3f}, log_prob {:.3f}/{:.3f}, {:.3f}/{:.3f}".format(
- # t,
- # decode_result.compression_ratio, compression_ratio_threshold,
- # -decode_result.avg_logprob, -logprob_threshold,
- # decode_result.no_speech_prob, no_speech_threshold
- # ))
-
- return decode_result
-
- seek = 0
- input_stride = exact_div(
- N_FRAMES, model.dims.n_audio_ctx
- ) # mel frames per output token: 2
- # 这里output token指的应该是CNN输出的那个东西
-
- time_precision = (
- input_stride * HOP_LENGTH / SAMPLE_RATE
- ) # time per output token: 0.02 (seconds)
- all_tokens = []
- all_segments = []
- prompt_reset_since = 0
-
- if initial_prompt is not None:
- initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
- all_tokens.extend(initial_prompt_tokens)
- else:
- initial_prompt_tokens = []
-
- def new_segment(
- *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
- ):
- tokens = tokens.tolist()
- text_tokens = [token for token in tokens if token < tokenizer.eot]
- return {
- "seek": seek,
- "start": start,
- "end": end,
- "text": tokenizer.decode(text_tokens),
- "tokens": tokens,
- "temperature": result.temperature,
- "avg_logprob": result.avg_logprob,
- "compression_ratio": result.compression_ratio,
- "no_speech_prob": result.no_speech_prob,
- }
-
- # show the progress bar when verbose is False (if True, transcribed text will be printed)
- with tqdm.tqdm(
- total=content_frames, unit="frames", disable=verbose is not False
- ) as pbar:
- last_speech_timestamp = 0.0
- while seek < content_frames: # seek:标记mel频谱当前帧的位置 直接跳过Padding上的部分
- # print("seek segments", seek, content_frames)
- time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE) # 本片段的开始时间
- # mel_segment = mel[:, seek : seek + N_FRAMES] # 获得当前片段的数据
- mel_segment = mel[:, seek:]
- segment_size = min(N_FRAMES, content_frames - seek) # segment_size: 排除padding的真的长度。content_frames:有内容的段的真正长度 如果不够N_FRAMES的话就会截断
- segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE # 当前片段的时长
- mel_segment = mel_segment.to(model.device).to(dtype) # pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype) # 补到mel_segment帧
-
- decode_options["prompt"] = all_tokens[prompt_reset_since:]
- result: DecodingResult = decode_with_fallback(mel_segment)
- tokens = torch.tensor(result.tokens)
-
- # 跳过静音部分
- if no_speech_threshold is not None:
- # no voice activity check
- should_skip = result.no_speech_prob > no_speech_threshold
- if (
- logprob_threshold is not None
- and result.avg_logprob > logprob_threshold
- ):
- # don't skip if the logprob is high enough, despite the no_speech_prob
- should_skip = False
-
- if should_skip:
- seek += segment_size # fast-forward to the next segment boundary
- continue
-
- previous_seek = seek
- current_segments = []
-
- timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin) # timestamp begin是<|0.00|>的token;bos比文字token大,eos的值比bos还大,所以是ge
- timestamp_tokens[-1] = False
- single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True] # 如果最后是[False,True]:本段里一个句子结束了
-
- consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
- # torch.where(condition) is identical to torch.nonzero(condition, as_tuple=True).
- # timestamp_token就是个一维向量吧 那为啥不直接nonzero
- # 如果有两个连续的时间戳 这个会是一个一维tensor 是这两个连续时间戳的结尾位置
- # 多个的话指向第二个 那如果有三个怎么办?
- # 否则是个0维tensor
-
- consecutive.add_(1) # 0维tensor+1还是0维 哪儿找的这些edge cases js是吧
- if len(consecutive) > 0:
- # if the output contains two consecutive timestamp tokens
- slices = consecutive.tolist()
- if single_timestamp_ending:
- slices.append(len(tokens)) # 把最后一段的结尾也加进去
- # print("many sentenses", consecutive)
- last_slice = 0
- for current_slice in slices:
- sliced_tokens = tokens[last_slice:current_slice]
- # 看起来语音开始帧、语音结束帧的位置会被编码到start_timestamp中
- start_timestamp_pos = (
- sliced_tokens[0].item() - tokenizer.timestamp_begin
- )
- end_timestamp_pos = (
- sliced_tokens[-1].item() - tokenizer.timestamp_begin
- )
- # 获取一个新的语音段
- current_segments.append(
- new_segment(
- start=time_offset + start_timestamp_pos * time_precision,
- end=time_offset + end_timestamp_pos * time_precision,
- tokens=sliced_tokens,
- result=result,
- )
- )
- last_slice = current_slice
-
- if single_timestamp_ending:
- # single timestamp at the end means no speech after the last timestamp.
- seek += segment_size
- else:
- # otherwise, ignore the unfinished segment and seek to the last timestamp
- # 如果语音尚未结束,那么seek变为上一个结束的语段的位置
- # 换句话说就是针对30s长的chunk的语音设计的
- last_timestamp_pos = (
- tokens[last_slice - 1].item() - tokenizer.timestamp_begin
- )
- seek += last_timestamp_pos * input_stride
- else:
- duration = segment_duration
- timestamps = tokens[timestamp_tokens.nonzero().flatten()]
- # print(timestamps)
- if (
- len(timestamps) > 0
- and timestamps[-1].item() != tokenizer.timestamp_begin
- ):
- # no consecutive timestamps but it has a timestamp; use the last one.
- # 取最后一个;假设要么有一个结束的time stamp;要么有一对儿?
- # 如果里面只有一个开始的timestamp 似乎后面的东西都会被丢掉?
- last_timestamp_pos = (
- timestamps[-1].item() - tokenizer.timestamp_begin
- )
- duration = last_timestamp_pos * time_precision
-
- current_segments.append(
- new_segment(
- start=time_offset,
- end=time_offset + duration,
- tokens=tokens,
- result=result,
- )
- )
- seek += segment_size
-
- # 每个token有自己的时间戳
- if word_timestamps:
- add_word_timestamps(
- segments=current_segments,
- model=model,
- tokenizer=tokenizer,
- mel=mel_segment,
- num_frames=segment_size,
- prepend_punctuations=prepend_punctuations,
- append_punctuations=append_punctuations,
- last_speech_timestamp=last_speech_timestamp,
- )
- word_end_timestamps = [
- w["end"] for s in current_segments for w in s["words"]
- ]
- if len(word_end_timestamps) > 0:
- last_speech_timestamp = word_end_timestamps[-1]
- if not single_timestamp_ending and len(word_end_timestamps) > 0:
- seek_shift = round(
- (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
- )
- if seek_shift > 0:
- seek = previous_seek + seek_shift
-
- if verbose:
- for segment in current_segments:
- start, end, text = segment["start"], segment["end"], segment["text"]
- line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
- print(make_safe(line))
-
- # if a segment is instantaneous or does not contain text, clear it
- for i, segment in enumerate(current_segments):
- if segment["start"] == segment["end"] or segment["text"].strip() == "":
- segment["text"] = ""
- segment["tokens"] = []
- segment["words"] = []
-
- # 更新结果
- all_segments.extend(
- [
- {"id": i, **segment}
- for i, segment in enumerate(
- current_segments, start=len(all_segments)
- )
- ]
- )
- all_tokens.extend(
- [token for segment in current_segments for token in segment["tokens"]]
- )
-
- if not condition_on_previous_text or result.temperature > 0.5:
- # do not feed the prompt tokens if a high temperature was used
- prompt_reset_since = len(all_tokens)
-
- # update progress bar
- pbar.update(min(content_frames, seek) - previous_seek)
-
- # print("太长了")
- # break
-
- return dict(
- text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
- segments=all_segments,
- language=language,
- )
-
-
-def cli():
- from . import available_models
-
- # fmt: off
- parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
- parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
- parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
- parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
- parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
- parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
- parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
-
- parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
- parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
-
- parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
- parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
- parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
- parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
- parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
-
- parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
- parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
- parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
- parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
-
- parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
- parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
- parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
- parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
- parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
- parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
- parser.add_argument("--append_punctuations", type=str, default="\"\'.。,,!!??::”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
- parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
- parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
- parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
- parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
- # fmt: on
-
- args = parser.parse_args().__dict__
- model_name: str = args.pop("model")
- model_dir: str = args.pop("model_dir")
- output_dir: str = args.pop("output_dir")
- output_format: str = args.pop("output_format")
- device: str = args.pop("device")
- os.makedirs(output_dir, exist_ok=True)
-
- if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
- if args["language"] is not None:
- warnings.warn(
- f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
- )
- args["language"] = "en"
-
- temperature = args.pop("temperature")
- if (increment := args.pop("temperature_increment_on_fallback")) is not None:
- temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
- else:
- temperature = [temperature]
-
- if (threads := args.pop("threads")) > 0:
- torch.set_num_threads(threads)
-
- from . import load_model
-
- model = load_model(model_name, device=device, download_root=model_dir)
-
- writer = get_writer(output_format, output_dir)
- word_options = ["highlight_words", "max_line_count", "max_line_width"]
- if not args["word_timestamps"]:
- for option in word_options:
- if args[option]:
- parser.error(f"--{option} requires --word_timestamps True")
- if args["max_line_count"] and not args["max_line_width"]:
- warnings.warn("--max_line_count has no effect without --max_line_width")
- writer_args = {arg: args.pop(arg) for arg in word_options}
- for audio_path in args.pop("audio"):
- result = transcribe(model, audio_path, temperature=temperature, **args)
- writer(result, audio_path, writer_args)
-
-
-if __name__ == "__main__":
- cli()
diff --git a/whisperlivekit/simul_whisper/whisper/transcribe.py b/whisperlivekit/simul_whisper/whisper/transcribe.py
index f0fe418..0a4cc36 100644
--- a/whisperlivekit/simul_whisper/whisper/transcribe.py
+++ b/whisperlivekit/simul_whisper/whisper/transcribe.py
@@ -1,7 +1,8 @@
import argparse
import os
+import traceback
import warnings
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import numpy as np
import torch
@@ -22,6 +23,7 @@ from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
from .utils import (
exact_div,
format_timestamp,
+ get_end,
get_writer,
make_safe,
optional_float,
@@ -44,9 +46,12 @@ def transcribe(
no_speech_threshold: Optional[float] = 0.6,
condition_on_previous_text: bool = True,
initial_prompt: Optional[str] = None,
+ carry_initial_prompt: bool = False,
word_timestamps: bool = False,
prepend_punctuations: str = "\"'“¿([{-",
append_punctuations: str = "\"'.。,,!!??::”)]}、",
+ clip_timestamps: Union[str, List[float]] = "0",
+ hallucination_silence_threshold: Optional[float] = None,
**decode_options,
):
"""
@@ -98,15 +103,27 @@ def transcribe(
"prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
to make it more likely to predict those word correctly.
+ carry_initial_prompt: bool
+ If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal
+ `decode()` call. If there is not enough context space at the start of the prompt, it is
+ left-sliced to make space.
+
decode_options: dict
Keyword arguments to construct `DecodingOptions` instances
+ clip_timestamps: Union[str, List[float]]
+ Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
+ The last end timestamp defaults to the end of the file.
+
+ hallucination_silence_threshold: Optional[float]
+ When word_timestamps is True, skip silent periods longer than this threshold (in seconds)
+ when a possible hallucination is detected
+
Returns
-------
A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
the spoken language ("language"), which is detected when `decode_options["language"]` is None.
"""
- # print("transcribe")
dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
if model.device == torch.device("cpu"):
if torch.cuda.is_available():
@@ -119,8 +136,9 @@ def transcribe(
decode_options["fp16"] = False
# Pad 30-seconds of silence to the input audio, for slicing
- mel = log_mel_spectrogram(audio, padding=N_SAMPLES)
+ mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES)
content_frames = mel.shape[-1] - N_FRAMES
+ content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
if decode_options.get("language", None) is None:
if not model.is_multilingual:
@@ -131,7 +149,6 @@ def transcribe(
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
)
mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
- # print(mel_segment.shape)
_, probs = model.detect_language(mel_segment)
decode_options["language"] = max(probs, key=probs.get)
if verbose is not None:
@@ -141,7 +158,25 @@ def transcribe(
language: str = decode_options["language"]
task: str = decode_options.get("task", "transcribe")
- tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
+ tokenizer = get_tokenizer(
+ model.is_multilingual,
+ num_languages=model.num_languages,
+ language=language,
+ task=task,
+ )
+
+ if isinstance(clip_timestamps, str):
+ clip_timestamps = [
+ float(ts) for ts in (clip_timestamps.split(",") if clip_timestamps else [])
+ ]
+ seek_points: List[int] = [round(ts * FRAMES_PER_SECOND) for ts in clip_timestamps]
+ if len(seek_points) == 0:
+ seek_points.append(0)
+ if len(seek_points) % 2 == 1:
+ seek_points.append(content_frames)
+ seek_clips: List[Tuple[int, int]] = list(zip(seek_points[::2], seek_points[1::2]))
+
+ punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、"
if word_timestamps and task == "translate":
warnings.warn("Word-level timestamps on translations may not be reliable.")
@@ -179,6 +214,8 @@ def transcribe(
if (
no_speech_threshold is not None
and decode_result.no_speech_prob > no_speech_threshold
+ and logprob_threshold is not None
+ and decode_result.avg_logprob < logprob_threshold
):
needs_fallback = False # silence
if not needs_fallback:
@@ -186,7 +223,8 @@ def transcribe(
return decode_result
- seek = 0
+ clip_idx = 0
+ seek = seek_clips[clip_idx][0]
input_stride = exact_div(
N_FRAMES, model.dims.n_audio_ctx
) # mel frames per output token: 2
@@ -197,9 +235,11 @@ def transcribe(
all_segments = []
prompt_reset_since = 0
+ remaining_prompt_length = model.dims.n_text_ctx // 2 - 1
if initial_prompt is not None:
initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
all_tokens.extend(initial_prompt_tokens)
+ remaining_prompt_length -= len(initial_prompt_tokens)
else:
initial_prompt_tokens = []
@@ -225,16 +265,33 @@ def transcribe(
total=content_frames, unit="frames", disable=verbose is not False
) as pbar:
last_speech_timestamp = 0.0
- while seek < content_frames:
+ # NOTE: This loop is obscurely flattened to make the diff readable.
+ # A later commit should turn this into a simpler nested loop.
+ # for seek_clip_start, seek_clip_end in seek_clips:
+ # while seek < seek_clip_end
+ while clip_idx < len(seek_clips):
+ seek_clip_start, seek_clip_end = seek_clips[clip_idx]
+ if seek < seek_clip_start:
+ seek = seek_clip_start
+ if seek >= seek_clip_end:
+ clip_idx += 1
+ if clip_idx < len(seek_clips):
+ seek = seek_clips[clip_idx][0]
+ continue
time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
- mel_segment = mel[:, seek : seek + N_FRAMES]
- segment_size = min(N_FRAMES, content_frames - seek)
+ window_end_time = float((seek + N_FRAMES) * HOP_LENGTH / SAMPLE_RATE)
+ segment_size = min(N_FRAMES, content_frames - seek, seek_clip_end - seek)
+ mel_segment = mel[:, seek : seek + segment_size]
segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
- # print("melshape", mel_segment.shape)
+ if carry_initial_prompt:
+ nignored = max(len(initial_prompt_tokens), prompt_reset_since)
+ remaining_prompt = all_tokens[nignored:][-remaining_prompt_length:]
+ decode_options["prompt"] = initial_prompt_tokens + remaining_prompt
+ else:
+ decode_options["prompt"] = all_tokens[prompt_reset_since:]
- decode_options["prompt"] = all_tokens[prompt_reset_since:]
result: DecodingResult = decode_with_fallback(mel_segment)
tokens = torch.tensor(result.tokens)
@@ -255,6 +312,30 @@ def transcribe(
previous_seek = seek
current_segments = []
+ # anomalous words are very long/short/improbable
+ def word_anomaly_score(word: dict) -> float:
+ probability = word.get("probability", 0.0)
+ duration = word["end"] - word["start"]
+ score = 0.0
+ if probability < 0.15:
+ score += 1.0
+ if duration < 0.133:
+ score += (0.133 - duration) * 15
+ if duration > 2.0:
+ score += duration - 2.0
+ return score
+
+ def is_segment_anomaly(segment: Optional[dict]) -> bool:
+ if segment is None or not segment["words"]:
+ return False
+ words = [w for w in segment["words"] if w["word"] not in punctuation]
+ words = words[:8]
+ score = sum(word_anomaly_score(w) for w in words)
+ return score >= 3 or score + 0.01 >= len(words)
+
+ def next_words_segment(segments: List[dict]) -> Optional[dict]:
+ return next((s for s in segments if s["words"]), None)
+
timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
@@ -317,9 +398,7 @@ def transcribe(
)
seek += segment_size
- # print("word_timestamps, ", word_timestamps)
if word_timestamps:
- # print("=========run timestamps here=========")
add_word_timestamps(
segments=current_segments,
model=model,
@@ -330,17 +409,71 @@ def transcribe(
append_punctuations=append_punctuations,
last_speech_timestamp=last_speech_timestamp,
)
- word_end_timestamps = [
- w["end"] for s in current_segments for w in s["words"]
- ]
- if len(word_end_timestamps) > 0:
- last_speech_timestamp = word_end_timestamps[-1]
- if not single_timestamp_ending and len(word_end_timestamps) > 0:
- seek_shift = round(
- (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
- )
- if seek_shift > 0:
- seek = previous_seek + seek_shift
+
+ if not single_timestamp_ending:
+ last_word_end = get_end(current_segments)
+ if last_word_end is not None and last_word_end > time_offset:
+ seek = round(last_word_end * FRAMES_PER_SECOND)
+
+ # skip silence before possible hallucinations
+ if hallucination_silence_threshold is not None:
+ threshold = hallucination_silence_threshold
+ if not single_timestamp_ending:
+ last_word_end = get_end(current_segments)
+ if last_word_end is not None and last_word_end > time_offset:
+ remaining_duration = window_end_time - last_word_end
+ if remaining_duration > threshold:
+ seek = round(last_word_end * FRAMES_PER_SECOND)
+ else:
+ seek = previous_seek + segment_size
+
+ # if first segment might be a hallucination, skip leading silence
+ first_segment = next_words_segment(current_segments)
+ if first_segment is not None and is_segment_anomaly(first_segment):
+ gap = first_segment["start"] - time_offset
+ if gap > threshold:
+ seek = previous_seek + round(gap * FRAMES_PER_SECOND)
+ continue
+
+ # skip silence before any possible hallucination that is surrounded
+ # by silence or more hallucinations
+ hal_last_end = last_speech_timestamp
+ for si in range(len(current_segments)):
+ segment = current_segments[si]
+ if not segment["words"]:
+ continue
+ if is_segment_anomaly(segment):
+ next_segment = next_words_segment(
+ current_segments[si + 1 :]
+ )
+ if next_segment is not None:
+ hal_next_start = next_segment["words"][0]["start"]
+ else:
+ hal_next_start = time_offset + segment_duration
+ silence_before = (
+ segment["start"] - hal_last_end > threshold
+ or segment["start"] < threshold
+ or segment["start"] - time_offset < 2.0
+ )
+ silence_after = (
+ hal_next_start - segment["end"] > threshold
+ or is_segment_anomaly(next_segment)
+ or window_end_time - segment["end"] < 2.0
+ )
+ if silence_before and silence_after:
+ seek = round(
+ max(time_offset + 1, segment["start"])
+ * FRAMES_PER_SECOND
+ )
+ if content_duration - segment["end"] < threshold:
+ seek = content_frames
+ current_segments[si:] = []
+ break
+ hal_last_end = segment["end"]
+
+ last_word_end = get_end(current_segments)
+ if last_word_end is not None:
+ last_speech_timestamp = last_word_end
if verbose:
for segment in current_segments:
@@ -384,10 +517,17 @@ def transcribe(
def cli():
from . import available_models
+ def valid_model_name(name):
+ if name in available_models() or os.path.exists(name):
+ return name
+ raise ValueError(
+ f"model should be one of {available_models()} or path to a model checkpoint"
+ )
+
# fmt: off
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
- parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
+ parser.add_argument("--model", default="turbo", type=valid_model_name, help="name of the Whisper model to use")
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
@@ -405,6 +545,8 @@ def cli():
parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
+ parser.add_argument("--carry_initial_prompt", type=str2bool, default=False, help="if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_text")
+
parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
@@ -418,7 +560,10 @@ def cli():
parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
+ parser.add_argument("--max_words_per_line", type=optional_int, default=None, help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment")
parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
+ parser.add_argument("--clip_timestamps", type=str, default="0", help="comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the file")
+ parser.add_argument("--hallucination_silence_threshold", type=optional_float, help="(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected")
# fmt: on
args = parser.parse_args().__dict__
@@ -450,17 +595,28 @@ def cli():
model = load_model(model_name, device=device, download_root=model_dir)
writer = get_writer(output_format, output_dir)
- word_options = ["highlight_words", "max_line_count", "max_line_width"]
+ word_options = [
+ "highlight_words",
+ "max_line_count",
+ "max_line_width",
+ "max_words_per_line",
+ ]
if not args["word_timestamps"]:
for option in word_options:
if args[option]:
parser.error(f"--{option} requires --word_timestamps True")
if args["max_line_count"] and not args["max_line_width"]:
warnings.warn("--max_line_count has no effect without --max_line_width")
+ if args["max_words_per_line"] and args["max_line_width"]:
+ warnings.warn("--max_words_per_line has no effect with --max_line_width")
writer_args = {arg: args.pop(arg) for arg in word_options}
for audio_path in args.pop("audio"):
- result = transcribe(model, audio_path, temperature=temperature, **args)
- writer(result, audio_path, writer_args)
+ try:
+ result = transcribe(model, audio_path, temperature=temperature, **args)
+ writer(result, audio_path, **writer_args)
+ except Exception as e:
+ traceback.print_exc()
+ print(f"Skipping {audio_path} due to {type(e).__name__}: {str(e)}")
if __name__ == "__main__":
diff --git a/whisperlivekit/simul_whisper/whisper/triton_ops.py b/whisperlivekit/simul_whisper/whisper/triton_ops.py
index edd4564..13d417b 100644
--- a/whisperlivekit/simul_whisper/whisper/triton_ops.py
+++ b/whisperlivekit/simul_whisper/whisper/triton_ops.py
@@ -60,7 +60,7 @@ def median_kernel(filter_width: int):
tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask) # noqa: F821
kernel = triton.JITFunction(kernel.fn)
- kernel.src = kernel.src.replace(
+ new_kernel = kernel.src.replace(
" LOAD_ALL_ROWS_HERE",
"\n".join(
[
@@ -69,7 +69,8 @@ def median_kernel(filter_width: int):
]
),
)
- kernel.src = kernel.src.replace(
+
+ new_kernel = new_kernel.replace(
" BUBBLESORT_HERE",
"\n\n".join(
[
@@ -90,7 +91,14 @@ def median_kernel(filter_width: int):
]
),
)
- kernel.src = kernel.src.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
+
+ new_kernel = new_kernel.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
+
+ if hasattr(kernel, "_unsafe_update_src") is True:
+ kernel._unsafe_update_src(new_kernel)
+ kernel.hash = None
+ else:
+ kernel.src = new_kernel
return kernel
diff --git a/whisperlivekit/simul_whisper/whisper/utils.py b/whisperlivekit/simul_whisper/whisper/utils.py
index 22260d0..13792f7 100644
--- a/whisperlivekit/simul_whisper/whisper/utils.py
+++ b/whisperlivekit/simul_whisper/whisper/utils.py
@@ -3,7 +3,7 @@ import os
import re
import sys
import zlib
-from typing import Callable, Optional, TextIO
+from typing import Callable, List, Optional, TextIO
system_encoding = sys.getdefaultencoding()
@@ -68,13 +68,29 @@ def format_timestamp(
)
+def get_start(segments: List[dict]) -> Optional[float]:
+ return next(
+ (w["start"] for s in segments for w in s["words"]),
+ segments[0]["start"] if segments else None,
+ )
+
+
+def get_end(segments: List[dict]) -> Optional[float]:
+ return next(
+ (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
+ segments[-1]["end"] if segments else None,
+ )
+
+
class ResultWriter:
extension: str
def __init__(self, output_dir: str):
self.output_dir = output_dir
- def __call__(self, result: dict, audio_path: str, options: dict):
+ def __call__(
+ self, result: dict, audio_path: str, options: Optional[dict] = None, **kwargs
+ ):
audio_basename = os.path.basename(audio_path)
audio_basename = os.path.splitext(audio_basename)[0]
output_path = os.path.join(
@@ -82,16 +98,20 @@ class ResultWriter:
)
with open(output_path, "w", encoding="utf-8") as f:
- self.write_result(result, file=f, options=options)
+ self.write_result(result, file=f, options=options, **kwargs)
- def write_result(self, result: dict, file: TextIO, options: dict):
+ def write_result(
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
raise NotImplementedError
class WriteTXT(ResultWriter):
extension: str = "txt"
- def write_result(self, result: dict, file: TextIO, options: dict):
+ def write_result(
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
for segment in result["segments"]:
print(segment["text"].strip(), file=file, flush=True)
@@ -100,48 +120,76 @@ class SubtitlesWriter(ResultWriter):
always_include_hours: bool
decimal_marker: str
- def iterate_result(self, result: dict, options: dict):
- raw_max_line_width: Optional[int] = options["max_line_width"]
- max_line_count: Optional[int] = options["max_line_count"]
- highlight_words: bool = options["highlight_words"]
- max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width
- preserve_segments = max_line_count is None or raw_max_line_width is None
+ def iterate_result(
+ self,
+ result: dict,
+ options: Optional[dict] = None,
+ *,
+ max_line_width: Optional[int] = None,
+ max_line_count: Optional[int] = None,
+ highlight_words: bool = False,
+ max_words_per_line: Optional[int] = None,
+ ):
+ options = options or {}
+ max_line_width = max_line_width or options.get("max_line_width")
+ max_line_count = max_line_count or options.get("max_line_count")
+ highlight_words = highlight_words or options.get("highlight_words", False)
+ max_words_per_line = max_words_per_line or options.get("max_words_per_line")
+ preserve_segments = max_line_count is None or max_line_width is None
+ max_line_width = max_line_width or 1000
+ max_words_per_line = max_words_per_line or 1000
def iterate_subtitles():
line_len = 0
line_count = 1
# the next subtitle to yield (a list of word timings with whitespace)
- subtitle: list[dict] = []
- last = result["segments"][0]["words"][0]["start"]
+ subtitle: List[dict] = []
+ last: float = get_start(result["segments"]) or 0.0
for segment in result["segments"]:
- for i, original_timing in enumerate(segment["words"]):
- timing = original_timing.copy()
- long_pause = not preserve_segments and timing["start"] - last > 3.0
- has_room = line_len + len(timing["word"]) <= max_line_width
- seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
- if line_len > 0 and has_room and not long_pause and not seg_break:
- # line continuation
- line_len += len(timing["word"])
- else:
- # new line
- timing["word"] = timing["word"].strip()
+ chunk_index = 0
+ words_count = max_words_per_line
+ while chunk_index < len(segment["words"]):
+ remaining_words = len(segment["words"]) - chunk_index
+ if max_words_per_line > len(segment["words"]) - chunk_index:
+ words_count = remaining_words
+ for i, original_timing in enumerate(
+ segment["words"][chunk_index : chunk_index + words_count]
+ ):
+ timing = original_timing.copy()
+ long_pause = (
+ not preserve_segments and timing["start"] - last > 3.0
+ )
+ has_room = line_len + len(timing["word"]) <= max_line_width
+ seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
if (
- len(subtitle) > 0
- and max_line_count is not None
- and (long_pause or line_count >= max_line_count)
- or seg_break
+ line_len > 0
+ and has_room
+ and not long_pause
+ and not seg_break
):
- # subtitle break
- yield subtitle
- subtitle = []
- line_count = 1
- elif line_len > 0:
- # line break
- line_count += 1
- timing["word"] = "\n" + timing["word"]
- line_len = len(timing["word"].strip())
- subtitle.append(timing)
- last = timing["start"]
+ # line continuation
+ line_len += len(timing["word"])
+ else:
+ # new line
+ timing["word"] = timing["word"].strip()
+ if (
+ len(subtitle) > 0
+ and max_line_count is not None
+ and (long_pause or line_count >= max_line_count)
+ or seg_break
+ ):
+ # subtitle break
+ yield subtitle
+ subtitle = []
+ line_count = 1
+ elif line_len > 0:
+ # line break
+ line_count += 1
+ timing["word"] = "\n" + timing["word"]
+ line_len = len(timing["word"].strip())
+ subtitle.append(timing)
+ last = timing["start"]
+ chunk_index += max_words_per_line
if len(subtitle) > 0:
yield subtitle
@@ -161,9 +209,11 @@ class SubtitlesWriter(ResultWriter):
yield start, end, "".join(
[
- re.sub(r"^(\s*)(.*)$", r"\1\2", word)
- if j == i
- else word
+ (
+ re.sub(r"^(\s*)(.*)$", r"\1\2", word)
+ if j == i
+ else word
+ )
for j, word in enumerate(all_words)
]
)
@@ -190,9 +240,11 @@ class WriteVTT(SubtitlesWriter):
always_include_hours: bool = False
decimal_marker: str = "."
- def write_result(self, result: dict, file: TextIO, options: dict):
+ def write_result(
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
print("WEBVTT\n", file=file)
- for start, end, text in self.iterate_result(result, options):
+ for start, end, text in self.iterate_result(result, options, **kwargs):
print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
@@ -201,9 +253,11 @@ class WriteSRT(SubtitlesWriter):
always_include_hours: bool = True
decimal_marker: str = ","
- def write_result(self, result: dict, file: TextIO, options: dict):
+ def write_result(
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
for i, (start, end, text) in enumerate(
- self.iterate_result(result, options), start=1
+ self.iterate_result(result, options, **kwargs), start=1
):
print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
@@ -220,7 +274,9 @@ class WriteTSV(ResultWriter):
extension: str = "tsv"
- def write_result(self, result: dict, file: TextIO, options: dict):
+ def write_result(
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
print("start", "end", "text", sep="\t", file=file)
for segment in result["segments"]:
print(round(1000 * segment["start"]), file=file, end="\t")
@@ -231,7 +287,9 @@ class WriteTSV(ResultWriter):
class WriteJSON(ResultWriter):
extension: str = "json"
- def write_result(self, result: dict, file: TextIO, options: dict):
+ def write_result(
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
json.dump(result, file)
@@ -249,9 +307,11 @@ def get_writer(
if output_format == "all":
all_writers = [writer(output_dir) for writer in writers.values()]
- def write_all(result: dict, file: TextIO, options: dict):
+ def write_all(
+ result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+ ):
for writer in all_writers:
- writer(result, file, options)
+ writer(result, file, options, **kwargs)
return write_all
diff --git a/whisperlivekit/simul_whisper/whisper/version.py b/whisperlivekit/simul_whisper/whisper/version.py
index c43bf6f..67426aa 100644
--- a/whisperlivekit/simul_whisper/whisper/version.py
+++ b/whisperlivekit/simul_whisper/whisper/version.py
@@ -1 +1 @@
-__version__ = "20230918"
+__version__ = "20250625"