diff --git a/README.md b/README.md
index 4b2a633..b60a8e5 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,8 @@ Real-time speech transcription directly to your browser, with a ready-to-use bac
```bash
pip install whisperlivekit
```
+> You can also clone the repo and `pip install -e .` for the latest version.
+
> **FFmpeg is required** and must be installed before using WhisperLiveKit
>
@@ -148,7 +150,8 @@ The rest I don't recommend. But below are your options.
|-----------|-------------|---------|
| `--model` | Whisper model size. | `small` |
| `--language` | Source language code or `auto` | `auto` |
-| `--task` | `transcribe` or `translate` | `transcribe` |
+| `--task` | Set to `translate` to translate to english | `transcribe` |
+| `--target-language` | [NOT FUNCTIONAL YET] | `None` |
| `--backend` | Processing backend | `simulstreaming` |
| `--min-chunk-size` | Minimum audio chunk size (seconds) | `1.0` |
| `--no-vac` | Disable Voice Activity Controller | `False` |
diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py
index 8ce714b..3a6b3c1 100644
--- a/whisperlivekit/core.py
+++ b/whisperlivekit/core.py
@@ -33,6 +33,7 @@ class TranscriptionEngine:
"model_dir": None,
"lan": "auto",
"task": "transcribe",
+ "target_language": "",
"backend": "faster-whisper",
"vac": True,
"vac_chunk_size": 0.04,
@@ -133,6 +134,12 @@ class TranscriptionEngine:
else:
raise ValueError(f"Unknown diarization backend: {self.args.diarization_backend}")
+ if self.args.target_language:
+ if self.args.language == 'auto':
+ raise Exception('Translation cannot be set with language auto')
+ else:
+ from whisperlivekit.translation.translation import load_model
+
TranscriptionEngine._initialized = True
diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py
index 023f951..66a9f41 100644
--- a/whisperlivekit/parse_args.py
+++ b/whisperlivekit/parse_args.py
@@ -112,6 +112,15 @@ def parse_args():
choices=["transcribe", "translate"],
help="Transcribe or translate.",
)
+
+ parser.add_argument(
+ "--target-language",
+ type=str,
+ default="",
+ dest="target_language",
+ help="Target language for translation. Not functional yet.",
+ )
+
parser.add_argument(
"--backend",
type=str,
diff --git a/whisperlivekit/translation/__init__.py b/whisperlivekit/translation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/whisperlivekit/translation/mapping_languages.py b/whisperlivekit/translation/mapping_languages.py
new file mode 100644
index 0000000..da6881d
--- /dev/null
+++ b/whisperlivekit/translation/mapping_languages.py
@@ -0,0 +1,182 @@
+"""
+adapted from https://store.crowdin.com/custom-mt
+"""
+
+LANGUAGES = [
+ {"name": "Afrikaans", "nllb": "afr_Latn", "crowdin": "af"},
+ {"name": "Akan", "nllb": "aka_Latn", "crowdin": "ak"},
+ {"name": "Amharic", "nllb": "amh_Ethi", "crowdin": "am"},
+ {"name": "Assamese", "nllb": "asm_Beng", "crowdin": "as"},
+ {"name": "Asturian", "nllb": "ast_Latn", "crowdin": "ast"},
+ {"name": "Bashkir", "nllb": "bak_Cyrl", "crowdin": "ba"},
+ {"name": "Bambara", "nllb": "bam_Latn", "crowdin": "bm"},
+ {"name": "Balinese", "nllb": "ban_Latn", "crowdin": "ban"},
+ {"name": "Belarusian", "nllb": "bel_Cyrl", "crowdin": "be"},
+ {"name": "Bengali", "nllb": "ben_Beng", "crowdin": "bn"},
+ {"name": "Bosnian", "nllb": "bos_Latn", "crowdin": "bs"},
+ {"name": "Bulgarian", "nllb": "bul_Cyrl", "crowdin": "bg"},
+ {"name": "Catalan", "nllb": "cat_Latn", "crowdin": "ca"},
+ {"name": "Cebuano", "nllb": "ceb_Latn", "crowdin": "ceb"},
+ {"name": "Czech", "nllb": "ces_Latn", "crowdin": "cs"},
+ {"name": "Welsh", "nllb": "cym_Latn", "crowdin": "cy"},
+ {"name": "Danish", "nllb": "dan_Latn", "crowdin": "da"},
+ {"name": "German", "nllb": "deu_Latn", "crowdin": "de"},
+ {"name": "Dzongkha", "nllb": "dzo_Tibt", "crowdin": "dz"},
+ {"name": "Greek", "nllb": "ell_Grek", "crowdin": "el"},
+ {"name": "English", "nllb": "eng_Latn", "crowdin": "en"},
+ {"name": "Esperanto", "nllb": "epo_Latn", "crowdin": "eo"},
+ {"name": "Estonian", "nllb": "est_Latn", "crowdin": "et"},
+ {"name": "Basque", "nllb": "eus_Latn", "crowdin": "eu"},
+ {"name": "Ewe", "nllb": "ewe_Latn", "crowdin": "ee"},
+ {"name": "Faroese", "nllb": "fao_Latn", "crowdin": "fo"},
+ {"name": "Fijian", "nllb": "fij_Latn", "crowdin": "fj"},
+ {"name": "Finnish", "nllb": "fin_Latn", "crowdin": "fi"},
+ {"name": "French", "nllb": "fra_Latn", "crowdin": "fr"},
+ {"name": "Friulian", "nllb": "fur_Latn", "crowdin": "fur-IT"},
+ {"name": "Scottish Gaelic", "nllb": "gla_Latn", "crowdin": "gd"},
+ {"name": "Irish", "nllb": "gle_Latn", "crowdin": "ga-IE"},
+ {"name": "Galician", "nllb": "glg_Latn", "crowdin": "gl"},
+ {"name": "Guarani", "nllb": "grn_Latn", "crowdin": "gn"},
+ {"name": "Gujarati", "nllb": "guj_Gujr", "crowdin": "gu-IN"},
+ {"name": "Haitian Creole", "nllb": "hat_Latn", "crowdin": "ht"},
+ {"name": "Hausa", "nllb": "hau_Latn", "crowdin": "ha"},
+ {"name": "Hebrew", "nllb": "heb_Hebr", "crowdin": "he"},
+ {"name": "Hindi", "nllb": "hin_Deva", "crowdin": "hi"},
+ {"name": "Croatian", "nllb": "hrv_Latn", "crowdin": "hr"},
+ {"name": "Hungarian", "nllb": "hun_Latn", "crowdin": "hu"},
+ {"name": "Armenian", "nllb": "hye_Armn", "crowdin": "hy-AM"},
+ {"name": "Igbo", "nllb": "ibo_Latn", "crowdin": "ig"},
+ {"name": "Indonesian", "nllb": "ind_Latn", "crowdin": "id"},
+ {"name": "Icelandic", "nllb": "isl_Latn", "crowdin": "is"},
+ {"name": "Italian", "nllb": "ita_Latn", "crowdin": "it"},
+ {"name": "Javanese", "nllb": "jav_Latn", "crowdin": "jv"},
+ {"name": "Japanese", "nllb": "jpn_Jpan", "crowdin": "ja"},
+ {"name": "Kabyle", "nllb": "kab_Latn", "crowdin": "kab"},
+ {"name": "Kannada", "nllb": "kan_Knda", "crowdin": "kn"},
+ {"name": "Georgian", "nllb": "kat_Geor", "crowdin": "ka"},
+ {"name": "Kazakh", "nllb": "kaz_Cyrl", "crowdin": "kk"},
+ {"name": "Khmer", "nllb": "khm_Khmr", "crowdin": "km"},
+ {"name": "Kinyarwanda", "nllb": "kin_Latn", "crowdin": "rw"},
+ {"name": "Kyrgyz", "nllb": "kir_Cyrl", "crowdin": "ky"},
+ {"name": "Korean", "nllb": "kor_Hang", "crowdin": "ko"},
+ {"name": "Lao", "nllb": "lao_Laoo", "crowdin": "lo"},
+ {"name": "Ligurian", "nllb": "lij_Latn", "crowdin": "lij"},
+ {"name": "Limburgish", "nllb": "lim_Latn", "crowdin": "li"},
+ {"name": "Lingala", "nllb": "lin_Latn", "crowdin": "ln"},
+ {"name": "Lithuanian", "nllb": "lit_Latn", "crowdin": "lt"},
+ {"name": "Luxembourgish", "nllb": "ltz_Latn", "crowdin": "lb"},
+ {"name": "Maithili", "nllb": "mai_Deva", "crowdin": "mai"},
+ {"name": "Malayalam", "nllb": "mal_Mlym", "crowdin": "ml-IN"},
+ {"name": "Marathi", "nllb": "mar_Deva", "crowdin": "mr"},
+ {"name": "Macedonian", "nllb": "mkd_Cyrl", "crowdin": "mk"},
+ {"name": "Maltese", "nllb": "mlt_Latn", "crowdin": "mt"},
+ {"name": "Mossi", "nllb": "mos_Latn", "crowdin": "mos"},
+ {"name": "Maori", "nllb": "mri_Latn", "crowdin": "mi"},
+ {"name": "Burmese", "nllb": "mya_Mymr", "crowdin": "my"},
+ {"name": "Dutch", "nllb": "nld_Latn", "crowdin": "nl"},
+ {"name": "Norwegian Nynorsk", "nllb": "nno_Latn", "crowdin": "nn-NO"},
+ {"name": "Nepali", "nllb": "npi_Deva", "crowdin": "ne-NP"},
+ {"name": "Northern Sotho", "nllb": "nso_Latn", "crowdin": "nso"},
+ {"name": "Occitan", "nllb": "oci_Latn", "crowdin": "oc"},
+ {"name": "Odia", "nllb": "ory_Orya", "crowdin": "or"},
+ {"name": "Papiamento", "nllb": "pap_Latn", "crowdin": "pap"},
+ {"name": "Polish", "nllb": "pol_Latn", "crowdin": "pl"},
+ {"name": "Portuguese", "nllb": "por_Latn", "crowdin": "pt-PT"},
+ {"name": "Dari", "nllb": "prs_Arab", "crowdin": "fa-AF"},
+ {"name": "Romanian", "nllb": "ron_Latn", "crowdin": "ro"},
+ {"name": "Rundi", "nllb": "run_Latn", "crowdin": "rn"},
+ {"name": "Russian", "nllb": "rus_Cyrl", "crowdin": "ru"},
+ {"name": "Sango", "nllb": "sag_Latn", "crowdin": "sg"},
+ {"name": "Sanskrit", "nllb": "san_Deva", "crowdin": "sa"},
+ {"name": "Santali", "nllb": "sat_Olck", "crowdin": "sat"},
+ {"name": "Sinhala", "nllb": "sin_Sinh", "crowdin": "si-LK"},
+ {"name": "Slovak", "nllb": "slk_Latn", "crowdin": "sk"},
+ {"name": "Slovenian", "nllb": "slv_Latn", "crowdin": "sl"},
+ {"name": "Shona", "nllb": "sna_Latn", "crowdin": "sn"},
+ {"name": "Sindhi", "nllb": "snd_Arab", "crowdin": "sd"},
+ {"name": "Somali", "nllb": "som_Latn", "crowdin": "so"},
+ {"name": "Southern Sotho", "nllb": "sot_Latn", "crowdin": "st"},
+ {"name": "Spanish", "nllb": "spa_Latn", "crowdin": "es-ES"},
+ {"name": "Sardinian", "nllb": "srd_Latn", "crowdin": "sc"},
+ {"name": "Swati", "nllb": "ssw_Latn", "crowdin": "ss"},
+ {"name": "Sundanese", "nllb": "sun_Latn", "crowdin": "su"},
+ {"name": "Swedish", "nllb": "swe_Latn", "crowdin": "sv-SE"},
+ {"name": "Swahili", "nllb": "swh_Latn", "crowdin": "sw"},
+ {"name": "Tamil", "nllb": "tam_Taml", "crowdin": "ta"},
+ {"name": "Tatar", "nllb": "tat_Cyrl", "crowdin": "tt-RU"},
+ {"name": "Telugu", "nllb": "tel_Telu", "crowdin": "te"},
+ {"name": "Tajik", "nllb": "tgk_Cyrl", "crowdin": "tg"},
+ {"name": "Tagalog", "nllb": "tgl_Latn", "crowdin": "tl"},
+ {"name": "Thai", "nllb": "tha_Thai", "crowdin": "th"},
+ {"name": "Tigrinya", "nllb": "tir_Ethi", "crowdin": "ti"},
+ {"name": "Tswana", "nllb": "tsn_Latn", "crowdin": "tn"},
+ {"name": "Tsonga", "nllb": "tso_Latn", "crowdin": "ts"},
+ {"name": "Turkmen", "nllb": "tuk_Latn", "crowdin": "tk"},
+ {"name": "Turkish", "nllb": "tur_Latn", "crowdin": "tr"},
+ {"name": "Uyghur", "nllb": "uig_Arab", "crowdin": "ug"},
+ {"name": "Ukrainian", "nllb": "ukr_Cyrl", "crowdin": "uk"},
+ {"name": "Venetian", "nllb": "vec_Latn", "crowdin": "vec"},
+ {"name": "Vietnamese", "nllb": "vie_Latn", "crowdin": "vi"},
+ {"name": "Wolof", "nllb": "wol_Latn", "crowdin": "wo"},
+ {"name": "Xhosa", "nllb": "xho_Latn", "crowdin": "xh"},
+ {"name": "Yoruba", "nllb": "yor_Latn", "crowdin": "yo"},
+ {"name": "Zulu", "nllb": "zul_Latn", "crowdin": "zu"},
+]
+
+NAME_TO_NLLB = {lang["name"]: lang["nllb"] for lang in LANGUAGES}
+NAME_TO_CROWDIN = {lang["name"]: lang["crowdin"] for lang in LANGUAGES}
+CROWDIN_TO_NLLB = {lang["crowdin"]: lang["nllb"] for lang in LANGUAGES}
+NLLB_TO_CROWDIN = {lang["nllb"]: lang["crowdin"] for lang in LANGUAGES}
+CROWDIN_TO_NAME = {lang["crowdin"]: lang["name"] for lang in LANGUAGES}
+NLLB_TO_NAME = {lang["nllb"]: lang["name"] for lang in LANGUAGES}
+
+
+def get_nllb_code(crowdin_code):
+ return CROWDIN_TO_NLLB.get(crowdin_code, crowdin_code)
+
+
+def get_crowdin_code(nllb_code):
+ return NLLB_TO_CROWDIN.get(nllb_code)
+
+
+def get_language_name_by_crowdin(crowdin_code):
+ return CROWDIN_TO_NAME.get(crowdin_code)
+
+
+def get_language_name_by_nllb(nllb_code):
+ return NLLB_TO_NAME.get(nllb_code)
+
+
+def get_language_info(identifier, identifier_type="auto"):
+ if identifier_type == "auto":
+ for lang in LANGUAGES:
+ if (lang["name"].lower() == identifier.lower() or
+ lang["nllb"] == identifier or
+ lang["crowdin"] == identifier):
+ return lang
+ elif identifier_type == "name":
+ for lang in LANGUAGES:
+ if lang["name"].lower() == identifier.lower():
+ return lang
+ elif identifier_type == "nllb":
+ for lang in LANGUAGES:
+ if lang["nllb"] == identifier:
+ return lang
+ elif identifier_type == "crowdin":
+ for lang in LANGUAGES:
+ if lang["crowdin"] == identifier:
+ return lang
+
+ return None
+
+
+def list_all_languages():
+ return [lang["name"] for lang in LANGUAGES]
+
+
+def list_all_nllb_codes():
+ return [lang["nllb"] for lang in LANGUAGES]
+
+
+def list_all_crowdin_codes():
+ return [lang["crowdin"] for lang in LANGUAGES]
\ No newline at end of file
diff --git a/whisperlivekit/translation/translation.py b/whisperlivekit/translation/translation.py
new file mode 100644
index 0000000..91c76fb
--- /dev/null
+++ b/whisperlivekit/translation/translation.py
@@ -0,0 +1,37 @@
+import ctranslate2
+import transformers
+from dataclasses import dataclass
+import huggingface_hub
+
+src_lang = "eng_Latn"
+
+@dataclass
+class TranslationModel():
+ translator: ctranslate2.Translator
+ tokenizer: transformers.AutoTokenizer
+
+def load_model(src_lang):
+ huggingface_hub.snapshot_download('entai2965/nllb-200-distilled-600M-ctranslate2',local_dir='nllb-200-distilled-600M-ctranslate2')
+ translator = ctranslate2.Translator("nllb-200-distilled-600M-ctranslate2",device="cpu")
+ tokenizer = transformers.AutoTokenizer.from_pretrained("nllb-200-distilled-600M-ctranslate2", src_lang=src_lang, clean_up_tokenization_spaces=True)
+ return TranslationModel(
+ translator=translator,
+ tokenizer=tokenizer
+ )
+
+def translate(input, translation_model, tgt_lang):
+ if not input:
+ return ""
+ source = translation_model.tokenizer.convert_ids_to_tokens(translation_model.tokenizer.encode(input))
+ target_prefix = [tgt_lang]
+ results = translation_model.translator.translate_batch([source], target_prefix=[target_prefix])
+ target = results[0].hypotheses[0][1:]
+ return translation_model.tokenizer.decode(translation_model.tokenizer.convert_tokens_to_ids(target))
+
+
+if __name__ == '__main__':
+ tgt_lang = "fra_Latn"
+ src_lang = "eng_Latn"
+ translation_model = load_model(src_lang)
+ result = translate('Hello world', translation_model=translation_model, tgt_lang=tgt_lang)
+ print(result)
\ No newline at end of file
diff --git a/whisperlivekit/web/src/settings.svg b/whisperlivekit/web/src/settings.svg
new file mode 100644
index 0000000..7f14a28
--- /dev/null
+++ b/whisperlivekit/web/src/settings.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/whisperlivekit/web/src/translate.svg b/whisperlivekit/web/src/translate.svg
new file mode 100644
index 0000000..22e0fde
--- /dev/null
+++ b/whisperlivekit/web/src/translate.svg
@@ -0,0 +1 @@
+
\ No newline at end of file