diff --git a/README.md b/README.md index 4b2a633..b60a8e5 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,8 @@ Real-time speech transcription directly to your browser, with a ready-to-use bac ```bash pip install whisperlivekit ``` +> You can also clone the repo and `pip install -e .` for the latest version. + > **FFmpeg is required** and must be installed before using WhisperLiveKit > @@ -148,7 +150,8 @@ The rest I don't recommend. But below are your options. |-----------|-------------|---------| | `--model` | Whisper model size. | `small` | | `--language` | Source language code or `auto` | `auto` | -| `--task` | `transcribe` or `translate` | `transcribe` | +| `--task` | Set to `translate` to translate to english | `transcribe` | +| `--target-language` | [NOT FUNCTIONAL YET] | `None` | | `--backend` | Processing backend | `simulstreaming` | | `--min-chunk-size` | Minimum audio chunk size (seconds) | `1.0` | | `--no-vac` | Disable Voice Activity Controller | `False` | diff --git a/whisperlivekit/core.py b/whisperlivekit/core.py index 8ce714b..3a6b3c1 100644 --- a/whisperlivekit/core.py +++ b/whisperlivekit/core.py @@ -33,6 +33,7 @@ class TranscriptionEngine: "model_dir": None, "lan": "auto", "task": "transcribe", + "target_language": "", "backend": "faster-whisper", "vac": True, "vac_chunk_size": 0.04, @@ -133,6 +134,12 @@ class TranscriptionEngine: else: raise ValueError(f"Unknown diarization backend: {self.args.diarization_backend}") + if self.args.target_language: + if self.args.language == 'auto': + raise Exception('Translation cannot be set with language auto') + else: + from whisperlivekit.translation.translation import load_model + TranscriptionEngine._initialized = True diff --git a/whisperlivekit/parse_args.py b/whisperlivekit/parse_args.py index 023f951..66a9f41 100644 --- a/whisperlivekit/parse_args.py +++ b/whisperlivekit/parse_args.py @@ -112,6 +112,15 @@ def parse_args(): choices=["transcribe", "translate"], help="Transcribe or translate.", ) + + parser.add_argument( + "--target-language", + type=str, + default="", + dest="target_language", + help="Target language for translation. Not functional yet.", + ) + parser.add_argument( "--backend", type=str, diff --git a/whisperlivekit/translation/__init__.py b/whisperlivekit/translation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/whisperlivekit/translation/mapping_languages.py b/whisperlivekit/translation/mapping_languages.py new file mode 100644 index 0000000..da6881d --- /dev/null +++ b/whisperlivekit/translation/mapping_languages.py @@ -0,0 +1,182 @@ +""" +adapted from https://store.crowdin.com/custom-mt +""" + +LANGUAGES = [ + {"name": "Afrikaans", "nllb": "afr_Latn", "crowdin": "af"}, + {"name": "Akan", "nllb": "aka_Latn", "crowdin": "ak"}, + {"name": "Amharic", "nllb": "amh_Ethi", "crowdin": "am"}, + {"name": "Assamese", "nllb": "asm_Beng", "crowdin": "as"}, + {"name": "Asturian", "nllb": "ast_Latn", "crowdin": "ast"}, + {"name": "Bashkir", "nllb": "bak_Cyrl", "crowdin": "ba"}, + {"name": "Bambara", "nllb": "bam_Latn", "crowdin": "bm"}, + {"name": "Balinese", "nllb": "ban_Latn", "crowdin": "ban"}, + {"name": "Belarusian", "nllb": "bel_Cyrl", "crowdin": "be"}, + {"name": "Bengali", "nllb": "ben_Beng", "crowdin": "bn"}, + {"name": "Bosnian", "nllb": "bos_Latn", "crowdin": "bs"}, + {"name": "Bulgarian", "nllb": "bul_Cyrl", "crowdin": "bg"}, + {"name": "Catalan", "nllb": "cat_Latn", "crowdin": "ca"}, + {"name": "Cebuano", "nllb": "ceb_Latn", "crowdin": "ceb"}, + {"name": "Czech", "nllb": "ces_Latn", "crowdin": "cs"}, + {"name": "Welsh", "nllb": "cym_Latn", "crowdin": "cy"}, + {"name": "Danish", "nllb": "dan_Latn", "crowdin": "da"}, + {"name": "German", "nllb": "deu_Latn", "crowdin": "de"}, + {"name": "Dzongkha", "nllb": "dzo_Tibt", "crowdin": "dz"}, + {"name": "Greek", "nllb": "ell_Grek", "crowdin": "el"}, + {"name": "English", "nllb": "eng_Latn", "crowdin": "en"}, + {"name": "Esperanto", "nllb": "epo_Latn", "crowdin": "eo"}, + {"name": "Estonian", "nllb": "est_Latn", "crowdin": "et"}, + {"name": "Basque", "nllb": "eus_Latn", "crowdin": "eu"}, + {"name": "Ewe", "nllb": "ewe_Latn", "crowdin": "ee"}, + {"name": "Faroese", "nllb": "fao_Latn", "crowdin": "fo"}, + {"name": "Fijian", "nllb": "fij_Latn", "crowdin": "fj"}, + {"name": "Finnish", "nllb": "fin_Latn", "crowdin": "fi"}, + {"name": "French", "nllb": "fra_Latn", "crowdin": "fr"}, + {"name": "Friulian", "nllb": "fur_Latn", "crowdin": "fur-IT"}, + {"name": "Scottish Gaelic", "nllb": "gla_Latn", "crowdin": "gd"}, + {"name": "Irish", "nllb": "gle_Latn", "crowdin": "ga-IE"}, + {"name": "Galician", "nllb": "glg_Latn", "crowdin": "gl"}, + {"name": "Guarani", "nllb": "grn_Latn", "crowdin": "gn"}, + {"name": "Gujarati", "nllb": "guj_Gujr", "crowdin": "gu-IN"}, + {"name": "Haitian Creole", "nllb": "hat_Latn", "crowdin": "ht"}, + {"name": "Hausa", "nllb": "hau_Latn", "crowdin": "ha"}, + {"name": "Hebrew", "nllb": "heb_Hebr", "crowdin": "he"}, + {"name": "Hindi", "nllb": "hin_Deva", "crowdin": "hi"}, + {"name": "Croatian", "nllb": "hrv_Latn", "crowdin": "hr"}, + {"name": "Hungarian", "nllb": "hun_Latn", "crowdin": "hu"}, + {"name": "Armenian", "nllb": "hye_Armn", "crowdin": "hy-AM"}, + {"name": "Igbo", "nllb": "ibo_Latn", "crowdin": "ig"}, + {"name": "Indonesian", "nllb": "ind_Latn", "crowdin": "id"}, + {"name": "Icelandic", "nllb": "isl_Latn", "crowdin": "is"}, + {"name": "Italian", "nllb": "ita_Latn", "crowdin": "it"}, + {"name": "Javanese", "nllb": "jav_Latn", "crowdin": "jv"}, + {"name": "Japanese", "nllb": "jpn_Jpan", "crowdin": "ja"}, + {"name": "Kabyle", "nllb": "kab_Latn", "crowdin": "kab"}, + {"name": "Kannada", "nllb": "kan_Knda", "crowdin": "kn"}, + {"name": "Georgian", "nllb": "kat_Geor", "crowdin": "ka"}, + {"name": "Kazakh", "nllb": "kaz_Cyrl", "crowdin": "kk"}, + {"name": "Khmer", "nllb": "khm_Khmr", "crowdin": "km"}, + {"name": "Kinyarwanda", "nllb": "kin_Latn", "crowdin": "rw"}, + {"name": "Kyrgyz", "nllb": "kir_Cyrl", "crowdin": "ky"}, + {"name": "Korean", "nllb": "kor_Hang", "crowdin": "ko"}, + {"name": "Lao", "nllb": "lao_Laoo", "crowdin": "lo"}, + {"name": "Ligurian", "nllb": "lij_Latn", "crowdin": "lij"}, + {"name": "Limburgish", "nllb": "lim_Latn", "crowdin": "li"}, + {"name": "Lingala", "nllb": "lin_Latn", "crowdin": "ln"}, + {"name": "Lithuanian", "nllb": "lit_Latn", "crowdin": "lt"}, + {"name": "Luxembourgish", "nllb": "ltz_Latn", "crowdin": "lb"}, + {"name": "Maithili", "nllb": "mai_Deva", "crowdin": "mai"}, + {"name": "Malayalam", "nllb": "mal_Mlym", "crowdin": "ml-IN"}, + {"name": "Marathi", "nllb": "mar_Deva", "crowdin": "mr"}, + {"name": "Macedonian", "nllb": "mkd_Cyrl", "crowdin": "mk"}, + {"name": "Maltese", "nllb": "mlt_Latn", "crowdin": "mt"}, + {"name": "Mossi", "nllb": "mos_Latn", "crowdin": "mos"}, + {"name": "Maori", "nllb": "mri_Latn", "crowdin": "mi"}, + {"name": "Burmese", "nllb": "mya_Mymr", "crowdin": "my"}, + {"name": "Dutch", "nllb": "nld_Latn", "crowdin": "nl"}, + {"name": "Norwegian Nynorsk", "nllb": "nno_Latn", "crowdin": "nn-NO"}, + {"name": "Nepali", "nllb": "npi_Deva", "crowdin": "ne-NP"}, + {"name": "Northern Sotho", "nllb": "nso_Latn", "crowdin": "nso"}, + {"name": "Occitan", "nllb": "oci_Latn", "crowdin": "oc"}, + {"name": "Odia", "nllb": "ory_Orya", "crowdin": "or"}, + {"name": "Papiamento", "nllb": "pap_Latn", "crowdin": "pap"}, + {"name": "Polish", "nllb": "pol_Latn", "crowdin": "pl"}, + {"name": "Portuguese", "nllb": "por_Latn", "crowdin": "pt-PT"}, + {"name": "Dari", "nllb": "prs_Arab", "crowdin": "fa-AF"}, + {"name": "Romanian", "nllb": "ron_Latn", "crowdin": "ro"}, + {"name": "Rundi", "nllb": "run_Latn", "crowdin": "rn"}, + {"name": "Russian", "nllb": "rus_Cyrl", "crowdin": "ru"}, + {"name": "Sango", "nllb": "sag_Latn", "crowdin": "sg"}, + {"name": "Sanskrit", "nllb": "san_Deva", "crowdin": "sa"}, + {"name": "Santali", "nllb": "sat_Olck", "crowdin": "sat"}, + {"name": "Sinhala", "nllb": "sin_Sinh", "crowdin": "si-LK"}, + {"name": "Slovak", "nllb": "slk_Latn", "crowdin": "sk"}, + {"name": "Slovenian", "nllb": "slv_Latn", "crowdin": "sl"}, + {"name": "Shona", "nllb": "sna_Latn", "crowdin": "sn"}, + {"name": "Sindhi", "nllb": "snd_Arab", "crowdin": "sd"}, + {"name": "Somali", "nllb": "som_Latn", "crowdin": "so"}, + {"name": "Southern Sotho", "nllb": "sot_Latn", "crowdin": "st"}, + {"name": "Spanish", "nllb": "spa_Latn", "crowdin": "es-ES"}, + {"name": "Sardinian", "nllb": "srd_Latn", "crowdin": "sc"}, + {"name": "Swati", "nllb": "ssw_Latn", "crowdin": "ss"}, + {"name": "Sundanese", "nllb": "sun_Latn", "crowdin": "su"}, + {"name": "Swedish", "nllb": "swe_Latn", "crowdin": "sv-SE"}, + {"name": "Swahili", "nllb": "swh_Latn", "crowdin": "sw"}, + {"name": "Tamil", "nllb": "tam_Taml", "crowdin": "ta"}, + {"name": "Tatar", "nllb": "tat_Cyrl", "crowdin": "tt-RU"}, + {"name": "Telugu", "nllb": "tel_Telu", "crowdin": "te"}, + {"name": "Tajik", "nllb": "tgk_Cyrl", "crowdin": "tg"}, + {"name": "Tagalog", "nllb": "tgl_Latn", "crowdin": "tl"}, + {"name": "Thai", "nllb": "tha_Thai", "crowdin": "th"}, + {"name": "Tigrinya", "nllb": "tir_Ethi", "crowdin": "ti"}, + {"name": "Tswana", "nllb": "tsn_Latn", "crowdin": "tn"}, + {"name": "Tsonga", "nllb": "tso_Latn", "crowdin": "ts"}, + {"name": "Turkmen", "nllb": "tuk_Latn", "crowdin": "tk"}, + {"name": "Turkish", "nllb": "tur_Latn", "crowdin": "tr"}, + {"name": "Uyghur", "nllb": "uig_Arab", "crowdin": "ug"}, + {"name": "Ukrainian", "nllb": "ukr_Cyrl", "crowdin": "uk"}, + {"name": "Venetian", "nllb": "vec_Latn", "crowdin": "vec"}, + {"name": "Vietnamese", "nllb": "vie_Latn", "crowdin": "vi"}, + {"name": "Wolof", "nllb": "wol_Latn", "crowdin": "wo"}, + {"name": "Xhosa", "nllb": "xho_Latn", "crowdin": "xh"}, + {"name": "Yoruba", "nllb": "yor_Latn", "crowdin": "yo"}, + {"name": "Zulu", "nllb": "zul_Latn", "crowdin": "zu"}, +] + +NAME_TO_NLLB = {lang["name"]: lang["nllb"] for lang in LANGUAGES} +NAME_TO_CROWDIN = {lang["name"]: lang["crowdin"] for lang in LANGUAGES} +CROWDIN_TO_NLLB = {lang["crowdin"]: lang["nllb"] for lang in LANGUAGES} +NLLB_TO_CROWDIN = {lang["nllb"]: lang["crowdin"] for lang in LANGUAGES} +CROWDIN_TO_NAME = {lang["crowdin"]: lang["name"] for lang in LANGUAGES} +NLLB_TO_NAME = {lang["nllb"]: lang["name"] for lang in LANGUAGES} + + +def get_nllb_code(crowdin_code): + return CROWDIN_TO_NLLB.get(crowdin_code, crowdin_code) + + +def get_crowdin_code(nllb_code): + return NLLB_TO_CROWDIN.get(nllb_code) + + +def get_language_name_by_crowdin(crowdin_code): + return CROWDIN_TO_NAME.get(crowdin_code) + + +def get_language_name_by_nllb(nllb_code): + return NLLB_TO_NAME.get(nllb_code) + + +def get_language_info(identifier, identifier_type="auto"): + if identifier_type == "auto": + for lang in LANGUAGES: + if (lang["name"].lower() == identifier.lower() or + lang["nllb"] == identifier or + lang["crowdin"] == identifier): + return lang + elif identifier_type == "name": + for lang in LANGUAGES: + if lang["name"].lower() == identifier.lower(): + return lang + elif identifier_type == "nllb": + for lang in LANGUAGES: + if lang["nllb"] == identifier: + return lang + elif identifier_type == "crowdin": + for lang in LANGUAGES: + if lang["crowdin"] == identifier: + return lang + + return None + + +def list_all_languages(): + return [lang["name"] for lang in LANGUAGES] + + +def list_all_nllb_codes(): + return [lang["nllb"] for lang in LANGUAGES] + + +def list_all_crowdin_codes(): + return [lang["crowdin"] for lang in LANGUAGES] \ No newline at end of file diff --git a/whisperlivekit/translation/translation.py b/whisperlivekit/translation/translation.py new file mode 100644 index 0000000..91c76fb --- /dev/null +++ b/whisperlivekit/translation/translation.py @@ -0,0 +1,37 @@ +import ctranslate2 +import transformers +from dataclasses import dataclass +import huggingface_hub + +src_lang = "eng_Latn" + +@dataclass +class TranslationModel(): + translator: ctranslate2.Translator + tokenizer: transformers.AutoTokenizer + +def load_model(src_lang): + huggingface_hub.snapshot_download('entai2965/nllb-200-distilled-600M-ctranslate2',local_dir='nllb-200-distilled-600M-ctranslate2') + translator = ctranslate2.Translator("nllb-200-distilled-600M-ctranslate2",device="cpu") + tokenizer = transformers.AutoTokenizer.from_pretrained("nllb-200-distilled-600M-ctranslate2", src_lang=src_lang, clean_up_tokenization_spaces=True) + return TranslationModel( + translator=translator, + tokenizer=tokenizer + ) + +def translate(input, translation_model, tgt_lang): + if not input: + return "" + source = translation_model.tokenizer.convert_ids_to_tokens(translation_model.tokenizer.encode(input)) + target_prefix = [tgt_lang] + results = translation_model.translator.translate_batch([source], target_prefix=[target_prefix]) + target = results[0].hypotheses[0][1:] + return translation_model.tokenizer.decode(translation_model.tokenizer.convert_tokens_to_ids(target)) + + +if __name__ == '__main__': + tgt_lang = "fra_Latn" + src_lang = "eng_Latn" + translation_model = load_model(src_lang) + result = translate('Hello world', translation_model=translation_model, tgt_lang=tgt_lang) + print(result) \ No newline at end of file diff --git a/whisperlivekit/web/src/settings.svg b/whisperlivekit/web/src/settings.svg new file mode 100644 index 0000000..7f14a28 --- /dev/null +++ b/whisperlivekit/web/src/settings.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/whisperlivekit/web/src/translate.svg b/whisperlivekit/web/src/translate.svg new file mode 100644 index 0000000..22e0fde --- /dev/null +++ b/whisperlivekit/web/src/translate.svg @@ -0,0 +1 @@ + \ No newline at end of file