diff --git a/README.md b/README.md index 8fc841e..58276b7 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@

-Real-time speech transcription directly to your browser, with a ready-to-use backend+server and a simple frontend. ✨ +Real-time transcription directly to your browser, with a ready-to-use backend+server and a simple frontend. #### Powered by Leading Research: @@ -142,7 +142,7 @@ async def websocket_endpoint(websocket: WebSocket): | `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/available_models.md) | `small` | | `--model-path` | .pt file/directory containing whisper model. Overrides `--model`. Recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/models_compatible_formats.md) | `None` | | `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` | -| `--target-language` | If sets, activates translation using NLLB. Ex: `fr`. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` | +| `--target-language` | If sets, translate to using NLLB. Ex: `fr`. [200 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/docs/supported_languages.md). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` | | `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` | | `--diarization` | Enable speaker identification | `False` | | `--backend` | Processing backend. You can switch to `faster-whisper` if `simulstreaming` does not work correctly | `simulstreaming` | diff --git a/docs/supported_languages.md b/docs/supported_languages.md new file mode 100644 index 0000000..a04443e --- /dev/null +++ b/docs/supported_languages.md @@ -0,0 +1,265 @@ +# Supported Languages + +WhisperLiveKit supports translation into **201 languages** from the FLORES-200 dataset through the NLLB (No Language Left Behind) translation system. + +## How to Specify Languages + +You can specify languages in **three different ways**: + +1. **Language Name** (case-insensitive): `"English"`, `"French"`, `"Spanish"` +2. **ISO Language Code**: `"en"`, `"fr"`, `"es"` +3. **NLLB Code** (FLORES-200): `"eng_Latn"`, `"fra_Latn"`, `"spa_Latn"` + +## Usage Examples + +### Command Line +```bash +# Using language name +whisperlivekit-server --target-language "French" + +# Using ISO code +whisperlivekit-server --target-language fr + +# Using NLLB code +whisperlivekit-server --target-language fra_Latn +``` + +### Python API +```python +from whisperlivekit.translation import get_language_info + +# Get language information by name +lang_info = get_language_info("French") +print(lang_info) +# {'name': 'French', 'nllb': 'fra_Latn', 'language_code': 'fr'} + +# Get language information by ISO code +lang_info = get_language_info("fr") + +# Get language information by NLLB code +lang_info = get_language_info("fra_Latn") + +# All three return the same result +``` + +## Complete Language List + +The following table lists all 201 supported languages with their corresponding codes: + +| Language Name | ISO Code | NLLB Code | +|---------------|----------|-----------| +| Acehnese (Arabic script) | ace_Arab | ace_Arab | +| Acehnese (Latin script) | ace_Latn | ace_Latn | +| Mesopotamian Arabic | acm_Arab | acm_Arab | +| Ta'izzi-Adeni Arabic | acq_Arab | acq_Arab | +| Tunisian Arabic | aeb_Arab | aeb_Arab | +| Afrikaans | af | afr_Latn | +| South Levantine Arabic | ajp_Arab | ajp_Arab | +| Akan | ak | aka_Latn | +| Tosk Albanian | als | als_Latn | +| Amharic | am | amh_Ethi | +| North Levantine Arabic | apc_Arab | apc_Arab | +| Modern Standard Arabic | ar | arb_Arab | +| Modern Standard Arabic (Romanized) | arb_Latn | arb_Latn | +| Najdi Arabic | ars_Arab | ars_Arab | +| Moroccan Arabic | ary_Arab | ary_Arab | +| Egyptian Arabic | arz_Arab | arz_Arab | +| Assamese | as | asm_Beng | +| Asturian | ast | ast_Latn | +| Awadhi | awa | awa_Deva | +| Central Aymara | ay | ayr_Latn | +| South Azerbaijani | azb | azb_Arab | +| North Azerbaijani | az | azj_Latn | +| Bashkir | ba | bak_Cyrl | +| Bambara | bm | bam_Latn | +| Balinese | ban | ban_Latn | +| Belarusian | be | bel_Cyrl | +| Bemba | bem | bem_Latn | +| Bengali | bn | ben_Beng | +| Bhojpuri | bho | bho_Deva | +| Banjar (Arabic script) | bjn_Arab | bjn_Arab | +| Banjar (Latin script) | bjn_Latn | bjn_Latn | +| Standard Tibetan | bo | bod_Tibt | +| Bosnian | bs | bos_Latn | +| Buginese | bug | bug_Latn | +| Bulgarian | bg | bul_Cyrl | +| Catalan | ca | cat_Latn | +| Cebuano | ceb | ceb_Latn | +| Czech | cs | ces_Latn | +| Chokwe | cjk | cjk_Latn | +| Central Kurdish | ckb | ckb_Arab | +| Crimean Tatar | crh | crh_Latn | +| Welsh | cy | cym_Latn | +| Danish | da | dan_Latn | +| German | de | deu_Latn | +| Southwestern Dinka | dik | dik_Latn | +| Dyula | dyu | dyu_Latn | +| Dzongkha | dz | dzo_Tibt | +| Greek | el | ell_Grek | +| English | en | eng_Latn | +| Esperanto | eo | epo_Latn | +| Estonian | et | est_Latn | +| Basque | eu | eus_Latn | +| Ewe | ee | ewe_Latn | +| Faroese | fo | fao_Latn | +| Fijian | fj | fij_Latn | +| Finnish | fi | fin_Latn | +| Fon | fon | fon_Latn | +| French | fr | fra_Latn | +| Friulian | fur-IT | fur_Latn | +| Nigerian Fulfulde | fuv | fuv_Latn | +| West Central Oromo | om | gaz_Latn | +| Scottish Gaelic | gd | gla_Latn | +| Irish | ga-IE | gle_Latn | +| Galician | gl | glg_Latn | +| Guarani | gn | grn_Latn | +| Gujarati | gu-IN | guj_Gujr | +| Haitian Creole | ht | hat_Latn | +| Hausa | ha | hau_Latn | +| Hebrew | he | heb_Hebr | +| Hindi | hi | hin_Deva | +| Chhattisgarhi | hne | hne_Deva | +| Croatian | hr | hrv_Latn | +| Hungarian | hu | hun_Latn | +| Armenian | hy-AM | hye_Armn | +| Igbo | ig | ibo_Latn | +| Ilocano | ilo | ilo_Latn | +| Indonesian | id | ind_Latn | +| Icelandic | is | isl_Latn | +| Italian | it | ita_Latn | +| Javanese | jv | jav_Latn | +| Japanese | ja | jpn_Jpan | +| Kabyle | kab | kab_Latn | +| Jingpho | kac | kac_Latn | +| Kamba | kam | kam_Latn | +| Kannada | kn | kan_Knda | +| Kashmiri (Arabic script) | kas_Arab | kas_Arab | +| Kashmiri (Devanagari script) | kas_Deva | kas_Deva | +| Georgian | ka | kat_Geor | +| Kazakh | kk | kaz_Cyrl | +| Kabiyè | kbp | kbp_Latn | +| Kabuverdianu | kea | kea_Latn | +| Halh Mongolian | mn | khk_Cyrl | +| Khmer | km | khm_Khmr | +| Kikuyu | ki | kik_Latn | +| Kinyarwanda | rw | kin_Latn | +| Kyrgyz | ky | kir_Cyrl | +| Kimbundu | kmb | kmb_Latn | +| Northern Kurdish | kmr | kmr_Latn | +| Central Kanuri (Arabic script) | knc_Arab | knc_Arab | +| Central Kanuri (Latin script) | knc_Latn | knc_Latn | +| Kikongo | kg | kon_Latn | +| Korean | ko | kor_Hang | +| Lao | lo | lao_Laoo | +| Ligurian | lij | lij_Latn | +| Limburgish | li | lim_Latn | +| Lingala | ln | lin_Latn | +| Lithuanian | lt | lit_Latn | +| Lombard | lmo | lmo_Latn | +| Latgalian | ltg | ltg_Latn | +| Luxembourgish | lb | ltz_Latn | +| Luba-Kasai | lua | lua_Latn | +| Ganda | lg | lug_Latn | +| Luo | luo | luo_Latn | +| Mizo | lus | lus_Latn | +| Standard Latvian | lv | lvs_Latn | +| Magahi | mag | mag_Deva | +| Maithili | mai | mai_Deva | +| Malayalam | ml-IN | mal_Mlym | +| Marathi | mr | mar_Deva | +| Minangkabau (Arabic script) | min_Arab | min_Arab | +| Minangkabau (Latin script) | min_Latn | min_Latn | +| Macedonian | mk | mkd_Cyrl | +| Maltese | mt | mlt_Latn | +| Meitei (Bengali script) | mni | mni_Beng | +| Mossi | mos | mos_Latn | +| Maori | mi | mri_Latn | +| Burmese | my | mya_Mymr | +| Dutch | nl | nld_Latn | +| Norwegian Nynorsk | nn-NO | nno_Latn | +| Norwegian Bokmål | nb | nob_Latn | +| Nepali | ne-NP | npi_Deva | +| Northern Sotho | nso | nso_Latn | +| Nuer | nus | nus_Latn | +| Nyanja | ny | nya_Latn | +| Occitan | oc | oci_Latn | +| Odia | or | ory_Orya | +| Pangasinan | pag | pag_Latn | +| Eastern Panjabi | pa | pan_Guru | +| Papiamento | pap | pap_Latn | +| Southern Pashto | pbt | pbt_Arab | +| Western Persian | fa | pes_Arab | +| Plateau Malagasy | mg | plt_Latn | +| Polish | pl | pol_Latn | +| Portuguese | pt-PT | por_Latn | +| Dari | fa-AF | prs_Arab | +| Ayacucho Quechua | qu | quy_Latn | +| Romanian | ro | ron_Latn | +| Rundi | rn | run_Latn | +| Russian | ru | rus_Cyrl | +| Sango | sg | sag_Latn | +| Sanskrit | sa | san_Deva | +| Santali | sat | sat_Olck | +| Sicilian | scn | scn_Latn | +| Shan | shn | shn_Mymr | +| Sinhala | si-LK | sin_Sinh | +| Slovak | sk | slk_Latn | +| Slovenian | sl | slv_Latn | +| Samoan | sm | smo_Latn | +| Shona | sn | sna_Latn | +| Sindhi | sd | snd_Arab | +| Somali | so | som_Latn | +| Southern Sotho | st | sot_Latn | +| Spanish | es-ES | spa_Latn | +| Sardinian | sc | srd_Latn | +| Serbian | sr | srp_Cyrl | +| Swati | ss | ssw_Latn | +| Sundanese | su | sun_Latn | +| Swedish | sv-SE | swe_Latn | +| Swahili | sw | swh_Latn | +| Silesian | szl | szl_Latn | +| Tamil | ta | tam_Taml | +| Tamasheq (Latin script) | taq_Latn | taq_Latn | +| Tamasheq (Tifinagh script) | taq_Tfng | taq_Tfng | +| Tatar | tt-RU | tat_Cyrl | +| Telugu | te | tel_Telu | +| Tajik | tg | tgk_Cyrl | +| Tagalog | tl | tgl_Latn | +| Thai | th | tha_Thai | +| Tigrinya | ti | tir_Ethi | +| Tok Pisin | tpi | tpi_Latn | +| Tswana | tn | tsn_Latn | +| Tsonga | ts | tso_Latn | +| Turkmen | tk | tuk_Latn | +| Tumbuka | tum | tum_Latn | +| Turkish | tr | tur_Latn | +| Twi | tw | twi_Latn | +| Central Atlas Tamazight | tzm | tzm_Tfng | +| Uyghur | ug | uig_Arab | +| Ukrainian | uk | ukr_Cyrl | +| Umbundu | umb | umb_Latn | +| Urdu | ur | urd_Arab | +| Northern Uzbek | uz | uzn_Latn | +| Venetian | vec | vec_Latn | +| Vietnamese | vi | vie_Latn | +| Waray | war | war_Latn | +| Wolof | wo | wol_Latn | +| Xhosa | xh | xho_Latn | +| Eastern Yiddish | yi | ydd_Hebr | +| Yoruba | yo | yor_Latn | +| Yue Chinese | yue | yue_Hant | +| Chinese (Simplified) | zh-CN | zho_Hans | +| Chinese (Traditional) | zh-TW | zho_Hant | +| Standard Malay | ms | zsm_Latn | +| Zulu | zu | zul_Latn | + +## Special Features + +### Multiple Script Support +Several languages are available in multiple scripts (e.g., Arabic and Latin): +- **Acehnese**: Arabic (`ace_Arab`) and Latin (`ace_Latn`) +- **Banjar**: Arabic (`bjn_Arab`) and Latin (`bjn_Latn`) +- **Kashmiri**: Arabic (`kas_Arab`) and Devanagari (`kas_Deva`) +- **Minangkabau**: Arabic (`min_Arab`) and Latin (`min_Latn`) +- **Tamasheq**: Latin (`taq_Latn`) and Tifinagh (`taq_Tfng`) +- **Central Kanuri**: Arabic (`knc_Arab`) and Latin (`knc_Latn`) \ No newline at end of file diff --git a/whisperlivekit/translation/mapping_languages.py b/whisperlivekit/translation/mapping_languages.py index becd657..c7bff7e 100644 --- a/whisperlivekit/translation/mapping_languages.py +++ b/whisperlivekit/translation/mapping_languages.py @@ -1,146 +1,228 @@ -""" -adapted from https://store.crowdin.com/custom-mt -""" - LANGUAGES = [ - {"name": "Afrikaans", "nllb": "afr_Latn", "crowdin": "af"}, - {"name": "Akan", "nllb": "aka_Latn", "crowdin": "ak"}, - {"name": "Amharic", "nllb": "amh_Ethi", "crowdin": "am"}, - {"name": "Assamese", "nllb": "asm_Beng", "crowdin": "as"}, - {"name": "Asturian", "nllb": "ast_Latn", "crowdin": "ast"}, - {"name": "Bashkir", "nllb": "bak_Cyrl", "crowdin": "ba"}, - {"name": "Bambara", "nllb": "bam_Latn", "crowdin": "bm"}, - {"name": "Balinese", "nllb": "ban_Latn", "crowdin": "ban"}, - {"name": "Belarusian", "nllb": "bel_Cyrl", "crowdin": "be"}, - {"name": "Bengali", "nllb": "ben_Beng", "crowdin": "bn"}, - {"name": "Bosnian", "nllb": "bos_Latn", "crowdin": "bs"}, - {"name": "Bulgarian", "nllb": "bul_Cyrl", "crowdin": "bg"}, - {"name": "Catalan", "nllb": "cat_Latn", "crowdin": "ca"}, - {"name": "Cebuano", "nllb": "ceb_Latn", "crowdin": "ceb"}, - {"name": "Czech", "nllb": "ces_Latn", "crowdin": "cs"}, - {"name": "Welsh", "nllb": "cym_Latn", "crowdin": "cy"}, - {"name": "Danish", "nllb": "dan_Latn", "crowdin": "da"}, - {"name": "German", "nllb": "deu_Latn", "crowdin": "de"}, - {"name": "Dzongkha", "nllb": "dzo_Tibt", "crowdin": "dz"}, - {"name": "Greek", "nllb": "ell_Grek", "crowdin": "el"}, - {"name": "English", "nllb": "eng_Latn", "crowdin": "en"}, - {"name": "Esperanto", "nllb": "epo_Latn", "crowdin": "eo"}, - {"name": "Estonian", "nllb": "est_Latn", "crowdin": "et"}, - {"name": "Basque", "nllb": "eus_Latn", "crowdin": "eu"}, - {"name": "Ewe", "nllb": "ewe_Latn", "crowdin": "ee"}, - {"name": "Faroese", "nllb": "fao_Latn", "crowdin": "fo"}, - {"name": "Fijian", "nllb": "fij_Latn", "crowdin": "fj"}, - {"name": "Finnish", "nllb": "fin_Latn", "crowdin": "fi"}, - {"name": "French", "nllb": "fra_Latn", "crowdin": "fr"}, - {"name": "Friulian", "nllb": "fur_Latn", "crowdin": "fur-IT"}, - {"name": "Scottish Gaelic", "nllb": "gla_Latn", "crowdin": "gd"}, - {"name": "Irish", "nllb": "gle_Latn", "crowdin": "ga-IE"}, - {"name": "Galician", "nllb": "glg_Latn", "crowdin": "gl"}, - {"name": "Guarani", "nllb": "grn_Latn", "crowdin": "gn"}, - {"name": "Gujarati", "nllb": "guj_Gujr", "crowdin": "gu-IN"}, - {"name": "Haitian Creole", "nllb": "hat_Latn", "crowdin": "ht"}, - {"name": "Hausa", "nllb": "hau_Latn", "crowdin": "ha"}, - {"name": "Hebrew", "nllb": "heb_Hebr", "crowdin": "he"}, - {"name": "Hindi", "nllb": "hin_Deva", "crowdin": "hi"}, - {"name": "Croatian", "nllb": "hrv_Latn", "crowdin": "hr"}, - {"name": "Hungarian", "nllb": "hun_Latn", "crowdin": "hu"}, - {"name": "Armenian", "nllb": "hye_Armn", "crowdin": "hy-AM"}, - {"name": "Igbo", "nllb": "ibo_Latn", "crowdin": "ig"}, - {"name": "Indonesian", "nllb": "ind_Latn", "crowdin": "id"}, - {"name": "Icelandic", "nllb": "isl_Latn", "crowdin": "is"}, - {"name": "Italian", "nllb": "ita_Latn", "crowdin": "it"}, - {"name": "Javanese", "nllb": "jav_Latn", "crowdin": "jv"}, - {"name": "Japanese", "nllb": "jpn_Jpan", "crowdin": "ja"}, - {"name": "Kabyle", "nllb": "kab_Latn", "crowdin": "kab"}, - {"name": "Kannada", "nllb": "kan_Knda", "crowdin": "kn"}, - {"name": "Georgian", "nllb": "kat_Geor", "crowdin": "ka"}, - {"name": "Kazakh", "nllb": "kaz_Cyrl", "crowdin": "kk"}, - {"name": "Khmer", "nllb": "khm_Khmr", "crowdin": "km"}, - {"name": "Kinyarwanda", "nllb": "kin_Latn", "crowdin": "rw"}, - {"name": "Kyrgyz", "nllb": "kir_Cyrl", "crowdin": "ky"}, - {"name": "Korean", "nllb": "kor_Hang", "crowdin": "ko"}, - {"name": "Lao", "nllb": "lao_Laoo", "crowdin": "lo"}, - {"name": "Ligurian", "nllb": "lij_Latn", "crowdin": "lij"}, - {"name": "Limburgish", "nllb": "lim_Latn", "crowdin": "li"}, - {"name": "Lingala", "nllb": "lin_Latn", "crowdin": "ln"}, - {"name": "Lithuanian", "nllb": "lit_Latn", "crowdin": "lt"}, - {"name": "Luxembourgish", "nllb": "ltz_Latn", "crowdin": "lb"}, - {"name": "Maithili", "nllb": "mai_Deva", "crowdin": "mai"}, - {"name": "Malayalam", "nllb": "mal_Mlym", "crowdin": "ml-IN"}, - {"name": "Marathi", "nllb": "mar_Deva", "crowdin": "mr"}, - {"name": "Macedonian", "nllb": "mkd_Cyrl", "crowdin": "mk"}, - {"name": "Maltese", "nllb": "mlt_Latn", "crowdin": "mt"}, - {"name": "Mossi", "nllb": "mos_Latn", "crowdin": "mos"}, - {"name": "Maori", "nllb": "mri_Latn", "crowdin": "mi"}, - {"name": "Burmese", "nllb": "mya_Mymr", "crowdin": "my"}, - {"name": "Dutch", "nllb": "nld_Latn", "crowdin": "nl"}, - {"name": "Norwegian Nynorsk", "nllb": "nno_Latn", "crowdin": "nn-NO"}, - {"name": "Nepali", "nllb": "npi_Deva", "crowdin": "ne-NP"}, - {"name": "Northern Sotho", "nllb": "nso_Latn", "crowdin": "nso"}, - {"name": "Occitan", "nllb": "oci_Latn", "crowdin": "oc"}, - {"name": "Odia", "nllb": "ory_Orya", "crowdin": "or"}, - {"name": "Papiamento", "nllb": "pap_Latn", "crowdin": "pap"}, - {"name": "Polish", "nllb": "pol_Latn", "crowdin": "pl"}, - {"name": "Portuguese", "nllb": "por_Latn", "crowdin": "pt-PT"}, - {"name": "Dari", "nllb": "prs_Arab", "crowdin": "fa-AF"}, - {"name": "Romanian", "nllb": "ron_Latn", "crowdin": "ro"}, - {"name": "Rundi", "nllb": "run_Latn", "crowdin": "rn"}, - {"name": "Russian", "nllb": "rus_Cyrl", "crowdin": "ru"}, - {"name": "Sango", "nllb": "sag_Latn", "crowdin": "sg"}, - {"name": "Sanskrit", "nllb": "san_Deva", "crowdin": "sa"}, - {"name": "Santali", "nllb": "sat_Olck", "crowdin": "sat"}, - {"name": "Sinhala", "nllb": "sin_Sinh", "crowdin": "si-LK"}, - {"name": "Slovak", "nllb": "slk_Latn", "crowdin": "sk"}, - {"name": "Slovenian", "nllb": "slv_Latn", "crowdin": "sl"}, - {"name": "Shona", "nllb": "sna_Latn", "crowdin": "sn"}, - {"name": "Sindhi", "nllb": "snd_Arab", "crowdin": "sd"}, - {"name": "Somali", "nllb": "som_Latn", "crowdin": "so"}, - {"name": "Southern Sotho", "nllb": "sot_Latn", "crowdin": "st"}, - {"name": "Spanish", "nllb": "spa_Latn", "crowdin": "es-ES"}, - {"name": "Sardinian", "nllb": "srd_Latn", "crowdin": "sc"}, - {"name": "Swati", "nllb": "ssw_Latn", "crowdin": "ss"}, - {"name": "Sundanese", "nllb": "sun_Latn", "crowdin": "su"}, - {"name": "Swedish", "nllb": "swe_Latn", "crowdin": "sv-SE"}, - {"name": "Swahili", "nllb": "swh_Latn", "crowdin": "sw"}, - {"name": "Tamil", "nllb": "tam_Taml", "crowdin": "ta"}, - {"name": "Tatar", "nllb": "tat_Cyrl", "crowdin": "tt-RU"}, - {"name": "Telugu", "nllb": "tel_Telu", "crowdin": "te"}, - {"name": "Tajik", "nllb": "tgk_Cyrl", "crowdin": "tg"}, - {"name": "Tagalog", "nllb": "tgl_Latn", "crowdin": "tl"}, - {"name": "Thai", "nllb": "tha_Thai", "crowdin": "th"}, - {"name": "Tigrinya", "nllb": "tir_Ethi", "crowdin": "ti"}, - {"name": "Tswana", "nllb": "tsn_Latn", "crowdin": "tn"}, - {"name": "Tsonga", "nllb": "tso_Latn", "crowdin": "ts"}, - {"name": "Turkmen", "nllb": "tuk_Latn", "crowdin": "tk"}, - {"name": "Turkish", "nllb": "tur_Latn", "crowdin": "tr"}, - {"name": "Uyghur", "nllb": "uig_Arab", "crowdin": "ug"}, - {"name": "Ukrainian", "nllb": "ukr_Cyrl", "crowdin": "uk"}, - {"name": "Venetian", "nllb": "vec_Latn", "crowdin": "vec"}, - {"name": "Vietnamese", "nllb": "vie_Latn", "crowdin": "vi"}, - {"name": "Wolof", "nllb": "wol_Latn", "crowdin": "wo"}, - {"name": "Xhosa", "nllb": "xho_Latn", "crowdin": "xh"}, - {"name": "Yoruba", "nllb": "yor_Latn", "crowdin": "yo"}, - {"name": "Zulu", "nllb": "zul_Latn", "crowdin": "zu"}, + {"name": "Acehnese (Arabic script)", "nllb": "ace_Arab", "language_code": "ace_Arab"}, + {"name": "Acehnese (Latin script)", "nllb": "ace_Latn", "language_code": "ace_Latn"}, + {"name": "Mesopotamian Arabic", "nllb": "acm_Arab", "language_code": "acm_Arab"}, + {"name": "Ta'izzi-Adeni Arabic", "nllb": "acq_Arab", "language_code": "acq_Arab"}, + {"name": "Tunisian Arabic", "nllb": "aeb_Arab", "language_code": "aeb_Arab"}, + {"name": "Afrikaans", "nllb": "afr_Latn", "language_code": "af"}, + {"name": "South Levantine Arabic", "nllb": "ajp_Arab", "language_code": "ajp_Arab"}, + {"name": "Akan", "nllb": "aka_Latn", "language_code": "ak"}, + {"name": "Tosk Albanian", "nllb": "als_Latn", "language_code": "als"}, + {"name": "Amharic", "nllb": "amh_Ethi", "language_code": "am"}, + {"name": "North Levantine Arabic", "nllb": "apc_Arab", "language_code": "apc_Arab"}, + {"name": "Modern Standard Arabic", "nllb": "arb_Arab", "language_code": "ar"}, + {"name": "Modern Standard Arabic (Romanized)", "nllb": "arb_Latn", "language_code": "arb_Latn"}, + {"name": "Najdi Arabic", "nllb": "ars_Arab", "language_code": "ars_Arab"}, + {"name": "Moroccan Arabic", "nllb": "ary_Arab", "language_code": "ary_Arab"}, + {"name": "Egyptian Arabic", "nllb": "arz_Arab", "language_code": "arz_Arab"}, + {"name": "Assamese", "nllb": "asm_Beng", "language_code": "as"}, + {"name": "Asturian", "nllb": "ast_Latn", "language_code": "ast"}, + {"name": "Awadhi", "nllb": "awa_Deva", "language_code": "awa"}, + {"name": "Central Aymara", "nllb": "ayr_Latn", "language_code": "ay"}, + {"name": "South Azerbaijani", "nllb": "azb_Arab", "language_code": "azb"}, + {"name": "North Azerbaijani", "nllb": "azj_Latn", "language_code": "az"}, + {"name": "Bashkir", "nllb": "bak_Cyrl", "language_code": "ba"}, + {"name": "Bambara", "nllb": "bam_Latn", "language_code": "bm"}, + {"name": "Balinese", "nllb": "ban_Latn", "language_code": "ban"}, + {"name": "Belarusian", "nllb": "bel_Cyrl", "language_code": "be"}, + {"name": "Bemba", "nllb": "bem_Latn", "language_code": "bem"}, + {"name": "Bengali", "nllb": "ben_Beng", "language_code": "bn"}, + {"name": "Bhojpuri", "nllb": "bho_Deva", "language_code": "bho"}, + {"name": "Banjar (Arabic script)", "nllb": "bjn_Arab", "language_code": "bjn_Arab"}, + {"name": "Banjar (Latin script)", "nllb": "bjn_Latn", "language_code": "bjn_Latn"}, + {"name": "Standard Tibetan", "nllb": "bod_Tibt", "language_code": "bo"}, + {"name": "Bosnian", "nllb": "bos_Latn", "language_code": "bs"}, + {"name": "Buginese", "nllb": "bug_Latn", "language_code": "bug"}, + {"name": "Bulgarian", "nllb": "bul_Cyrl", "language_code": "bg"}, + {"name": "Catalan", "nllb": "cat_Latn", "language_code": "ca"}, + {"name": "Cebuano", "nllb": "ceb_Latn", "language_code": "ceb"}, + {"name": "Czech", "nllb": "ces_Latn", "language_code": "cs"}, + {"name": "Chokwe", "nllb": "cjk_Latn", "language_code": "cjk"}, + {"name": "Central Kurdish", "nllb": "ckb_Arab", "language_code": "ckb"}, + {"name": "Crimean Tatar", "nllb": "crh_Latn", "language_code": "crh"}, + {"name": "Welsh", "nllb": "cym_Latn", "language_code": "cy"}, + {"name": "Danish", "nllb": "dan_Latn", "language_code": "da"}, + {"name": "German", "nllb": "deu_Latn", "language_code": "de"}, + {"name": "Southwestern Dinka", "nllb": "dik_Latn", "language_code": "dik"}, + {"name": "Dyula", "nllb": "dyu_Latn", "language_code": "dyu"}, + {"name": "Dzongkha", "nllb": "dzo_Tibt", "language_code": "dz"}, + {"name": "Greek", "nllb": "ell_Grek", "language_code": "el"}, + {"name": "English", "nllb": "eng_Latn", "language_code": "en"}, + {"name": "Esperanto", "nllb": "epo_Latn", "language_code": "eo"}, + {"name": "Estonian", "nllb": "est_Latn", "language_code": "et"}, + {"name": "Basque", "nllb": "eus_Latn", "language_code": "eu"}, + {"name": "Ewe", "nllb": "ewe_Latn", "language_code": "ee"}, + {"name": "Faroese", "nllb": "fao_Latn", "language_code": "fo"}, + {"name": "Fijian", "nllb": "fij_Latn", "language_code": "fj"}, + {"name": "Finnish", "nllb": "fin_Latn", "language_code": "fi"}, + {"name": "Fon", "nllb": "fon_Latn", "language_code": "fon"}, + {"name": "French", "nllb": "fra_Latn", "language_code": "fr"}, + {"name": "Friulian", "nllb": "fur_Latn", "language_code": "fur-IT"}, + {"name": "Nigerian Fulfulde", "nllb": "fuv_Latn", "language_code": "fuv"}, + {"name": "West Central Oromo", "nllb": "gaz_Latn", "language_code": "om"}, + {"name": "Scottish Gaelic", "nllb": "gla_Latn", "language_code": "gd"}, + {"name": "Irish", "nllb": "gle_Latn", "language_code": "ga-IE"}, + {"name": "Galician", "nllb": "glg_Latn", "language_code": "gl"}, + {"name": "Guarani", "nllb": "grn_Latn", "language_code": "gn"}, + {"name": "Gujarati", "nllb": "guj_Gujr", "language_code": "gu-IN"}, + {"name": "Haitian Creole", "nllb": "hat_Latn", "language_code": "ht"}, + {"name": "Hausa", "nllb": "hau_Latn", "language_code": "ha"}, + {"name": "Hebrew", "nllb": "heb_Hebr", "language_code": "he"}, + {"name": "Hindi", "nllb": "hin_Deva", "language_code": "hi"}, + {"name": "Chhattisgarhi", "nllb": "hne_Deva", "language_code": "hne"}, + {"name": "Croatian", "nllb": "hrv_Latn", "language_code": "hr"}, + {"name": "Hungarian", "nllb": "hun_Latn", "language_code": "hu"}, + {"name": "Armenian", "nllb": "hye_Armn", "language_code": "hy-AM"}, + {"name": "Igbo", "nllb": "ibo_Latn", "language_code": "ig"}, + {"name": "Ilocano", "nllb": "ilo_Latn", "language_code": "ilo"}, + {"name": "Indonesian", "nllb": "ind_Latn", "language_code": "id"}, + {"name": "Icelandic", "nllb": "isl_Latn", "language_code": "is"}, + {"name": "Italian", "nllb": "ita_Latn", "language_code": "it"}, + {"name": "Javanese", "nllb": "jav_Latn", "language_code": "jv"}, + {"name": "Japanese", "nllb": "jpn_Jpan", "language_code": "ja"}, + {"name": "Kabyle", "nllb": "kab_Latn", "language_code": "kab"}, + {"name": "Jingpho", "nllb": "kac_Latn", "language_code": "kac"}, + {"name": "Kamba", "nllb": "kam_Latn", "language_code": "kam"}, + {"name": "Kannada", "nllb": "kan_Knda", "language_code": "kn"}, + {"name": "Kashmiri (Arabic script)", "nllb": "kas_Arab", "language_code": "kas_Arab"}, + {"name": "Kashmiri (Devanagari script)", "nllb": "kas_Deva", "language_code": "kas_Deva"}, + {"name": "Georgian", "nllb": "kat_Geor", "language_code": "ka"}, + {"name": "Kazakh", "nllb": "kaz_Cyrl", "language_code": "kk"}, + {"name": "Kabiyè", "nllb": "kbp_Latn", "language_code": "kbp"}, + {"name": "Kabuverdianu", "nllb": "kea_Latn", "language_code": "kea"}, + {"name": "Halh Mongolian", "nllb": "khk_Cyrl", "language_code": "mn"}, + {"name": "Khmer", "nllb": "khm_Khmr", "language_code": "km"}, + {"name": "Kikuyu", "nllb": "kik_Latn", "language_code": "ki"}, + {"name": "Kinyarwanda", "nllb": "kin_Latn", "language_code": "rw"}, + {"name": "Kyrgyz", "nllb": "kir_Cyrl", "language_code": "ky"}, + {"name": "Kimbundu", "nllb": "kmb_Latn", "language_code": "kmb"}, + {"name": "Northern Kurdish", "nllb": "kmr_Latn", "language_code": "kmr"}, + {"name": "Central Kanuri (Arabic script)", "nllb": "knc_Arab", "language_code": "knc_Arab"}, + {"name": "Central Kanuri (Latin script)", "nllb": "knc_Latn", "language_code": "knc_Latn"}, + {"name": "Kikongo", "nllb": "kon_Latn", "language_code": "kg"}, + {"name": "Korean", "nllb": "kor_Hang", "language_code": "ko"}, + {"name": "Lao", "nllb": "lao_Laoo", "language_code": "lo"}, + {"name": "Ligurian", "nllb": "lij_Latn", "language_code": "lij"}, + {"name": "Limburgish", "nllb": "lim_Latn", "language_code": "li"}, + {"name": "Lingala", "nllb": "lin_Latn", "language_code": "ln"}, + {"name": "Lithuanian", "nllb": "lit_Latn", "language_code": "lt"}, + {"name": "Lombard", "nllb": "lmo_Latn", "language_code": "lmo"}, + {"name": "Latgalian", "nllb": "ltg_Latn", "language_code": "ltg"}, + {"name": "Luxembourgish", "nllb": "ltz_Latn", "language_code": "lb"}, + {"name": "Luba-Kasai", "nllb": "lua_Latn", "language_code": "lua"}, + {"name": "Ganda", "nllb": "lug_Latn", "language_code": "lg"}, + {"name": "Luo", "nllb": "luo_Latn", "language_code": "luo"}, + {"name": "Mizo", "nllb": "lus_Latn", "language_code": "lus"}, + {"name": "Standard Latvian", "nllb": "lvs_Latn", "language_code": "lv"}, + {"name": "Magahi", "nllb": "mag_Deva", "language_code": "mag"}, + {"name": "Maithili", "nllb": "mai_Deva", "language_code": "mai"}, + {"name": "Malayalam", "nllb": "mal_Mlym", "language_code": "ml-IN"}, + {"name": "Marathi", "nllb": "mar_Deva", "language_code": "mr"}, + {"name": "Minangkabau (Arabic script)", "nllb": "min_Arab", "language_code": "min_Arab"}, + {"name": "Minangkabau (Latin script)", "nllb": "min_Latn", "language_code": "min_Latn"}, + {"name": "Macedonian", "nllb": "mkd_Cyrl", "language_code": "mk"}, + {"name": "Maltese", "nllb": "mlt_Latn", "language_code": "mt"}, + {"name": "Meitei (Bengali script)", "nllb": "mni_Beng", "language_code": "mni"}, + {"name": "Mossi", "nllb": "mos_Latn", "language_code": "mos"}, + {"name": "Maori", "nllb": "mri_Latn", "language_code": "mi"}, + {"name": "Burmese", "nllb": "mya_Mymr", "language_code": "my"}, + {"name": "Dutch", "nllb": "nld_Latn", "language_code": "nl"}, + {"name": "Norwegian Nynorsk", "nllb": "nno_Latn", "language_code": "nn-NO"}, + {"name": "Norwegian Bokmål", "nllb": "nob_Latn", "language_code": "nb"}, + {"name": "Nepali", "nllb": "npi_Deva", "language_code": "ne-NP"}, + {"name": "Northern Sotho", "nllb": "nso_Latn", "language_code": "nso"}, + {"name": "Nuer", "nllb": "nus_Latn", "language_code": "nus"}, + {"name": "Nyanja", "nllb": "nya_Latn", "language_code": "ny"}, + {"name": "Occitan", "nllb": "oci_Latn", "language_code": "oc"}, + {"name": "Odia", "nllb": "ory_Orya", "language_code": "or"}, + {"name": "Pangasinan", "nllb": "pag_Latn", "language_code": "pag"}, + {"name": "Eastern Panjabi", "nllb": "pan_Guru", "language_code": "pa"}, + {"name": "Papiamento", "nllb": "pap_Latn", "language_code": "pap"}, + {"name": "Southern Pashto", "nllb": "pbt_Arab", "language_code": "pbt"}, + {"name": "Western Persian", "nllb": "pes_Arab", "language_code": "fa"}, + {"name": "Plateau Malagasy", "nllb": "plt_Latn", "language_code": "mg"}, + {"name": "Polish", "nllb": "pol_Latn", "language_code": "pl"}, + {"name": "Portuguese", "nllb": "por_Latn", "language_code": "pt-PT"}, + {"name": "Dari", "nllb": "prs_Arab", "language_code": "fa-AF"}, + {"name": "Ayacucho Quechua", "nllb": "quy_Latn", "language_code": "qu"}, + {"name": "Romanian", "nllb": "ron_Latn", "language_code": "ro"}, + {"name": "Rundi", "nllb": "run_Latn", "language_code": "rn"}, + {"name": "Russian", "nllb": "rus_Cyrl", "language_code": "ru"}, + {"name": "Sango", "nllb": "sag_Latn", "language_code": "sg"}, + {"name": "Sanskrit", "nllb": "san_Deva", "language_code": "sa"}, + {"name": "Santali", "nllb": "sat_Olck", "language_code": "sat"}, + {"name": "Sicilian", "nllb": "scn_Latn", "language_code": "scn"}, + {"name": "Shan", "nllb": "shn_Mymr", "language_code": "shn"}, + {"name": "Sinhala", "nllb": "sin_Sinh", "language_code": "si-LK"}, + {"name": "Slovak", "nllb": "slk_Latn", "language_code": "sk"}, + {"name": "Slovenian", "nllb": "slv_Latn", "language_code": "sl"}, + {"name": "Samoan", "nllb": "smo_Latn", "language_code": "sm"}, + {"name": "Shona", "nllb": "sna_Latn", "language_code": "sn"}, + {"name": "Sindhi", "nllb": "snd_Arab", "language_code": "sd"}, + {"name": "Somali", "nllb": "som_Latn", "language_code": "so"}, + {"name": "Southern Sotho", "nllb": "sot_Latn", "language_code": "st"}, + {"name": "Spanish", "nllb": "spa_Latn", "language_code": "es-ES"}, + {"name": "Sardinian", "nllb": "srd_Latn", "language_code": "sc"}, + {"name": "Serbian", "nllb": "srp_Cyrl", "language_code": "sr"}, + {"name": "Swati", "nllb": "ssw_Latn", "language_code": "ss"}, + {"name": "Sundanese", "nllb": "sun_Latn", "language_code": "su"}, + {"name": "Swedish", "nllb": "swe_Latn", "language_code": "sv-SE"}, + {"name": "Swahili", "nllb": "swh_Latn", "language_code": "sw"}, + {"name": "Silesian", "nllb": "szl_Latn", "language_code": "szl"}, + {"name": "Tamil", "nllb": "tam_Taml", "language_code": "ta"}, + {"name": "Tamasheq (Latin script)", "nllb": "taq_Latn", "language_code": "taq_Latn"}, + {"name": "Tamasheq (Tifinagh script)", "nllb": "taq_Tfng", "language_code": "taq_Tfng"}, + {"name": "Tatar", "nllb": "tat_Cyrl", "language_code": "tt-RU"}, + {"name": "Telugu", "nllb": "tel_Telu", "language_code": "te"}, + {"name": "Tajik", "nllb": "tgk_Cyrl", "language_code": "tg"}, + {"name": "Tagalog", "nllb": "tgl_Latn", "language_code": "tl"}, + {"name": "Thai", "nllb": "tha_Thai", "language_code": "th"}, + {"name": "Tigrinya", "nllb": "tir_Ethi", "language_code": "ti"}, + {"name": "Tok Pisin", "nllb": "tpi_Latn", "language_code": "tpi"}, + {"name": "Tswana", "nllb": "tsn_Latn", "language_code": "tn"}, + {"name": "Tsonga", "nllb": "tso_Latn", "language_code": "ts"}, + {"name": "Turkmen", "nllb": "tuk_Latn", "language_code": "tk"}, + {"name": "Tumbuka", "nllb": "tum_Latn", "language_code": "tum"}, + {"name": "Turkish", "nllb": "tur_Latn", "language_code": "tr"}, + {"name": "Twi", "nllb": "twi_Latn", "language_code": "tw"}, + {"name": "Central Atlas Tamazight", "nllb": "tzm_Tfng", "language_code": "tzm"}, + {"name": "Uyghur", "nllb": "uig_Arab", "language_code": "ug"}, + {"name": "Ukrainian", "nllb": "ukr_Cyrl", "language_code": "uk"}, + {"name": "Umbundu", "nllb": "umb_Latn", "language_code": "umb"}, + {"name": "Urdu", "nllb": "urd_Arab", "language_code": "ur"}, + {"name": "Northern Uzbek", "nllb": "uzn_Latn", "language_code": "uz"}, + {"name": "Venetian", "nllb": "vec_Latn", "language_code": "vec"}, + {"name": "Vietnamese", "nllb": "vie_Latn", "language_code": "vi"}, + {"name": "Waray", "nllb": "war_Latn", "language_code": "war"}, + {"name": "Wolof", "nllb": "wol_Latn", "language_code": "wo"}, + {"name": "Xhosa", "nllb": "xho_Latn", "language_code": "xh"}, + {"name": "Eastern Yiddish", "nllb": "ydd_Hebr", "language_code": "yi"}, + {"name": "Yoruba", "nllb": "yor_Latn", "language_code": "yo"}, + {"name": "Yue Chinese", "nllb": "yue_Hant", "language_code": "yue"}, + {"name": "Chinese (Simplified)", "nllb": "zho_Hans", "language_code": "zh-CN"}, + {"name": "Chinese (Traditional)", "nllb": "zho_Hant", "language_code": "zh-TW"}, + {"name": "Standard Malay", "nllb": "zsm_Latn", "language_code": "ms"}, + {"name": "Zulu", "nllb": "zul_Latn", "language_code": "zu"}, ] NAME_TO_NLLB = {lang["name"]: lang["nllb"] for lang in LANGUAGES} -NAME_TO_CROWDIN = {lang["name"]: lang["crowdin"] for lang in LANGUAGES} -CROWDIN_TO_NLLB = {lang["crowdin"]: lang["nllb"] for lang in LANGUAGES} -NLLB_TO_CROWDIN = {lang["nllb"]: lang["crowdin"] for lang in LANGUAGES} -CROWDIN_TO_NAME = {lang["crowdin"]: lang["name"] for lang in LANGUAGES} +NAME_TO_LANGUAGE_CODE = {lang["name"]: lang["language_code"] for lang in LANGUAGES} +LANGUAGE_CODE_TO_NLLB = {lang["language_code"]: lang["nllb"] for lang in LANGUAGES} +NLLB_TO_LANGUAGE_CODE = {lang["nllb"]: lang["language_code"] for lang in LANGUAGES} +LANGUAGE_CODE_TO_NAME = {lang["language_code"]: lang["name"] for lang in LANGUAGES} NLLB_TO_NAME = {lang["nllb"]: lang["name"] for lang in LANGUAGES} -def get_nllb_code(crowdin_code): - return CROWDIN_TO_NLLB.get(crowdin_code, None) +def get_nllb_code(language_code_code): + return LANGUAGE_CODE_TO_NLLB.get(language_code_code, None) -def get_crowdin_code(nllb_code): - return NLLB_TO_CROWDIN.get(nllb_code) +def get_language_code_code(nllb_code): + return NLLB_TO_LANGUAGE_CODE.get(nllb_code) -def get_language_name_by_crowdin(crowdin_code): - return CROWDIN_TO_NAME.get(crowdin_code) +def get_language_name_by_language_code(language_code_code): + return LANGUAGE_CODE_TO_NAME.get(language_code_code) def get_language_name_by_nllb(nllb_code): @@ -152,7 +234,7 @@ def get_language_info(identifier, identifier_type="auto"): for lang in LANGUAGES: if (lang["name"].lower() == identifier.lower() or lang["nllb"] == identifier or - lang["crowdin"] == identifier): + lang["language_code"] == identifier): return lang elif identifier_type == "name": for lang in LANGUAGES: @@ -162,9 +244,9 @@ def get_language_info(identifier, identifier_type="auto"): for lang in LANGUAGES: if lang["nllb"] == identifier: return lang - elif identifier_type == "crowdin": + elif identifier_type == "language_code": for lang in LANGUAGES: - if lang["crowdin"] == identifier: + if lang["language_code"] == identifier: return lang return None @@ -178,5 +260,5 @@ def list_all_nllb_codes(): return [lang["nllb"] for lang in LANGUAGES] -def list_all_crowdin_codes(): - return [lang["crowdin"] for lang in LANGUAGES] \ No newline at end of file +def list_all_language_code_codes(): + return [lang["language_code"] for lang in LANGUAGES]