Documentation for this module may be created at Модул:uk-pronunciation/doc

local export = {}

function export.pronunciation(word, accent)
	if type(word) == "table" then
		word, accent =
			word.args[1] or word:getParent().args[1],
			word.args.accent or word:getParent().args.accent
    end
    if not word or (word == "") then
        error("Please put the word as the first postional parameter!")
    end
    word = mw.ustring.lower(word)

    -- single characters that map to IPA sounds    
    local phonetic_chars_map= {
         ["а"] = "ɑ",     ["б"] = "b",     ["в"] = "ʋ",     ["г"] = "ɦ",     ["ґ"] = "ɡ", 
         ["д"] = "d",     ["е"] = "ɛ",     ["є"] = "jɛ",    ["ж"] = "ʒ",     ["з"] = "z", 
         ["и"] = "ɪ",     ["і"] = "i",     ["ї"] = "ji",    ["й"] = "j",     ["к"] = "k", 
         ["л"] = "l",     ["м"] = "m",     ["н"] = "n",     ["о"] = "ɔ",     ["п"] = "p", 
         ["р"] = "r",     ["с"] = "s",     ["т"] = "t",     ["у"] = "u",     ["ф"] = "f", 
         ["х"] = "x",     ["ц"] = "t͡s",    ["ч"] = "t͡ʃ",    ["ш"] = "ʃ",     ["щ"] = "ʃt͡ʃ", 
         ["ь"] = "ʲ",     ["ю"] = "ju",    ["я"] = "jɑ",    ["’"] = "j",
         -- accented vowels
         ["у́"] = "ˈ"
    }    
    -- character sequences of two that map to IPA sounds
    local phonetic_2chars_map = {
        ["дж"] = "d͡ʒ",    ["дз"] = "d͡z",
    -- Dental plosives assimilate to following hissing/hushing consonants, which is not noted in the spelling.
        ["дс"] = "d͡zs",   ["дш"] = "d͡ʒʃ",   ["дч"] = "d͡ʒt͡ʃ", ["дц"] = "d͡zt͡s",
        ["тс"] = "t͡s",    ["тш"] = "t͡ʃʃ",   ["тч"] = "t͡ʃː", ["тц"] = "t͡sː", 
    }    
    -- character sequences of three that map to IPA sounds
    local phonetic_3chars_map = {
        ["дзь"] = "d͡zʲ", 
     -- Dental plosives assimilate to following hissing/hushing consonants, which is not noted in the spelling.
        ["тьс"] = "t͡sʲː"
    }
    
    local phonetic = word

    -- first apply consonant cluster simplifications that always occur orthographically
    phonetic =  mw.ustring.gsub(phonetic, "нтськ", "ньськ")
    phonetic =  mw.ustring.gsub(phonetic, "стськ", "ськ")
    phonetic =  mw.ustring.gsub(phonetic, "нтст", "нст")
    phonetic =  mw.ustring.gsub(phonetic, "стч", "шч")
    phonetic =  mw.ustring.gsub(phonetic, "стд", "зд")
    phonetic =  mw.ustring.gsub(phonetic, "стс", "сː")
    phonetic =  mw.ustring.gsub(phonetic, "стськ", "ськ")
    phonetic =  mw.ustring.gsub(phonetic, "^зш", "шː")
    phonetic =  mw.ustring.gsub(phonetic, "зш", "жш")
    phonetic =  mw.ustring.gsub(phonetic, "^зч", "шч")
    phonetic =  mw.ustring.gsub(phonetic, "зч", "жч")

    -- then long consonants that are orthographically geminated.
    phonetic =  mw.ustring.gsub(phonetic, "([бвгґд])%1", "%1ː")
    phonetic =  mw.ustring.gsub(phonetic, "([^д]+)жж", "%1жː") -- джж sequence encode diphonemic дж
    phonetic =  mw.ustring.gsub(phonetic, "([^д]+)зз", "%1зː") -- дзз sequence encode diphonemic дз
    phonetic =  mw.ustring.gsub(phonetic, "([йклмнпрстфхцчшщ])%1", "%1ː")
    phonetic =  mw.ustring.gsub(phonetic, "дждж", "джː")
    phonetic =  mw.ustring.gsub(phonetic, "дздз", "дзː")
    
    -- remap apostrophe to '!' so that it doesn't conflict with IPA stress mark
    phonetic =  mw.ustring.gsub(phonetic, "'", "!")
    
    -- replace multiple letter sequences
    for key, replacement in pairs(phonetic_3chars_map) do
        phonetic = mw.ustring.gsub(phonetic, key, replacement)
    end
    for key, replacement in pairs(phonetic_2chars_map) do
        phonetic = mw.ustring.gsub(phonetic, key, replacement)
    end    
    -- replace single letters of the alphabet
    phonetic = mw.ustring.gsub(phonetic, '.', phonetic_chars_map)

    -- add IPA stress mark and remove the acute
    phonetic = mw.ustring.gsub(phonetic, "([ɑɛiɪuɔ][\204\129\204\128])","ˈ%1")
    phonetic = mw.ustring.gsub(phonetic, "([\204\129\204\128])","") 
    
    -- add accent if the word is monosyllabic and not "|accent=off"
    local _, numberOfVowels  = mw.ustring.gsub(phonetic, "[ɑɛiɪuɔ]", "")
    if (numberOfVowels == 1) and (accent ~= "off") then
        phonetic = mw.ustring.gsub(phonetic, "([ɑɛiɪuɔ])", "ˈ%1")
    end
    
    -- palatalizable consonants before /i/ or /j/ become palatalized
    phonetic = mw.ustring.gsub(phonetic, "([tdsznlrbpʋfmkɦxʃʒ])([ː]?)([ˈ]?)i", "%1ʲ%2%3i")
    phonetic = mw.ustring.gsub(phonetic, "([tdsznlrbpʋfmkɦxʃʒ])([ː]?)j", "%1ʲ%2")

    -- eliminate garbage sequences of [ʲːj] resulting from -тьс- cluster followed by [j]
    phonetic = mw.ustring.gsub(phonetic, "ʲːj", "ʲː")

    -- consonant simplification: ст + ц' → [с'ц']. We do it here because of palatalization.
    -- Due to the т +ц → [ц:] rule length is present. According to Орфоепскі словник p. 13,
    -- both forms are proper, without length in normal (colloquial) speech and with length
    -- in slow speech, so we parenthesize the length as optional.
    phonetic =  mw.ustring.gsub(phonetic, "st͡sʲ([ː]?)", "sʲt͡sʲ(%1)")
    
    -- assimilation: voiceless + voiced = voiced + voiced
    -- should /ʋ/ be included as voiced? Орфоепічний словник doesn't voice initial cluster of шв (p. 116)
    phonetic = mw.ustring.gsub(phonetic, "p([bdzʒɡɦ]+)", "b%1")
    phonetic = mw.ustring.gsub(phonetic, "f([bdzʒɡɦ]+)", "v%1")
    phonetic = mw.ustring.gsub(phonetic, "t([bdzʒɡɦ]+)", "d%1")
    phonetic = mw.ustring.gsub(phonetic, "tʲ([bdzʒɡɦ]+)", "dʲ%1")
    phonetic = mw.ustring.gsub(phonetic, "s([bdzʒɡɦ]+)", "z%1")
    phonetic = mw.ustring.gsub(phonetic, "sʲ([bdzʒɡɦ]+)", "zʲ%1")
    phonetic = mw.ustring.gsub(phonetic, "ʃ([bdzʒɡɦ]+)", "ʒ%1")
    phonetic = mw.ustring.gsub(phonetic, "k([bdzʒɡɦ]+)", "ɡ%1")
    phonetic = mw.ustring.gsub(phonetic, "x([bdzʒɡɦ]+)", "ɦ%1")
    phonetic = mw.ustring.gsub(phonetic, "t͡s([bdzʒɡɦ]+)", "d͡z%1")
    phonetic = mw.ustring.gsub(phonetic, "t͡sʲ([bdzʒɡɦ]+)", "d͡zʲ%1")
    phonetic = mw.ustring.gsub(phonetic, "t͡ʃ([bdzʒɡɦ]+)", "d͡ʒ%1")
    phonetic = mw.ustring.gsub(phonetic, "ʃt͡ʃ([bdzʒɡɦ]+)", "ʒd͡ʒ%1")

    -- In the sequence of two consonants, of which the second is soft, the first is pronounced soft too
    -- unless the first consonant is a labial, namely б, п, в, ф, м.
    phonetic = mw.ustring.gsub(phonetic, "([tdsznl])(.)ʲ", "%1ʲ%2ʲ")
    phonetic = mw.ustring.gsub(phonetic, "([tdsznl])t͡sʲ", "%1ʲt͡sʲ")
    phonetic = mw.ustring.gsub(phonetic, "([tdsznl])d͡zʲ", "%1ʲd͡zʲ")
    phonetic = mw.ustring.gsub(phonetic, "t͡s(.)ʲ", "t͡sʲ%1ʲ")
    phonetic = mw.ustring.gsub(phonetic, "d͡z(.)ʲ", "d͡zʲ%1ʲ")
    phonetic = mw.ustring.gsub(phonetic, "d͡zt͡sʲ", "d͡zʲt͡sʲ")
    phonetic = mw.ustring.gsub(phonetic, "t͡sd͡zʲ", "t͡sʲd͡zʲ")

    -- Hushing consonants ж, ч, ш assimilate to the following hissing consonants, giving a long hissing consonant:
    -- [ʒ] + [t͡sʲ] → [zʲt͡sʲ], [t͡ʃ] + [t͡sʲ] → [t͡sʲː], [ʃ] + [t͡sʲ] → [sʲt͡sʲ], [ʃ] + [sʲ] → [sʲː]
    phonetic = mw.ustring.gsub(phonetic, "ʒt͡sʲ", "zʲt͡sʲ")
    phonetic = mw.ustring.gsub(phonetic, "t͡ʃt͡sʲ", "t͡sʲː")
    phonetic = mw.ustring.gsub(phonetic, "ʃt͡sʲ", "sʲt͡sʲ")
    phonetic = mw.ustring.gsub(phonetic, "ʃsʲ", "sʲː")

    -- Hissing consonants before hushing consonants within a word assimilate - on зш and зч word-initially and 
    -- word-medially see above.
    -- [s] + [ʃ] → [ʃː],  [z] + [ʃ] → [ʒʃ], [z] + [t͡s] → [ʒt͡s]
    -- [z] + [d͡ʒ] → [ʒd͡ʒ]
    phonetic = mw.ustring.gsub(phonetic, "zʒ", "ʒː")
    phonetic = mw.ustring.gsub(phonetic, "sʃ", "ʃː")
    phonetic = mw.ustring.gsub(phonetic, "zt͡s", "ʒt͡s")
    phonetic = mw.ustring.gsub(phonetic, "zd͡ʒ", "ʒd͡ʒ")
    
    -- cleanup: excessive palatalization: CʲCʲCʲ → CCʲCʲ
    phonetic = mw.ustring.gsub(phonetic, "([^ɑɛiɪuɔ]+)ʲ([^ɑɛiɪuɔ]+)ʲ([^ɑɛiɪuɔ]+)ʲ", "%1%2ʲ%3ʲ")

    -- unstressed /ɑ/ has an allophone [ɐ]
    phonetic = mw.ustring.gsub(phonetic, "([^ˈ]+)ɑ", "%1ɐ")
    phonetic = mw.ustring.gsub(phonetic, "^ɑ", "ɐ")
    -- unstressed /u/ has an allophone [ʊ]
    phonetic = mw.ustring.gsub(phonetic, "([^ˈ]+)u", "%1ʊ")
    phonetic = mw.ustring.gsub(phonetic, "^u", "ʊ")
    -- unstressed /ɔ/ has by assimilation an allophone [o] before a stressed syllable with /u/ or /i/
    phonetic = mw.ustring.gsub(phonetic, "ɔ([bdzʒgɦmnlrpftskxʲʃ͡]+)ˈ([uiʊ]+)", "o%1ˈ%2")
    -- one allophone [e] covers unstressed /ɛ/ and /ɪ/
    phonetic = mw.ustring.gsub(phonetic, "([^ˈ])ɛ", "%1e")
    phonetic = mw.ustring.gsub(phonetic, "^ɛ", "e")
    phonetic = mw.ustring.gsub(phonetic, "([^ˈ])ɪ", "%1e")
    phonetic = mw.ustring.gsub(phonetic, "^ɪ", "e")   

    -- /ʋ/ has an allophone [u̯] in a syllable coda
    phonetic = mw.ustring.gsub(phonetic, "([ɑɛiɪuɔɐoʊe]+)ʋ", "%1u̯")
    -- /ʋ/ has an allophone [w] before /ɔ, u/and voiced consonants (not after a vowel)
    phonetic = mw.ustring.gsub(phonetic, "ʋ([ˈ]?)([ɔuoʊbdzʒɡɦmnlr]+)", "w%1%2")
    -- /ʋ/ has an allophone [ʍ] before before voiceless consonants (not after a vowel)
    phonetic = mw.ustring.gsub(phonetic, "ʋ([pftskxʃ]+)", "ʍ%1")

    -- in a syllable-final position (i.e. the first position of a syllable coda) /j/ has an allophone [i̯]:
    phonetic = mw.ustring.gsub(phonetic, "([ɑɛiɪuɔɐoʊe]+)j([ˈ]?)([bdzʒɡɦmnlrpftskxʃ]+)", "%1i̯%2%3")
    phonetic = mw.ustring.gsub(phonetic, "([ɑɛiɪuɔɐoʊe]+)j$", "%1i̯")
    -- also at the beginning of a word before a consonant
    phonetic = mw.ustring.gsub(phonetic, "^j([bdzʒɡɦmnlrpftskxʃ]+)", "i̯%1")
 
    -- remove old orthographic apostrophe
    phonetic =  mw.ustring.gsub(phonetic, "!", "")
    return (phonetic)
end

return export