বিষয়বস্তুতে চলুন

মডিউল:hi-translit

উইকিঅভিধান, মুক্ত অভিধান থেকে

এই মডিউলের জন্য মডিউল:hi-translit/নথি-এ নথিপত্র তৈরি করা হয়ে থাকতে পারে

-- Transliteration for Hindi (possibly other languages using Devanagari script, except for Sanskrit)
local export = {}

local m_str_utils = require("Module:string utilities")

local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local match = m_str_utils.match
local plain_gsub = m_str_utils.plain_gsub
local reverse = m_str_utils.reverse
local toNFC = mw.ustring.toNFC

local conv = {
	-- consonants
	['क']='ক', ['ख']='খ', ['ग']='গ', ['घ']='ঘ', ['ङ']='ঙ',
	['च']='চ', ['छ']='ছ', ['ज']='জ', ['झ']='ঝ', ['ञ']='ঞ', 
	['ट']='ট', ['ठ']='ঠ', ['ड']='ড', ['ढ']='ঢ', ['ण']='ণ',
	['त']='ত', ['थ']='থ', ['द']='দ', ['ध']='ধ', ['न']='ন', 
	['प']='প', ['फ']='ফ', ['ब']='ব', ['भ']='ভ', ['म']='ম',
	['य']='য়', ['र']='র', ['ल']='ল', ['व']='ৱ', ['ळ']='ল়',
	['श']='শ', ['ष']='ষ', ['स']='স', ['ह']='হ',
	['क़'] = 'q', ['ख़'] = 'x', ['ग़'] = 'ġ', ['ऴ'] = 'ḻ',
	['ज़'] = 'z', ['ष़'] = 'ẓ', ['झ़'] = 'ź', ['ड़'] = 'ṛ', ['ढ़'] = 'ṛh',
	['फ़'] = 'f', ['ऩ'] = 'ṉ', ['ऱ'] = 'ṟ', ['य़'] = 'ẏ', ['व़'] = 'w',
	-- ['ज्ञ'] = 'jñ',

	-- vowel diacritics
	['ि'] = 'ি', ['ु'] = 'ু', ['े'] = 'ে', ['ो'] = 'ো',
    ['ॊ'] = 'ǒ', ['ॆ'] = 'ě',
	['ा'] = 'া', ['ी'] = 'ী', ['ू'] = 'ূ', 
	['ृ'] = 'ৃ',
	['ै'] = 'ai', ['ौ'] = 'au',
	['ॉ'] = 'ŏ',
	['ॅ'] = 'ĕ',

	-- vowel signs
	['अ'] = 'অ', ['इ'] = 'ই', ['उ'] = 'উ', ['ए'] = 'এ', ['ओ'] = 'ও',
	['आ'] = 'আ', ['ई'] = 'ঈ', ['ऊ'] = 'ঋ', ['ऎ'] = 'ঐ', ['ऒ'] = 'ঔ',
	['ऋ'] = 'ঋ', 
	['ऐ'] = 'ai', ['औ'] = 'au', 
	['ऑ'] = 'ŏ',
	['ऍ'] = 'ĕ',
	
	['ॐ'] = 'om',
	
	-- chandrabindu
	['ँ'] = 'ঁ',
	
	-- anusvara
	['ं'] = '̃',
	
	-- visarga
	['ः'] = 'ঃ',
	
	-- virama
	['्'] = '',
	
	-- numerals
	['०'] = '০', ['१'] = '১', ['२'] = '২', ['३'] = '৩', ['४'] = '৪',
	['५'] = '৫', ['६'] = '৬', ['७'] = '৭', ['८'] = '৮', ['९'] = '৯',
	
	-- punctuation
	['।'] = '.', -- danda
	['॥'] = '.', -- double danda
	['+'] = '', -- compound separator
	
	-- abbreviation sign
	['॰'] = '.',
}

local nasal_assim_short = {
	['क'] = 'ङ', ['ख'] = 'ङ', ['ग'] = 'ङ', ['घ'] = 'ङ', ['ङ'] = 'ङ',
	['च'] = 'ञ', ['छ'] = 'ञ', ['ज'] = 'ञ', ['झ'] = 'ञ', ['ञ'] = 'ञ',  
	['ट'] = 'ण', ['ठ'] = 'ण', ['ड'] = 'ण', ['ढ'] = 'ण', ['ण'] = 'ण',
	['त'] = 'न', ['थ'] = 'न', ['द'] = 'न', ['ध'] = 'न', ['न'] = 'न',
	['प'] = 'म', ['फ'] = 'म', ['ब'] = 'म', ['भ'] = 'म', ['म'] = 'म',
	['य'] = 'ञ', ['र'] = 'न', ['ल'] = 'न',  ['व'] = 'म',
	['श'] = 'ञ', ['ष'] = 'ण', ['स'] = 'न', ['ह'] = 'ँ',
	['ज़'] = 'न', ['फ़'] = 'म',  ['क़'] = 'ङ', ['ख़'] = 'ङ', ['ग़'] = 'ङ',
	['ड़'] = 'ँ', ['ढ़'] = 'ँ'
}

local nasal_assim_long = {
	['क'] = 'ँ', ['ख'] = 'ँ', ['ग'] = 'ङ', ['घ'] = 'ङ', ['ङ'] = 'ँ',
	['च'] = 'ँ', ['छ'] = 'ँ', ['ज'] = 'ञ', ['झ'] = 'ञ', ['ञ'] = 'ँ',  
	['ट'] = 'ँ', ['ठ'] = 'ँ', ['ड'] = 'ण', ['ढ'] = 'ण', ['ण'] = 'ँ',
	['त'] = 'ँ', ['थ'] = 'ँ', ['द'] = 'न', ['ध'] = 'न', ['न'] = 'ँ',
	['प'] = 'ँ', ['फ'] = 'ँ', ['ब'] = 'म', ['भ'] = 'म', ['म'] = 'ँ',
	['ह'] = 'ँ',
	['ज़'] = 'न', ['फ़'] = 'म',  ['क़'] = 'ङ', ['ख़'] = 'ङ', ['ग़'] = 'ङ',
	['ड़'] = 'ँ', ['ढ़'] = 'ँ'
}

-- These clusters when occurring word-finally will not trigger a schwa added
-- after them even though the second consonant is in special_cons, which normally
-- causes the extra schwa to be added. NOTE: The clusters are reversed from their
-- ultimate effect, e.g. the first cluster is written 'ml' but actually applies
-- to words ending in 'lm'. The clusters below overall refer to the six clusters
-- describable by [rl][mnv], i.e. rm, rn, rv, lm, ln, lv (plus rṇ).
-- Plus some word-final geminates.
local perm_cl = {
	['म्ल'] = true, ['व्ल'] = true, ['न्ल'] = true,
	['म्र'] = true, ['व्र'] = true, ['न्र'] = true, ['ण्र'] = true,
	['न्न'] = true, ['म्म'] = true, ['ण्ण'] = true, ['ल्ल'] = true, ['र्र'] = true
}

local all_cons, special_cons = 'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवषशसह', 'यरलवहनमञण'
local vowel, vowel_sign = '*aिुृेोाीूैौॉॅॆॊ\'', 'अइउएओआईऊऋऐऔऑऍ\''
local long_vowel, short_vowel = 'ाीूेैोौआईऊएऐओऔ', '*aिुृॆॊॅॉअइउऋऍऑऎऒ\''
local syncope_pattern = '([' .. vowel .. vowel_sign .. '])(़?[' .. all_cons .. '])(़?[' .. all_cons .. '])([ंँ]?[' .. vowel .. vowel_sign .. '])'

function export.tr(text, lang, sc)
	-- treat anusvara + nasal as geminate nasal after short vowels
	text = gsub(text, '([' .. short_vowel .. all_cons .. '])ं([नम])', '%1%2्%2')
	-- word-final apostrophe (e.g. from bold formatting) does not delete schwa
	text = gsub(text, '([' .. all_cons .. ']़?)(\'%A)', '%1a%2')
	text = gsub(text, '([' .. all_cons .. ']़?)(\')$', '%1a%2')
	text = gsub(text, '([' .. all_cons .. ']़?)([' .. vowel .. '्]?)', function(c, d)
		return c .. (d == "" and 'a' or d) end)
	-- to detect words, include Unicode 0900-0963 and 0971-097F (plus 'a' and '*'),
	-- excluding e.g. danda and abbreviation dot ॰
	for word in gmatch(text, "[ऀ-ॣॱ-ॿa*]+") do
		local orig_word = word
		word = reverse(word)
		word = gsub(word, '^(़?)([' .. all_cons .. '])(.)(.?)', function(opt, first, second, third)
			return (((match(first, '[' .. special_cons .. ']') and match(second, '्') and not perm_cl[first..second..third])
				or match(first .. second, 'य[ीिई]'))
				and 'a' or "") .. opt .. first .. second .. third end)
		while match(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, '%1%2%3%4')
		end
		word = reverse(word)
		-- The special_vowel category consists of ī/e/o/ai/au, these vowels are often 
		-- written with anusvara for 'aesthetic' reasons as the vowel diacritic 
		-- gets in the way of the candrabindu. For ī/e/o/ai/au, anusvara will trigger 
		-- nasal_assim_long and candrabindu will force a nasal vowel. To force a nasal 
		-- consonant before voiceless stops (mostly in loanwords), respell with 
		-- homorganic nasal ङ/ञ/ण/न/म + virama ्
		-- Exception: vowel 'e' when written as standalone ए
        local special_vowel, normal_vowel = 'ीेैोौईऐओऔ', '*aिुाूृॆॅॊॉअइउआऊऋऎऍऒऑए\''
        word = gsub(word, '([' .. special_vowel .. '])ँ(.़?)', function(prev, succ)
			return prev .. "̃" .. succ
		end)
		--sometimes chandrabindu != anusvara
		word = gsub(word, '([' .. normal_vowel .. '])ं([सशषवयकखटतथदडपचछ]़?)', function(prev, succ)
			return prev .. (nasal_assim_short[succ] or "̃") .. succ
		end)
		word = gsub(word, '([' .. normal_vowel .. '])ँ([सशषवयकखटतथदडपचछ]़?)', function(prev, succ)
			return prev .. "̃" .. succ
		end)
		--force chandrabindu to behave as anusvara
		word = gsub(word, 'ँ', 'ं')
		word = gsub(word, '([' .. short_vowel .. '])ं(.़?)', function(prev, succ)
			return prev .. (nasal_assim_short[succ] or "̃") .. succ
		end)
		word = gsub(word, '([' .. long_vowel .. '])ं(.़?)', function(prev, succ)
			return prev .. (nasal_assim_long[succ] or "̃") .. succ
		end)
		text = plain_gsub(text, orig_word, word)
	end
	text = gsub(text, '.़?', conv)
	text = gsub(text, 'a([iu])̃', 'a͠%1')
    text = gsub(text, 'ñz', 'nz')
    -- text = gsub(text, '%*', 'a')
	return toNFC(text)
end

return export