وحدة:ar-utilities

يمكن إنشاء صفحة توثيق الوحدة في وحدة:ar-utilities/شرح
local m_links = require("Module:links")
local ar_translit = require("Module:ar-translit")

local export = {}

local lang = require("Module:languages").getByCode("ar")

local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rsplit = mw.text.split
local u = mw.ustring.char

local function ine(x) -- If Not Empty
    if x == "" then
        return nil
    else
        return x
    end
end

-- version of rsubn() that discards all but the first return value
function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- synthesize a frame so that exported functions meant to be called from
-- templates can be called from the debug console.
function debug_frame(parargs, args)
	return {args = args, getParent = function() return {args = parargs} end}
end

--------------------------- hamza processing ------------------------------

-- hamza variants
local HAMZA            = u(0x0621) -- hamza on the line (stand-alone hamza) = ء
local HAMZA_ON_ALIF    = u(0x0623)
local HAMZA_ON_WAW     = u(0x0624)
local HAMZA_UNDER_ALIF = u(0x0625)
local HAMZA_ON_YA      = u(0x0626)
local HAMZA_ANY        = "[" .. HAMZA .. HAMZA_ON_ALIF .. HAMZA_UNDER_ALIF .. HAMZA_ON_WAW .. HAMZA_ON_YA .. "]"
local HAMZA_PH         = u(0xFFF0) -- hamza placeholder

-- diacritics
local A  = u(0x064E) -- fatḥa
local AN = u(0x064B) -- fatḥatān (fatḥa tanwīn)
local U  = u(0x064F) -- ḍamma
local UN = u(0x064C) -- ḍammatān (ḍamma tanwīn)
local I  = u(0x0650) -- kasra
local IN = u(0x064D) -- kasratān (kasra tanwīn)
local SK = u(0x0652) -- sukūn = no vowel
local SH = u(0x0651) -- šadda = gemination of consonants
local DAGGER_ALIF = u(0x0670)
local DIACRITIC_ANY_BUT_SH = "[" .. A .. I .. U .. AN .. IN .. UN .. SK .. DAGGER_ALIF .. "]"
-- Pattern matching short vowels
local AIU = "[" .. A .. I .. U .. "]"
-- Pattern matching any diacritics that may be on a consonant
local DIACRITIC = SH .. "?" .. DIACRITIC_ANY_BUT_SH

-- various letters and signs
local ALIF   = u(0x0627) -- ʾalif = ا
local AMAQ   = u(0x0649) -- ʾalif maqṣūra = ى
local AMAD   = u(0x0622) -- ʾalif madda = آ
local WAW    = u(0x0648) -- wāw = و
local YA     = u(0x064A) -- yā = ي

function reorder_shadda(text)
	-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
	-- replaced with short-vowel+shadda during NFC normalisation, which
	-- MediaWiki does for all Unicode strings; however, it makes the
	-- detection process inconvenient, so undo it. (For example, the code in
	-- remove_in would fail to detect the -in in مُتَرَبٍّ because the shadda
	-- would come after the -in.)
	text = rsub(text, "(" .. DIACRITIC_ANY_BUT_SH .. ")" .. SH, SH .. "%1")
	return text
end

local hamza_subs = {
	--------------------------- handle initial hamza --------------------------
	-- put initial hamza on a seat according to following vowel.
 	{"^" .. HAMZA_PH .. "([" .. I .. YA .. "])", HAMZA_UNDER_ALIF .. "%1"},
 	{" " .. HAMZA_PH .. "([" .. I .. YA .. "])", " " .. HAMZA_UNDER_ALIF .. "%1"},
 	{"^" .. HAMZA_PH, HAMZA_ON_ALIF}, -- if no vowel, assume a
 	{" " .. HAMZA_PH, " " .. HAMZA_ON_ALIF}, -- if no vowel, assume a

	----------------------------- handle final hamza --------------------------
	-- "final" hamza may be followed by a short vowel or tanwīn sequence
	-- use a previous short vowel to get the seat
	{"(" .. AIU .. ")(" .. HAMZA_PH .. ")(" .. DIACRITIC .. "?)$",
		function(v, ham, diacrit)
			ham = v == I and HAMZA_ON_YA or v == U and HAMZA_ON_WAW or HAMZA_ON_ALIF
			return v .. ham .. diacrit
		end
	},
	{"(" .. AIU .. ")(" .. HAMZA_PH .. ")(" .. DIACRITIC .. "? )",
		function(v, ham, diacrit)
			ham = v == I and HAMZA_ON_YA or v == U and HAMZA_ON_WAW or HAMZA_ON_ALIF
			return v .. ham .. diacrit
		end
	},
	-- else hamza is on the line
	{HAMZA_PH .. "(" .. DIACRITIC .. "?)$", HAMZA .. "%1"},

	---------------------------- handle medial hamza --------------------------
	-- if long vowel or diphthong precedes, we need to ignore it.
	{"([" .. AMAD .. ALIF .. WAW .. YA .. "]" .. SK .. "?)(" .. HAMZA_PH .. ")(" .. SH .. "?)([^ ])",
		function(prec, ham, shad, v2)
			ham = (v2 == I or v2 == YA) and HAMZA_ON_YA or
				(v2 == U or v2 == WAW) and HAMZA_ON_WAW or
				rfind(prec, YA) and HAMZA_ON_YA or
				HAMZA
			return prec .. ham ..shad .. v2
		end
	},
	-- otherwise, seat of medial hamza relates to vowels on one or both sides.
 	{"([^ ])(" .. HAMZA_PH .. ")(" .. SH .. "?)(" .. AN .. "?[^ ])",
		function(v1, ham, shad, v2)
			ham = (v1 == I or v2 == I or v2 == YA) and HAMZA_ON_YA or
				(v1 == U or v2 == U or v2 == WAW) and HAMZA_ON_WAW or
				-- special exception for the accusative ending, in words like
				-- جُزْءًا (juzʾan). By the rules of Thackston pp. 281-282 a
				-- hamza-on-alif should appear, but that would result in
				-- two alifs in a row, which is generally forbidden.
				-- According to Haywood/Nahmad pp. 114-115, after sukūn before
				-- the accusative ending (including when a pronominal suffix
				-- follows) hamza is written on yāʾ if the previous letter
				-- is connecting, else on the line. The only examples they
				-- give involve preceding non-connecting z (جُزْءًا juzʾan and
				-- (جُزْءَهُ juzʾahu) and preceding diphthongs, with the only
				-- connecting letter being yāʾ, where we have hamza-on-yāʾ
				-- anyway by the preceding regexp. Haywood/Nahmad's rule seems
				-- too complicated, and since it conflicts with Thackston,
				-- we only implement the case where otherwise two alifs would
				-- appear with the indefinite accusative ending.
				v2 == AN .. ALIF and HAMZA or
				HAMZA_ON_ALIF
			return v1 .. ham .. shad .. v2
		end
	},
	
	--------------------------- handle alif madda -----------------------------
	{HAMZA_ON_ALIF .. A .. "?" .. ALIF, AMAD},

	----------------------- catch any remaining hamzas ------------------------
	{HAMZA_PH, HAMZA}
}

function export.process_hamza(term)
	-- convert HAMZA_PH into appropriate hamza seat
	for _, sub in ipairs(hamza_subs) do
		term = rsub(term, sub[1], sub[2])
	end

	-- sequence of hamza-on-wāw + wāw is problematic and leads to a preferred
	-- alternative with some other type of hamza, as well as the original
	-- sequence; sequence of wāw + hamza-on-wāw + wāw is especially problematic
	-- and leads to two different alternatives with the original sequence not
	-- one of them
	if rfind(term, WAW .. "ؤُو") then
		return {rsub(term, WAW .. "ؤُو", WAW .. "ئُو"), rsub(term, WAW .. "ؤُو", WAW .. "ءُو")}
	elseif rfind(term, YA .. "ؤُو") then
		return {rsub(term, YA .. "ؤُو", YA .. "ئُو"), term}
	elseif rfind(term, ALIF .. "ؤُو") then
		-- Here John Mace "Arabic Verbs" is inconsistent. In past-tense parts,
		-- the preferred alternative has hamza on the line, whereas in
		-- non-past parts the preferred alternative has hamza-on-yāʾ even
		-- though the sequence of vowels is identical. It's too complicated to
		-- propagate information about tense through to here so pick one.
		return {rsub(term, ALIF .. "ؤُو", ALIF .. "ئُو"), term}
	elseif rfind(term, A .. "ؤُو") then
		return {rsub(term, A .. "ؤُو", A .. HAMZA_ON_ALIF .. U .. WAW), term}
	-- no alternative spelling in sequence of U + hamza-on-wāw + U + wāw;
	-- sequence of I + hamza-on-wāw + U + wāw does not occur (has
	-- hamza-on-yāʾ instead)
	else
		return {term}
	end
end

----------------------------------- misc junk ---------------------------------

local LRM = u(0x200E) -- left-to-right mark

local function links(text)
	if word == "" or word == "&mdash;" then
		return word
	else
		return m_links.full_link(text, nil, lang, nil, nil, nil, {tr = "-"}, false)
	end
end

local function format_genders(lang, sc, genders)
	if genders and #genders > 0 then
		local gen = require("Module:gender and number")
		return "&nbsp;" .. gen.format_list(genders, lang)
	else
		return ""
	end
end

function export.ar_root(frame)
	local args = frame:getParent().args
	local roots = {}
	if not ine(args[1]) and mw.title.getCurrentTitle().nsText == "Template" then
		table.insert(roots, "{{{1}}}")
		table.insert(roots, "{{{2}}}")
		table.insert(roots, "{{{3}}}")
	elseif ine(args[1]) and ine(args[2]) then
		table.insert(roots, args[1])
		table.insert(roots, args[2])
		if ine(args[3]) then
			table.insert(roots, args[3])
		end
		if ine(args[4]) then
			table.insert(roots, args[4])
		end
	elseif ine(args[1]) then	
		roots = mw.text.split(args[1], " ")
	else
		roots = mw.text.split(mw.title.getCurrentTitle().fullText, " ")
	end
	local joined_root = table.concat(roots, " ")
	local joined_tr = ar_translit.tr(table.concat(roots, "-"), nil, nil, nil, nil, "force") or "-"
	local link_text = m_links.full_link(joined_root, nil, lang, nil, nil, nil,
		{tr = joined_tr, gloss = ine(args["gloss"])}, false)
	if mw.title.getCurrentTitle().fullText == joined_root then
		link_text = link_text .. "[[تصنيف:Arabic roots|" .. (ine(args["sort"]) or joined_root) .. "]]"
	end
	return link_text
end

function export.ar_root2(parargs, args)
	return export.ar_root(debug_frame(parargs, args))
end

function export.linkify_bold(arabic, tr, novocalize, gender)
	require("Module:debug").track("ar-utilities linkify bold")
	
	-- make it possible to call this function from a template
	if type(arabic) == "table" then
		local function f(x) return (x ~= "") and x or nil end
		arabic, tr, novocalize, gender = f(arabic.args[1]), f(arabic.args[2]),
			f(arabic.args["novocalize"]), f(arabic.args["g"])
	end
	
	local categories = {}
	local autotr = ar_translit.tr(arabic, nil, nil, nil)
	
	-- If the given transliteration doesn't match the automatic one,
	-- then attempt to reverse transliterate back to Arabic script.
	if not novocalize and tr and tr ~= autotr then
		table.insert(categories, "[[تصنيف:Arabic terms with reverse-transliteration]]")
		
		local vocalized_arabic = ar_translit.tr_latin_matching(tr, arabic)
		
		if vocalized_arabic then
			assert(lang:makeEntryName(arabic) == lang:makeEntryName(vocalized_arabic), "Reverse-transliterated Arabic has different non-vowel form from the one given!")
			arabic = vocalized_arabic
		else
			table.insert(categories, "[[تصنيف:Arabic terms where vowel insertion failed]]")
		end
	end
	
	tr = tr or autotr
	
	local text = "<b lang=\"ar\" class=\"Arab\">" .. links(arabic) .. "</b>" .. LRM
	
	if tr then
		text = text .. " (" .. tr .. ")"
	end
	
	if gender == "mf" then
		gender = {"m", "f"}
	else
		gender = {gender}
	end
	
	text = text .. format_genders(lang, nil, gender)
	
	return text .. table.concat(categories)
end

-- Used in {{ar-adj-in}} so that we can specify a full lemma rather than
-- requiring the user to truncate the -in ending. FIXME: Move ar-adj-in
-- into Lua.
function export.remove_in(frame)
	local lemma = frame.args[1] or error("Lemma required.")
	return rsub(reorder_shadda(lemma), IN .. "$", "")
end

-- Used in {{ar-adj-an}} so that we can specify a full lemma rather than
-- requiring the user to truncate the -an ending. FIXME: Move ar-adj-an
-- into Lua.
function export.remove_an(frame)
	local lemma = frame.args[1] or error("Lemma required.")
	return rsub(reorder_shadda(lemma), AN .. AMAQ .. "$", "")
end

-- Compare two words and find the alternation pattern (vowel changes, prefixes, suffixes etc.)
-- Still a WIP, doesn't work correctly yet.
function export.find_pattern(word1, word2)
	return nil
end

return export