انتقل إلى المحتوى

وحدة:Unicode data/build

من ويكاموس، القاموس الحر
local export = {}

local unpack = unpack or table.unpack -- Lua 5.2 compatibility

local m_str_utils = require("Module:string utilities")

local byte = string.byte
local dump = mw.dumpObject
local format = string.format
local gmatch = m_str_utils.gmatch
local gsplit = m_str_utils.gsplit
local gsub = string.gsub
local match = string.match
local sub = string.sub
local table_concat = table.concat
local table_insert = table.insert
local table_sort = table.sort
local tonumber = tonumber
local trim = m_str_utils.trim
local u = m_str_utils.char
local u_codepoint = mw.ustring.codepoint

-- To update Unicode data:
-- 1. Update all raw data files (see [[Special:PrefixIndex/Module:Unicode_data/raw/]])
-- 2. Update unicode_ver below
-- 3. Run the _formatted functions, copy the return values of each to the corresponding data modules, and save them
-- 4. Update Unicode_version in [[Module:character list]] and [[Appendix:Unicode]]
local unicode_ver = "17.0" -- update as needed once raw data files have been updated.

local CODEPOINT_MAX = 0x10FFFF

local function get_text(mod)
	return mw.title.new(mod):getContent()
end

local function normalize_value(val)
	val = trim(val)
	return val ~= "" and val or nil
end

local function unpack_line(line)
	local val, rem = match(line, "^([^;]*);(.*)$")
	if val then
		return normalize_value(val), unpack_line(rem)
	end
	return normalize_value(line)
end

local function iterate_lines(mod)
	return gmatch(get_text(mod), "[^\n]+")
end

local function iterate_unicode_data(mod)
	local iter = iterate_lines(mod)
	return function()
		while true do
			local line = iter()
			if line == nil then
				return nil
			end
			line = gsub(line, "#.*", "")
			if line ~= "" then
				return unpack_line(line)
			end
		end
	end
end

local escapes = {
	["\a"] = [[\a]], ["\b"] = [[\b]], ["\t"] = [[\t]], ["\n"] = [[\n]],
	["\v"] = [[\v]], ["\f"] = [[\f]], ["\r"] = [[\r]], ["\""] = [[\"]],
	["\\"] = [[\\]],
}

local function escape_byte(ch)
	return escapes[ch] or format("\\%03d", byte(ch))
end

local function escape(str)
	return (gsub(str, "[%c\"\\\128-\255]", escape_byte))
end

local function make_lua_string(str)
	return '"' .. gsub(str, "[%c\"\\]", escape_byte) .. '"'
end

local function add(data, k, v)
	local cp1, cp2 = match(k, "^(.-)%.%.(.*)$")
	if cp1 then
		cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
		for cp = cp1, cp2 do
			data[u(cp)] = v
		end
	else
		data[u(tonumber(k, 16))] = v
	end	
end

local function add_num(data, k, v)
	local cp1, cp2 = match(k, "^(.-)%.%.(.*)$")
	if cp1 then
		cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
		for cp = cp1, cp2 do
			data[cp] = v
		end
	else
		data[tonumber(k, 16)] = v
	end	
end

local function gather_singles_ranges(arr, max_singles_group)
	local singles = {}
	local ranges = {}
	local anchor = nil
	local last_val = nil
	
	max_singles_group = max_singles_group or 1
	
	for i = 0, CODEPOINT_MAX + 1 do -- intentionally one past
		local val = arr[i]
		if val == last_val then
			anchor = anchor or i
		else
			if anchor ~= nil and last_val ~= nil then
				if i - anchor <= max_singles_group then
					for j = anchor, i - 1 do
						table_insert(singles, { j, last_val })
					end
				else
					table_insert(ranges, { anchor, i - 1, last_val })
				end
			end
			anchor = i
			last_val = val
		end
	end
	
	return singles, ranges
end

function export.combining_classes()
	local data = {}
	for cp, class in iterate_unicode_data("Module:Unicode data/raw/DerivedCombiningClass.txt") do
		class = tonumber(class)
		if class ~= 0 then
			add(data, cp, class)
		end
	end
	return data
end

-- output goes to [[Module:Unicode data/combining classes]]
function export.combining_classes_formatted()
	local data = export.combining_classes()
	local sorted_pairs = {}
	for ch, class in pairs(data) do
		table_insert(sorted_pairs, { u_codepoint(ch), class })
	end
	table_sort(sorted_pairs, function (a, b) return a[1] < b[1] end)
	
	local formatted_rows = {}
	for _, kv in ipairs(sorted_pairs) do
		local codepoint, class = unpack(kv)
		table_insert(formatted_rows, string.format("\t[\"%s\"] = %d,\t-- U+%04X", escape(u(codepoint)), class, codepoint))
	end
	return "-- Generated by [[Module:Unicode data/build]] combining_classes_formatted (Unicode version " .. unicode_ver .. ")\nreturn {\n" .. table_concat(formatted_rows, "\n") .. "\n}"
end

function export.blocks()
	local data = {}
	for cp, block_name in iterate_unicode_data("Module:Unicode data/raw/Blocks.txt") do
		local cp1, cp2 = match(cp, "^(.-)%.%.(.*)$")
		if cp1 then
			cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
			table_insert(data, { cp1, cp2, block_name })
		else
			cp1 = tonumber(k, 16)
			table_insert(data, { cp1, cp1, block_name })
		end
	end
	return data
end

-- output goes to [[Module:Unicode data/blocks]]
function export.blocks_formatted()
	local data = export.blocks()
	local formatted_rows = {}
	for _, kv in ipairs(data) do
		local cp1, cp2, block_name = unpack(kv)
		table_insert(formatted_rows, format("\t{ 0x%06X, 0x%06X, %-60s },", cp1, cp2, make_lua_string(block_name)))
	end
	return "-- Generated by [[Module:Unicode data/build]] blocks_formatted (Unicode version " .. unicode_ver .. ")\nlocal blocks = {\n" .. table_concat(formatted_rows, "\n") .. "\n}\nblocks.length = #blocks\n\nreturn blocks"
end

function export.name_aliases()
	local data = {}
	for cp, alias, tp in iterate_unicode_data("Module:Unicode data/raw/NameAliases.txt") do
		local cp_num = tonumber(cp, 16)
		local cp_data = data[cp_num]
		if cp_data == nil then
			cp_data = {}
			data[cp_num] = cp_data
		elseif cp_data[alias] ~= nil then
			error(format("Alias %s specified more than once in the data", dump(alias)))
		end
		cp_data[alias] = tp
	end
	return data
end

-- output goes to [[Module:Unicode data/aliases]]
function export.name_aliases_formatted()
	local data = export.name_aliases()
	
	local defnames = { "correction", "control", "alternate", "figment", "abbreviation" }
	local has_def = {}
	local defstrings = {}
	for _, v in ipairs(defnames) do
		has_def[v] = true
		table_insert(defstrings, make_lua_string(v))
	end
	
	local defs = "local " .. table_concat(defnames, ", ") .. " =\n\t" .. table_concat(defstrings, ", ")
	
	local formatted_rows = {}
	for i = 0, 0x10FFFF do
		if data[i] then
			local formatted_subrows = {}
			for k, v in pairs(data[i]) do
				if not has_def[v] then error(("Missing string '%s' from defnames"):format(v)) end
				table_insert(formatted_subrows, format('\t\t{ %12s, %s };', v, make_lua_string(k)))
			end
			table_sort(formatted_subrows)
			table_insert(formatted_rows, format("\t[0x%06X] = {\n%s\n\t};", i, table_concat(formatted_subrows, "\n")))
		end
	end
	return "-- Generated by [[Module:Unicode data/build]] name_aliases_formatted (Unicode version " .. unicode_ver .. ")\n" .. defs .. "\n\nreturn {\n" .. table_concat(formatted_rows, "\n") .. "\n}"
end

function export.scripts()
	--[=[
	-- Official Unicode script values for individual codepoints and ranges of
	-- codepoints.
	
	-- https://www.unicode.org/Public/UNIDATA/Scripts.txt provided
	-- the script names, https://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
	-- provided script codes corresponding to the names (see [[Script (Unicode)]]),
	-- and https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt provided
	-- script extensions.
	--]=]
	
	-- Script extensions are not currently used.
	
	local script_aliases = {}
	local script_aliases_reverse = {}
	for kind, sc, name in iterate_unicode_data("Module:Unicode data/raw/PropertyValueAliases.txt") do
		if kind == "sc" then
			script_aliases[sc] = gsub(name, "_", " ")
			script_aliases_reverse[name] = sc
		end
	end
	
	local cp_sc = {}
	for cp, script in iterate_unicode_data("Module:Unicode data/raw/Scripts.txt") do
		add_num(cp_sc, cp, script_aliases_reverse[script])
	end
	
	local singles, ranges = gather_singles_ranges(cp_sc, 2)
	return {
		singles = singles,
		ranges = ranges,
		aliases = script_aliases,
	}
end

-- output goes to [[Module:Unicode data/scripts]]
function export.scripts_formatted()
	local data = export.scripts()
	
	local formatted_groups = {}
	local formatted_rows
	
	formatted_rows = {}
	for _, tuple in ipairs(data.singles) do
		table_insert(formatted_rows, format('\t\t[0x%05X] = "%s",', unpack(tuple)))
	end
	table_insert(formatted_groups, "\tsingles = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
	
	formatted_rows = {}
	for _, tuple in ipairs(data.ranges) do
		table_insert(formatted_rows,format ('\t\t{ 0x%05X, 0x%05X, "%s" },', unpack(tuple)))
	end
	table_insert(formatted_groups, "\tranges = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
	
	formatted_rows = {}
	local alias_pairs = {}
	for k, v in pairs(data.aliases) do
		table_insert(alias_pairs, { k, v })
	end
	table_sort(alias_pairs, function (a, b) return a[1] < b[1] end)
	for _, tuple in ipairs(alias_pairs) do
		table_insert(formatted_rows, format('\t\t%s = %s,', tuple[1], make_lua_string(tuple[2])))
	end
	table_insert(formatted_groups, "\t--Scripts.txt gives full names; here we consider them aliases to save space.\n\taliases = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
	
	return "-- Generated by [[Module:Unicode data/build]] scripts_formatted (Unicode version " .. unicode_ver .. ")\n\nlocal data = {\n" .. table_concat(formatted_groups, "\n\n") .. "\n}\n\n" .. [=[
-- Required for binary search function in [[Module:Language/scripts]].
-- Cannot get length of module loaded with mw.loadData.
data.ranges.length = #data.ranges

return data
]=]
end

function export.category(kind_filter)
	local long_names = {}
	for kind, gc, name in iterate_unicode_data("Module:Unicode data/raw/PropertyValueAliases.txt") do
		if kind == "gc" and (not kind_filter or kind_filter[gc]) then
			long_names[gc] = name
		end
	end
	
	local cp_gc = {}
	for cp, gc in iterate_unicode_data("Module:Unicode data/raw/DerivedGeneralCategory.txt") do
		if not kind_filter or kind_filter[gc] then
			add_num(cp_gc, cp, gc)
		end
	end
	
	local singles, ranges = gather_singles_ranges(cp_gc, 1)
	return {
		singles = singles,
		ranges = ranges,
		long_names = long_names,
	}
end

-- output goes to [[Module:Unicode data/category]]
function export.category_formatted()
	local data = export.category()
	
	local formatted_groups = {}
	local formatted_rows
	
	formatted_rows = {}
	for _, tuple in ipairs(data.singles) do
		table_insert(formatted_rows, format('\t\t[0x%05X] = "%s",', unpack(tuple)))
	end
	table_insert(formatted_groups, "\tsingles = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
	
	formatted_rows = {}
	for _, tuple in ipairs(data.ranges) do
		table_insert(formatted_rows, format('\t\t{ 0x%06X, 0x%06X, "%s" },', unpack(tuple)))
	end
	table_insert(formatted_groups, "\tranges = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
	
	formatted_rows = {}
	local long_name_pairs = {}
	for k, v in pairs(data.long_names) do
		table_insert(long_name_pairs, { k, v })
	end
	table_sort(long_name_pairs, function (a, b) return a[1] < b[1] end)
	for _, tuple in ipairs(long_name_pairs) do
		table_insert(formatted_rows, format('\t\t%-02s = %s,', tuple[1], make_lua_string(tuple[2])))
	end
	table_insert(formatted_groups, "\tlong_names = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
	
	return "-- Generated by [[Module:Unicode data/build]] category_formatted (Unicode version " .. unicode_ver .. ")\n\nlocal data = {\n" .. table_concat(formatted_groups, "\n\n") .. "\n}\n\n" .. [=[
data.ranges.length = #data.ranges
return data
]=]
end

-- output goes to [[Module:Unicode data/control]]
function export.control_formatted()
	local control_classes = {
		{ "Cc", "control" },
		{ "Cf", "format" },
		{ "Cs", "surrogate" },
		{ "Co", "private-use" },
		{ "Cn", "unassigned" },
		{ "Zs", "space-separator" },
		{ "Zl", "line-separator" },
		{ "Zp", "paragraph-separator" },
	}
	local control_classes_kv = {}
	local control_classes_by_first_letter = {}
	local control_classes_first_letters = {}
	
	for _, kv in ipairs(control_classes) do
		local k, v = unpack(kv)
		control_classes_kv[k] = v
		local first_letter = sub(k, 1, 1)
		if not control_classes_by_first_letter[first_letter] then
			table_insert(control_classes_first_letters, first_letter)
			control_classes_by_first_letter[first_letter] = {
				vars = {}, values = {}
			}
		end
		local target = control_classes_by_first_letter[first_letter]
		table_insert(target.vars, k)
		table_insert(target.values, v)
	end
	
	local control_classes_defs = {}
	for _, first_letter in ipairs(control_classes_first_letters) do
		local target = control_classes_by_first_letter[first_letter]
		local formatted_values = {}
		for _, v in ipairs(target.values) do
			table.insert(formatted_values, make_lua_string(v))
		end
		table_insert(control_classes_defs, "local " .. table_concat(target.vars, ", ") .. " =\n\t" .. table_concat(formatted_values, ", "))
	end
	
	local header = table_concat(control_classes_defs, "\n") .. "\n"
	
	local data = export.category(control_classes_kv)
	
	local formatted_groups = {}
	local formatted_rows
	
	formatted_rows = {}
	for _, tuple in ipairs(data.singles) do
		table_insert(formatted_rows, format('\t\t[0x%06X] = %s,', unpack(tuple)))
	end
	table_insert(formatted_groups, "\tsingles = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t};")
	
	formatted_rows = {}
	for _, tuple in ipairs(data.ranges) do
		table_insert(formatted_rows, format('\t\t{ 0x%06X, 0x%06X, %s },', unpack(tuple)))
	end
	table_insert(formatted_groups, "\tranges = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t};")
	
	return "-- Generated by [[Module:Unicode data/build]] control_formatted (Unicode version " .. unicode_ver .. ")\n" .. header .. "\nlocal data = {\n" .. table_concat(formatted_groups, "\n\n") .. "\n}\n\n" .. [=[
data.ranges.length = #data.ranges
return data
]=]
end

return export