وحدة:Unicode data/build
المظهر
local export = {}
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local m_str_utils = require("Module:string utilities")
local byte = string.byte
local dump = mw.dumpObject
local format = string.format
local gmatch = m_str_utils.gmatch
local gsplit = m_str_utils.gsplit
local gsub = string.gsub
local match = string.match
local sub = string.sub
local table_concat = table.concat
local table_insert = table.insert
local table_sort = table.sort
local tonumber = tonumber
local trim = m_str_utils.trim
local u = m_str_utils.char
local u_codepoint = mw.ustring.codepoint
-- To update Unicode data:
-- 1. Update all raw data files (see [[Special:PrefixIndex/Module:Unicode_data/raw/]])
-- 2. Update unicode_ver below
-- 3. Run the _formatted functions, copy the return values of each to the corresponding data modules, and save them
-- 4. Update Unicode_version in [[Module:character list]] and [[Appendix:Unicode]]
local unicode_ver = "17.0" -- update as needed once raw data files have been updated.
local CODEPOINT_MAX = 0x10FFFF
local function get_text(mod)
return mw.title.new(mod):getContent()
end
local function normalize_value(val)
val = trim(val)
return val ~= "" and val or nil
end
local function unpack_line(line)
local val, rem = match(line, "^([^;]*);(.*)$")
if val then
return normalize_value(val), unpack_line(rem)
end
return normalize_value(line)
end
local function iterate_lines(mod)
return gmatch(get_text(mod), "[^\n]+")
end
local function iterate_unicode_data(mod)
local iter = iterate_lines(mod)
return function()
while true do
local line = iter()
if line == nil then
return nil
end
line = gsub(line, "#.*", "")
if line ~= "" then
return unpack_line(line)
end
end
end
end
local escapes = {
["\a"] = [[\a]], ["\b"] = [[\b]], ["\t"] = [[\t]], ["\n"] = [[\n]],
["\v"] = [[\v]], ["\f"] = [[\f]], ["\r"] = [[\r]], ["\""] = [[\"]],
["\\"] = [[\\]],
}
local function escape_byte(ch)
return escapes[ch] or format("\\%03d", byte(ch))
end
local function escape(str)
return (gsub(str, "[%c\"\\\128-\255]", escape_byte))
end
local function make_lua_string(str)
return '"' .. gsub(str, "[%c\"\\]", escape_byte) .. '"'
end
local function add(data, k, v)
local cp1, cp2 = match(k, "^(.-)%.%.(.*)$")
if cp1 then
cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
for cp = cp1, cp2 do
data[u(cp)] = v
end
else
data[u(tonumber(k, 16))] = v
end
end
local function add_num(data, k, v)
local cp1, cp2 = match(k, "^(.-)%.%.(.*)$")
if cp1 then
cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
for cp = cp1, cp2 do
data[cp] = v
end
else
data[tonumber(k, 16)] = v
end
end
local function gather_singles_ranges(arr, max_singles_group)
local singles = {}
local ranges = {}
local anchor = nil
local last_val = nil
max_singles_group = max_singles_group or 1
for i = 0, CODEPOINT_MAX + 1 do -- intentionally one past
local val = arr[i]
if val == last_val then
anchor = anchor or i
else
if anchor ~= nil and last_val ~= nil then
if i - anchor <= max_singles_group then
for j = anchor, i - 1 do
table_insert(singles, { j, last_val })
end
else
table_insert(ranges, { anchor, i - 1, last_val })
end
end
anchor = i
last_val = val
end
end
return singles, ranges
end
function export.combining_classes()
local data = {}
for cp, class in iterate_unicode_data("Module:Unicode data/raw/DerivedCombiningClass.txt") do
class = tonumber(class)
if class ~= 0 then
add(data, cp, class)
end
end
return data
end
-- output goes to [[Module:Unicode data/combining classes]]
function export.combining_classes_formatted()
local data = export.combining_classes()
local sorted_pairs = {}
for ch, class in pairs(data) do
table_insert(sorted_pairs, { u_codepoint(ch), class })
end
table_sort(sorted_pairs, function (a, b) return a[1] < b[1] end)
local formatted_rows = {}
for _, kv in ipairs(sorted_pairs) do
local codepoint, class = unpack(kv)
table_insert(formatted_rows, string.format("\t[\"%s\"] = %d,\t-- U+%04X", escape(u(codepoint)), class, codepoint))
end
return "-- Generated by [[Module:Unicode data/build]] combining_classes_formatted (Unicode version " .. unicode_ver .. ")\nreturn {\n" .. table_concat(formatted_rows, "\n") .. "\n}"
end
function export.blocks()
local data = {}
for cp, block_name in iterate_unicode_data("Module:Unicode data/raw/Blocks.txt") do
local cp1, cp2 = match(cp, "^(.-)%.%.(.*)$")
if cp1 then
cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
table_insert(data, { cp1, cp2, block_name })
else
cp1 = tonumber(k, 16)
table_insert(data, { cp1, cp1, block_name })
end
end
return data
end
-- output goes to [[Module:Unicode data/blocks]]
function export.blocks_formatted()
local data = export.blocks()
local formatted_rows = {}
for _, kv in ipairs(data) do
local cp1, cp2, block_name = unpack(kv)
table_insert(formatted_rows, format("\t{ 0x%06X, 0x%06X, %-60s },", cp1, cp2, make_lua_string(block_name)))
end
return "-- Generated by [[Module:Unicode data/build]] blocks_formatted (Unicode version " .. unicode_ver .. ")\nlocal blocks = {\n" .. table_concat(formatted_rows, "\n") .. "\n}\nblocks.length = #blocks\n\nreturn blocks"
end
function export.name_aliases()
local data = {}
for cp, alias, tp in iterate_unicode_data("Module:Unicode data/raw/NameAliases.txt") do
local cp_num = tonumber(cp, 16)
local cp_data = data[cp_num]
if cp_data == nil then
cp_data = {}
data[cp_num] = cp_data
elseif cp_data[alias] ~= nil then
error(format("Alias %s specified more than once in the data", dump(alias)))
end
cp_data[alias] = tp
end
return data
end
-- output goes to [[Module:Unicode data/aliases]]
function export.name_aliases_formatted()
local data = export.name_aliases()
local defnames = { "correction", "control", "alternate", "figment", "abbreviation" }
local has_def = {}
local defstrings = {}
for _, v in ipairs(defnames) do
has_def[v] = true
table_insert(defstrings, make_lua_string(v))
end
local defs = "local " .. table_concat(defnames, ", ") .. " =\n\t" .. table_concat(defstrings, ", ")
local formatted_rows = {}
for i = 0, 0x10FFFF do
if data[i] then
local formatted_subrows = {}
for k, v in pairs(data[i]) do
if not has_def[v] then error(("Missing string '%s' from defnames"):format(v)) end
table_insert(formatted_subrows, format('\t\t{ %12s, %s };', v, make_lua_string(k)))
end
table_sort(formatted_subrows)
table_insert(formatted_rows, format("\t[0x%06X] = {\n%s\n\t};", i, table_concat(formatted_subrows, "\n")))
end
end
return "-- Generated by [[Module:Unicode data/build]] name_aliases_formatted (Unicode version " .. unicode_ver .. ")\n" .. defs .. "\n\nreturn {\n" .. table_concat(formatted_rows, "\n") .. "\n}"
end
function export.scripts()
--[=[
-- Official Unicode script values for individual codepoints and ranges of
-- codepoints.
-- https://www.unicode.org/Public/UNIDATA/Scripts.txt provided
-- the script names, https://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
-- provided script codes corresponding to the names (see [[Script (Unicode)]]),
-- and https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt provided
-- script extensions.
--]=]
-- Script extensions are not currently used.
local script_aliases = {}
local script_aliases_reverse = {}
for kind, sc, name in iterate_unicode_data("Module:Unicode data/raw/PropertyValueAliases.txt") do
if kind == "sc" then
script_aliases[sc] = gsub(name, "_", " ")
script_aliases_reverse[name] = sc
end
end
local cp_sc = {}
for cp, script in iterate_unicode_data("Module:Unicode data/raw/Scripts.txt") do
add_num(cp_sc, cp, script_aliases_reverse[script])
end
local singles, ranges = gather_singles_ranges(cp_sc, 2)
return {
singles = singles,
ranges = ranges,
aliases = script_aliases,
}
end
-- output goes to [[Module:Unicode data/scripts]]
function export.scripts_formatted()
local data = export.scripts()
local formatted_groups = {}
local formatted_rows
formatted_rows = {}
for _, tuple in ipairs(data.singles) do
table_insert(formatted_rows, format('\t\t[0x%05X] = "%s",', unpack(tuple)))
end
table_insert(formatted_groups, "\tsingles = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
formatted_rows = {}
for _, tuple in ipairs(data.ranges) do
table_insert(formatted_rows,format ('\t\t{ 0x%05X, 0x%05X, "%s" },', unpack(tuple)))
end
table_insert(formatted_groups, "\tranges = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
formatted_rows = {}
local alias_pairs = {}
for k, v in pairs(data.aliases) do
table_insert(alias_pairs, { k, v })
end
table_sort(alias_pairs, function (a, b) return a[1] < b[1] end)
for _, tuple in ipairs(alias_pairs) do
table_insert(formatted_rows, format('\t\t%s = %s,', tuple[1], make_lua_string(tuple[2])))
end
table_insert(formatted_groups, "\t--Scripts.txt gives full names; here we consider them aliases to save space.\n\taliases = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
return "-- Generated by [[Module:Unicode data/build]] scripts_formatted (Unicode version " .. unicode_ver .. ")\n\nlocal data = {\n" .. table_concat(formatted_groups, "\n\n") .. "\n}\n\n" .. [=[
-- Required for binary search function in [[Module:Language/scripts]].
-- Cannot get length of module loaded with mw.loadData.
data.ranges.length = #data.ranges
return data
]=]
end
function export.category(kind_filter)
local long_names = {}
for kind, gc, name in iterate_unicode_data("Module:Unicode data/raw/PropertyValueAliases.txt") do
if kind == "gc" and (not kind_filter or kind_filter[gc]) then
long_names[gc] = name
end
end
local cp_gc = {}
for cp, gc in iterate_unicode_data("Module:Unicode data/raw/DerivedGeneralCategory.txt") do
if not kind_filter or kind_filter[gc] then
add_num(cp_gc, cp, gc)
end
end
local singles, ranges = gather_singles_ranges(cp_gc, 1)
return {
singles = singles,
ranges = ranges,
long_names = long_names,
}
end
-- output goes to [[Module:Unicode data/category]]
function export.category_formatted()
local data = export.category()
local formatted_groups = {}
local formatted_rows
formatted_rows = {}
for _, tuple in ipairs(data.singles) do
table_insert(formatted_rows, format('\t\t[0x%05X] = "%s",', unpack(tuple)))
end
table_insert(formatted_groups, "\tsingles = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
formatted_rows = {}
for _, tuple in ipairs(data.ranges) do
table_insert(formatted_rows, format('\t\t{ 0x%06X, 0x%06X, "%s" },', unpack(tuple)))
end
table_insert(formatted_groups, "\tranges = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
formatted_rows = {}
local long_name_pairs = {}
for k, v in pairs(data.long_names) do
table_insert(long_name_pairs, { k, v })
end
table_sort(long_name_pairs, function (a, b) return a[1] < b[1] end)
for _, tuple in ipairs(long_name_pairs) do
table_insert(formatted_rows, format('\t\t%-02s = %s,', tuple[1], make_lua_string(tuple[2])))
end
table_insert(formatted_groups, "\tlong_names = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t},")
return "-- Generated by [[Module:Unicode data/build]] category_formatted (Unicode version " .. unicode_ver .. ")\n\nlocal data = {\n" .. table_concat(formatted_groups, "\n\n") .. "\n}\n\n" .. [=[
data.ranges.length = #data.ranges
return data
]=]
end
-- output goes to [[Module:Unicode data/control]]
function export.control_formatted()
local control_classes = {
{ "Cc", "control" },
{ "Cf", "format" },
{ "Cs", "surrogate" },
{ "Co", "private-use" },
{ "Cn", "unassigned" },
{ "Zs", "space-separator" },
{ "Zl", "line-separator" },
{ "Zp", "paragraph-separator" },
}
local control_classes_kv = {}
local control_classes_by_first_letter = {}
local control_classes_first_letters = {}
for _, kv in ipairs(control_classes) do
local k, v = unpack(kv)
control_classes_kv[k] = v
local first_letter = sub(k, 1, 1)
if not control_classes_by_first_letter[first_letter] then
table_insert(control_classes_first_letters, first_letter)
control_classes_by_first_letter[first_letter] = {
vars = {}, values = {}
}
end
local target = control_classes_by_first_letter[first_letter]
table_insert(target.vars, k)
table_insert(target.values, v)
end
local control_classes_defs = {}
for _, first_letter in ipairs(control_classes_first_letters) do
local target = control_classes_by_first_letter[first_letter]
local formatted_values = {}
for _, v in ipairs(target.values) do
table.insert(formatted_values, make_lua_string(v))
end
table_insert(control_classes_defs, "local " .. table_concat(target.vars, ", ") .. " =\n\t" .. table_concat(formatted_values, ", "))
end
local header = table_concat(control_classes_defs, "\n") .. "\n"
local data = export.category(control_classes_kv)
local formatted_groups = {}
local formatted_rows
formatted_rows = {}
for _, tuple in ipairs(data.singles) do
table_insert(formatted_rows, format('\t\t[0x%06X] = %s,', unpack(tuple)))
end
table_insert(formatted_groups, "\tsingles = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t};")
formatted_rows = {}
for _, tuple in ipairs(data.ranges) do
table_insert(formatted_rows, format('\t\t{ 0x%06X, 0x%06X, %s },', unpack(tuple)))
end
table_insert(formatted_groups, "\tranges = {\n" .. table_concat(formatted_rows, "\n") .. "\n\t};")
return "-- Generated by [[Module:Unicode data/build]] control_formatted (Unicode version " .. unicode_ver .. ")\n" .. header .. "\nlocal data = {\n" .. table_concat(formatted_groups, "\n\n") .. "\n}\n\n" .. [=[
data.ranges.length = #data.ranges
return data
]=]
end
return export