وحدة:Unicode data/build regex
المظهر
local export = {}
local gline = require("Module:string/gline")
local table_insert = table.insert
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local function iterate_lines(mod)
return gline(mw.title.new(mod).content)
end
local exclude_chars = {
-- Private use characters (U+E000-U+F8FF, U+F0000-U+FFFFD & U+100000-U+10FFFD).
{ 0xE000, 0xF8FF },
{ 0xF0000, 0xFFFFD },
{ 0x100000, 0x10FFFD },
-- High and low surrogates (U+D800-U+DFFF).
{ 0xD800, 0xDFFF },
-- C1 control characters (U+0080-U+009F).
{ 0x0080, 0x009F },
-- Soft hyphen (U+00AD).
0x00AD,
-- Directionality formatting characters (U+202A-U+202E & U+2066-U+2069).
{ 0x202A, 0x202E },
{ 0x2066, 0x2069 },
-- Invisible mathematical operators (U+2061-U+2064).
{ 0x2061, 0x2064 },
-- Deprecated formatting characters (U+206A-U+206F & U+FEFF).
{ 0x206A, 0x206F },
0xFEFF,
-- Interlinear/rubytext annotation for internal processing (U+FFF9-U+FFFB).
{ 0xFFF9, 0xFFFB },
-- Language tags (U+E0001 & U+E0020-U+E007F).
0xE0001,
{ 0xE0020, 0xE007F },
-- Noncharacters
{ 0xFDD0, 0xFDEF },
0xFFFE, 0xFFFF,
0x1FFFE, 0x1FFFF,
0x2FFFE, 0x2FFFF,
0x3FFFE, 0x3FFFF,
0x4FFFE, 0x4FFFF,
0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF,
0x7FFFE, 0x7FFFF,
0x8FFFE, 0x8FFFF,
0x9FFFE, 0x9FFFF,
0x10FFFE, 0x10FFFF,
}
local function add_range(univ, cp1, cp2)
for i = cp1, cp2 do univ[i] = true end
end
local function remove_range(univ, cp1, cp2)
for i = cp1, cp2 do univ[i] = false end
end
function export.make_regex()
local data = {}
local coalesce1 = nil
local coalesce2 = nil
add_range(data, 0, 0x10FFFF)
-- Fetch codepoints
for line in iterate_lines("Module:Unicode data/raw/DerivedName.txt") do
local range = line:match("^(%x[%x.]+)")
if range then
local cp1, cp2 = range:match("^(%x%x+)%.%.(%x%x+)")
if cp2 ~= nil then
cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
remove_range(data, cp1, cp2)
if coalesce1 then
remove_range(data, coalesce1, coalesce2)
end
coalesce1 = nil
coalesce2 = nil
else
local codepoint = range:match("^%x%x+")
if codepoint then
codepoint = tonumber(range, 16)
if not coalesce1 then
coalesce1 = codepoint
elseif coalesce2 + 1 ~= codepoint then
remove_range(data, coalesce1, coalesce2)
coalesce1 = codepoint
end
coalesce2 = codepoint
end
end
end
end
if coalesce1 ~= nil then
remove_range(data, coalesce1, coalesce2)
end
-- Exclude codepoints
for _, range in ipairs(exclude_chars) do
if type(range) == "table" then
add_range(data, unpack(range))
else
add_range(data, range, range)
end
end
-- Combine ranges
local tokens = {}
coalesce1 = nil
for i = 0, 0x110000 do
if data[i] then
coalesce1 = coalesce1 or i
elseif coalesce1 ~= nil then
coalesce2 = i - 1
if coalesce1 + 1 < coalesce2 then
table_insert(tokens, ("\\x{%04X}-\\x{%04X}"):format(coalesce1, coalesce2))
elseif coalesce1 < coalesce2 then
table_insert(tokens, ("\\x{%04X}\\x{%04X}"):format(coalesce1, coalesce2))
else
table_insert(tokens, ("\\x{%04X}"):format(coalesce1))
end
coalesce1 = nil
end
end
return "[" .. table.concat(tokens, "") .. "]"
end
return export