انتقل إلى المحتوى

وحدة:Unicode data/build regex

من ويكاموس، القاموس الحر
local export = {}

local gline = require("Module:string/gline")
local table_insert = table.insert
local unpack = unpack or table.unpack -- Lua 5.2 compatibility

local function iterate_lines(mod)
	return gline(mw.title.new(mod).content)
end

local exclude_chars = {
	-- Private use characters (U+E000-U+F8FF, U+F0000-U+FFFFD & U+100000-U+10FFFD).
	{ 0xE000, 0xF8FF },
	{ 0xF0000, 0xFFFFD },
	{ 0x100000, 0x10FFFD },
	-- High and low surrogates (U+D800-U+DFFF).
	{ 0xD800, 0xDFFF },
	-- C1 control characters (U+0080-U+009F).
	{ 0x0080, 0x009F },
	-- Soft hyphen (U+00AD).
	0x00AD,
	-- Directionality formatting characters (U+202A-U+202E & U+2066-U+2069).
	{ 0x202A, 0x202E },
	{ 0x2066, 0x2069 },
	-- Invisible mathematical operators (U+2061-U+2064).
	{ 0x2061, 0x2064 },
	-- Deprecated formatting characters (U+206A-U+206F & U+FEFF).
	{ 0x206A, 0x206F },
	0xFEFF,
	-- Interlinear/rubytext annotation for internal processing (U+FFF9-U+FFFB).
	{ 0xFFF9, 0xFFFB },
	-- Language tags (U+E0001 & U+E0020-U+E007F).
	0xE0001,
	{ 0xE0020, 0xE007F },

	-- Noncharacters
	{ 0xFDD0, 0xFDEF },
	0xFFFE, 0xFFFF,
	0x1FFFE, 0x1FFFF,
	0x2FFFE, 0x2FFFF,
	0x3FFFE, 0x3FFFF,
	0x4FFFE, 0x4FFFF,
	0x5FFFE, 0x5FFFF,
	0x6FFFE, 0x6FFFF,
	0x7FFFE, 0x7FFFF,
	0x8FFFE, 0x8FFFF,
	0x9FFFE, 0x9FFFF,
	0x10FFFE, 0x10FFFF,
}

local function add_range(univ, cp1, cp2)
	for i = cp1, cp2 do univ[i] = true end
end

local function remove_range(univ, cp1, cp2)
	for i = cp1, cp2 do univ[i] = false end
end

function export.make_regex()
	local data = {}
	local coalesce1 = nil
	local coalesce2 = nil
	
	add_range(data, 0, 0x10FFFF)
	
	-- Fetch codepoints
	for line in iterate_lines("Module:Unicode data/raw/DerivedName.txt") do
		local range = line:match("^(%x[%x.]+)")
		if range then
			local cp1, cp2 = range:match("^(%x%x+)%.%.(%x%x+)")
			if cp2 ~= nil then
				cp1, cp2 = tonumber(cp1, 16), tonumber(cp2, 16)
				remove_range(data, cp1, cp2)
				if coalesce1 then
					remove_range(data, coalesce1, coalesce2)
				end
				coalesce1 = nil
				coalesce2 = nil
			else
				local codepoint = range:match("^%x%x+")
				if codepoint then
					codepoint = tonumber(range, 16)
					if not coalesce1 then
						coalesce1 = codepoint
					elseif coalesce2 + 1 ~= codepoint then
						remove_range(data, coalesce1, coalesce2)
						coalesce1 = codepoint
					end
					coalesce2 = codepoint
				end
			end
		end
	end
	if coalesce1 ~= nil then
		remove_range(data, coalesce1, coalesce2)
	end
	
	-- Exclude codepoints
	for _, range in ipairs(exclude_chars) do
		if type(range) == "table" then
			add_range(data, unpack(range))
		else
			add_range(data, range, range)
		end
	end
	
	-- Combine ranges
	local tokens = {}
	coalesce1 = nil
	for i = 0, 0x110000 do
		if data[i] then
			coalesce1 = coalesce1 or i
		elseif coalesce1 ~= nil then
			coalesce2 = i - 1
			if coalesce1 + 1 < coalesce2 then
				table_insert(tokens, ("\\x{%04X}-\\x{%04X}"):format(coalesce1, coalesce2))
			elseif coalesce1 < coalesce2 then
				table_insert(tokens, ("\\x{%04X}\\x{%04X}"):format(coalesce1, coalesce2))
			else
				table_insert(tokens, ("\\x{%04X}"):format(coalesce1))
			end
			coalesce1 = nil
		end
	end
	
	return "[" .. table.concat(tokens, "") .. "]"
end

return export