模組:Jpan-sortkey

Ùi Wiktionary

可在模組:Jpan-sortkey/doc建立此模組的說明文件

local export = {}

local kanji_pattern = mw.loadData("Module:ja/data/range").kanji
local latin_pattern = mw.loadData("Module:ja/data/range").latin
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"

function export.sortkey_from_string(text, lang, sc)
	text = mw.ustring.toNFD(require("Module:ja").kata_to_hira(text))
	
	-- If the first character has dakuten, replace it with the corresponding character without dakuten and add an apostrophe to the end, e.g. がす > かす'
	text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x3099) .. "(.*)", "%1%2'")
	-- Similar thing, but with handuken and two apostrophes, e.g. ぱす -> はす''
	text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x309A) .. "(.*)", "%1%2''")
	
	-- Replace the long vowel mark with the vowel that it stands for
	if text:match("ー") then
		local from = {
			"あぁかさたなはまやゃらわ",
			"いぃきしちにひみり",
			"うぅくすつぬふむゆゅる",
			"えぇけせてねへめれ",
			"おぉこそとのほもよょろ"
		}
		local to = {"あ", "い", "う", "え", "お"}
		local dh = u(0x3099) .. u(0x309A)
		for i, v in ipairs(from) do
			text = mw.ustring.gsub(text, "([" .. v .. "][" .. dh .. "]?)ー", "%1" .. to[i])
		end
	end
	
	text = gsub(text, "[・゠]", " ")
	
	local ret = require("Module:Hani-sortkey").makeSortKey(text, lang, sc)
	
	if ret ~= text then
		require("Module:debug/track"){"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang}
	end
	
	return ret
end

function export.makeSortKey(text, lang, sc)
	if lang ~= "mul" and find(text, "[0-9" .. kanji_pattern .. latin_pattern .. "々]") then
		local langname = require("Module:languages").getByCode(lang):getCanonicalName()
		local content = mw.title.new(toNFC(text)):getContent()
		if content then
			local loc1, loc2 = content:find("%f[^%z%s]==%s*" .. langname .. "%s*==()")
			local loc2 = content:find("%f[^%z%s]==[^\n=]+==", loc2)
			if loc1 then
				content = content:sub(loc1, loc2)
				local findTemplates = require("Module:templateparser").findTemplates
				local templates, br = {}
				for template, args in findTemplates(content) do
					if template == lang .. "-pos" and args[2] then
						text = args[2]:gsub("[ %-%.^%%]", "")
						br = true
						break
					elseif template == "head" and args[1] == lang then
						for i, arg in ipairs(args) do
							if arg == "kana" then
								local kana = args[i+1]
								if kana then
									text = kana
									br = true
									break
								end
							end
						end
					end
					local pos = {
						[lang .. "-noun"] = true,
						[lang .. "-verb"] = true,
						[lang .. "-adj"] = true,
						[lang .. "-phrase"] = true,
						[lang .. "-verb form"] = true,
						[lang .. "-verb-suru"] = true
					}
					if pos[template] and args[1] then
						text = args[1]:gsub("[ %-%.^%%]", "")
						br = true
						break
					end
				end
				if not br then
					for template, args in findTemplates(content) do
						if template == lang .. "-kanjitab" then
							if args.sortkey then
								return args.sortkey
							end
							-- extract kanji and non-kanji
							local kanji = {}
							local non_kanji = {}
							
							local kanji_border = 1
							mw.ustring.gsub(text, "()([" .. kanji_pattern .. "々])()", function(p1, w1, p2)
								table.insert(non_kanji, mw.ustring.sub(text, kanji_border, p1 - 1))
								kanji_border = p2
								table.insert(kanji, w1)
							end)
							table.insert(non_kanji, mw.ustring.sub(text, kanji_border))
							-- 々
							for i, v in ipairs(kanji) do
								if v == "々" then kanji[i] = kanji[i - 1] end
							end
							-- process readings
							local readings = {}
							local readings_actual = {}
							local reading_length_total = 0
							for i in ipairs(args) do
								local reading_kana, reading_length
								_, _, reading_kana, reading_length = mw.ustring.find(args[i] or "", "^([^0-9]*)([0-9]*)$")
								reading_kana = reading_kana ~= "" and reading_kana or nil
								reading_length = reading_kana and tonumber(reading_length) or 1
	
								table.insert(readings, {reading_kana, reading_length})
								reading_length_total = reading_length_total + reading_length
								for i = reading_length_total + 1, #kanji do
									table.insert(readings, {nil, 1})
								end
								if reading_kana then
									local actual_reading = args["k" .. i]
									local okurigana = args["o" .. i]
									readings_actual[i] = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length}
								else
									readings_actual[i] = {nil, 1}
								end
							end
							local sortkey = {non_kanji[1]}
							local id = 1
							for _, v in ipairs(readings_actual) do
								id = id + v[2]
								v[1] = v[1] ~= "-" and v[1]
								table.insert(sortkey, (v[1] or "") .. (non_kanji[id] or ""))
							end
							sortkey = table.concat(sortkey)
							if sortkey ~= "" then
								text = sortkey
							end
							break
						end
					end
				end
			end
		end
	end
	return export.sortkey_from_string(text, lang, sc)
end

return export